diff --git a/Changelog b/Changelog
index 2eb785fe44145..4217449438a9d 100644
--- a/Changelog
+++ b/Changelog
@@ -16,6 +16,8 @@ version <next>:
 - APV decoder and APV raw bitstream muxing and demuxing
 - APV parser
 - APV encoding support through a libopenapv wrapper
+- VVC decoder supports all content of SCC (Screen Content Coding):
+  IBC (Inter Block Copy), Palette Mode and ACT (Adaptive Color Transform
 
 
 version 7.1:
diff --git a/MAINTAINERS b/MAINTAINERS
index d1d87752b9193..0fba390938552 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -176,6 +176,7 @@ Codecs:
   dss_sp.c                              Oleksij Rempel
   dv.c                                  Roman Shaposhnik
   dvbsubdec.c                           Anshul Maheshwari
+  dxv.*, dxvenc.*                       Emma Worley
   eacmv*, eaidct*, eat*                 Peter Ross
   exif.c, exif.h                        Thilo Borgmann
   ffv1*                             [2] Michael Niedermayer
diff --git a/Makefile b/Makefile
index e2250f6bc6f2c..877b0071f6cc7 100644
--- a/Makefile
+++ b/Makefile
@@ -19,14 +19,20 @@ vpath %/fate_config.sh.template $(SRC_PATH)
 TESTTOOLS   = audiogen videogen rotozoom tiny_psnr tiny_ssim base64 audiomatch
 HOSTPROGS  := $(TESTTOOLS:%=tests/%) doc/print_options
 
-ALLFFLIBS = avcodec avdevice avfilter avformat avutil postproc swscale swresample
+ALLFFLIBS =            \
+    avcodec            \
+    avdevice           \
+    avfilter           \
+    avformat           \
+    avutil             \
+    swscale            \
+    swresample         \
 
 # $(FFLIBS-yes) needs to be in linking order
 FFLIBS-$(CONFIG_AVDEVICE)   += avdevice
 FFLIBS-$(CONFIG_AVFILTER)   += avfilter
 FFLIBS-$(CONFIG_AVFORMAT)   += avformat
 FFLIBS-$(CONFIG_AVCODEC)    += avcodec
-FFLIBS-$(CONFIG_POSTPROC)   += postproc
 FFLIBS-$(CONFIG_SWRESAMPLE) += swresample
 FFLIBS-$(CONFIG_SWSCALE)    += swscale
 
@@ -104,8 +110,7 @@ SUBDIR_VARS := CLEANFILES FFLIBS HOSTPROGS TESTPROGS TOOLS               \
                ALTIVEC-OBJS VSX-OBJS MMX-OBJS X86ASM-OBJS                \
                MIPSFPU-OBJS MIPSDSPR2-OBJS MIPSDSP-OBJS MSA-OBJS         \
                MMI-OBJS LSX-OBJS LASX-OBJS RV-OBJS RVV-OBJS RVVB-OBJS    \
-               OBJS SLIBOBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS       \
-               SIMD128-OBJS
+               OBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS SIMD128-OBJS
 
 define RESET
 $(1) :=
diff --git a/configure b/configure
index 0609dac4abc90..89a766b403d55 100755
--- a/configure
+++ b/configure
@@ -249,7 +249,7 @@ External library support:
   --enable-liblensfun      enable lensfun lens correction [no]
   --enable-libmodplug      enable ModPlug via libmodplug [no]
   --enable-libmp3lame      enable MP3 encoding via libmp3lame [no]
-  --enable-liboapv         enable APV encoding/decoding via liboapv [no]
+  --enable-liboapv         enable APV encoding via liboapv [no]
   --enable-libopencore-amrnb enable AMR-NB de/encoding via libopencore-amrnb [no]
   --enable-libopencore-amrwb enable AMR-WB decoding via libopencore-amrwb [no]
   --enable-libopencv       enable video filtering via libopencv [no]
@@ -2661,6 +2661,7 @@ CONFIG_EXTRA="
     vp56dsp
     vp8dsp
     vulkan_encode
+    vvc_sei
     wma_freqs
     wmv2dsp
 "
@@ -2910,6 +2911,7 @@ mpegvideoenc_select="aandcttables fdctdsp me_cmp mpegvideo pixblockdsp"
 msmpeg4dec_select="h263_decoder"
 msmpeg4enc_select="h263_encoder"
 vc1dsp_select="h264chroma qpeldsp startcode"
+vvc_sei_select="atsc_a53 golomb"
 wmv2dsp_select="qpeldsp"
 
 # decoders / encoders
@@ -3147,7 +3149,7 @@ vp6f_decoder_select="vp6_decoder"
 vp7_decoder_select="h264pred videodsp vp8dsp"
 vp8_decoder_select="h264pred videodsp vp8dsp"
 vp9_decoder_select="videodsp vp9_parser vp9_superframe_split_bsf"
-vvc_decoder_select="cabac cbs_h266 golomb videodsp"
+vvc_decoder_select="cabac cbs_h266 golomb videodsp vvc_sei"
 wcmv_decoder_select="inflate_wrapper"
 webp_decoder_select="vp8_decoder exif"
 wmalossless_decoder_select="llauddsp"
@@ -3747,6 +3749,7 @@ wav_demuxer_select="riffdec"
 wav_muxer_select="riffenc"
 webm_chunk_muxer_select="webm_muxer"
 webm_dash_manifest_demuxer_select="matroska_demuxer"
+whip_muxer_deps_any="dtls_protocol"
 wtv_demuxer_select="mpegts_demuxer riffdec"
 wtv_muxer_select="mpegts_muxer riffenc"
 xmv_demuxer_select="riffdec"
@@ -3845,6 +3848,9 @@ srtp_protocol_select="rtp_protocol srtp"
 tcp_protocol_select="network"
 tls_protocol_deps_any="gnutls openssl schannel securetransport libtls mbedtls"
 tls_protocol_select="tcp_protocol"
+# TODO: Support libtls, mbedtls, and gnutls.
+dtls_protocol_deps_any="openssl"
+dtls_protocol_select="udp_protocol"
 udp_protocol_select="network"
 udplite_protocol_select="network"
 unix_protocol_deps="sys_un_h"
@@ -3879,6 +3885,7 @@ ass_filter_deps="libass"
 avgblur_opencl_filter_deps="opencl"
 avgblur_vulkan_filter_deps="vulkan spirv_compiler"
 azmq_filter_deps="libzmq"
+blackdetect_vulkan_filter_deps="vulkan spirv_compiler"
 blackframe_filter_deps="gpl"
 blend_vulkan_filter_deps="vulkan spirv_compiler"
 boxblur_filter_deps="gpl"
@@ -3986,6 +3993,7 @@ vpp_amf_filter_deps="amf"
 scale_qsv_filter_deps="libmfx"
 scale_qsv_filter_select="qsvvpp"
 scdet_filter_select="scene_sad"
+scdet_vulkan_filter_deps="vulkan spirv_compiler"
 select_filter_select="scene_sad"
 sharpness_vaapi_filter_deps="vaapi"
 showcqt_filter_deps="avformat swscale"
@@ -4159,6 +4167,8 @@ if test "$target_os_default" = aix; then
     arch_default=$(uname -p)
     strip_default="strip -X32_64"
     nm_default="nm -g -X32_64"
+elif test "$MSYSTEM_CARCH" != ""; then
+    arch_default="$MSYSTEM_CARCH"
 else
     arch_default=$(uname -m)
 fi
@@ -5293,7 +5303,7 @@ case "$arch" in
     arm*|iPad*|iPhone*)
         arch="arm"
     ;;
-    loongarch*)
+    loongarch*|loong64)
         arch="loongarch"
     ;;
     mips*|IP*)
@@ -7192,6 +7202,14 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
                              }
 enabled vapoursynth       && require_headers "vapoursynth/VSScript4.h vapoursynth/VapourSynth4.h"
 
+enabled openssl            && {
+    enabled whip_muxer && {
+        $pkg_config --exists --print-errors "openssl >= 1.0.1k" ||
+        require_pkg_config openssl "openssl >= 1.0.1k" openssl/ssl.h SSL_library_init ||
+        require_pkg_config openssl "openssl >= 1.0.1k" openssl/ssl.h OPENSSL_init_ssl
+    }
+}
+
 
 if enabled gcrypt; then
     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
diff --git a/doc/APIchanges b/doc/APIchanges
index d0869561f39db..91710bb27d6ff 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -2,6 +2,9 @@ The last version increases of all libraries were on 2025-03-28
 
 API changes, most recent first:
 
+2025-05-21 - xxxxxxxxxx - lavu 60.3.100 - avassert.h
+  Add av_unreachable() and av_assume() macros.
+
 2025-02-xx - xxxxxxxxxx - lavfi 10.10.100 - avfilter.h
   Add avfilter_link_get_hw_frames_ctx().
 
diff --git a/doc/examples/filter_audio.c b/doc/examples/filter_audio.c
index 8b237e2adf3b6..02222f591417c 100644
--- a/doc/examples/filter_audio.c
+++ b/doc/examples/filter_audio.c
@@ -270,7 +270,6 @@ int main(int argc, char *argv[])
     AVFilterGraph *graph;
     AVFilterContext *src, *sink;
     AVFrame *frame;
-    uint8_t errstr[1024];
     float duration;
     int err, nb_frames, i;
 
@@ -354,7 +353,6 @@ int main(int argc, char *argv[])
     return 0;
 
 fail:
-    av_strerror(err, errstr, sizeof(errstr));
-    fprintf(stderr, "%s\n", errstr);
+    fprintf(stderr, "%s\n", av_err2str(err));
     return 1;
 }
diff --git a/doc/examples/qsv_decode.c b/doc/examples/qsv_decode.c
index 5a6f3625aa6ee..ec91109480835 100644
--- a/doc/examples/qsv_decode.c
+++ b/doc/examples/qsv_decode.c
@@ -219,11 +219,8 @@ int main(int argc, char **argv)
     ret = decode_packet(decoder_ctx, frame, sw_frame, NULL, output_ctx);
 
 finish:
-    if (ret < 0) {
-        char buf[1024];
-        av_strerror(ret, buf, sizeof(buf));
-        fprintf(stderr, "%s\n", buf);
-    }
+    if (ret < 0)
+        fprintf(stderr, "%s\n", av_err2str(ret));
 
     avformat_close_input(&input_ctx);
 
diff --git a/doc/filters.texi b/doc/filters.texi
index 679b71f29065c..63f55f5794e70 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -8634,7 +8634,7 @@ Filter out noisy pixels from @code{bitplane} set above.
 Default is disabled.
 @end table
 
-@section blackdetect
+@section blackdetect, blackdetect_vulkan
 
 Detect video intervals that are (almost) completely black. Can be
 useful to detect chapter transitions, commercials, or invalid
@@ -8687,6 +8687,12 @@ the input video format, the range is [0-255] for YUV full-range
 formats and [16-235] for YUV non full-range formats.
 
 Default value is 0.10.
+
+@item alpha
+If true, check the alpha channel instead of the luma channel. Detects frames
+which are (almost) transparent, instead of frames which are almost black.
+
+Default value is disabled.
 @end table
 
 The following example sets the maximum pixel threshold to the minimum
@@ -16259,6 +16265,16 @@ and @code{(oh-ph)/2}.
 Set the output placement width/height expressions, default values are @code{ow}
 and @code{oh}.
 
+@item rotate
+Rotate the input frame clockwise by the specified angle.
+
+@table @samp
+@item 0, 360
+@item 90
+@item 180
+@item 270
+@end table
+
 @item fps
 Set the output frame rate. This can be rational, e.g. @code{60000/1001}. If
 set to the special string @code{none} (the default), input timestamps will
@@ -16311,6 +16327,18 @@ to double the input image resolution:
 -vf "libplacebo=w=iw*2:h=ih*2:extra_opts='upscaler=custom\:upscaler_preset=ewa_lanczos\:upscaler_blur=0.9812505644269356'"
 @end example
 
+
+@item shader_cache
+File path of a cache directory that libplacebo will use to store and load
+cached shader objects. This cache is not cleaned up automatically. If the
+path does not end in a directory separator, the generated filenames will be
+effectively prefixed by the last path component. All directories must already
+exist.
+
+@example
+-vf "libplacebo=shader_cache=/tmp/pl-shader-"
+@end example
+
 @item colorspace
 @item color_primaries
 @item color_trc
diff --git a/doc/htmlxref.cnf b/doc/htmlxref.cnf
new file mode 100644
index 0000000000000..079c848a651a6
--- /dev/null
+++ b/doc/htmlxref.cnf
@@ -0,0 +1,6 @@
+ffmpeg mono ./ffmpeg.html
+ffmpeg-filters mono ./ffmpeg-filters.html
+ffmpeg-formats mono ./ffmpeg-formats.html
+ffmpeg-resampler mono ./ffmpeg-resampler.html
+ffmpeg-scaler mono ./ffmpeg-scaler.html
+ffmpeg-utils mono ./ffmpeg-utils.html
diff --git a/doc/muxers.texi b/doc/muxers.texi
index 04b7f20b7e8cb..30c95c3d34e3b 100644
--- a/doc/muxers.texi
+++ b/doc/muxers.texi
@@ -3879,4 +3879,51 @@ ffmpeg -f webm_dash_manifest -i video1.webm \
        manifest.xml
 @end example
 
+@anchor{whip}
+@section whip
+
+WebRTC (Real-Time Communication) muxer that supports sub-second latency streaming according to
+the WHIP (WebRTC-HTTP ingestion protocol) specification.
+
+It uses HTTP as a signaling protocol to exchange SDP capabilities and ICE lite candidates. Then,
+it uses STUN binding requests and responses to establish a session over UDP. Subsequently, it
+initiates a DTLS handshake to exchange the SRTP encryption keys. Lastly, it splits video and
+audio frames into RTP packets and encrypts them using SRTP.
+
+Ensure that you use H.264 without B frames and Opus for the audio codec. For example, to convert
+an input file with @command{ffmpeg} to WebRTC:
+@example
+ffmpeg -re -i input.mp4 -acodec libopus -ar 48000 -ac 2 \
+  -vcodec libx264 -profile:v baseline -tune zerolatency -threads 1 -bf 0 \
+  -f whip "http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream"
+@end example
+
+For this example, we have employed low latency options, resulting in an end-to-end latency of
+approximately 150ms.
+
+@subsection Options
+
+This muxer supports the following options:
+
+@table @option
+
+@item handshake_timeout @var{integer}
+Set the timeout in milliseconds for ICE and DTLS handshake.
+Default value is 5000.
+
+@item pkt_size @var{integer}
+Set the maximum size, in bytes, of RTP packets that send out.
+Default value is 1500.
+
+@item authorization @var{string}
+The optional Bearer token for WHIP Authorization.
+
+@item cert_file @var{string}
+The optional certificate file path for DTLS.
+
+@item key_file @var{string}
+The optional private key file path for DTLS.
+
+@end table
+
 @c man end MUXERS
diff --git a/doc/swscale-v2.txt b/doc/swscale-v2.txt
new file mode 100644
index 0000000000000..3ae2b27036d50
--- /dev/null
+++ b/doc/swscale-v2.txt
@@ -0,0 +1,344 @@
+New swscale design to change everything (tm)
+============================================
+
+SwsGraph
+--------
+
+The entry point to the new architecture, SwsGraph is what coordinates
+multiple "passes". These can include cascaded scaling passes, error diffusion
+dithering, and so on. Or we could have separate passes for the vertical and
+horizontal scaling. In between each SwsPass lies a fully allocated image buffer.
+Graph passes may have different levels of threading, e.g. we can have a single
+threaded error diffusion pass following a multi-threaded scaling pass.
+
+SwsGraph is internally recreated whenever the image format, dimensions or
+settings change in any way. sws_scale_frame() is itself just a light-weight
+wrapper that runs ff_sws_graph_create() whenever the format changes, splits
+interlaced images into separate fields, and calls ff_sws_graph_run() on each.
+
+From the point of view of SwsGraph itself, all inputs are progressive.
+
+SwsOp / SwsOpList
+-----------------
+
+This is the newly introduced abstraction layer between the high-level format
+handling logic and the low-level backing implementation. Each SwsOp is designed
+to be as small and atomic as possible, with the possible exception of the
+read / write operations due to their numerous variants.
+
+The basic idea is to split logic between three major components:
+
+1. The high-level format "business logic", which generates in a very
+   naive way a sequence of operations guaranteed to get you from point A
+   to point B. This logic is written with correctness in mind only, and
+   ignoring any performance concerns or low-level implementation decisions.
+   Semantically, everything is always decoded from the input format to
+   normalized (real valued) RGB, and then encoded back to output format.
+
+   This code lives in libswscale/format.c
+
+2. The optimizer. This is where the "magic" happens, so to speak. The
+   optimizer's job is to take the abstract sequence of operations
+   produced by the high-level format analysis code and incrementally
+   optimize it. Each optimization step is designed to be minute and provably
+   lossless, or otherwise guarded behind the BITEXACT flag. This ensures that
+   the resulting output is always identical, no matter how many layers of
+   optimization we add.
+
+   This code lives in libswscale/ops.c
+
+3. The compiler. Once we have a sequence of operations as output by the
+   optimizer, we "compile" this down to a callable function. This is then
+   applied by the dispatch wrapper by striping it over the input image.
+
+   See libswscale/ops_backend.c for the reference backend, or
+   libswscale/x86/ops.c for a more complex SIMD example.
+
+This overall approach has a considerable number of benefits:
+
+1. It allows us to verify correctness of logic and spot semantic errors at a
+   very high level, by simply looking at the sequence of operations (available
+   by default at debug / verbose log level), without having to dig through the
+   multiple levels of complicated, interwoven format handling code that is
+   legacy swscale.
+
+2. Because most of the brains lives inside the the powerful optimizer, we get
+   fast paths "for free" for any suitable format conversion, rather than having
+   to enumerate them one by one. SIMD code itself can be written in a very
+   general way and does need to be tied to specific pixel formats - subsequent
+   low-level implementations can be strung together without much overhead.
+
+3. We can in the future, with relative ease, compile these operations
+   down to SPIR-V (or even LLVM IR) and generate efficient GPU or
+   target-machine specific implementations. This also opens the window for
+   adding hardware frame support to libswscale, and even transparently using
+   GPU acceleration for CPU frames.
+
+4. Platform-specific SIMD can be reduced down to a comparatively small set of
+   optimized routines, while still providing 100% coverage for all possible
+   pixel formats and operations. (As of writing, the x86 example backend has
+   about 60 unique implementations, of which 20 are trivial swizzles, 10 are
+   read/write ops, 10 are pixel type conversions and the remaining 20 are the
+   various logic/arithmetic ops).
+
+5. Backends hide behind a layer of abstraction offering them a considerable
+   deal of flexibility in how they want to implement their operations. For
+   example, the x86 backend has a dedicated function for compiling compatible
+   operations down to a single in-place pshufb instruction.
+
+   Platform specific low level data is self-contained within its own setup()
+   function and private data structure, eliminating all reads into SwsContext
+   or the possibility of conflicts between platforms.
+
+6. We can compute an exact reference result for each operation with fixed
+   precision (ff_sws_op_apply_q), and use that to e.g. measure the amount of
+   error introduced by dithering, or even catch bugs in the reference C
+   implementation. (In theory - currently checkasm just compares against C)
+
+Examples of SwsOp in action
+---------------------------
+
+For illustration, here is the sequence of operations currently generated by
+my prototype, for a conversion from RGB24 to YUV444P:
+
+Unoptimized operation list:
+  [ u8 .... -> ....] SWS_OP_READ         : 3 elem(s) packed >> 0
+  [ u8 .... -> ....] SWS_OP_SWIZZLE      : 0123
+  [ u8 .... -> ....] SWS_OP_RSHIFT       : >> 0
+  [ u8 .... -> ....] SWS_OP_CLEAR        : {_ _ _ 0}
+  [ u8 .... -> ....] SWS_OP_CONVERT      : u8 -> f32
+  [f32 .... -> ....] SWS_OP_LINEAR       : diag3+alpha [[1/255 0 0 0 0] [0 1/255 0 0 0] [0 0 1/255 0 0] [0 0 0 1 1]]
+  [f32 .... -> ....] SWS_OP_LINEAR       : matrix3 [[0.299000 0.587000 0.114000 0 0] [-0.168736 -0.331264 1/2 0 0] [1/2 -0.418688 -57/701 0 0] [0 0 0 1 0]]
+  [f32 .... -> ....] SWS_OP_LINEAR       : diag3+off3 [[219 0 0 0 16] [0 224 0 0 128] [0 0 224 0 128] [0 0 0 1 0]]
+  [f32 .... -> ....] SWS_OP_DITHER       : 16x16 matrix
+  [f32 .... -> ....] SWS_OP_MAX          : {0 0 0 0} <= x
+  [f32 .... -> ....] SWS_OP_MIN          : x <= {255 255 255 _}
+  [f32 .... -> ....] SWS_OP_CONVERT      : f32 -> u8
+  [ u8 .... -> ....] SWS_OP_LSHIFT       : << 0
+  [ u8 .... -> ....] SWS_OP_SWIZZLE      : 0123
+  [ u8 .... -> ....] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+
+This is optimized into the following sequence:
+
+Optimized operation list:
+  [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
+  [ u8 ...X -> +++X] SWS_OP_CONVERT      : u8 -> f32
+  [f32 ...X -> ...X] SWS_OP_LINEAR       : matrix3+off3 [[0.256788 0.504129 0.097906 0 16] [-0.148223 -0.290993 112/255 0 128] [112/255 -0.367788 -0.071427 0 128] [0 0 0 1 0]]
+  [f32 ...X -> ...X] SWS_OP_DITHER       : 16x16 matrix
+  [f32 ...X -> +++X] SWS_OP_CONVERT      : f32 -> u8
+  [ u8 ...X -> +++X] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+    (X = unused, + = exact, 0 = zero)
+
+The extra metadata on the left of the operation list is just a dump of the
+internal state used by the optimizer during optimization. It keeps track of
+knowledge about the pixel values, such as their value range, whether or not
+they're exact integers, and so on.
+
+In this example, you can see that the input values are exact (except for
+the alpha channel, which is undefined), until the first SWS_OP_LINEAR
+multiplies them by a noninteger constant. They regain their exact integer
+status only after the (truncating) conversion to U8 in the output step.
+
+Example of more aggressive optimization
+---------------------------------------
+
+Conversion pass for gray -> rgb48:
+Unoptimized operation list:
+  [ u8 .... -> ....] SWS_OP_READ         : 1 elem(s) planar >> 0
+  [ u8 .... -> ....] SWS_OP_SWIZZLE      : 0123
+  [ u8 .... -> ....] SWS_OP_RSHIFT       : >> 0
+  [ u8 .... -> ....] SWS_OP_CLEAR        : {_ 0 0 0}
+  [ u8 .... -> ....] SWS_OP_CONVERT      : u8 -> f32
+  [f32 .... -> ....] SWS_OP_LINEAR       : luma+alpha [[1/255 0 0 0 0] [0 1 0 0 0] [0 0 1 0 0] [0 0 0 1 1]]
+  [f32 .... -> ....] SWS_OP_LINEAR       : matrix3 [[1 0 701/500 0 0] [1 -0.344136 -0.714136 0 0] [1 443/250 0 0 0] [0 0 0 1 0]]
+  [f32 .... -> ....] SWS_OP_LINEAR       : diag3 [[65535 0 0 0 0] [0 65535 0 0 0] [0 0 65535 0 0] [0 0 0 1 0]]
+  [f32 .... -> ....] SWS_OP_MAX          : {0 0 0 0} <= x
+  [f32 .... -> ....] SWS_OP_MIN          : x <= {65535 65535 65535 _}
+  [f32 .... -> ....] SWS_OP_CONVERT      : f32 -> u16
+  [u16 .... -> ....] SWS_OP_LSHIFT       : << 0
+  [u16 .... -> ....] SWS_OP_SWIZZLE      : 0123
+  [u16 .... -> ....] SWS_OP_WRITE        : 3 elem(s) packed >> 0
+
+Optimized operation list:
+  [ u8 XXXX -> +XXX] SWS_OP_READ         : 1 elem(s) planar >> 0
+  [ u8 .XXX -> +XXX] SWS_OP_CONVERT      : u8 -> u16 (expand)
+  [u16 .XXX -> +++X] SWS_OP_SWIZZLE      : 0003
+  [u16 ...X -> +++X] SWS_OP_WRITE        : 3 elem(s) packed >> 0
+    (X = unused, + = exact, 0 = zero)
+
+Here, the optimizer has managed to eliminate all of the unnecessary linear
+operations on previously zero'd values, turn the resulting column matrix into
+a swizzle operation, avoid the unnecessary dither (and round trip via float)
+because the pixel values are guaranteed to be bit exact, and finally, turns
+the multiplication by 65535 / 255 = 257 into a simple integer expand operation.
+
+As a final bonus, the x86 backend further optimizes this into a 12-byte shuffle:
+  pshufb = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1}
+
+time=208 us, ref=4212 us, speedup=20.236x faster (single thread)
+time=57 us, ref=472 us, speedup=8.160x faster (multi thread)
+
+Compiler and underlying implementation layer (SwsOpChain)
+---------------------------------------------------------
+
+While the backend API is flexible enough to permit more exotic implementations
+(e.g. using JIT code generation), we establish a common set of helpers for use
+in "traditional" SIMD implementations.
+
+The basic idea is to have one "kernel" (or implementation) per operation,
+and then just chain a list of these kernels together as separate function
+calls. For best performance, we want to keep data in vector registers in
+between function calls using a custom calling convention, thus avoiding any
+unnecessary memory accesses. Additionally, we want the per-kernel overhead to
+be as low as possible, with each kernel ideally just jumping directly into
+the next kernel.
+
+As a result, we arrive at a design where we first divide the image into small
+chunks, or "blocks", and then dispatch the "chain" of kernels on each chunk in
+sequence. Each kernel processes a fixed number of pixels, with the overall
+entry point taking care of looping. Remaining pixels (the "tail") are handled
+generically by the backend-invariant dispatch code (located in ops.c), using a
+partial memcpy into a suitably sized temporary buffer.
+
+To minimize the per-kernel function call overhead, we use a "continuation
+passing style" for chaining kernels. Each operation computes its result and
+then directly calls the next operation in the sequence, with the appropriate
+internal function signature.
+
+The C reference backend reads data into the stack and then passes the array
+pointers to the next continuation as regular function arguments:
+
+  void process(GlobalContext *ctx, OpContext *op,
+               block_t x, block_t y, block_t z, block_t w)
+  {
+      for (int i = 0; i < SWS_BLOCK_SIZE; i++)
+          // do something with x[i], y[i], z[i], w[i]
+
+      op->next(ctx, &op[1], x, y, z, w);
+  }
+
+With type conversions pushing the new data onto the stack as well:
+
+  void convert8to16(GlobalContext *ctx, OpContext *op,
+                    block_t x, block_t y, block_t z, block_t w)
+  {
+        /* Pseudo-code */
+        u16block_t x16 = (u16block_t) x;
+        u16block_t y16 = (u16block_t) y;
+        u16block_t z16 = (u16block_t) z;
+        u16block_t w16 = (u16block_t) w;
+
+        op->next(ctx, &op[1], x16, y16, z16, w16);
+  }
+
+By contrast, the x86 backend always keeps the X/Y/Z/W values pinned in specific
+vector registers (ymm0-ymm3 for the lower half, and ymm4-ymm7 for the second
+half).
+
+Each kernel additionally has access to a 32 byte per-op context storing the
+pointer to the next kernel plus 16 bytes of arbitrary private data. This is
+used during construction of the function chain to place things like small
+constants.
+
+In assembly, the per-kernel overhead looks like this:
+
+  load $tmp, $arg1
+  ...
+  add $arg1, 32
+  jump $tmp
+
+This design gives vastly better performance than the alternative of returning
+out to a central loop or "trampoline". This is partly because the order of
+kernels within a chain is always the same, so the branch predictor can easily
+remember the target address of each "jump" instruction.
+
+The only way to realistically improve on this design would be to directly
+stitch the kernel body together using runtime code generation.
+
+Future considerations and limitations
+-------------------------------------
+
+My current prototype has a number of severe limitations and opportunities
+for improvements:
+
+1. It does not handle scaling at all. I am not yet entirely sure on how I want
+   to handle scaling; this includes handling of subsampled content. I have a
+   number of vague ideas in my head, but nothing where I can say with certainty
+   that it will work out well.
+
+   It's possible that we won't come up with a perfect solution here, and will
+   need to decide on which set of compromises we are comfortable accepting:
+
+   1. Do we need the ability to scale YUV -> YUV by handling luma and chroma
+      independently? When downscaling 100x100 4:2:0 to 50x50 4:4:4, should we
+      support the option of reusing the chroma plane directly (even though
+      this would introduce a subpixel shift for typical chroma siting)?
+
+   Looking towards zimg, I am also thinking that we probably also want to do
+   scaling on floating point values, since this is best for both performance
+   and accuracy, especially given that we need to go up to 32-bit intermediates
+   during scaling anyway.
+
+   So far, the most promising approach seems to be to handle subsampled
+   input/output as a dedicated read/write operation type; perhaps even with a
+   fixed/static subsampling kernel. To avoid compromising on performance when
+   chroma resampling is not necessary, the optimizer could then relax the
+   pipeline to use non-interpolating read/writes when all intermediate
+   operations are component-independent.
+
+2. Since each operation is conceptually defined on 4-component pixels, we end
+   up defining a lot of variants of each implementation for each possible
+   *subset*. For example, we have four different implementations for
+   SWS_OP_SCALE in my current templates:
+    - op_scale_1000
+    - op_scale_1001
+    - op_scale_1110
+    - op_scale_1111
+
+   This reflects the four different arangements of pixel components that are
+   typically present (or absent). While best for performance, it does turn into
+   a bit of a chore when implementing these kernels.
+
+   The only real alternative would be to either branch inside the kernel (bad),
+   or to use separate kernels for each individual component and chain them all
+   together. I have not yet tested whether the latter approach would be faster
+   after the latest round of refactors to the kernel glue code.
+
+3. I do not yet have any support for LUTs. But when I add them, something we
+   could do is have the optimized pass automatically "promote" a sequence of
+   operations to LUTs. For example, any sequence that looks like:
+
+   1. [u8] SWS_OP_CONVERT -> X
+   2. [X] ... // only per-component operations
+   4. [X] SWS_OP_CONVERT -> Y
+   3. [Y] SWS_OP_WRITE
+
+   could be replaced by a LUT with 256 entries. This is especially important
+   for anything involving packed 8-bit input (e.g. rgb8, rgb4_byte).
+
+   We also definitely want to hook this up to the existing CMS code for
+   transformations between different primaries.
+
+4. Because we rely on AVRational math to generate the coefficients for
+   operations, we need to be able to represent all pixel values as an
+   AVRational. However, this presents a challenge for 32-bit formats (e.g.
+   GRAY32, RGBA128), because their size exceeds INT_MAX, which is the maximum
+   value representable by an AVRational.
+
+   It's possible we may want to introduce an AVRational64 for this, or
+   perhaps more flexibly, extend AVRational to an AVFloating type which is
+   represented as { AVRational n; int exp; }, representing n/d * 2^exp. This
+   would preserve our ability to represent all pixel values exactly, while
+   opening up the range arbitrarily.
+
+5. Is there ever a situation where the use of floats introduces the risk of
+   non bit-exact output? For this reason, and possible performance advantages,
+   we may want to explore the use of a fixed-point 16 bit path as an alternative
+   to the floating point math.
+
+   So far, I have managed to avoid any bit exactness issues inside the x86
+   backend by ensuring that the order of linear operations is identical
+   between the C backend and the x86 backend, but this may not be practical
+   to guarantee on all backends. The x86 float code is also dramatically
+   faster than the old fixed point code, so I'm tentatively optimistic about
+   the lack of a need for a fixed point path.
diff --git a/ffbuild/common.mak b/ffbuild/common.mak
index 0e1eb1f62bcbe..ddf48923ea735 100644
--- a/ffbuild/common.mak
+++ b/ffbuild/common.mak
@@ -140,9 +140,9 @@ else
 endif
 
 # 1) Preprocess CSS to a minified version
+%.css.min: TAG = SED
 %.css.min: %.css
-	# Must start with a tab in the real Makefile
-	sed 's!/\\*.*\\*/!!g' $< \
+	$(M)sed 's!/\\*.*\\*/!!g' $< \
 	| tr '\n' ' ' \
 	| tr -s ' ' \
 	| sed 's/^ //; s/ $$//' \
@@ -151,6 +151,7 @@ endif
 ifdef CONFIG_RESOURCE_COMPRESSION
 
 # 2) Gzip the minified CSS
+%.css.min.gz: TAG = GZIP
 %.css.min.gz: %.css.min
 	$(M)gzip -nc9 $< > $@
 
@@ -159,6 +160,7 @@ ifdef CONFIG_RESOURCE_COMPRESSION
 	$(BIN2C) $< $@ $(subst .,_,$(basename $(notdir $@)))
 
 # 4) Gzip the HTML file (no minification needed)
+%.html.gz: TAG = GZIP
 %.html.gz: %.html
 	$(M)gzip -nc9 $< > $@
 
@@ -197,7 +199,6 @@ endif
 include $(SRC_PATH)/ffbuild/arch.mak
 
 OBJS      += $(OBJS-yes)
-SLIBOBJS  += $(SLIBOBJS-yes)
 SHLIBOBJS += $(SHLIBOBJS-yes)
 STLIBOBJS += $(STLIBOBJS-yes)
 FFLIBS    := $($(NAME)_FFLIBS) $(FFLIBS-yes) $(FFLIBS)
@@ -207,7 +208,6 @@ LDLIBS       = $(FFLIBS:%=%$(BUILDSUF))
 FFEXTRALIBS := $(LDLIBS:%=$(LD_LIB)) $(foreach lib,EXTRALIBS-$(NAME) $(FFLIBS:%=EXTRALIBS-%),$($(lib))) $(EXTRALIBS)
 
 OBJS      := $(sort $(OBJS:%=$(SUBDIR)%))
-SLIBOBJS  := $(sort $(SLIBOBJS:%=$(SUBDIR)%))
 SHLIBOBJS := $(sort $(SHLIBOBJS:%=$(SUBDIR)%))
 STLIBOBJS := $(sort $(STLIBOBJS:%=$(SUBDIR)%))
 TESTOBJS  := $(TESTOBJS:%=$(SUBDIR)tests/%) $(TESTPROGS:%=$(SUBDIR)tests/%.o)
@@ -245,13 +245,12 @@ $(HOSTPROGS): %$(HOSTEXESUF): %.o
 $(OBJS):     | $(sort $(dir $(OBJS)))
 $(HOBJS):    | $(sort $(dir $(HOBJS)))
 $(HOSTOBJS): | $(sort $(dir $(HOSTOBJS)))
-$(SLIBOBJS): | $(sort $(dir $(SLIBOBJS)))
 $(SHLIBOBJS): | $(sort $(dir $(SHLIBOBJS)))
 $(STLIBOBJS): | $(sort $(dir $(STLIBOBJS)))
 $(TESTOBJS): | $(sort $(dir $(TESTOBJS)))
 $(TOOLOBJS): | tools
 
-OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SLIBOBJS) $(SHLIBOBJS) $(STLIBOBJS) $(TESTOBJS))
+OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SHLIBOBJS) $(STLIBOBJS) $(TESTOBJS))
 
 CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.ver *.version *.html.gz *.html.c *.css.gz *.css.c  *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
 LIBSUFFIXES       = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
@@ -263,4 +262,4 @@ endef
 
 $(eval $(RULES))
 
--include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d) $(HOBJS:.o=.d) $(SHLIBOBJS:.o=.d) $(STLIBOBJS:.o=.d) $(SLIBOBJS:.o=.d)) $(OBJS:.o=$(DEFAULT_X86ASMD).d)
+-include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d) $(HOBJS:.o=.d) $(SHLIBOBJS:.o=.d) $(STLIBOBJS:.o=.d)) $(OBJS:.o=$(DEFAULT_X86ASMD).d)
diff --git a/ffbuild/library.mak b/ffbuild/library.mak
index 288c82a177041..569708c73b859 100644
--- a/ffbuild/library.mak
+++ b/ffbuild/library.mak
@@ -70,7 +70,7 @@ $(SUBDIR)lib$(NAME).ver: $(SUBDIR)lib$(NAME).v $(OBJS)
 $(SUBDIR)$(SLIBNAME): $(SUBDIR)$(SLIBNAME_WITH_MAJOR)
 	$(Q)cd ./$(SUBDIR) && $(LN_S) $(SLIBNAME_WITH_MAJOR) $(SLIBNAME)
 
-$(SUBDIR)$(SLIBNAME_WITH_MAJOR): $(OBJS) $(SHLIBOBJS) $(SLIBOBJS) $(SUBDIR)lib$(NAME).ver
+$(SUBDIR)$(SLIBNAME_WITH_MAJOR): $(OBJS) $(SHLIBOBJS) $(SUBDIR)lib$(NAME).ver
 	$(SLIB_CREATE_DEF_CMD)
 ifeq ($(RESPONSE_FILES),yes)
 	$(Q)echo $$(filter %.o,$$^) > $$@.objs
diff --git a/fftools/Makefile b/fftools/Makefile
index 361a4fd574f97..b3c08ae5a0e8b 100644
--- a/fftools/Makefile
+++ b/fftools/Makefile
@@ -49,7 +49,6 @@ OBJS-ffprobe +=                       \
     fftools/textformat/tw_avio.o      \
     fftools/textformat/tw_buffer.o    \
     fftools/textformat/tw_stdout.o    \
-    $(OBJS-resman)                    \
 
 OBJS-ffplay += fftools/ffplay_renderer.o
 
@@ -93,4 +92,4 @@ uninstall-progs:
 	$(RM) $(addprefix "$(BINDIR)/", $(ALLAVPROGS))
 
 clean::
-	$(RM) $(ALLAVPROGS) $(ALLAVPROGS_G) $(CLEANSUFFIXES:%=fftools/%)
+	$(RM) $(ALLAVPROGS) $(ALLAVPROGS_G) $(CLEANSUFFIXES:%=fftools/%) $(CLEANSUFFIXES:%=fftools/graph/%) $(CLEANSUFFIXES:%=fftools/textformat/%)
diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 964770df23238..de607cac931b8 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -309,7 +309,7 @@ const AVIOInterruptCB int_cb = { decode_interrupt_cb, NULL };
 
 static void ffmpeg_cleanup(int ret)
 {
-    if (print_graphs || print_graphs_file)
+    if ((print_graphs || print_graphs_file) && nb_output_files > 0)
         print_filtergraphs(filtergraphs, nb_filtergraphs, input_files, nb_input_files, output_files, nb_output_files);
 
     if (do_benchmark) {
@@ -344,6 +344,9 @@ static void ffmpeg_cleanup(int ret)
 
     av_freep(&filter_nbthreads);
 
+    av_freep(&print_graphs_file);
+    av_freep(&print_graphs_format);
+
     av_freep(&input_files);
     av_freep(&output_files);
 
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index 7fbf0ad5326d6..7868f3d85ffc1 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -39,6 +39,7 @@
 #include "libavfilter/avfilter.h"
 
 #include "libavutil/avutil.h"
+#include "libavutil/bprint.h"
 #include "libavutil/dict.h"
 #include "libavutil/eval.h"
 #include "libavutil/fifo.h"
@@ -352,6 +353,18 @@ typedef struct OutputFilterOptions {
 typedef struct InputFilter {
     struct FilterGraph *graph;
     uint8_t            *name;
+    int                 index;
+
+    // filter data type
+    enum AVMediaType    type;
+
+    AVFilterContext    *filter;
+
+    char               *input_name;
+
+    /* for filters that are not yet bound to an input stream,
+     * this stores the input linklabel, if any */
+    uint8_t            *linklabel;
 } InputFilter;
 
 typedef struct OutputFilter {
@@ -359,6 +372,11 @@ typedef struct OutputFilter {
 
     struct FilterGraph  *graph;
     uint8_t             *name;
+    int                  index;
+
+    AVFilterContext     *filter;
+
+    char                *output_name;
 
     /* for filters that are not yet bound to an output stream,
      * this stores the output linklabel, if any */
@@ -381,6 +399,9 @@ typedef struct FilterGraph {
     int          nb_inputs;
     OutputFilter **outputs;
     int         nb_outputs;
+
+    const char      *graph_desc;
+    struct AVBPrint graph_print_buf;
 } FilterGraph;
 
 enum DecoderFlags {
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index b774606562788..e0c40ffe00a93 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -21,7 +21,6 @@
 #include <stdint.h>
 
 #include "ffmpeg.h"
-#include "ffmpeg_filter.h"
 #include "graph/graphprint.h"
 
 #include "libavfilter/avfilter.h"
@@ -44,6 +43,42 @@
 // FIXME private header, used for mid_pred()
 #include "libavcodec/mathops.h"
 
+typedef struct FilterGraphPriv {
+    FilterGraph      fg;
+
+    // name used for logging
+    char             log_name[32];
+
+    int              is_simple;
+    // true when the filtergraph contains only meta filters
+    // that do not modify the frame data
+    int              is_meta;
+    // source filters are present in the graph
+    int              have_sources;
+    int              disable_conversions;
+
+    unsigned         nb_outputs_done;
+
+    int              nb_threads;
+
+    // frame for temporarily holding output from the filtergraph
+    AVFrame         *frame;
+    // frame for sending output to the encoder
+    AVFrame         *frame_enc;
+
+    Scheduler       *sch;
+    unsigned         sch_idx;
+} FilterGraphPriv;
+
+static FilterGraphPriv *fgp_from_fg(FilterGraph *fg)
+{
+    return (FilterGraphPriv*)fg;
+}
+
+static const FilterGraphPriv *cfgp_from_cfg(const FilterGraph *fg)
+{
+    return (const FilterGraphPriv*)fg;
+}
 
 // data that is local to the filter thread and not visible outside of it
 typedef struct FilterGraphThread {
@@ -66,6 +101,141 @@ typedef struct FilterGraphThread {
     uint8_t         *eof_out;
 } FilterGraphThread;
 
+typedef struct InputFilterPriv {
+    InputFilter         ifilter;
+
+    InputFilterOptions  opts;
+
+    // used to hold submitted input
+    AVFrame            *frame;
+
+    // source data type: AVMEDIA_TYPE_SUBTITLE for sub2video,
+    // same as type otherwise
+    enum AVMediaType    type_src;
+
+    int                 eof;
+    int                 bound;
+    int                 drop_warned;
+    uint64_t            nb_dropped;
+
+    // parameters configured for this input
+    int                 format;
+
+    int                 width, height;
+    AVRational          sample_aspect_ratio;
+    enum AVColorSpace   color_space;
+    enum AVColorRange   color_range;
+
+    int                 sample_rate;
+    AVChannelLayout     ch_layout;
+
+    AVRational          time_base;
+
+    AVFrameSideData   **side_data;
+    int                 nb_side_data;
+
+    AVFifo             *frame_queue;
+
+    AVBufferRef        *hw_frames_ctx;
+
+    int                 displaymatrix_present;
+    int                 displaymatrix_applied;
+    int32_t             displaymatrix[9];
+
+    int                 downmixinfo_present;
+    AVDownmixInfo       downmixinfo;
+
+    struct {
+        AVFrame *frame;
+
+        int64_t last_pts;
+        int64_t end_pts;
+
+        /// marks if sub2video_update should force an initialization
+        unsigned int initialize;
+    } sub2video;
+} InputFilterPriv;
+
+static InputFilterPriv *ifp_from_ifilter(InputFilter *ifilter)
+{
+    return (InputFilterPriv*)ifilter;
+}
+
+typedef struct FPSConvContext {
+    AVFrame          *last_frame;
+    /* number of frames emitted by the video-encoding sync code */
+    int64_t           frame_number;
+    /* history of nb_frames_prev, i.e. the number of times the
+     * previous frame was duplicated by vsync code in recent
+     * do_video_out() calls */
+    int64_t           frames_prev_hist[3];
+
+    uint64_t          dup_warning;
+
+    int               last_dropped;
+    int               dropped_keyframe;
+
+    enum VideoSyncMethod vsync_method;
+
+    AVRational        framerate;
+    AVRational        framerate_max;
+    const AVRational *framerate_supported;
+    int               framerate_clip;
+} FPSConvContext;
+
+typedef struct OutputFilterPriv {
+    OutputFilter            ofilter;
+
+    void                   *log_parent;
+    char                    log_name[32];
+
+    /* desired output stream properties */
+    int                     format;
+    int                     width, height;
+    int                     sample_rate;
+    AVChannelLayout         ch_layout;
+    enum AVColorSpace       color_space;
+    enum AVColorRange       color_range;
+
+    AVFrameSideData       **side_data;
+    int                     nb_side_data;
+
+    // time base in which the output is sent to our downstream
+    // does not need to match the filtersink's timebase
+    AVRational              tb_out;
+    // at least one frame with the above timebase was sent
+    // to our downstream, so it cannot change anymore
+    int                     tb_out_locked;
+
+    AVRational              sample_aspect_ratio;
+
+    AVDictionary           *sws_opts;
+    AVDictionary           *swr_opts;
+
+    // those are only set if no format is specified and the encoder gives us multiple options
+    // They point directly to the relevant lists of the encoder.
+    const int              *formats;
+    const AVChannelLayout  *ch_layouts;
+    const int              *sample_rates;
+    const enum AVColorSpace *color_spaces;
+    const enum AVColorRange *color_ranges;
+
+    AVRational              enc_timebase;
+    int64_t                 trim_start_us;
+    int64_t                 trim_duration_us;
+    // offset for output timestamps, in AV_TIME_BASE_Q
+    int64_t                 ts_offset;
+    int64_t                 next_pts;
+    FPSConvContext          fps;
+
+    unsigned                flags;
+} OutputFilterPriv;
+
+static OutputFilterPriv *ofp_from_ofilter(OutputFilter *ofilter)
+{
+    return (OutputFilterPriv*)ofilter;
+}
+
 typedef struct FilterCommand {
     char *target;
     char *command;
@@ -146,7 +316,7 @@ static void sub2video_push_ref(InputFilterPriv *ifp, int64_t pts)
 
     av_assert1(frame->data[0]);
     ifp->sub2video.last_pts = frame->pts = pts;
-    ret = av_buffersrc_add_frame_flags(ifp->filter, frame,
+    ret = av_buffersrc_add_frame_flags(ifp->ifilter.filter, frame,
                                        AV_BUFFERSRC_FLAG_KEEP_REF |
                                        AV_BUFFERSRC_FLAG_PUSH);
     if (ret != AVERROR_EOF && ret < 0)
@@ -480,10 +650,10 @@ static OutputFilter *ofilter_alloc(FilterGraph *fg, enum AVMediaType type)
     ofp->format       = -1;
     ofp->color_space  = AVCOL_SPC_UNSPECIFIED;
     ofp->color_range  = AVCOL_RANGE_UNSPECIFIED;
-    ofp->index        = fg->nb_outputs - 1;
+    ofilter->index    = fg->nb_outputs - 1;
 
     snprintf(ofp->log_name, sizeof(ofp->log_name), "%co%d",
-             av_get_media_type_string(type)[0], ofp->index);
+             av_get_media_type_string(type)[0], ofilter->index);
 
     return ofilter;
 }
@@ -499,10 +669,10 @@ static int ifilter_bind_ist(InputFilter *ifilter, InputStream *ist,
     av_assert0(!ifp->bound);
     ifp->bound = 1;
 
-    if (ifp->type != ist->par->codec_type &&
-        !(ifp->type == AVMEDIA_TYPE_VIDEO && ist->par->codec_type == AVMEDIA_TYPE_SUBTITLE)) {
+    if (ifilter->type != ist->par->codec_type &&
+        !(ifilter->type == AVMEDIA_TYPE_VIDEO && ist->par->codec_type == AVMEDIA_TYPE_SUBTITLE)) {
         av_log(fgp, AV_LOG_ERROR, "Tried to connect %s stream to %s filtergraph input\n",
-               av_get_media_type_string(ist->par->codec_type), av_get_media_type_string(ifp->type));
+               av_get_media_type_string(ist->par->codec_type), av_get_media_type_string(ifilter->type));
         return AVERROR(EINVAL);
     }
 
@@ -517,8 +687,12 @@ static int ifilter_bind_ist(InputFilter *ifilter, InputStream *ist,
     if (ret < 0)
         return ret;
 
+    ifilter->input_name = av_strdup(ifp->opts.name);
+    if (!ifilter->input_name)
+        return AVERROR(EINVAL);
+
     ret = sch_connect(fgp->sch,
-                      src, SCH_FILTER_IN(fgp->sch_idx, ifp->index));
+                      src, SCH_FILTER_IN(fgp->sch_idx, ifilter->index));
     if (ret < 0)
         return ret;
 
@@ -553,19 +727,23 @@ static int ifilter_bind_dec(InputFilterPriv *ifp, Decoder *dec,
     av_assert0(!ifp->bound);
     ifp->bound = 1;
 
-    if (ifp->type != dec->type) {
+    if (ifp->ifilter.type != dec->type) {
         av_log(fgp, AV_LOG_ERROR, "Tried to connect %s decoder to %s filtergraph input\n",
-               av_get_media_type_string(dec->type), av_get_media_type_string(ifp->type));
+               av_get_media_type_string(dec->type), av_get_media_type_string(ifp->ifilter.type));
         return AVERROR(EINVAL);
     }
 
-    ifp->type_src = ifp->type;
+    ifp->type_src = ifp->ifilter.type;
 
     ret = dec_filter_add(dec, &ifp->ifilter, &ifp->opts, vs, &src);
     if (ret < 0)
         return ret;
 
-    ret = sch_connect(fgp->sch, src, SCH_FILTER_IN(fgp->sch_idx, ifp->index));
+    ifp->ifilter.input_name = av_strdup(ifp->opts.name);
+    if (!ifp->ifilter.input_name)
+        return AVERROR(EINVAL);
+
+    ret = sch_connect(fgp->sch, src, SCH_FILTER_IN(fgp->sch_idx, ifp->ifilter.index));
     if (ret < 0)
         return ret;
 
@@ -634,8 +812,8 @@ int ofilter_bind_enc(OutputFilter *ofilter, unsigned sched_idx_enc,
     ofp->trim_start_us    = opts->trim_start_us;
     ofp->trim_duration_us = opts->trim_duration_us;
 
-    ofp->name         = av_strdup(opts->name);
-    if (!ofp->name)
+    ofilter->output_name  = av_strdup(opts->name);
+    if (!ofilter->output_name)
         return AVERROR(EINVAL);
 
     ret = av_dict_copy(&ofp->sws_opts, opts->sws_opts, 0);
@@ -655,7 +833,7 @@ int ofilter_bind_enc(OutputFilter *ofilter, unsigned sched_idx_enc,
         ofp->log_parent = NULL;
         av_strlcpy(ofp->log_name, fgp->log_name, sizeof(ofp->log_name));
     } else
-        av_strlcatf(ofp->log_name, sizeof(ofp->log_name), "->%s", ofp->name);
+        av_strlcatf(ofp->log_name, sizeof(ofp->log_name), "->%s", ofilter->output_name);
 
     switch (ofilter->type) {
     case AVMEDIA_TYPE_VIDEO:
@@ -714,7 +892,7 @@ int ofilter_bind_enc(OutputFilter *ofilter, unsigned sched_idx_enc,
         break;
     }
 
-    ret = sch_connect(fgp->sch, SCH_FILTER_OUT(fgp->sch_idx, ofp->index),
+    ret = sch_connect(fgp->sch, SCH_FILTER_OUT(fgp->sch_idx, ofilter->index),
                                 SCH_ENC(sched_idx_enc));
     if (ret < 0)
         return ret;
@@ -728,16 +906,16 @@ static int ofilter_bind_ifilter(OutputFilter *ofilter, InputFilterPriv *ifp,
     OutputFilterPriv *ofp = ofp_from_ofilter(ofilter);
 
     av_assert0(!ofilter->bound);
-    av_assert0(ofilter->type == ifp->type);
+    av_assert0(ofilter->type == ifp->ifilter.type);
 
     ofilter->bound = 1;
     av_freep(&ofilter->linklabel);
 
-    ofp->name = av_strdup(opts->name);
-    if (!ofp->name)
+    ofilter->output_name = av_strdup(opts->name);
+    if (!ofilter->output_name)
         return AVERROR(EINVAL);
 
-    av_strlcatf(ofp->log_name, sizeof(ofp->log_name), "->%s", ofp->name);
+    av_strlcatf(ofp->log_name, sizeof(ofp->log_name), "->%s", ofilter->output_name);
 
     return 0;
 }
@@ -753,18 +931,18 @@ static int ifilter_bind_fg(InputFilterPriv *ifp, FilterGraph *fg_src, int out_id
     av_assert0(!ifp->bound);
     ifp->bound = 1;
 
-    if (ifp->type != ofilter_src->type) {
+    if (ifp->ifilter.type != ofilter_src->type) {
         av_log(fgp, AV_LOG_ERROR, "Tried to connect %s output to %s input\n",
                av_get_media_type_string(ofilter_src->type),
-               av_get_media_type_string(ifp->type));
+               av_get_media_type_string(ifp->ifilter.type));
         return AVERROR(EINVAL);
     }
 
-    ifp->type_src = ifp->type;
+    ifp->type_src = ifp->ifilter.type;
 
     memset(&opts, 0, sizeof(opts));
 
-    snprintf(name, sizeof(name), "fg:%d:%d", fgp->fg.index, ifp->index);
+    snprintf(name, sizeof(name), "fg:%d:%d", fgp->fg.index, ifp->ifilter.index);
     opts.name = name;
 
     ret = ofilter_bind_ifilter(ofilter_src, ifp, &opts);
@@ -772,7 +950,7 @@ static int ifilter_bind_fg(InputFilterPriv *ifp, FilterGraph *fg_src, int out_id
         return ret;
 
     ret = sch_connect(fgp->sch, SCH_FILTER_OUT(fg_src->index, out_idx),
-                                SCH_FILTER_IN(fgp->sch_idx, ifp->index));
+                                SCH_FILTER_IN(fgp->sch_idx, ifp->ifilter.index));
     if (ret < 0)
         return ret;
 
@@ -795,7 +973,7 @@ static InputFilter *ifilter_alloc(FilterGraph *fg)
     if (!ifp->frame)
         return NULL;
 
-    ifp->index           = fg->nb_inputs - 1;
+    ifilter->index       = fg->nb_inputs - 1;
     ifp->format          = -1;
     ifp->color_space     = AVCOL_SPC_UNSPECIFIED;
     ifp->color_range     = AVCOL_RANGE_UNSPECIFIED;
@@ -832,10 +1010,11 @@ void fg_free(FilterGraph **pfg)
         av_frame_free(&ifp->opts.fallback);
 
         av_buffer_unref(&ifp->hw_frames_ctx);
-        av_freep(&ifp->linklabel);
+        av_freep(&ifilter->linklabel);
         av_freep(&ifp->opts.name);
         av_frame_side_data_free(&ifp->side_data, &ifp->nb_side_data);
         av_freep(&ifilter->name);
+        av_freep(&ifilter->input_name);
         av_freep(&fg->inputs[j]);
     }
     av_freep(&fg->inputs);
@@ -849,14 +1028,14 @@ void fg_free(FilterGraph **pfg)
 
         av_freep(&ofilter->linklabel);
         av_freep(&ofilter->name);
+        av_freep(&ofilter->output_name);
         av_freep(&ofilter->apad);
-        av_freep(&ofp->name);
         av_channel_layout_uninit(&ofp->ch_layout);
         av_frame_side_data_free(&ofp->side_data, &ofp->nb_side_data);
         av_freep(&fg->outputs[j]);
     }
     av_freep(&fg->outputs);
-    av_freep(&fgp->graph_desc);
+    av_freep(&fg->graph_desc);
 
     av_frame_free(&fgp->frame);
     av_frame_free(&fgp->frame_enc);
@@ -909,7 +1088,7 @@ int fg_create(FilterGraph **pfg, char *graph_desc, Scheduler *sch)
     }
 
     fg->class       = &fg_class;
-    fgp->graph_desc = graph_desc;
+    fg->graph_desc  = graph_desc;
     fgp->disable_conversions = !auto_conversion_filters;
     fgp->nb_threads          = -1;
     fgp->sch                 = sch;
@@ -928,7 +1107,7 @@ int fg_create(FilterGraph **pfg, char *graph_desc, Scheduler *sch)
         return AVERROR(ENOMEM);;
     graph->nb_threads = 1;
 
-    ret = graph_parse(fg, graph, fgp->graph_desc, &inputs, &outputs,
+    ret = graph_parse(fg, graph, fg->graph_desc, &inputs, &outputs,
                       hw_device_for_filter());
     if (ret < 0)
         goto fail;
@@ -945,21 +1124,19 @@ int fg_create(FilterGraph **pfg, char *graph_desc, Scheduler *sch)
 
     for (AVFilterInOut *cur = inputs; cur; cur = cur->next) {
         InputFilter *const ifilter = ifilter_alloc(fg);
-        InputFilterPriv       *ifp;
 
         if (!ifilter) {
             ret = AVERROR(ENOMEM);
             goto fail;
         }
 
-        ifp            = ifp_from_ifilter(ifilter);
-        ifp->linklabel = cur->name;
+        ifilter->linklabel = cur->name;
         cur->name      = NULL;
 
-        ifp->type      = avfilter_pad_get_type(cur->filter_ctx->input_pads,
+        ifilter->type  = avfilter_pad_get_type(cur->filter_ctx->input_pads,
                                                cur->pad_idx);
 
-        if (ifp->type != AVMEDIA_TYPE_VIDEO && ifp->type != AVMEDIA_TYPE_AUDIO) {
+        if (ifilter->type != AVMEDIA_TYPE_VIDEO && ifilter->type != AVMEDIA_TYPE_AUDIO) {
             av_log(fg, AV_LOG_FATAL, "Only video and audio filters supported "
                    "currently.\n");
             ret = AVERROR(ENOSYS);
@@ -1070,23 +1247,22 @@ int fg_create_simple(FilterGraph **pfg,
 
 static int fg_complex_bind_input(FilterGraph *fg, InputFilter *ifilter)
 {
-    FilterGraphPriv *fgp = fgp_from_fg(fg);
     InputFilterPriv *ifp = ifp_from_ifilter(ifilter);
     InputStream *ist = NULL;
-    enum AVMediaType type = ifp->type;
+    enum AVMediaType type = ifilter->type;
     ViewSpecifier vs = { .type = VIEW_SPECIFIER_TYPE_NONE };
     const char *spec;
     char *p;
     int i, ret;
 
-    if (ifp->linklabel && !strncmp(ifp->linklabel, "dec:", 4)) {
+    if (ifilter->linklabel && !strncmp(ifilter->linklabel, "dec:", 4)) {
         // bind to a standalone decoder
         int dec_idx;
 
-        dec_idx = strtol(ifp->linklabel + 4, &p, 0);
+        dec_idx = strtol(ifilter->linklabel + 4, &p, 0);
         if (dec_idx < 0 || dec_idx >= nb_decoders) {
             av_log(fg, AV_LOG_ERROR, "Invalid decoder index %d in filtergraph description %s\n",
-                   dec_idx, fgp->graph_desc);
+                   dec_idx, fg->graph_desc);
             return AVERROR(EINVAL);
         }
 
@@ -1102,7 +1278,7 @@ static int fg_complex_bind_input(FilterGraph *fg, InputFilter *ifilter)
             av_log(fg, AV_LOG_ERROR, "Error binding a decoder to filtergraph input %s\n",
                    ifilter->name);
         return ret;
-    } else if (ifp->linklabel) {
+    } else if (ifilter->linklabel) {
         StreamSpecifier ss;
         AVFormatContext *s;
         AVStream       *st = NULL;
@@ -1119,25 +1295,25 @@ static int fg_complex_bind_input(FilterGraph *fg, InputFilter *ifilter)
                 OutputFilter *ofilter = fg_src->outputs[j];
 
                 if (!ofilter->bound && ofilter->linklabel &&
-                    !strcmp(ofilter->linklabel, ifp->linklabel)) {
+                    !strcmp(ofilter->linklabel, ifilter->linklabel)) {
                     av_log(fg, AV_LOG_VERBOSE,
                            "Binding input with label '%s' to filtergraph output %d:%d\n",
-                           ifp->linklabel, i, j);
+                           ifilter->linklabel, i, j);
 
                     ret = ifilter_bind_fg(ifp, fg_src, j);
                     if (ret < 0)
                         av_log(fg, AV_LOG_ERROR, "Error binding filtergraph input %s\n",
-                               ifp->linklabel);
+                               ifilter->linklabel);
                     return ret;
                 }
             }
         }
 
         // bind to an explicitly specified demuxer stream
-        file_idx = strtol(ifp->linklabel, &p, 0);
+        file_idx = strtol(ifilter->linklabel, &p, 0);
         if (file_idx < 0 || file_idx >= nb_input_files) {
             av_log(fg, AV_LOG_FATAL, "Invalid file index %d in filtergraph description %s.\n",
-                   file_idx, fgp->graph_desc);
+                   file_idx, fg->graph_desc);
             return AVERROR(EINVAL);
         }
         s = input_files[file_idx]->ctx;
@@ -1171,14 +1347,14 @@ static int fg_complex_bind_input(FilterGraph *fg, InputFilter *ifilter)
         stream_specifier_uninit(&ss);
         if (!st) {
             av_log(fg, AV_LOG_FATAL, "Stream specifier '%s' in filtergraph description %s "
-                   "matches no streams.\n", p, fgp->graph_desc);
+                   "matches no streams.\n", p, fg->graph_desc);
             return AVERROR(EINVAL);
         }
         ist = input_files[file_idx]->streams[st->index];
 
         av_log(fg, AV_LOG_VERBOSE,
                "Binding input with label '%s' to input stream %d:%d\n",
-               ifp->linklabel, ist->file->index, ist->index);
+               ifilter->linklabel, ist->file->index, ist->index);
     } else {
         ist = ist_find_unused(type);
         if (!ist) {
@@ -1191,7 +1367,7 @@ static int fg_complex_bind_input(FilterGraph *fg, InputFilter *ifilter)
 
         av_log(fg, AV_LOG_VERBOSE,
                "Binding unlabeled input %d to input stream %d:%d\n",
-               ifp->index, ist->file->index, ist->index);
+               ifilter->index, ist->file->index, ist->index);
     }
     av_assert0(ist);
 
@@ -1340,8 +1516,8 @@ static int configure_output_video_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
     int ret;
     char name[255];
 
-    snprintf(name, sizeof(name), "out_%s", ofp->name);
-    ret = avfilter_graph_create_filter(&ofp->filter,
+    snprintf(name, sizeof(name), "out_%s", ofilter->output_name);
+    ret = avfilter_graph_create_filter(&ofilter->filter,
                                        avfilter_get_by_name("buffersink"),
                                        name, NULL, NULL, graph);
 
@@ -1360,7 +1536,7 @@ static int configure_output_video_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
             av_strlcatf(args, sizeof(args), ":%s=%s", e->key, e->value);
         }
 
-        snprintf(name, sizeof(name), "scaler_out_%s", ofp->name);
+        snprintf(name, sizeof(name), "scaler_out_%s", ofilter->output_name);
         if ((ret = avfilter_graph_create_filter(&filter, avfilter_get_by_name("scale"),
                                                 name, args, NULL, graph)) < 0)
             return ret;
@@ -1396,14 +1572,14 @@ static int configure_output_video_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
         pad_idx     = 0;
     }
 
-    snprintf(name, sizeof(name), "trim_out_%s", ofp->name);
+    snprintf(name, sizeof(name), "trim_out_%s", ofilter->output_name);
     ret = insert_trim(fgp, ofp->trim_start_us, ofp->trim_duration_us,
                       &last_filter, &pad_idx, name);
     if (ret < 0)
         return ret;
 
 
-    if ((ret = avfilter_link(last_filter, pad_idx, ofp->filter, 0)) < 0)
+    if ((ret = avfilter_link(last_filter, pad_idx, ofilter->filter, 0)) < 0)
         return ret;
 
     return 0;
@@ -1419,8 +1595,8 @@ static int configure_output_audio_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
     char name[255];
     int ret;
 
-    snprintf(name, sizeof(name), "out_%s", ofp->name);
-    ret = avfilter_graph_create_filter(&ofp->filter,
+    snprintf(name, sizeof(name), "out_%s", ofilter->output_name);
+    ret = avfilter_graph_create_filter(&ofilter->filter,
                                        avfilter_get_by_name("abuffersink"),
                                        name, NULL, NULL, graph);
     if (ret < 0)
@@ -1457,7 +1633,7 @@ static int configure_output_audio_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
     if (args.len) {
         AVFilterContext *format;
 
-        snprintf(name, sizeof(name), "format_out_%s", ofp->name);
+        snprintf(name, sizeof(name), "format_out_%s", ofilter->output_name);
         ret = avfilter_graph_create_filter(&format,
                                            avfilter_get_by_name("aformat"),
                                            name, args.str, NULL, graph);
@@ -1477,13 +1653,13 @@ static int configure_output_audio_filter(FilterGraphPriv *fgp, AVFilterGraph *gr
         fgp->have_sources = 1;
     }
 
-    snprintf(name, sizeof(name), "trim for output %s", ofp->name);
+    snprintf(name, sizeof(name), "trim for output %s", ofilter->output_name);
     ret = insert_trim(fgp, ofp->trim_start_us, ofp->trim_duration_us,
                       &last_filter, &pad_idx, name);
     if (ret < 0)
         goto fail;
 
-    if ((ret = avfilter_link(last_filter, pad_idx, ofp->filter, 0)) < 0)
+    if ((ret = avfilter_link(last_filter, pad_idx, ofilter->filter, 0)) < 0)
         goto fail;
 fail:
     av_bprint_finalize(&args, NULL);
@@ -1532,8 +1708,8 @@ static int configure_input_video_filter(FilterGraph *fg, AVFilterGraph *graph,
     snprintf(name, sizeof(name), "graph %d input from stream %s", fg->index,
              ifp->opts.name);
 
-    ifp->filter = avfilter_graph_alloc_filter(graph, buffer_filt, name);
-    if (!ifp->filter) {
+    ifilter->filter = avfilter_graph_alloc_filter(graph, buffer_filt, name);
+    if (!ifilter->filter) {
         ret = AVERROR(ENOMEM);
         goto fail;
     }
@@ -1551,16 +1727,16 @@ static int configure_input_video_filter(FilterGraph *fg, AVFilterGraph *graph,
     par->side_data           = ifp->side_data;
     par->nb_side_data        = ifp->nb_side_data;
 
-    ret = av_buffersrc_parameters_set(ifp->filter, par);
+    ret = av_buffersrc_parameters_set(ifilter->filter, par);
     if (ret < 0)
         goto fail;
     av_freep(&par);
 
-    ret = avfilter_init_dict(ifp->filter, NULL);
+    ret = avfilter_init_dict(ifilter->filter, NULL);
     if (ret < 0)
         goto fail;
 
-    last_filter = ifp->filter;
+    last_filter = ifilter->filter;
 
     desc = av_pix_fmt_desc_get(ifp->format);
     av_assert0(desc);
@@ -1654,7 +1830,7 @@ static int configure_input_audio_filter(FilterGraph *fg, AVFilterGraph *graph,
         av_bprintf(&args, ":channels=%d", ifp->ch_layout.nb_channels);
     snprintf(name, sizeof(name), "graph_%d_in_%s", fg->index, ifp->opts.name);
 
-    if ((ret = avfilter_graph_create_filter(&ifp->filter, abuffer_filt,
+    if ((ret = avfilter_graph_create_filter(&ifilter->filter, abuffer_filt,
                                             name, args.str, NULL,
                                             graph)) < 0)
         return ret;
@@ -1663,11 +1839,11 @@ static int configure_input_audio_filter(FilterGraph *fg, AVFilterGraph *graph,
         return AVERROR(ENOMEM);
     par->side_data     = ifp->side_data;
     par->nb_side_data  = ifp->nb_side_data;
-    ret = av_buffersrc_parameters_set(ifp->filter, par);
+    ret = av_buffersrc_parameters_set(ifilter->filter, par);
     av_free(par);
     if (ret < 0)
         return ret;
-    last_filter = ifp->filter;
+    last_filter = ifilter->filter;
 
     snprintf(name, sizeof(name), "trim for input stream %s", ifp->opts.name);
     ret = insert_trim(fg, ifp->opts.trim_start_us, ifp->opts.trim_end_us,
@@ -1684,7 +1860,7 @@ static int configure_input_audio_filter(FilterGraph *fg, AVFilterGraph *graph,
 static int configure_input_filter(FilterGraph *fg, AVFilterGraph *graph,
                                   InputFilter *ifilter, AVFilterInOut *in)
 {
-    switch (ifp_from_ifilter(ifilter)->type) {
+    switch (ifilter->type) {
     case AVMEDIA_TYPE_VIDEO: return configure_input_video_filter(fg, graph, ifilter, in);
     case AVMEDIA_TYPE_AUDIO: return configure_input_audio_filter(fg, graph, ifilter, in);
     default: av_assert0(0); return 0;
@@ -1694,9 +1870,9 @@ static int configure_input_filter(FilterGraph *fg, AVFilterGraph *graph,
 static void cleanup_filtergraph(FilterGraph *fg, FilterGraphThread *fgt)
 {
     for (int i = 0; i < fg->nb_outputs; i++)
-        ofp_from_ofilter(fg->outputs[i])->filter = NULL;
+        fg->outputs[i]->filter = NULL;
     for (int i = 0; i < fg->nb_inputs; i++)
-        ifp_from_ifilter(fg->inputs[i])->filter = NULL;
+        fg->inputs[i]->filter = NULL;
     avfilter_graph_free(&fgt->graph);
 }
 
@@ -1733,7 +1909,7 @@ static int configure_filtergraph(FilterGraph *fg, FilterGraphThread *fgt)
     AVFilterInOut *inputs, *outputs, *cur;
     int ret = AVERROR_BUG, i, simple = filtergraph_is_simple(fg);
     int have_input_eof = 0;
-    const char *graph_desc = fgp->graph_desc;
+    const char *graph_desc = fg->graph_desc;
 
     cleanup_filtergraph(fg, fgt);
     fgt->graph = avfilter_graph_alloc();
@@ -1810,7 +1986,7 @@ static int configure_filtergraph(FilterGraph *fg, FilterGraphThread *fgt)
         int nb_sd;
         OutputFilter *ofilter = fg->outputs[i];
         OutputFilterPriv *ofp = ofp_from_ofilter(ofilter);
-        AVFilterContext *sink = ofp->filter;
+        AVFilterContext *sink = ofilter->filter;
 
         ofp->format = av_buffersink_get_format(sink);
 
@@ -1850,6 +2026,7 @@ static int configure_filtergraph(FilterGraph *fg, FilterGraphThread *fgt)
     }
 
     for (int i = 0; i < fg->nb_inputs; i++) {
+        InputFilter *ifilter = fg->inputs[i];
         InputFilterPriv *ifp = ifp_from_ifilter(fg->inputs[i]);
         AVFrame *tmp;
         while (av_fifo_read(ifp->frame_queue, &tmp, 1) >= 0) {
@@ -1860,7 +2037,7 @@ static int configure_filtergraph(FilterGraph *fg, FilterGraphThread *fgt)
                     if (ifp->displaymatrix_applied)
                         av_frame_remove_side_data(tmp, AV_FRAME_DATA_DISPLAYMATRIX);
                 }
-                ret = av_buffersrc_add_frame(ifp->filter, tmp);
+                ret = av_buffersrc_add_frame(ifilter->filter, tmp);
             }
             av_frame_free(&tmp);
             if (ret < 0)
@@ -1870,9 +2047,9 @@ static int configure_filtergraph(FilterGraph *fg, FilterGraphThread *fgt)
 
     /* send the EOFs for the finished inputs */
     for (int i = 0; i < fg->nb_inputs; i++) {
-        InputFilterPriv *ifp = ifp_from_ifilter(fg->inputs[i]);
+        InputFilter *ifilter = fg->inputs[i];
         if (fgt->eof_in[i]) {
-            ret = av_buffersrc_add_frame(ifp->filter, NULL);
+            ret = av_buffersrc_add_frame(ifilter->filter, NULL);
             if (ret < 0)
                 goto fail;
             have_input_eof = 1;
@@ -1902,7 +2079,7 @@ static int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *fr
     if (ret < 0)
         return ret;
 
-    ifp->time_base = (ifp->type == AVMEDIA_TYPE_AUDIO)    ? (AVRational){ 1, frame->sample_rate } :
+    ifp->time_base = (ifilter->type == AVMEDIA_TYPE_AUDIO)    ? (AVRational){ 1, frame->sample_rate } :
                      (ifp->opts.flags & IFILTER_FLAG_CFR) ? av_inv_q(ifp->opts.framerate)         :
                      frame->time_base;
 
@@ -1992,12 +2169,11 @@ static int choose_input(const FilterGraph *fg, const FilterGraphThread *fgt)
 
     for (int i = 0; i < fg->nb_inputs; i++) {
         InputFilter *ifilter = fg->inputs[i];
-        InputFilterPriv *ifp = ifp_from_ifilter(ifilter);
 
         if (fgt->eof_in[i])
             continue;
 
-        nb_requests = av_buffersrc_get_nb_failed_requests(ifp->filter);
+        nb_requests = av_buffersrc_get_nb_failed_requests(ifilter->filter);
         if (nb_requests > nb_requests_max) {
             nb_requests_max = nb_requests;
             best_input = i;
@@ -2041,7 +2217,7 @@ static int choose_out_timebase(OutputFilterPriv *ofp, AVFrame *frame)
 
     fr = fps->framerate;
     if (!fr.num) {
-        AVRational fr_sink = av_buffersink_get_frame_rate(ofp->filter);
+        AVRational fr_sink = av_buffersink_get_frame_rate(ofilter->filter);
         if (fr_sink.num > 0 && fr_sink.den > 0)
             fr = fr_sink;
     }
@@ -2294,16 +2470,16 @@ static int close_output(OutputFilterPriv *ofp, FilterGraphThread *fgt)
                "No filtered frames for output stream, trying to "
                "initialize anyway.\n");
 
-        ret = sch_filter_send(fgp->sch, fgp->sch_idx, ofp->index, frame);
+        ret = sch_filter_send(fgp->sch, fgp->sch_idx, ofp->ofilter.index, frame);
         if (ret < 0) {
             av_frame_unref(frame);
             return ret;
         }
     }
 
-    fgt->eof_out[ofp->index] = 1;
+    fgt->eof_out[ofp->ofilter.index] = 1;
 
-    ret = sch_filter_send(fgp->sch, fgp->sch_idx, ofp->index, NULL);
+    ret = sch_filter_send(fgp->sch, fgp->sch_idx, ofp->ofilter.index, NULL);
     return (ret == AVERROR_EOF) ? 0 : ret;
 }
 
@@ -2356,12 +2532,12 @@ static int fg_output_frame(OutputFilterPriv *ofp, FilterGraphThread *fgt,
         }
 
         // send the frame to consumers
-        ret = sch_filter_send(fgp->sch, fgp->sch_idx, ofp->index, frame_out);
+        ret = sch_filter_send(fgp->sch, fgp->sch_idx, ofp->ofilter.index, frame_out);
         if (ret < 0) {
             av_frame_unref(frame_out);
 
-            if (!fgt->eof_out[ofp->index]) {
-                fgt->eof_out[ofp->index] = 1;
+            if (!fgt->eof_out[ofp->ofilter.index]) {
+                fgt->eof_out[ofp->ofilter.index] = 1;
                 fgp->nb_outputs_done++;
             }
 
@@ -2394,13 +2570,13 @@ static int fg_output_step(OutputFilterPriv *ofp, FilterGraphThread *fgt,
                           AVFrame *frame)
 {
     FilterGraphPriv    *fgp = fgp_from_fg(ofp->ofilter.graph);
-    AVFilterContext *filter = ofp->filter;
+    AVFilterContext *filter = ofp->ofilter.filter;
     FrameData *fd;
     int ret;
 
     ret = av_buffersink_get_frame_flags(filter, frame,
                                         AV_BUFFERSINK_FLAG_NO_REQUEST);
-    if (ret == AVERROR_EOF && !fgt->eof_out[ofp->index]) {
+    if (ret == AVERROR_EOF && !fgt->eof_out[ofp->ofilter.index]) {
         ret = fg_output_frame(ofp, fgt, NULL);
         return (ret < 0) ? ret : 1;
     } else if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
@@ -2412,7 +2588,7 @@ static int fg_output_step(OutputFilterPriv *ofp, FilterGraphThread *fgt,
         return ret;
     }
 
-    if (fgt->eof_out[ofp->index]) {
+    if (fgt->eof_out[ofp->ofilter.index]) {
         av_frame_unref(frame);
         return 0;
     }
@@ -2587,7 +2763,7 @@ static int sub2video_frame(InputFilter *ifilter, AVFrame *frame, int buffer)
         if (ifp->sub2video.end_pts < INT64_MAX)
             sub2video_update(ifp, INT64_MAX, NULL);
 
-        return av_buffersrc_add_frame(ifp->filter, NULL);
+        return av_buffersrc_add_frame(ifilter->filter, NULL);
     }
 
     ifp->width  = frame->width  ? frame->width  : ifp->width;
@@ -2604,16 +2780,16 @@ static int send_eof(FilterGraphThread *fgt, InputFilter *ifilter,
     InputFilterPriv *ifp = ifp_from_ifilter(ifilter);
     int ret;
 
-    if (fgt->eof_in[ifp->index])
+    if (fgt->eof_in[ifilter->index])
        return 0;
 
-    fgt->eof_in[ifp->index] = 1;
+    fgt->eof_in[ifilter->index] = 1;
 
-    if (ifp->filter) {
+    if (ifilter->filter) {
         pts = av_rescale_q_rnd(pts, tb, ifp->time_base,
                                AV_ROUND_NEAR_INF | AV_ROUND_PASS_MINMAX);
 
-        ret = av_buffersrc_close(ifp->filter, pts, AV_BUFFERSRC_FLAG_PUSH);
+        ret = av_buffersrc_close(ifilter->filter, pts, AV_BUFFERSRC_FLAG_PUSH);
         if (ret < 0)
             return ret;
     } else {
@@ -2682,7 +2858,7 @@ static int send_frame(FilterGraph *fg, FilterGraphThread *fgt,
     int need_reinit = 0, ret;
 
     /* determine if the parameters for this input changed */
-    switch (ifp->type) {
+    switch (ifilter->type) {
     case AVMEDIA_TYPE_AUDIO:
         if (ifp->format      != frame->format ||
             ifp->sample_rate != frame->sample_rate ||
@@ -2802,7 +2978,7 @@ static int send_frame(FilterGraph *fg, FilterGraphThread *fgt,
         return AVERROR(ENOMEM);
     fd->wallclock[LATENCY_PROBE_FILTER_PRE] = av_gettime_relative();
 
-    ret = av_buffersrc_add_frame_flags(ifp->filter, frame,
+    ret = av_buffersrc_add_frame_flags(ifilter->filter, frame,
                                        AV_BUFFERSRC_FLAG_PUSH);
     if (ret < 0) {
         av_frame_unref(frame);
@@ -2821,7 +2997,7 @@ static void fg_thread_set_name(const FilterGraph *fg)
         OutputFilterPriv *ofp = ofp_from_ofilter(fg->outputs[0]);
         snprintf(name, sizeof(name), "%cf%s",
                  av_get_media_type_string(ofp->ofilter.type)[0],
-                 ofp->name);
+                 ofp->ofilter.output_name);
     } else {
         snprintf(name, sizeof(name), "fc%d", fg->index);
     }
diff --git a/fftools/ffmpeg_filter.h b/fftools/ffmpeg_filter.h
deleted file mode 100644
index 94b94beece6be..0000000000000
--- a/fftools/ffmpeg_filter.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef FFTOOLS_FFMPEG_FILTER_H
-#define FFTOOLS_FFMPEG_FILTER_H
-
-#include "ffmpeg.h"
-
-#include <stdint.h>
-
-#include "ffmpeg_sched.h"
-#include "sync_queue.h"
-
-#include "libavfilter/avfilter.h"
-
-#include "libavutil/avutil.h"
-#include "libavutil/dict.h"
-#include "libavutil/fifo.h"
-#include "libavutil/pixfmt.h"
-#include "libavutil/rational.h"
-#include "libavutil/bprint.h"
-#include "libavutil/channel_layout.h"
-#include "libavutil/downmix_info.h"
-
-typedef struct FilterGraphPriv {
-    FilterGraph      fg;
-
-    // name used for logging
-    char             log_name[32];
-
-    int              is_simple;
-    // true when the filtergraph contains only meta filters
-    // that do not modify the frame data
-    int              is_meta;
-    // source filters are present in the graph
-    int              have_sources;
-    int              disable_conversions;
-
-    unsigned         nb_outputs_done;
-
-    const char      *graph_desc;
-
-    int              nb_threads;
-
-    // frame for temporarily holding output from the filtergraph
-    AVFrame         *frame;
-    // frame for sending output to the encoder
-    AVFrame         *frame_enc;
-
-    Scheduler       *sch;
-    unsigned         sch_idx;
-
-    AVBPrint graph_print_buf;
-
-} FilterGraphPriv;
-
-static inline FilterGraphPriv *fgp_from_fg(FilterGraph *fg)
-{
-    return (FilterGraphPriv*)fg;
-}
-
-static inline const FilterGraphPriv *cfgp_from_cfg(const FilterGraph *fg)
-{
-    return (const FilterGraphPriv*)fg;
-}
-
-typedef struct InputFilterPriv {
-    InputFilter         ifilter;
-
-    InputFilterOptions  opts;
-
-    int                 index;
-
-    AVFilterContext    *filter;
-
-    // used to hold submitted input
-    AVFrame            *frame;
-
-    /* for filters that are not yet bound to an input stream,
-     * this stores the input linklabel, if any */
-    uint8_t            *linklabel;
-
-    // filter data type
-    enum AVMediaType    type;
-    // source data type: AVMEDIA_TYPE_SUBTITLE for sub2video,
-    // same as type otherwise
-    enum AVMediaType    type_src;
-
-    int                 eof;
-    int                 bound;
-    int                 drop_warned;
-    uint64_t            nb_dropped;
-
-    // parameters configured for this input
-    int                 format;
-
-    int                 width, height;
-    AVRational          sample_aspect_ratio;
-    enum AVColorSpace   color_space;
-    enum AVColorRange   color_range;
-
-    int                 sample_rate;
-    AVChannelLayout     ch_layout;
-
-    AVRational          time_base;
-
-    AVFrameSideData   **side_data;
-    int                 nb_side_data;
-
-    AVFifo             *frame_queue;
-
-    AVBufferRef        *hw_frames_ctx;
-
-    int                 displaymatrix_present;
-    int                 displaymatrix_applied;
-    int32_t             displaymatrix[9];
-
-    int                 downmixinfo_present;
-    AVDownmixInfo       downmixinfo;
-
-    struct {
-        AVFrame *frame;
-
-        int64_t last_pts;
-        int64_t end_pts;
-
-        /// marks if sub2video_update should force an initialization
-        unsigned int initialize;
-    } sub2video;
-} InputFilterPriv;
-
-static inline InputFilterPriv *ifp_from_ifilter(InputFilter *ifilter)
-{
-    return (InputFilterPriv*)ifilter;
-}
-
-typedef struct FPSConvContext {
-    AVFrame          *last_frame;
-    /* number of frames emitted by the video-encoding sync code */
-    int64_t           frame_number;
-    /* history of nb_frames_prev, i.e. the number of times the
-     * previous frame was duplicated by vsync code in recent
-     * do_video_out() calls */
-    int64_t           frames_prev_hist[3];
-
-    uint64_t          dup_warning;
-
-    int               last_dropped;
-    int               dropped_keyframe;
-
-    enum VideoSyncMethod vsync_method;
-
-    AVRational        framerate;
-    AVRational        framerate_max;
-    const AVRational *framerate_supported;
-    int               framerate_clip;
-} FPSConvContext;
-
-
-typedef struct OutputFilterPriv {
-    OutputFilter            ofilter;
-
-    int                     index;
-
-    void                   *log_parent;
-    char                    log_name[32];
-
-    char                   *name;
-
-    AVFilterContext        *filter;
-
-    /* desired output stream properties */
-    int                     format;
-    int                     width, height;
-    int                     sample_rate;
-    AVChannelLayout         ch_layout;
-    enum AVColorSpace       color_space;
-    enum AVColorRange       color_range;
-
-    AVFrameSideData       **side_data;
-    int                     nb_side_data;
-
-    // time base in which the output is sent to our downstream
-    // does not need to match the filtersink's timebase
-    AVRational              tb_out;
-    // at least one frame with the above timebase was sent
-    // to our downstream, so it cannot change anymore
-    int                     tb_out_locked;
-
-    AVRational              sample_aspect_ratio;
-
-    AVDictionary           *sws_opts;
-    AVDictionary           *swr_opts;
-
-    // those are only set if no format is specified and the encoder gives us multiple options
-    // They point directly to the relevant lists of the encoder.
-    const int              *formats;
-    const AVChannelLayout  *ch_layouts;
-    const int              *sample_rates;
-    const enum AVColorSpace *color_spaces;
-    const enum AVColorRange *color_ranges;
-
-    AVRational              enc_timebase;
-    int64_t                 trim_start_us;
-    int64_t                 trim_duration_us;
-    // offset for output timestamps, in AV_TIME_BASE_Q
-    int64_t                 ts_offset;
-    int64_t                 next_pts;
-    FPSConvContext          fps;
-
-    unsigned                flags;
-} OutputFilterPriv;
-
-static inline OutputFilterPriv *ofp_from_ofilter(OutputFilter *ofilter)
-{
-    return (OutputFilterPriv*)ofilter;
-}
-
-#endif /* FFTOOLS_FFMPEG_FILTER_H */
diff --git a/fftools/ffprobe.c b/fftools/ffprobe.c
index 80ce38e73bd2e..1346ed33c5c81 100644
--- a/fftools/ffprobe.c
+++ b/fftools/ffprobe.c
@@ -457,6 +457,43 @@ static inline int show_tags(AVTextFormatContext *tfc, AVDictionary *tags, int se
     return ret;
 }
 
+static void print_displaymatrix(AVTextFormatContext *tfc, const int32_t matrix[9])
+{
+    double rotation = av_display_rotation_get(matrix);
+    if (isnan(rotation))
+        rotation = 0;
+    avtext_print_integers(tfc, "displaymatrix", (void*)matrix, 9, " %11d", 3, 4, 1);
+    print_int("rotation", rotation);
+}
+
+static void print_mastering_display_metadata(AVTextFormatContext *tfc,
+                                             const AVMasteringDisplayMetadata *metadata)
+{
+    if (metadata->has_primaries) {
+        print_q("red_x",   metadata->display_primaries[0][0], '/');
+        print_q("red_y",   metadata->display_primaries[0][1], '/');
+        print_q("green_x", metadata->display_primaries[1][0], '/');
+        print_q("green_y", metadata->display_primaries[1][1], '/');
+        print_q("blue_x",  metadata->display_primaries[2][0], '/');
+        print_q("blue_y",  metadata->display_primaries[2][1], '/');
+
+        print_q("white_point_x", metadata->white_point[0], '/');
+        print_q("white_point_y", metadata->white_point[1], '/');
+    }
+
+    if (metadata->has_luminance) {
+        print_q("min_luminance", metadata->min_luminance, '/');
+        print_q("max_luminance", metadata->max_luminance, '/');
+    }
+}
+
+static void print_context_light_level(AVTextFormatContext *tfc,
+                                      const AVContentLightMetadata *metadata)
+{
+    print_int("max_content", metadata->MaxCLL);
+    print_int("max_average", metadata->MaxFALL);
+}
+
 static void print_dovi_metadata(AVTextFormatContext *tfc, const AVDOVIMetadata *dovi)
 {
     if (!dovi)
@@ -929,121 +966,98 @@ static void print_pkt_side_data(AVTextFormatContext *tfc,
                                 const AVPacketSideData *sd,
                                 SectionID id_data)
 {
-        const char *name = av_packet_side_data_name(sd->type);
-
-        avtext_print_section_header(tfc, sd, id_data);
-        print_str("side_data_type", name ? name : "unknown");
-        if (sd->type == AV_PKT_DATA_DISPLAYMATRIX && sd->size >= 9*4) {
-            double rotation = av_display_rotation_get((int32_t *)sd->data);
-            if (isnan(rotation))
-                rotation = 0;
-            avtext_print_integers(tfc, "displaymatrix", sd->data, 9, " %11d", 3, 4, 1);
-            print_int("rotation", rotation);
-        } else if (sd->type == AV_PKT_DATA_STEREO3D) {
-            const AVStereo3D *stereo = (AVStereo3D *)sd->data;
-            print_str("type", av_stereo3d_type_name(stereo->type));
-            print_int("inverted", !!(stereo->flags & AV_STEREO3D_FLAG_INVERT));
-            print_str("view", av_stereo3d_view_name(stereo->view));
-            print_str("primary_eye", av_stereo3d_primary_eye_name(stereo->primary_eye));
-            print_int("baseline", stereo->baseline);
-            print_q("horizontal_disparity_adjustment", stereo->horizontal_disparity_adjustment, '/');
-            print_q("horizontal_field_of_view", stereo->horizontal_field_of_view, '/');
-        } else if (sd->type == AV_PKT_DATA_SPHERICAL) {
-            const AVSphericalMapping *spherical = (AVSphericalMapping *)sd->data;
-            print_str("projection", av_spherical_projection_name(spherical->projection));
-            if (spherical->projection == AV_SPHERICAL_CUBEMAP) {
-                print_int("padding", spherical->padding);
-            } else if (spherical->projection == AV_SPHERICAL_EQUIRECTANGULAR_TILE) {
-                size_t l, t, r, b;
-                av_spherical_tile_bounds(spherical, par->width, par->height,
-                                         &l, &t, &r, &b);
-                print_int("bound_left", l);
-                print_int("bound_top", t);
-                print_int("bound_right", r);
-                print_int("bound_bottom", b);
-            }
-
-            print_int("yaw", (double) spherical->yaw / (1 << 16));
-            print_int("pitch", (double) spherical->pitch / (1 << 16));
-            print_int("roll", (double) spherical->roll / (1 << 16));
-        } else if (sd->type == AV_PKT_DATA_SKIP_SAMPLES && sd->size == 10) {
-            print_int("skip_samples",    AV_RL32(sd->data));
-            print_int("discard_padding", AV_RL32(sd->data + 4));
-            print_int("skip_reason",     AV_RL8(sd->data + 8));
-            print_int("discard_reason",  AV_RL8(sd->data + 9));
-        } else if (sd->type == AV_PKT_DATA_MASTERING_DISPLAY_METADATA) {
-            AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
-
-            if (metadata->has_primaries) {
-                print_q("red_x", metadata->display_primaries[0][0], '/');
-                print_q("red_y", metadata->display_primaries[0][1], '/');
-                print_q("green_x", metadata->display_primaries[1][0], '/');
-                print_q("green_y", metadata->display_primaries[1][1], '/');
-                print_q("blue_x", metadata->display_primaries[2][0], '/');
-                print_q("blue_y", metadata->display_primaries[2][1], '/');
-
-                print_q("white_point_x", metadata->white_point[0], '/');
-                print_q("white_point_y", metadata->white_point[1], '/');
-            }
-
-            if (metadata->has_luminance) {
-                print_q("min_luminance", metadata->min_luminance, '/');
-                print_q("max_luminance", metadata->max_luminance, '/');
-            }
-        } else if (sd->type == AV_PKT_DATA_CONTENT_LIGHT_LEVEL) {
-            AVContentLightMetadata *metadata = (AVContentLightMetadata *)sd->data;
-            print_int("max_content", metadata->MaxCLL);
-            print_int("max_average", metadata->MaxFALL);
-        } else if (sd->type == AV_PKT_DATA_AMBIENT_VIEWING_ENVIRONMENT) {
-            print_ambient_viewing_environment(
-                tfc, (const AVAmbientViewingEnvironment *)sd->data);
-        } else if (sd->type == AV_PKT_DATA_DYNAMIC_HDR10_PLUS) {
-            AVDynamicHDRPlus *metadata = (AVDynamicHDRPlus *)sd->data;
-            print_dynamic_hdr10_plus(tfc, metadata);
-        } else if (sd->type == AV_PKT_DATA_DOVI_CONF) {
-            AVDOVIDecoderConfigurationRecord *dovi = (AVDOVIDecoderConfigurationRecord *)sd->data;
-            const char *comp = "unknown";
-            print_int("dv_version_major", dovi->dv_version_major);
-            print_int("dv_version_minor", dovi->dv_version_minor);
-            print_int("dv_profile", dovi->dv_profile);
-            print_int("dv_level", dovi->dv_level);
-            print_int("rpu_present_flag", dovi->rpu_present_flag);
-            print_int("el_present_flag", dovi->el_present_flag);
-            print_int("bl_present_flag", dovi->bl_present_flag);
-            print_int("dv_bl_signal_compatibility_id", dovi->dv_bl_signal_compatibility_id);
-            switch (dovi->dv_md_compression)
-            {
-                case AV_DOVI_COMPRESSION_NONE:     comp = "none";     break;
-                case AV_DOVI_COMPRESSION_LIMITED:  comp = "limited";  break;
-                case AV_DOVI_COMPRESSION_RESERVED: comp = "reserved"; break;
-                case AV_DOVI_COMPRESSION_EXTENDED: comp = "extended"; break;
-            }
-            print_str("dv_md_compression", comp);
-        } else if (sd->type == AV_PKT_DATA_AUDIO_SERVICE_TYPE) {
-            enum AVAudioServiceType *t = (enum AVAudioServiceType *)sd->data;
-            print_int("service_type", *t);
-        } else if (sd->type == AV_PKT_DATA_MPEGTS_STREAM_ID) {
-            print_int("id", *sd->data);
-        } else if (sd->type == AV_PKT_DATA_CPB_PROPERTIES) {
-            const AVCPBProperties *prop = (AVCPBProperties *)sd->data;
-            print_int("max_bitrate", prop->max_bitrate);
-            print_int("min_bitrate", prop->min_bitrate);
-            print_int("avg_bitrate", prop->avg_bitrate);
-            print_int("buffer_size", prop->buffer_size);
-            print_int("vbv_delay",   prop->vbv_delay);
-        } else if (sd->type == AV_PKT_DATA_WEBVTT_IDENTIFIER ||
-                   sd->type == AV_PKT_DATA_WEBVTT_SETTINGS) {
-            if (do_show_data)
-                avtext_print_data(tfc, "data", sd->data, sd->size);
-            avtext_print_data_hash(tfc, "data_hash", sd->data, sd->size);
-        } else if (sd->type == AV_PKT_DATA_FRAME_CROPPING && sd->size >= sizeof(uint32_t) * 4) {
-            print_int("crop_top",    AV_RL32(sd->data));
-            print_int("crop_bottom", AV_RL32(sd->data + 4));
-            print_int("crop_left",   AV_RL32(sd->data + 8));
-            print_int("crop_right",  AV_RL32(sd->data + 12));
-        } else if (sd->type == AV_PKT_DATA_AFD && sd->size > 0) {
-            print_int("active_format", *sd->data);
-        }
+    const char *name = av_packet_side_data_name(sd->type);
+
+    avtext_print_section_header(tfc, sd, id_data);
+    print_str("side_data_type", name ? name : "unknown");
+    if (sd->type == AV_PKT_DATA_DISPLAYMATRIX && sd->size >= 9*4) {
+        print_displaymatrix(tfc, (const int32_t*)sd->data);
+    } else if (sd->type == AV_PKT_DATA_STEREO3D) {
+        const AVStereo3D *stereo = (AVStereo3D *)sd->data;
+        print_str("type", av_stereo3d_type_name(stereo->type));
+        print_int("inverted", !!(stereo->flags & AV_STEREO3D_FLAG_INVERT));
+        print_str("view", av_stereo3d_view_name(stereo->view));
+        print_str("primary_eye", av_stereo3d_primary_eye_name(stereo->primary_eye));
+        print_int("baseline", stereo->baseline);
+        print_q("horizontal_disparity_adjustment", stereo->horizontal_disparity_adjustment, '/');
+        print_q("horizontal_field_of_view", stereo->horizontal_field_of_view, '/');
+    } else if (sd->type == AV_PKT_DATA_SPHERICAL) {
+        const AVSphericalMapping *spherical = (AVSphericalMapping *)sd->data;
+        print_str("projection", av_spherical_projection_name(spherical->projection));
+        if (spherical->projection == AV_SPHERICAL_CUBEMAP) {
+            print_int("padding", spherical->padding);
+        } else if (spherical->projection == AV_SPHERICAL_EQUIRECTANGULAR_TILE) {
+            size_t l, t, r, b;
+            av_spherical_tile_bounds(spherical, par->width, par->height,
+                                     &l, &t, &r, &b);
+            print_int("bound_left", l);
+            print_int("bound_top", t);
+            print_int("bound_right", r);
+            print_int("bound_bottom", b);
+        }
+
+        print_int("yaw", (double) spherical->yaw / (1 << 16));
+        print_int("pitch", (double) spherical->pitch / (1 << 16));
+        print_int("roll", (double) spherical->roll / (1 << 16));
+    } else if (sd->type == AV_PKT_DATA_SKIP_SAMPLES && sd->size == 10) {
+        print_int("skip_samples",    AV_RL32(sd->data));
+        print_int("discard_padding", AV_RL32(sd->data + 4));
+        print_int("skip_reason",     AV_RL8(sd->data + 8));
+        print_int("discard_reason",  AV_RL8(sd->data + 9));
+    } else if (sd->type == AV_PKT_DATA_MASTERING_DISPLAY_METADATA) {
+        print_mastering_display_metadata(tfc, (AVMasteringDisplayMetadata *)sd->data);
+    } else if (sd->type == AV_PKT_DATA_CONTENT_LIGHT_LEVEL) {
+        print_context_light_level(tfc, (AVContentLightMetadata *)sd->data);
+    } else if (sd->type == AV_PKT_DATA_AMBIENT_VIEWING_ENVIRONMENT) {
+        print_ambient_viewing_environment(
+            tfc, (const AVAmbientViewingEnvironment *)sd->data);
+    } else if (sd->type == AV_PKT_DATA_DYNAMIC_HDR10_PLUS) {
+        AVDynamicHDRPlus *metadata = (AVDynamicHDRPlus *)sd->data;
+        print_dynamic_hdr10_plus(tfc, metadata);
+    } else if (sd->type == AV_PKT_DATA_DOVI_CONF) {
+        AVDOVIDecoderConfigurationRecord *dovi = (AVDOVIDecoderConfigurationRecord *)sd->data;
+        const char *comp = "unknown";
+        print_int("dv_version_major", dovi->dv_version_major);
+        print_int("dv_version_minor", dovi->dv_version_minor);
+        print_int("dv_profile", dovi->dv_profile);
+        print_int("dv_level", dovi->dv_level);
+        print_int("rpu_present_flag", dovi->rpu_present_flag);
+        print_int("el_present_flag", dovi->el_present_flag);
+        print_int("bl_present_flag", dovi->bl_present_flag);
+        print_int("dv_bl_signal_compatibility_id", dovi->dv_bl_signal_compatibility_id);
+        switch (dovi->dv_md_compression)
+        {
+            case AV_DOVI_COMPRESSION_NONE:     comp = "none";     break;
+            case AV_DOVI_COMPRESSION_LIMITED:  comp = "limited";  break;
+            case AV_DOVI_COMPRESSION_RESERVED: comp = "reserved"; break;
+            case AV_DOVI_COMPRESSION_EXTENDED: comp = "extended"; break;
+        }
+        print_str("dv_md_compression", comp);
+    } else if (sd->type == AV_PKT_DATA_AUDIO_SERVICE_TYPE) {
+        enum AVAudioServiceType *t = (enum AVAudioServiceType *)sd->data;
+        print_int("service_type", *t);
+    } else if (sd->type == AV_PKT_DATA_MPEGTS_STREAM_ID) {
+        print_int("id", *sd->data);
+    } else if (sd->type == AV_PKT_DATA_CPB_PROPERTIES) {
+        const AVCPBProperties *prop = (AVCPBProperties *)sd->data;
+        print_int("max_bitrate", prop->max_bitrate);
+        print_int("min_bitrate", prop->min_bitrate);
+        print_int("avg_bitrate", prop->avg_bitrate);
+        print_int("buffer_size", prop->buffer_size);
+        print_int("vbv_delay",   prop->vbv_delay);
+    } else if (sd->type == AV_PKT_DATA_WEBVTT_IDENTIFIER ||
+               sd->type == AV_PKT_DATA_WEBVTT_SETTINGS) {
+        if (do_show_data)
+            avtext_print_data(tfc, "data", sd->data, sd->size);
+        avtext_print_data_hash(tfc, "data_hash", sd->data, sd->size);
+    } else if (sd->type == AV_PKT_DATA_FRAME_CROPPING && sd->size >= sizeof(uint32_t) * 4) {
+        print_int("crop_top",    AV_RL32(sd->data));
+        print_int("crop_bottom", AV_RL32(sd->data + 4));
+        print_int("crop_left",   AV_RL32(sd->data + 8));
+        print_int("crop_right",  AV_RL32(sd->data + 12));
+    } else if (sd->type == AV_PKT_DATA_AFD && sd->size > 0) {
+        print_int("active_format", *sd->data);
+    }
 }
 
 static void print_private_data(AVTextFormatContext *tfc, void *priv_data)
@@ -1279,11 +1293,7 @@ static void print_frame_side_data(AVTextFormatContext *tfc,
         name = av_frame_side_data_name(sd->type);
         print_str("side_data_type", name ? name : "unknown");
         if (sd->type == AV_FRAME_DATA_DISPLAYMATRIX && sd->size >= 9*4) {
-            double rotation = av_display_rotation_get((int32_t *)sd->data);
-            if (isnan(rotation))
-                rotation = 0;
-            avtext_print_integers(tfc, "displaymatrix", sd->data, 9, " %11d", 3, 4, 1);
-            print_int("rotation", rotation);
+            print_displaymatrix(tfc, (const int32_t*)sd->data);
         } else if (sd->type == AV_FRAME_DATA_AFD && sd->size > 0) {
             print_int("active_format", *sd->data);
         } else if (sd->type == AV_FRAME_DATA_GOP_TIMECODE && sd->size >= 8) {
@@ -1303,31 +1313,12 @@ static void print_frame_side_data(AVTextFormatContext *tfc,
             }
             avtext_print_section_footer(tfc);
         } else if (sd->type == AV_FRAME_DATA_MASTERING_DISPLAY_METADATA) {
-            AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
-
-            if (metadata->has_primaries) {
-                print_q("red_x", metadata->display_primaries[0][0], '/');
-                print_q("red_y", metadata->display_primaries[0][1], '/');
-                print_q("green_x", metadata->display_primaries[1][0], '/');
-                print_q("green_y", metadata->display_primaries[1][1], '/');
-                print_q("blue_x", metadata->display_primaries[2][0], '/');
-                print_q("blue_y", metadata->display_primaries[2][1], '/');
-
-                print_q("white_point_x", metadata->white_point[0], '/');
-                print_q("white_point_y", metadata->white_point[1], '/');
-            }
-
-            if (metadata->has_luminance) {
-                print_q("min_luminance", metadata->min_luminance, '/');
-                print_q("max_luminance", metadata->max_luminance, '/');
-            }
+            print_mastering_display_metadata(tfc, (AVMasteringDisplayMetadata *)sd->data);
         } else if (sd->type == AV_FRAME_DATA_DYNAMIC_HDR_PLUS) {
             AVDynamicHDRPlus *metadata = (AVDynamicHDRPlus *)sd->data;
             print_dynamic_hdr10_plus(tfc, metadata);
         } else if (sd->type == AV_FRAME_DATA_CONTENT_LIGHT_LEVEL) {
-            AVContentLightMetadata *metadata = (AVContentLightMetadata *)sd->data;
-            print_int("max_content", metadata->MaxCLL);
-            print_int("max_average", metadata->MaxFALL);
+            print_context_light_level(tfc, (AVContentLightMetadata *)sd->data);
         } else if (sd->type == AV_FRAME_DATA_ICC_PROFILE) {
             const AVDictionaryEntry *tag = av_dict_get(sd->metadata, "name", NULL, AV_DICT_MATCH_CASE);
             if (tag)
diff --git a/fftools/graph/graphprint.c b/fftools/graph/graphprint.c
index fc94a75797935..e4c6886cf8eef 100644
--- a/fftools/graph/graphprint.c
+++ b/fftools/graph/graphprint.c
@@ -28,7 +28,7 @@
 
 #include "graphprint.h"
 
-#include "fftools/ffmpeg_filter.h"
+#include "fftools/ffmpeg.h"
 #include "fftools/ffmpeg_mux.h"
 
 #include "libavutil/avassert.h"
@@ -318,6 +318,7 @@ static void print_link(GraphPrintContext *gpc, AVFilterLink *link)
 
     if (hw_frames_ctx && hw_frames_ctx->data)
         print_hwframescontext(gpc, (AVHWFramesContext *)hw_frames_ctx->data);
+    av_buffer_unref(&hw_frames_ctx);
 }
 
 static char sanitize_char(const char c)
@@ -478,19 +479,18 @@ static void init_sections(void)
 static void print_filtergraph_single(GraphPrintContext *gpc, FilterGraph *fg, AVFilterGraph *graph)
 {
     AVTextFormatContext *tfc = gpc->tfc;
-    FilterGraphPriv *fgp = fgp_from_fg(fg);
     AVDictionary *input_map = NULL;
     AVDictionary *output_map = NULL;
 
     print_int("graph_index", fg->index);
     print_fmt("name", "Graph %d.%d", gpc->id_prefix_num, fg->index);
     print_fmt("id", "Graph_%d_%d", gpc->id_prefix_num, fg->index);
-    print_str("description", fgp->graph_desc);
+    print_str("description", fg->graph_desc);
 
     print_section_header_id(gpc, SECTION_ID_GRAPH_INPUTS, "Input_File", 0);
 
     for (int i = 0; i < fg->nb_inputs; i++) {
-        InputFilterPriv *ifilter = ifp_from_ifilter(fg->inputs[i]);
+        InputFilter *ifilter = fg->inputs[i];
         enum AVMediaType media_type = ifilter->type;
 
         avtext_print_section_header(tfc, NULL, SECTION_ID_GRAPH_INPUT);
@@ -507,8 +507,8 @@ static void print_filtergraph_single(GraphPrintContext *gpc, FilterGraph *fg, AV
 
         if (ifilter->linklabel && ifilter->filter)
             av_dict_set(&input_map, ifilter->filter->name, (const char *)ifilter->linklabel, 0);
-        else if (ifilter->opts.name && ifilter->filter)
-            av_dict_set(&input_map, ifilter->filter->name, (const char *)ifilter->opts.name, 0);
+        else if (ifilter->input_name && ifilter->filter)
+            av_dict_set(&input_map, ifilter->filter->name, (const char *)ifilter->input_name, 0);
 
         print_str("media_type", av_get_media_type_string(media_type));
 
@@ -520,13 +520,13 @@ static void print_filtergraph_single(GraphPrintContext *gpc, FilterGraph *fg, AV
     print_section_header_id(gpc, SECTION_ID_GRAPH_OUTPUTS, "Output_File", 0);
 
     for (int i = 0; i < fg->nb_outputs; i++) {
-        OutputFilterPriv *ofilter = ofp_from_ofilter(fg->outputs[i]);
+        OutputFilter *ofilter = fg->outputs[i];
 
         avtext_print_section_header(tfc, NULL, SECTION_ID_GRAPH_OUTPUT);
 
         print_int("output_index", ofilter->index);
 
-        print_str("name", ofilter->name);
+        print_str("name", ofilter->output_name);
 
         if (fg->outputs[i]->linklabel)
             print_str("link_label", (const char*)fg->outputs[i]->linklabel);
@@ -536,11 +536,11 @@ static void print_filtergraph_single(GraphPrintContext *gpc, FilterGraph *fg, AV
             print_str("filter_name", ofilter->filter->filter->name);
         }
 
-        if (ofilter->name && ofilter->filter)
-            av_dict_set(&output_map, ofilter->filter->name, ofilter->name, 0);
+        if (ofilter->output_name && ofilter->filter)
+            av_dict_set(&output_map, ofilter->filter->name, ofilter->output_name, 0);
 
 
-        print_str("media_type", av_get_media_type_string(fg->outputs[i]->type));
+        print_str("media_type", av_get_media_type_string(ofilter->type));
 
         avtext_print_section_footer(tfc); // SECTION_ID_GRAPH_OUTPUT
     }
@@ -556,7 +556,7 @@ static void print_filtergraph_single(GraphPrintContext *gpc, FilterGraph *fg, AV
 
         if (gpc->is_diagram) {
             print_fmt("name", "Graph %d.%d", gpc->id_prefix_num, fg->index);
-            print_str("description", fgp->graph_desc);
+            print_str("description", fg->graph_desc);
             print_str("id", sec_ctx.context_id);
         }
 
@@ -780,6 +780,8 @@ static int print_streams(GraphPrintContext *gpc, InputFile **ifiles, int nb_ifil
 
         avtext_print_section_header(tfc, &sec_ctx, SECTION_ID_OUTPUTSTREAMS);
 
+        av_freep(&sec_ctx.context_id);
+
         for (int i = 0; i < of->nb_streams; i++) {
             OutputStream *ost = of->streams[i];
             const AVCodecDescriptor *codec_desc = avcodec_descriptor_get(ost->st->codecpar->codec_id);
@@ -862,6 +864,8 @@ static void uninit_graphprint(GraphPrintContext *gpc)
 
     // Finalize the print buffer if it was initialized
     av_bprint_finalize(&gpc->pbuf, NULL);
+
+    av_freep(&gpc);
 }
 
 static int init_graphprint(GraphPrintContext **pgpc, AVBPrint *target_buf)
@@ -870,8 +874,6 @@ static int init_graphprint(GraphPrintContext **pgpc, AVBPrint *target_buf)
     AVTextFormatContext *tfc = NULL;
     AVTextWriterContext *wctx = NULL;
     GraphPrintContext *gpc = NULL;
-    char *w_args = NULL;
-    char *w_name;
     int ret;
 
     init_sections();
@@ -879,19 +881,7 @@ static int init_graphprint(GraphPrintContext **pgpc, AVBPrint *target_buf)
 
     av_bprint_init(target_buf, 0, AV_BPRINT_SIZE_UNLIMITED);
 
-    if (!print_graphs_format)
-        print_graphs_format = av_strdup("json");
-    if (!print_graphs_format) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-
-    w_name = av_strtok(print_graphs_format, "=", &w_args);
-    if (!w_name) {
-        av_log(NULL, AV_LOG_ERROR, "No name specified for the filter graph output format\n");
-        ret = AVERROR(EINVAL);
-        goto fail;
-    }
+    const char *w_name = print_graphs_format ? print_graphs_format : "json";
 
     text_formatter = avtext_get_formatter_by_name(w_name);
     if (!text_formatter) {
@@ -908,6 +898,9 @@ static int init_graphprint(GraphPrintContext **pgpc, AVBPrint *target_buf)
     }
 
     AVTextFormatOptions tf_options = { .show_optional_fields = -1 };
+    const char *w_args = print_graphs_format ? strchr(print_graphs_format, '=') : NULL;
+    if (w_args)
+        ++w_args; // consume '='
     ret = avtext_context_open(&tfc, text_formatter, wctx, w_args, sections, FF_ARRAY_ELEMS(sections), tf_options, NULL);
     if (ret < 0) {
         goto fail;
@@ -962,11 +955,10 @@ int print_filtergraph(FilterGraph *fg, AVFilterGraph *graph)
 {
     GraphPrintContext *gpc = NULL;
     AVTextFormatContext *tfc;
-    FilterGraphPriv *fgp = fgp_from_fg(fg);
-    AVBPrint *target_buf = &fgp->graph_print_buf;
+    AVBPrint *target_buf = &fg->graph_print_buf;
     int ret;
 
-    if (!fg || !fgp) {
+    if (!fg) {
         av_log(NULL, AV_LOG_ERROR, "Invalid filter graph provided\n");
         return AVERROR(EINVAL);
     }
@@ -1030,8 +1022,7 @@ static int print_filtergraphs_priv(FilterGraph **graphs, int nb_graphs, InputFil
     avtext_print_section_header(tfc, NULL, SECTION_ID_FILTERGRAPHS);
 
     for (int i = 0; i < nb_graphs; i++) {
-        FilterGraphPriv *fgp = fgp_from_fg(graphs[i]);
-        AVBPrint *graph_buf = &fgp->graph_print_buf;
+        AVBPrint *graph_buf = &graphs[i]->graph_print_buf;
 
         if (graph_buf->len > 0) {
             avtext_print_section_header(tfc, NULL, SECTION_ID_FILTERGRAPH);
@@ -1048,8 +1039,7 @@ static int print_filtergraphs_priv(FilterGraph **graphs, int nb_graphs, InputFil
             OutputStream *ost = of->streams[i];
 
             if (ost->fg_simple) {
-                FilterGraphPriv *fgp = fgp_from_fg(ost->fg_simple);
-                AVBPrint *graph_buf = &fgp->graph_print_buf;
+                AVBPrint *graph_buf = &ost->fg_simple->graph_print_buf;
 
                 if (graph_buf->len > 0) {
                     avtext_print_section_header(tfc, NULL, SECTION_ID_FILTERGRAPH);
@@ -1080,7 +1070,6 @@ static int print_filtergraphs_priv(FilterGraph **graphs, int nb_graphs, InputFil
             }
 
             avio_write(avio, (const unsigned char *)target_buf.str, FFMIN(target_buf.len, target_buf.size - 1));
-            avio_flush(avio);
 
             if ((ret = avio_closep(&avio)) < 0)
                 av_log(NULL, AV_LOG_ERROR, "Error closing graph output file, loss of information possible: %s\n", av_err2str(ret));
@@ -1103,5 +1092,7 @@ static int print_filtergraphs_priv(FilterGraph **graphs, int nb_graphs, InputFil
 
 int print_filtergraphs(FilterGraph **graphs, int nb_graphs, InputFile **ifiles, int nb_ifiles, OutputFile **ofiles, int nb_ofiles)
 {
-    return print_filtergraphs_priv(graphs, nb_graphs, ifiles, nb_ifiles, ofiles, nb_ofiles);
+    int ret = print_filtergraphs_priv(graphs, nb_graphs, ifiles, nb_ifiles, ofiles, nb_ofiles);
+    ff_resman_uninit();
+    return ret;
 }
diff --git a/fftools/resources/.gitignore b/fftools/resources/.gitignore
index 5f496535a605f..bda2c59a1c9ed 100644
--- a/fftools/resources/.gitignore
+++ b/fftools/resources/.gitignore
@@ -2,3 +2,5 @@
 *.css.c
 *.html.gz
 *.css.gz
+*.min
+*.min.gz
diff --git a/fftools/resources/resman.c b/fftools/resources/resman.c
index a9e21626fa81c..aa53e96bf4cc6 100644
--- a/fftools/resources/resman.c
+++ b/fftools/resources/resman.c
@@ -32,7 +32,6 @@
 #endif
 
 #include "resman.h"
-#include "fftools/ffmpeg_filter.h"
 #include "libavutil/avassert.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/dict.h"
@@ -61,7 +60,7 @@ typedef struct ResourceManagerContext {
 
 static AVMutex mutex = AV_MUTEX_INITIALIZER;
 
-ResourceManagerContext *resman_ctx = NULL;
+static ResourceManagerContext resman_ctx = { .class = &resman_class };
 
 
 #if CONFIG_RESOURCE_COMPRESSION
@@ -118,39 +117,11 @@ static int decompress_gzip(ResourceManagerContext *ctx, uint8_t *in, unsigned in
 }
 #endif
 
-static ResourceManagerContext *get_resman_context(void)
-{
-    ResourceManagerContext *res = resman_ctx;
-
-    ff_mutex_lock(&mutex);
-
-    if (res)
-        goto end;
-
-    res = av_mallocz(sizeof(ResourceManagerContext));
-    if (!res) {
-        av_log(NULL, AV_LOG_ERROR, "Failed to allocate resource manager context\n");
-        goto end;
-    }
-
-    res->class = &resman_class;
-    resman_ctx = res;
-
-end:
-    ff_mutex_unlock(&mutex);
-    return res;
-}
-
-
 void ff_resman_uninit(void)
 {
     ff_mutex_lock(&mutex);
 
-    if (resman_ctx) {
-        if (resman_ctx->resource_dic)
-            av_dict_free(&resman_ctx->resource_dic);
-        av_freep(&resman_ctx);
-    }
+    av_dict_free(&resman_ctx.resource_dic);
 
     ff_mutex_unlock(&mutex);
 }
@@ -158,14 +129,11 @@ void ff_resman_uninit(void)
 
 char *ff_resman_get_string(FFResourceId resource_id)
 {
-    ResourceManagerContext *ctx               = get_resman_context();
+    ResourceManagerContext *ctx = &resman_ctx;
     FFResourceDefinition resource_definition = { 0 };
     AVDictionaryEntry *dic_entry;
     char *res = NULL;
 
-    if (!ctx)
-        return NULL;
-
     for (unsigned i = 0; i < FF_ARRAY_ELEMS(resource_definitions); ++i) {
         FFResourceDefinition def = resource_definitions[i];
         if (def.resource_id == resource_id) {
@@ -174,10 +142,7 @@ char *ff_resman_get_string(FFResourceId resource_id)
         }
     }
 
-    if (!resource_definition.name) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to find resource with ID %d\n", resource_id);
-        return NULL;
-    }
+    av_assert1(resource_definition.name);
 
     ff_mutex_lock(&mutex);
 
@@ -194,13 +159,13 @@ char *ff_resman_get_string(FFResourceId resource_id)
         int ret = decompress_gzip(ctx, (uint8_t *)resource_definition.data, *resource_definition.data_len, &out, &out_len);
 
         if (ret) {
-            av_log(NULL, AV_LOG_ERROR, "Unable to decompress the resource with ID %d\n", resource_id);
+            av_log(ctx, AV_LOG_ERROR, "Unable to decompress the resource with ID %d\n", resource_id);
             goto end;
         }
 
         dict_ret = av_dict_set(&ctx->resource_dic, resource_definition.name, out, 0);
         if (dict_ret < 0) {
-            av_log(NULL, AV_LOG_ERROR, "Failed to store decompressed resource in dictionary: %d\n", dict_ret);
+            av_log(ctx, AV_LOG_ERROR, "Failed to store decompressed resource in dictionary: %d\n", dict_ret);
             av_freep(&out);
             goto end;
         }
@@ -210,7 +175,7 @@ char *ff_resman_get_string(FFResourceId resource_id)
 
         dict_ret = av_dict_set(&ctx->resource_dic, resource_definition.name, (const char *)resource_definition.data, 0);
         if (dict_ret < 0) {
-            av_log(NULL, AV_LOG_ERROR, "Failed to store resource in dictionary: %d\n", dict_ret);
+            av_log(ctx, AV_LOG_ERROR, "Failed to store resource in dictionary: %d\n", dict_ret);
             goto end;
         }
 
@@ -218,7 +183,7 @@ char *ff_resman_get_string(FFResourceId resource_id)
         dic_entry = av_dict_get(ctx->resource_dic, resource_definition.name, NULL, 0);
 
         if (!dic_entry) {
-            av_log(NULL, AV_LOG_ERROR, "Failed to retrieve resource from dictionary after storing it\n");
+            av_log(ctx, AV_LOG_ERROR, "Failed to retrieve resource from dictionary after storing it\n");
             goto end;
         }
     }
diff --git a/fftools/textformat/avtextformat.c b/fftools/textformat/avtextformat.c
index bb90e66918ef4..14779e6f0cb26 100644
--- a/fftools/textformat/avtextformat.c
+++ b/fftools/textformat/avtextformat.c
@@ -43,8 +43,8 @@
 static const struct {
     double bin_val;
     double dec_val;
-    const char *bin_str;
-    const char *dec_str;
+    char bin_str[4];
+    char dec_str[4];
 } si_prefixes[] = {
     { 1.0, 1.0, "", "" },
     { 1.024e3, 1e3, "Ki", "K" },
@@ -681,34 +681,28 @@ int avtextwriter_context_open(AVTextWriterContext **pwctx, const AVTextWriter *w
     return ret;
 }
 
-static const AVTextFormatter *registered_formatters[9 + 1];
-
-static void formatters_register_all(void)
+static const AVTextFormatter *const registered_formatters[] =
 {
-    static int initialized;
-
-    if (initialized)
-        return;
-    initialized = 1;
-
-    registered_formatters[0] = &avtextformatter_default;
-    registered_formatters[1] = &avtextformatter_compact;
-    registered_formatters[2] = &avtextformatter_csv;
-    registered_formatters[3] = &avtextformatter_flat;
-    registered_formatters[4] = &avtextformatter_ini;
-    registered_formatters[5] = &avtextformatter_json;
-    registered_formatters[6] = &avtextformatter_xml;
-    registered_formatters[7] = &avtextformatter_mermaid;
-    registered_formatters[8] = &avtextformatter_mermaidhtml;
-}
+    &avtextformatter_default,
+    &avtextformatter_compact,
+    &avtextformatter_csv,
+    &avtextformatter_flat,
+    &avtextformatter_ini,
+    &avtextformatter_json,
+    &avtextformatter_xml,
+    &avtextformatter_mermaid,
+    &avtextformatter_mermaidhtml,
+    NULL
+};
 
 const AVTextFormatter *avtext_get_formatter_by_name(const char *name)
 {
-    formatters_register_all();
-
-    for (int i = 0; registered_formatters[i]; i++)
-        if (!strcmp(registered_formatters[i]->name, name))
+    for (int i = 0; registered_formatters[i]; i++) {
+        const char *end;
+        if (av_strstart(name, registered_formatters[i]->name, &end) &&
+            (*end == '\0' || *end == '='))
             return registered_formatters[i];
+    }
 
     return NULL;
 }
diff --git a/fftools/textformat/tf_mermaid.c b/fftools/textformat/tf_mermaid.c
index 6147cf6eeabb8..d3b9131adad93 100644
--- a/fftools/textformat/tf_mermaid.c
+++ b/fftools/textformat/tf_mermaid.c
@@ -153,7 +153,6 @@ typedef struct MermaidContext {
     }  section_data[SECTION_MAX_NB_LEVELS];
 
     unsigned nb_link_captions[SECTION_MAX_NB_LEVELS]; ///< generic print buffer dedicated to each section,
-    AVBPrint section_pbuf[SECTION_MAX_NB_LEVELS]; ///< generic print buffer dedicated to each section,
     AVBPrint link_buf; ///< print buffer for writing diagram links
     AVDictionary *link_dict;
 } MermaidContext;
@@ -216,6 +215,32 @@ static av_cold int mermaid_init_html(AVTextFormatContext *tfc)
     return 0;
 }
 
+static av_cold int mermaid_uninit(AVTextFormatContext *tfc)
+{
+    MermaidContext *mmc = tfc->priv;
+
+    av_bprint_finalize(&mmc->link_buf, NULL);
+    av_dict_free(&mmc->link_dict);
+
+    for (unsigned i = 0; i < SECTION_MAX_NB_LEVELS; i++) {
+        av_freep(&mmc->section_data[i].dest_id);
+        av_freep(&mmc->section_data[i].section_id);
+        av_freep(&mmc->section_data[i].src_id);
+        av_freep(&mmc->section_data[i].section_type);
+    }
+
+    return 0;
+}
+
+static void set_str(const char **dst, const char *src)
+{
+    if (*dst)
+        av_freep(dst);
+
+    if (src)
+        *dst = av_strdup(src);
+}
+
 #define MM_INDENT() writer_printf(tfc, "%*c", mmc->indent_level * 2, ' ')
 
 static void mermaid_print_section_header(AVTextFormatContext *tfc, const void *data)
@@ -266,6 +291,8 @@ static void mermaid_print_section_header(AVTextFormatContext *tfc, const void *d
             break;
         }
 
+        av_bprint_finalize(&css_buf, NULL);
+        av_freep(&directive);
         return;
     }
 
@@ -310,7 +337,7 @@ static void mermaid_print_section_header(AVTextFormatContext *tfc, const void *d
         }
 
         mmc->section_data[tfc->level].subgraph_start_incomplete = 1;
-        mmc->section_data[tfc->level].section_id = av_strdup(sec_ctx->context_id);
+        set_str(&mmc->section_data[tfc->level].section_id, sec_ctx->context_id);
     }
 
     if (section->flags & AV_TEXTFORMAT_SECTION_FLAG_IS_SHAPE) {
@@ -322,7 +349,7 @@ static void mermaid_print_section_header(AVTextFormatContext *tfc, const void *d
 
         if (sec_ctx->context_id) {
 
-            mmc->section_data[tfc->level].section_id = av_strdup(sec_ctx->context_id);
+            set_str(&mmc->section_data[tfc->level].section_id, sec_ctx->context_id);
 
             switch (mmc->diagram_config->diagram_type) {
             case AV_DIAGRAMTYPE_GRAPH:
@@ -352,7 +379,7 @@ static void mermaid_print_section_header(AVTextFormatContext *tfc, const void *d
             av_log(tfc, AV_LOG_ERROR, "Unable to write shape start. Missing id field. Section: %s", section->name);
         }
 
-        mmc->section_data[tfc->level].section_id = av_strdup(sec_ctx->context_id);
+        set_str(&mmc->section_data[tfc->level].section_id, sec_ctx->context_id);
     }
 
 
@@ -371,7 +398,7 @@ static void mermaid_print_section_header(AVTextFormatContext *tfc, const void *d
         mmc->nb_link_captions[tfc->level] = 0;
 
         if (sec_ctx && sec_ctx->context_type)
-            mmc->section_data[tfc->level].section_type = av_strdup(sec_ctx->context_type);
+            set_str(&mmc->section_data[tfc->level].section_type, sec_ctx->context_type);
 
         ////if (section->flags & AV_TEXTFORMAT_SECTION_FLAG_HAS_TYPE) {
         ////    AVBPrint buf;
@@ -533,17 +560,17 @@ static void mermaid_print_value(AVTextFormatContext *tfc, const char *key,
     int exit = 0;
 
     if (section->id_key && !strcmp(section->id_key, key)) {
-        mmc->section_data[tfc->level].section_id = av_strdup(str);
+        set_str(&mmc->section_data[tfc->level].section_id, str);
         exit = 1;
     }
 
     if (section->dest_id_key && !strcmp(section->dest_id_key, key)) {
-        mmc->section_data[tfc->level].dest_id = av_strdup(str);
+        set_str(&mmc->section_data[tfc->level].dest_id, str);
         exit = 1;
     }
 
     if (section->src_id_key && !strcmp(section->src_id_key, key)) {
-        mmc->section_data[tfc->level].src_id = av_strdup(str);
+        set_str(&mmc->section_data[tfc->level].src_id, str);
         exit = 1;
     }
 
@@ -636,6 +663,7 @@ const AVTextFormatter avtextformatter_mermaid = {
     .name                 = "mermaid",
     .priv_size            = sizeof(MermaidContext),
     .init                 = mermaid_init,
+    .uninit               = mermaid_uninit,
     .print_section_header = mermaid_print_section_header,
     .print_section_footer = mermaid_print_section_footer,
     .print_integer        = mermaid_print_int,
@@ -649,6 +677,7 @@ const AVTextFormatter avtextformatter_mermaidhtml = {
     .name                 = "mermaidhtml",
     .priv_size            = sizeof(MermaidContext),
     .init                 = mermaid_init_html,
+    .uninit               = mermaid_uninit,
     .print_section_header = mermaid_print_section_header,
     .print_section_footer = mermaid_print_section_footer,
     .print_integer        = mermaid_print_int,
diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index 94f42681272e3..c6b2ce1230476 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -337,7 +337,8 @@ static inline void mcdc(uint16_t *dst, const uint16_t *src, int log2w,
         }
         break;
     default:
-        av_assert0(0);
+        av_unreachable("log2w starts at 3 and gets only decremented during "
+                       "recursive calls to decode_p_block");
     }
 }
 
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 77734dff24585..fb3fb7f7f72de 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -42,6 +42,7 @@ OBJS = ac3_parser.o                                                     \
        dv_profile.o                                                     \
        encode.o                                                         \
        get_buffer.o                                                     \
+       hashtable.o                                                      \
        imgconvert.o                                                     \
        jni.o                                                            \
        lcevcdec.o                                                       \
@@ -811,6 +812,7 @@ OBJS-$(CONFIG_VP9_V4L2M2M_DECODER)     += v4l2_m2m_dec.o
 OBJS-$(CONFIG_VQA_DECODER)             += vqavideo.o
 OBJS-$(CONFIG_VQC_DECODER)             += vqcdec.o
 OBJS-$(CONFIG_VVC_DECODER)             += executor.o h2645data.o
+OBJS-$(CONFIG_VVC_SEI)                 += h2645_sei.o aom_film_grain.o h274.o
 OBJS-$(CONFIG_WADY_DPCM_DECODER)       += dpcm.o
 OBJS-$(CONFIG_WAVARC_DECODER)          += wavarc.o
 OBJS-$(CONFIG_WAVPACK_DECODER)         += wavpack.o wavpackdata.o dsd.o
@@ -1325,6 +1327,7 @@ TESTPROGS = avcodec                                                     \
             bitstream_le                                                \
             celp_math                                                   \
             codec_desc                                                  \
+            hashtable                                                   \
             htmlsubtitles                                               \
             jpeg2000dwt                                                 \
             mathops                                                    \
diff --git a/libavcodec/aac/aacdec_ac.c b/libavcodec/aac/aacdec_ac.c
index 7e5077cd19d8e..5104604fa580a 100644
--- a/libavcodec/aac/aacdec_ac.c
+++ b/libavcodec/aac/aacdec_ac.c
@@ -91,10 +91,7 @@ uint32_t ff_aac_ac_get_pk(uint32_t c)
 void ff_aac_ac_update_context(AACArithState *state, int idx,
                               uint16_t a, uint16_t b)
 {
-    state->cur[0] = a + b + 1;
-    if (state->cur[0] > 0xF)
-        state->cur[0] = 0xF;
-
+    state->cur[0] = FFMIN(a + b + 1, 0xF);
     state->cur[3] = state->cur[2];
     state->cur[2] = state->cur[1];
     state->cur[1] = state->cur[0];
diff --git a/libavcodec/aarch64/pixblockdsp_init_aarch64.c b/libavcodec/aarch64/pixblockdsp_init_aarch64.c
index e4bac722f88f3..404f3680a619d 100644
--- a/libavcodec/aarch64/pixblockdsp_init_aarch64.c
+++ b/libavcodec/aarch64/pixblockdsp_init_aarch64.c
@@ -21,7 +21,6 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/aarch64/cpu.h"
-#include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
 void ff_get_pixels_neon(int16_t *block, const uint8_t *pixels,
@@ -30,7 +29,6 @@ void ff_diff_pixels_neon(int16_t *block, const uint8_t *s1,
                          const uint8_t *s2, ptrdiff_t stride);
 
 av_cold void ff_pixblockdsp_init_aarch64(PixblockDSPContext *c,
-                                         AVCodecContext *avctx,
                                          unsigned high_bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index 2386c15ad00a3..ccd437f700ab4 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -81,17 +81,6 @@ typedef float                   SHORTFLOAT;
 
 #define AC3_LEVEL(x)            ROUND15((x) * FIXR15(M_SQRT1_2))
 
-/* pre-defined gain values */
-#define LEVEL_PLUS_3DB          M_SQRT2
-#define LEVEL_PLUS_1POINT5DB    1.1892071150027209
-#define LEVEL_MINUS_1POINT5DB   0.8408964152537145
-#define LEVEL_MINUS_3DB         M_SQRT1_2
-#define LEVEL_MINUS_4POINT5DB   0.5946035575013605
-#define LEVEL_MINUS_6DB         0.5000000000000000
-#define LEVEL_MINUS_9DB         0.3535533905932738
-#define LEVEL_ZERO              0.0000000000000000
-#define LEVEL_ONE               1.0000000000000000
-
 typedef struct AC3BitAllocParameters {
     int sr_code;
     int sr_shift;
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 49b170c235084..5eacab44751ad 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -46,142 +46,32 @@
 #include "decode.h"
 #include "kbdwin.h"
 
-/**
- * table for ungrouping 3 values in 7 bits.
- * used for exponents and bap=2 mantissas
- */
-static uint8_t ungroup_3_in_7_bits_tab[128][3];
-
-/** tables for ungrouping mantissas */
-static int b1_mantissas[32][3];
-static int b2_mantissas[128][3];
-static int b3_mantissas[8];
-static int b4_mantissas[128][2];
-static int b5_mantissas[16];
-
-/**
- * Quantization table: levels for symmetric. bits for asymmetric.
- * reference: Table 7.18 Mapping of bap to Quantizer
- */
-static const uint8_t quantization_tab[16] = {
-    0, 3, 5, 7, 11, 15,
-    5, 6, 7, 8, 9, 10, 11, 12, 14, 16
-};
-
 #if (!USE_FIXED)
 /** dynamic range table. converts codes to scale factors. */
 static float dynamic_range_tab[256];
 float ff_ac3_heavy_dynamic_range_tab[256];
-#endif
-
-/** Adjustments in dB gain */
-static const float gain_levels[9] = {
-    LEVEL_PLUS_3DB,
-    LEVEL_PLUS_1POINT5DB,
-    LEVEL_ONE,
-    LEVEL_MINUS_1POINT5DB,
-    LEVEL_MINUS_3DB,
-    LEVEL_MINUS_4POINT5DB,
-    LEVEL_MINUS_6DB,
-    LEVEL_ZERO,
-    LEVEL_MINUS_9DB
-};
-
-/** Adjustments in dB gain (LFE, +10 to -21 dB) */
-static const float gain_levels_lfe[32] = {
-    3.162275, 2.818382, 2.511886, 2.238719, 1.995261, 1.778278, 1.584893,
-    1.412536, 1.258924, 1.122018, 1.000000, 0.891251, 0.794328, 0.707946,
-    0.630957, 0.562341, 0.501187, 0.446683, 0.398107, 0.354813, 0.316227,
-    0.281838, 0.251188, 0.223872, 0.199526, 0.177828, 0.158489, 0.141253,
-    0.125892, 0.112201, 0.100000, 0.089125
-};
-
-/**
- * Table for default stereo downmixing coefficients
- * reference: Section 7.8.2 Downmixing Into Two Channels
- */
-static const uint8_t ac3_default_coeffs[8][5][2] = {
-    { { 2, 7 }, { 7, 2 },                               },
-    { { 4, 4 },                                         },
-    { { 2, 7 }, { 7, 2 },                               },
-    { { 2, 7 }, { 5, 5 }, { 7, 2 },                     },
-    { { 2, 7 }, { 7, 2 }, { 6, 6 },                     },
-    { { 2, 7 }, { 5, 5 }, { 7, 2 }, { 8, 8 },           },
-    { { 2, 7 }, { 7, 2 }, { 6, 7 }, { 7, 6 },           },
-    { { 2, 7 }, { 5, 5 }, { 7, 2 }, { 6, 7 }, { 7, 6 }, },
-};
-
-/**
- * Symmetrical Dequantization
- * reference: Section 7.3.3 Expansion of Mantissas for Symmetrical Quantization
- *            Tables 7.19 to 7.23
- */
-static inline int
-symmetric_dequant(int code, int levels)
-{
-    return ((code - (levels >> 1)) * (1 << 24)) / levels;
-}
 
 /*
  * Initialize tables at runtime.
  */
-static av_cold void ac3_tables_init(void)
+static av_cold void ac3_float_tables_init(void)
 {
-    int i;
-
-    /* generate table for ungrouping 3 values in 7 bits
-       reference: Section 7.1.3 Exponent Decoding */
-    for (i = 0; i < 128; i++) {
-        ungroup_3_in_7_bits_tab[i][0] =  i / 25;
-        ungroup_3_in_7_bits_tab[i][1] = (i % 25) / 5;
-        ungroup_3_in_7_bits_tab[i][2] = (i % 25) % 5;
-    }
-
-    /* generate grouped mantissa tables
-       reference: Section 7.3.5 Ungrouping of Mantissas */
-    for (i = 0; i < 32; i++) {
-        /* bap=1 mantissas */
-        b1_mantissas[i][0] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][0], 3);
-        b1_mantissas[i][1] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][1], 3);
-        b1_mantissas[i][2] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][2], 3);
-    }
-    for (i = 0; i < 128; i++) {
-        /* bap=2 mantissas */
-        b2_mantissas[i][0] = symmetric_dequant(ungroup_3_in_7_bits_tab[i][0], 5);
-        b2_mantissas[i][1] = symmetric_dequant(ungroup_3_in_7_bits_tab[i][1], 5);
-        b2_mantissas[i][2] = symmetric_dequant(ungroup_3_in_7_bits_tab[i][2], 5);
-
-        /* bap=4 mantissas */
-        b4_mantissas[i][0] = symmetric_dequant(i / 11, 11);
-        b4_mantissas[i][1] = symmetric_dequant(i % 11, 11);
-    }
-    /* generate ungrouped mantissa tables
-       reference: Tables 7.21 and 7.23 */
-    for (i = 0; i < 7; i++) {
-        /* bap=3 mantissas */
-        b3_mantissas[i] = symmetric_dequant(i, 7);
-    }
-    for (i = 0; i < 15; i++) {
-        /* bap=5 mantissas */
-        b5_mantissas[i] = symmetric_dequant(i, 15);
-    }
-
-#if (!USE_FIXED)
     /* generate dynamic range table
        reference: Section 7.7.1 Dynamic Range Control */
-    for (i = 0; i < 256; i++) {
+    for (int i = 0; i < 256; i++) {
         int v = (i >> 5) - ((i >> 7) << 3) - 5;
         dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0x1F) | 0x20);
     }
 
     /* generate compr dynamic range table
        reference: Section 7.7.2 Heavy Compression */
-    for (i = 0; i < 256; i++) {
+    for (int i = 0; i < 256; i++) {
         int v = (i >> 4) - ((i >> 7) << 4) - 4;
         ff_ac3_heavy_dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0xF) | 0x10);
     }
-#endif
+    ff_ac3_init_static();
 }
+#endif
 
 static void ac3_downmix(AVCodecContext *avctx)
 {
@@ -206,7 +96,6 @@ static void ac3_downmix(AVCodecContext *avctx)
  */
 static av_cold int ac3_decode_init(AVCodecContext *avctx)
 {
-    static AVOnce init_static_once = AV_ONCE_INIT;
     AC3DecodeContext *s = avctx->priv_data;
     const float scale = 1.0f;
     int i, ret;
@@ -247,7 +136,12 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
         s->dlyptr[i] = s->delay[i];
     }
 
-    ff_thread_once(&init_static_once, ac3_tables_init);
+#if USE_FIXED
+    ff_ac3_init_static();
+#else
+    static AVOnce init_static_once = AV_ONCE_INIT;
+    ff_thread_once(&init_static_once, ac3_float_tables_init);
+#endif
 
     return 0;
 }
@@ -404,8 +298,8 @@ static int parse_frame_header(AC3DecodeContext *s)
 static int set_downmix_coeffs(AC3DecodeContext *s)
 {
     int i;
-    float cmix = gain_levels[s->  center_mix_level];
-    float smix = gain_levels[s->surround_mix_level];
+    float cmix = ff_ac3_gain_levels[s->  center_mix_level];
+    float smix = ff_ac3_gain_levels[s->surround_mix_level];
     float norm0, norm1;
     float downmix_coeffs[2][AC3_MAX_CHANNELS];
 
@@ -418,8 +312,8 @@ static int set_downmix_coeffs(AC3DecodeContext *s)
     }
 
     for (i = 0; i < s->fbw_channels; i++) {
-        downmix_coeffs[0][i] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
-        downmix_coeffs[1][i] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
+        downmix_coeffs[0][i] = ff_ac3_gain_levels[ff_ac3_default_coeffs[s->channel_mode][i][0]];
+        downmix_coeffs[1][i] = ff_ac3_gain_levels[ff_ac3_default_coeffs[s->channel_mode][i][1]];
     }
     if (s->channel_mode > 1 && s->channel_mode & 1) {
         downmix_coeffs[0][1] = downmix_coeffs[1][1] = cmix;
@@ -479,9 +373,9 @@ static int decode_exponents(AC3DecodeContext *s,
             av_log(s->avctx, AV_LOG_ERROR, "expacc %d is out-of-range\n", expacc);
             return AVERROR_INVALIDDATA;
         }
-        dexp[i++] = ungroup_3_in_7_bits_tab[expacc][0];
-        dexp[i++] = ungroup_3_in_7_bits_tab[expacc][1];
-        dexp[i++] = ungroup_3_in_7_bits_tab[expacc][2];
+        dexp[i++] = ff_ac3_ungroup_3_in_7_bits_tab[expacc][0];
+        dexp[i++] = ff_ac3_ungroup_3_in_7_bits_tab[expacc][1];
+        dexp[i++] = ff_ac3_ungroup_3_in_7_bits_tab[expacc][2];
     }
 
     /* convert to absolute exps and expand groups */
@@ -576,9 +470,9 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
                 mantissa = m->b1_mant[m->b1];
             } else {
                 int bits      = get_bits(gbc, 5);
-                mantissa      = b1_mantissas[bits][0];
-                m->b1_mant[1] = b1_mantissas[bits][1];
-                m->b1_mant[0] = b1_mantissas[bits][2];
+                mantissa      = ff_ac3_bap1_mantissas[bits][0];
+                m->b1_mant[1] = ff_ac3_bap1_mantissas[bits][1];
+                m->b1_mant[0] = ff_ac3_bap1_mantissas[bits][2];
                 m->b1         = 2;
             }
             break;
@@ -588,14 +482,14 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
                 mantissa = m->b2_mant[m->b2];
             } else {
                 int bits      = get_bits(gbc, 7);
-                mantissa      = b2_mantissas[bits][0];
-                m->b2_mant[1] = b2_mantissas[bits][1];
-                m->b2_mant[0] = b2_mantissas[bits][2];
+                mantissa      = ff_ac3_bap2_mantissas[bits][0];
+                m->b2_mant[1] = ff_ac3_bap2_mantissas[bits][1];
+                m->b2_mant[0] = ff_ac3_bap2_mantissas[bits][2];
                 m->b2         = 2;
             }
             break;
         case 3:
-            mantissa = b3_mantissas[get_bits(gbc, 3)];
+            mantissa = ff_ac3_bap3_mantissas[get_bits(gbc, 3)];
             break;
         case 4:
             if (m->b4) {
@@ -603,13 +497,13 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
                 mantissa = m->b4_mant;
             } else {
                 int bits   = get_bits(gbc, 7);
-                mantissa   = b4_mantissas[bits][0];
-                m->b4_mant = b4_mantissas[bits][1];
+                mantissa   = ff_ac3_bap4_mantissas[bits][0];
+                m->b4_mant = ff_ac3_bap4_mantissas[bits][1];
                 m->b4      = 1;
             }
             break;
         case 5:
-            mantissa = b5_mantissas[get_bits(gbc, 4)];
+            mantissa = ff_ac3_bap5_mantissas[get_bits(gbc, 4)];
             break;
         default: /* 6 to 15 */
             /* Shift mantissa and sign-extend it. */
@@ -617,7 +511,7 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
                 av_log(s->avctx, AV_LOG_ERROR, "bap %d is invalid in plain AC-3\n", bap);
                 bap = 15;
             }
-            mantissa = (unsigned)get_sbits(gbc, quantization_tab[bap]) << (24 - quantization_tab[bap]);
+            mantissa = (unsigned)get_sbits(gbc, ff_ac3_quantization_tab[bap]) << (24 - ff_ac3_quantization_tab[bap]);
             break;
         }
         coeffs[freq] = mantissa >> exps[freq];
@@ -1620,10 +1514,10 @@ static int ac3_decode_frame(AVCodecContext *avctx, AVFrame *frame,
             s->output_mode  = AC3_CHMODE_STEREO;
         }
 
-        s->loro_center_mix_level   = gain_levels[s->  center_mix_level];
-        s->loro_surround_mix_level = gain_levels[s->surround_mix_level];
-        s->ltrt_center_mix_level   = gain_levels[s->  center_mix_level_ltrt];
-        s->ltrt_surround_mix_level = gain_levels[s->surround_mix_level_ltrt];
+        s->loro_center_mix_level   = ff_ac3_gain_levels[s->  center_mix_level];
+        s->loro_surround_mix_level = ff_ac3_gain_levels[s->surround_mix_level];
+        s->ltrt_center_mix_level   = ff_ac3_gain_levels[s->  center_mix_level_ltrt];
+        s->ltrt_surround_mix_level = ff_ac3_gain_levels[s->surround_mix_level_ltrt];
         switch (s->preferred_downmix) {
         case AC3_DMIXMOD_LTRT:
             s->preferred_stereo_downmix = AV_DOWNMIX_TYPE_LTRT;
@@ -1862,12 +1756,12 @@ static int ac3_decode_frame(AVCodecContext *avctx, AVFrame *frame,
             downmix_info->preferred_downmix_type = AV_DOWNMIX_TYPE_UNKNOWN;
             break;
         }
-        downmix_info->center_mix_level        = gain_levels[s->       center_mix_level];
-        downmix_info->center_mix_level_ltrt   = gain_levels[s->  center_mix_level_ltrt];
-        downmix_info->surround_mix_level      = gain_levels[s->     surround_mix_level];
-        downmix_info->surround_mix_level_ltrt = gain_levels[s->surround_mix_level_ltrt];
+        downmix_info->center_mix_level        = ff_ac3_gain_levels[s->       center_mix_level];
+        downmix_info->center_mix_level_ltrt   = ff_ac3_gain_levels[s->  center_mix_level_ltrt];
+        downmix_info->surround_mix_level      = ff_ac3_gain_levels[s->     surround_mix_level];
+        downmix_info->surround_mix_level_ltrt = ff_ac3_gain_levels[s->surround_mix_level_ltrt];
         if (s->lfe_mix_level_exists)
-            downmix_info->lfe_mix_level       = gain_levels_lfe[s->lfe_mix_level];
+            downmix_info->lfe_mix_level       = ff_eac3_gain_levels_lfe[s->lfe_mix_level];
         else
             downmix_info->lfe_mix_level       = 0.0; // -inf dB
     }
diff --git a/libavcodec/ac3dec_data.c b/libavcodec/ac3dec_data.c
index a3794ab223d12..0f5402c335670 100644
--- a/libavcodec/ac3dec_data.c
+++ b/libavcodec/ac3dec_data.c
@@ -21,10 +21,11 @@
 
 /**
  * @file
- * Tables taken directly from the AC-3 spec.
+ * Tables taken directly from the AC-3 spec or derived from it.
  */
 
 #include "ac3dec_data.h"
+#include "libavutil/thread.h"
 
 /**
  * Table used to ungroup 3 values stored in 5 bits.
@@ -42,6 +43,124 @@ const uint8_t ff_ac3_ungroup_3_in_5_bits_tab[32][3] = {
     { 3, 0, 1 }, { 3, 0, 2 }, { 3, 1, 0 }, { 3, 1, 1 }
 };
 
+/**
+ * table for ungrouping 3 values in 7 bits.
+ * used for exponents and bap=2 mantissas
+ */
+uint8_t ff_ac3_ungroup_3_in_7_bits_tab[128][3];
+
+/**
+ * Symmetrical Dequantization
+ * reference: Section 7.3.3 Expansion of Mantissas for Symmetrical Quantization
+ *            Tables 7.19 to 7.23
+ */
+#define SYMMETRIC_DEQUANT(code, levels) (((code - (levels >> 1)) * (1 << 24)) / levels)
+/**
+ * Ungrouped mantissa tables; the extra entry is padding to avoid range checks
+ */
+/**
+ * Table 7.21
+ */
+const int ff_ac3_bap3_mantissas[7 + 1] = {
+    SYMMETRIC_DEQUANT(0, 7),
+    SYMMETRIC_DEQUANT(1, 7),
+    SYMMETRIC_DEQUANT(2, 7),
+    SYMMETRIC_DEQUANT(3, 7),
+    SYMMETRIC_DEQUANT(4, 7),
+    SYMMETRIC_DEQUANT(5, 7),
+    SYMMETRIC_DEQUANT(6, 7),
+};
+/**
+ * Table 7.23
+ */
+const int ff_ac3_bap5_mantissas[15 + 1] = {
+    SYMMETRIC_DEQUANT(0,  15),
+    SYMMETRIC_DEQUANT(1,  15),
+    SYMMETRIC_DEQUANT(2,  15),
+    SYMMETRIC_DEQUANT(3,  15),
+    SYMMETRIC_DEQUANT(4,  15),
+    SYMMETRIC_DEQUANT(5,  15),
+    SYMMETRIC_DEQUANT(6,  15),
+    SYMMETRIC_DEQUANT(7,  15),
+    SYMMETRIC_DEQUANT(8,  15),
+    SYMMETRIC_DEQUANT(9,  15),
+    SYMMETRIC_DEQUANT(10, 15),
+    SYMMETRIC_DEQUANT(11, 15),
+    SYMMETRIC_DEQUANT(12, 15),
+    SYMMETRIC_DEQUANT(13, 15),
+    SYMMETRIC_DEQUANT(14, 15),
+};
+
+int ff_ac3_bap1_mantissas[32][3];
+int ff_ac3_bap2_mantissas[128][3];
+int ff_ac3_bap4_mantissas[128][2];
+
+static inline int
+symmetric_dequant(int code, int levels)
+{
+    return SYMMETRIC_DEQUANT(code, levels);
+}
+
+static av_cold void ac3_init_static(void)
+{
+    /* generate table for ungrouping 3 values in 7 bits
+       reference: Section 7.1.3 Exponent Decoding */
+    for (int i = 0; i < 128; ++i) {
+        ff_ac3_ungroup_3_in_7_bits_tab[i][0] =  i / 25;
+        ff_ac3_ungroup_3_in_7_bits_tab[i][1] = (i % 25) / 5;
+        ff_ac3_ungroup_3_in_7_bits_tab[i][2] = (i % 25) % 5;
+    }
+
+    /* generate grouped mantissa tables
+       reference: Section 7.3.5 Ungrouping of Mantissas */
+    for (int i = 0; i < 32; ++i) {
+        /* bap=1 mantissas */
+        ff_ac3_bap1_mantissas[i][0] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][0], 3);
+        ff_ac3_bap1_mantissas[i][1] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][1], 3);
+        ff_ac3_bap1_mantissas[i][2] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][2], 3);
+    }
+    for (int i = 0; i < 128; ++i) {
+        /* bap=2 mantissas */
+        ff_ac3_bap2_mantissas[i][0] = symmetric_dequant(ff_ac3_ungroup_3_in_7_bits_tab[i][0], 5);
+        ff_ac3_bap2_mantissas[i][1] = symmetric_dequant(ff_ac3_ungroup_3_in_7_bits_tab[i][1], 5);
+        ff_ac3_bap2_mantissas[i][2] = symmetric_dequant(ff_ac3_ungroup_3_in_7_bits_tab[i][2], 5);
+
+        /* bap=4 mantissas */
+        ff_ac3_bap4_mantissas[i][0] = symmetric_dequant(i / 11, 11);
+        ff_ac3_bap4_mantissas[i][1] = symmetric_dequant(i % 11, 11);
+    }
+}
+
+av_cold void ff_ac3_init_static(void)
+{
+    static AVOnce ac3_init_static_once = AV_ONCE_INIT;
+    ff_thread_once(&ac3_init_static_once, ac3_init_static);
+}
+
+/**
+ * Quantization table: levels for symmetric. bits for asymmetric.
+ * reference: Table 7.18 Mapping of bap to Quantizer
+ */
+const uint8_t ff_ac3_quantization_tab[16] = {
+    0, 3, 5, 7, 11, 15,
+    5, 6, 7, 8, 9, 10, 11, 12, 14, 16
+};
+
+/**
+ * Table for default stereo downmixing coefficients
+ * reference: Section 7.8.2 Downmixing Into Two Channels
+ */
+const uint8_t ff_ac3_default_coeffs[8][5][2] = {
+    { { 2, 7 }, { 7, 2 },                               },
+    { { 4, 4 },                                         },
+    { { 2, 7 }, { 7, 2 },                               },
+    { { 2, 7 }, { 5, 5 }, { 7, 2 },                     },
+    { { 2, 7 }, { 7, 2 }, { 6, 6 },                     },
+    { { 2, 7 }, { 5, 5 }, { 7, 2 }, { 8, 8 },           },
+    { { 2, 7 }, { 7, 2 }, { 6, 7 }, { 7, 6 },           },
+    { { 2, 7 }, { 5, 5 }, { 7, 2 }, { 6, 7 }, { 7, 6 }, },
+};
+
 const uint8_t ff_eac3_hebap_tab[64] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
     8, 8, 9, 9, 9, 10, 10, 10, 10, 11,
@@ -57,3 +176,12 @@ const uint8_t ff_eac3_hebap_tab[64] = {
  */
 const uint8_t ff_eac3_default_spx_band_struct[17] =
 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
+
+/** Adjustments in dB gain (LFE, +10 to -21 dB) */
+const float ff_eac3_gain_levels_lfe[32] = {
+    3.162275, 2.818382, 2.511886, 2.238719, 1.995261, 1.778278, 1.584893,
+    1.412536, 1.258924, 1.122018, 1.000000, 0.891251, 0.794328, 0.707946,
+    0.630957, 0.562341, 0.501187, 0.446683, 0.398107, 0.354813, 0.316227,
+    0.281838, 0.251188, 0.223872, 0.199526, 0.177828, 0.158489, 0.141253,
+    0.125892, 0.112201, 0.100000, 0.089125
+};
diff --git a/libavcodec/ac3dec_data.h b/libavcodec/ac3dec_data.h
index 975b52ef2cb48..613871627bea0 100644
--- a/libavcodec/ac3dec_data.h
+++ b/libavcodec/ac3dec_data.h
@@ -24,9 +24,31 @@
 
 #include <stdint.h>
 
+#include "libavutil/attributes_internal.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
+
 extern const uint8_t ff_ac3_ungroup_3_in_5_bits_tab[32][3];
+extern       uint8_t ff_ac3_ungroup_3_in_7_bits_tab[128][3];
+
+extern const int     ff_ac3_bap3_mantissas[ 7 + 1];
+extern const int     ff_ac3_bap5_mantissas[15 + 1];
+
+/** tables for ungrouping mantissas */
+extern int ff_ac3_bap1_mantissas[32][3];
+extern int ff_ac3_bap2_mantissas[128][3];
+extern int ff_ac3_bap4_mantissas[128][2];
+
+extern const uint8_t ff_ac3_quantization_tab[16];
+
+extern const uint8_t ff_ac3_default_coeffs[8][5][2];
 
 extern const uint8_t ff_eac3_hebap_tab[64];
 extern const uint8_t ff_eac3_default_spx_band_struct[17];
+extern const float   ff_eac3_gain_levels_lfe[32];
+
+void ff_ac3_init_static(void);
+
+FF_VISIBILITY_POP_HIDDEN
 
 #endif /* AVCODEC_AC3DEC_DATA_H */
diff --git a/libavcodec/ac3defs.h b/libavcodec/ac3defs.h
index ff92f0ac4ab99..f9b1be059faa4 100644
--- a/libavcodec/ac3defs.h
+++ b/libavcodec/ac3defs.h
@@ -34,6 +34,17 @@
 #define AC3_CRITICAL_BANDS 50
 #define AC3_MAX_CPL_BANDS  18
 
+/* pre-defined gain values */
+#define LEVEL_PLUS_3DB          M_SQRT2
+#define LEVEL_PLUS_1POINT5DB    1.1892071150027209
+#define LEVEL_MINUS_1POINT5DB   0.8408964152537145
+#define LEVEL_MINUS_3DB         M_SQRT1_2
+#define LEVEL_MINUS_4POINT5DB   0.5946035575013605
+#define LEVEL_MINUS_6DB         0.5000000000000000
+#define LEVEL_MINUS_9DB         0.3535533905932738
+#define LEVEL_ZERO              0.0000000000000000
+#define LEVEL_ONE               1.0000000000000000
+
 /* exponent encoding strategy */
 #define EXP_REUSE 0
 #define EXP_NEW   1
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 3649289865213..a316d4e4d745e 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -71,10 +71,7 @@ static const float surmixlev_options[SURMIXLEV_NUM_OPTIONS] = {
 };
 
 #define EXTMIXLEV_NUM_OPTIONS 8
-static const float extmixlev_options[EXTMIXLEV_NUM_OPTIONS] = {
-    LEVEL_PLUS_3DB,  LEVEL_PLUS_1POINT5DB,  LEVEL_ONE,       LEVEL_MINUS_1POINT5DB,
-    LEVEL_MINUS_3DB, LEVEL_MINUS_4POINT5DB, LEVEL_MINUS_6DB, LEVEL_ZERO
-};
+#define extmixlev_options ff_ac3_gain_levels
 
 /* The first two options apply only to the AC-3 encoders;
  * the rest is also valid for EAC-3. When modifying it,
@@ -1638,6 +1635,8 @@ static void ac3_output_frame_header(AC3EncodeContext *s, PutBitContext *pb)
 {
     AC3EncOptions *opt = &s->options;
 
+    put_bits_assume_flushed(pb);
+
     put_bits(pb, 16, 0x0b77);   /* frame header */
     put_bits(pb, 16, 0);        /* crc1: will be filled later */
     put_bits(pb, 2,  s->bit_alloc.sr_code);
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index 48c89a8ba007d..b38e7237b3479 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -25,6 +25,7 @@
  */
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/mathematics.h"
 
 #include "ac3tab.h"
 
@@ -147,6 +148,19 @@ const uint16_t ff_ac3_fast_gain_tab[8]= {
     0x080, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400,
 };
 
+/** Adjustments in dB gain */
+const float ff_ac3_gain_levels[9] = {
+    LEVEL_PLUS_3DB,
+    LEVEL_PLUS_1POINT5DB,
+    LEVEL_ONE,
+    LEVEL_MINUS_1POINT5DB,
+    LEVEL_MINUS_3DB,
+    LEVEL_MINUS_4POINT5DB,
+    LEVEL_MINUS_6DB,
+    LEVEL_ZERO,
+    LEVEL_MINUS_9DB
+};
+
 const uint64_t ff_eac3_custom_channel_map_locations[16][2] = {
     { 1, AV_CH_FRONT_LEFT },
     { 1, AV_CH_FRONT_CENTER },
diff --git a/libavcodec/ac3tab.h b/libavcodec/ac3tab.h
index dcef643acb8d1..3f83ce7b8c5c4 100644
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -26,6 +26,9 @@
 
 #include "ac3defs.h"
 
+#include "libavutil/attributes_internal.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
 extern const uint16_t ff_ac3_frame_size_tab[38][3];
 extern const uint8_t  ff_ac3_channels_tab[8];
 extern const uint16_t ff_ac3_channel_layout_tab[8];
@@ -43,7 +46,9 @@ extern const int16_t  ff_ac3_floor_tab[8];
 extern const uint16_t ff_ac3_fast_gain_tab[8];
 extern const uint8_t  ff_ac3_band_start_tab[AC3_CRITICAL_BANDS+1];
 extern const uint8_t  ff_ac3_bin_to_band_tab[253];
+extern const float    ff_ac3_gain_levels[9];
 extern const uint64_t ff_eac3_custom_channel_map_locations[16][2];
+FF_VISIBILITY_POP_HIDDEN
 
 #define COMMON_CHANNEL_MAP \
     { { 0, 1,          }, { 0, 1, 2,         } },\
diff --git a/libavcodec/adpcm.c b/libavcodec/adpcm.c
index e20b60e05fef4..622cf54b40e94 100644
--- a/libavcodec/adpcm.c
+++ b/libavcodec/adpcm.c
@@ -2319,7 +2319,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, AVFrame *frame,
         }
         ) /* End of CASE */
     default:
-        av_assert0(0); // unsupported codec_id should not happen
+        av_unreachable("There are cases for all codec ids using adpcm_decode_frame");
     }
 
     if (avpkt->size && bytestream2_tell(&gb) == 0) {
diff --git a/libavcodec/amfenc_h264.c b/libavcodec/amfenc_h264.c
index cfcc5482f06a9..260139f14fad5 100644
--- a/libavcodec/amfenc_h264.c
+++ b/libavcodec/amfenc_h264.c
@@ -468,26 +468,61 @@ static av_cold int amf_encode_init_h264(AVCodecContext *avctx)
     }
 
     // B-Frames
-    if (ctx->max_consecutive_b_frames != -1) {
-        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_MAX_CONSECUTIVE_BPICTURES, ctx->max_consecutive_b_frames);
-        if (ctx->max_b_frames != -1) {
-            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_B_PIC_PATTERN, ctx->max_b_frames);
-            if (res != AMF_OK) {
-                res = ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_B_PIC_PATTERN, &var);
-                av_log(ctx, AV_LOG_WARNING, "B-frames=%d is not supported by this GPU, switched to %d\n",
-                    ctx->max_b_frames, (int)var.int64Value);
-                ctx->max_b_frames = (int)var.int64Value;
+    AMFVariantStruct    is_adaptive_b_frames = { 0 };
+    res = ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_ADAPTIVE_MINIGOP, &is_adaptive_b_frames);
+    if (ctx->max_consecutive_b_frames != -1 || ctx->max_b_frames != -1 || is_adaptive_b_frames.boolValue == true) {
+
+        //Get the capability of encoder
+        AMFCaps *encoder_caps = NULL;
+        ctx->encoder->pVtbl->GetCaps(ctx->encoder, &encoder_caps);
+        if (encoder_caps != NULL)
+        {
+            res = encoder_caps->pVtbl->GetProperty(encoder_caps, AMF_VIDEO_ENCODER_CAP_BFRAMES, &var);
+            if (res == AMF_OK) {
+
+                //encoder supports H.264 B-frame
+                if(var.boolValue == true){
+                    //adaptive b-frames is higher priority than max_b_frames
+                    if (is_adaptive_b_frames.boolValue == true)
+                    {
+                        //force AMF_VIDEO_ENCODER_MAX_CONSECUTIVE_BPICTURES to 3
+                        AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_MAX_CONSECUTIVE_BPICTURES, 3);
+
+                        if(ctx->pa_lookahead_buffer_depth < 1)
+                        {
+                            //force AMF_PA_LOOKAHEAD_BUFFER_DEPTH to 1 if not set or smaller than 1
+                            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_PA_LOOKAHEAD_BUFFER_DEPTH, 1);
+                        }
+                    }
+                    else {
+                        if (ctx->max_b_frames != -1) {
+                            //in case user sets B-frames
+                            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_B_PIC_PATTERN, ctx->max_b_frames);
+                            if (res != AMF_OK) {
+                                res = ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_B_PIC_PATTERN, &var);
+                                av_log(ctx, AV_LOG_WARNING, "B-frames=%d is not supported by this GPU, switched to %d\n", ctx->max_b_frames, (int)var.int64Value);
+                                ctx->max_b_frames = (int)var.int64Value;
+                            }
+                            AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_MAX_CONSECUTIVE_BPICTURES, ctx->max_b_frames);
+                        }
+                    }
+
+                }
+                //encoder doesn't support H.264 B-frame
+                else {
+                    av_log(ctx, AV_LOG_WARNING, "The current GPU in use does not support H.264 B-frame encoding, there will be no B-frame in bitstream.\n");
+                }
+            } else {
+                //Can't get the capability of encoder
+                av_log(ctx, AV_LOG_WARNING, "Unable to get H.264 B-frame capability.\n");
+                av_log(ctx, AV_LOG_WARNING, "There will be no B-frame in bitstream.\n");
             }
-            if (ctx->max_consecutive_b_frames < ctx->max_b_frames) {
-                av_log(ctx, AVERROR_BUG, "Maxium B frames needs to be greater than the specified B frame count.\n");
-            }
-        }
-    }
-    else {
-        if (ctx->max_b_frames != -1) {
-            av_log(ctx, AVERROR_BUG, "Maxium number of B frames needs to be specified.\n");
+
+            encoder_caps->pVtbl->Release(encoder_caps);
+            encoder_caps = NULL;
         }
     }
+
     res = ctx->encoder->pVtbl->GetProperty(ctx->encoder, AMF_VIDEO_ENCODER_B_PIC_PATTERN, &var);
     if ((int)var.int64Value) {
         AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_B_PIC_DELTA_QP, ctx->b_frame_delta_qp);
diff --git a/libavcodec/amrwbdec.c b/libavcodec/amrwbdec.c
index 929fc30a3ce07..91fb870a6408d 100644
--- a/libavcodec/amrwbdec.c
+++ b/libavcodec/amrwbdec.c
@@ -556,7 +556,8 @@ static void decode_fixed_vector(float *fixed_vector, const uint16_t *pulse_hi,
                            ((int) pulse_hi[i] << 11), 4, 1);
         break;
     default:
-        av_assert2(0);
+        av_unreachable("Everything >= MODE_SID is impossible: MODE_SID is patchwelcome,"
+                       "> MODE_SID is invalid");
     }
 
     memset(fixed_vector, 0, sizeof(float) * AMRWB_SFR_SIZE);
diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c
index 5481c0178c03e..121338ad0ce7c 100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -21,7 +21,6 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
 void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels,
@@ -39,7 +38,6 @@ void ff_diff_pixels_unaligned_neon(int16_t *block, const uint8_t *s1,
                                    const uint8_t *s2, ptrdiff_t stride);
 
 av_cold void ff_pixblockdsp_init_arm(PixblockDSPContext *c,
-                                     AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index 52666ee5473f3..883edd046860a 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -26,6 +26,7 @@
 #include "config_components.h"
 
 #include "libavutil/attributes.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 
@@ -44,6 +45,10 @@ typedef struct ASVEncContext {
 
     PutBitContext pb;
 
+    void (*get_pixels)(int16_t *restrict block,
+                       const uint8_t *pixels,
+                       ptrdiff_t stride);
+
     PixblockDSPContext pdsp;
     FDCTDSPContext fdsp;
     DECLARE_ALIGNED(32, int16_t, block)[6][64];
@@ -61,40 +66,43 @@ enum {
 static inline void asv1_put_level(PutBitContext *pb, int level)
 {
     unsigned int index = level + 3;
+    unsigned n, code;
 
     if (index <= 6) {
-        put_bits(pb, ff_asv_level_tab[index][1], ff_asv_level_tab[index][0]);
+        n    = ff_asv_level_tab[index][1];
+        code = ff_asv_level_tab[index][0];
     } else {
-        put_bits(pb, 3, 0); /* Escape code */
-        put_sbits(pb, 8, level);
+        n    = 3 + 8;
+        code = (0 /* Escape code */ << 8)  | (level & 0xFF);
     }
+    put_bits(pb, n, code);
 }
 
 static inline void asv2_put_level(ASVEncContext *a, PutBitContext *pb, int level)
 {
     unsigned int index = level + 31;
+    unsigned n, code;
 
     if (index <= 62) {
-        put_bits_le(pb, ff_asv2_level_tab[index][1], ff_asv2_level_tab[index][0]);
+        n    = ff_asv2_level_tab[index][1];
+        code = ff_asv2_level_tab[index][0];
     } else {
-        put_bits_le(pb, 5, 0); /* Escape code */
         if (level < -128 || level > 127) {
             av_log(a->c.avctx, AV_LOG_WARNING, "Clipping level %d, increase qscale\n", level);
             level = av_clip_int8(level);
         }
-        put_bits_le(pb, 8, level & 0xFF);
+        n    = 5 + 8;
+        code = (level & 0xFF) << 5 | /* Escape code */ 0;
     }
+    put_bits_le(pb, n, code);
 }
 
 static inline void asv1_encode_block(ASVEncContext *a, int16_t block[64])
 {
-    int i;
-    int nc_count = 0;
-
     put_bits(&a->pb, 8, (block[0] + 32) >> 6);
     block[0] = 0;
 
-    for (i = 0; i < 10; i++) {
+    for (unsigned i = 0, nc_bits = 0, nc_val = 0; i < 10; i++) {
         const int index = ff_asv_scantab[4 * i];
         int ccp         = 0;
 
@@ -112,10 +120,11 @@ static inline void asv1_encode_block(ASVEncContext *a, int16_t block[64])
             ccp |= 1;
 
         if (ccp) {
-            for (; nc_count; nc_count--)
-                put_bits(&a->pb, 2, 2); /* Skip */
-
-            put_bits(&a->pb, ff_asv_ccp_tab[ccp][1], ff_asv_ccp_tab[ccp][0]);
+            put_bits(&a->pb, nc_bits + ff_asv_ccp_tab[ccp][1],
+                             nc_val << ff_asv_ccp_tab[ccp][1] /* Skip */ |
+                             ff_asv_ccp_tab[ccp][0]);
+            nc_bits = 0;
+            nc_val  = 0;
 
             if (ccp & 8)
                 asv1_put_level(&a->pb, block[index + 0]);
@@ -126,7 +135,8 @@ static inline void asv1_encode_block(ASVEncContext *a, int16_t block[64])
             if (ccp & 1)
                 asv1_put_level(&a->pb, block[index + 9]);
         } else {
-            nc_count++;
+            nc_bits += 2;
+            nc_val   = (nc_val << 2) | 2;
         }
     }
     put_bits(&a->pb, 5, 0xF); /* End of block */
@@ -145,8 +155,8 @@ static inline void asv2_encode_block(ASVEncContext *a, int16_t block[64])
 
     count >>= 2;
 
-    put_bits_le(&a->pb, 4, count);
-    put_bits_le(&a->pb, 8, (block[0] + 32) >> 6);
+    put_bits_le(&a->pb, 4 + 8, count /* 4 bits */ |
+                               (/* DC */(block[0] + 32) >> 6) << 4);
     block[0] = 0;
 
     for (i = 0; i <= count; i++) {
@@ -213,74 +223,92 @@ static inline void dct_get(ASVEncContext *a, const AVFrame *frame,
     const uint8_t *ptr_cb = frame->data[1] + (mb_y *  8 * frame->linesize[1]) + mb_x *  8;
     const uint8_t *ptr_cr = frame->data[2] + (mb_y *  8 * frame->linesize[2]) + mb_x *  8;
 
-    a->pdsp.get_pixels(block[0], ptr_y,                    linesize);
-    a->pdsp.get_pixels(block[1], ptr_y + 8,                linesize);
-    a->pdsp.get_pixels(block[2], ptr_y + 8 * linesize,     linesize);
-    a->pdsp.get_pixels(block[3], ptr_y + 8 * linesize + 8, linesize);
+    a->get_pixels(block[0], ptr_y,                    linesize);
+    a->get_pixels(block[1], ptr_y + 8,                linesize);
+    a->get_pixels(block[2], ptr_y + 8 * linesize,     linesize);
+    a->get_pixels(block[3], ptr_y + 8 * linesize + 8, linesize);
     for (i = 0; i < 4; i++)
         a->fdsp.fdct(block[i]);
 
     if (!(a->c.avctx->flags & AV_CODEC_FLAG_GRAY)) {
-        a->pdsp.get_pixels(block[4], ptr_cb, frame->linesize[1]);
-        a->pdsp.get_pixels(block[5], ptr_cr, frame->linesize[2]);
+        a->get_pixels(block[4], ptr_cb, frame->linesize[1]);
+        a->get_pixels(block[5], ptr_cr, frame->linesize[2]);
         for (i = 4; i < 6; i++)
             a->fdsp.fdct(block[i]);
     }
 }
 
-static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+static void handle_partial_mb(ASVEncContext *a, const uint8_t *const data[3],
+                              const int linesizes[3],
+                              int valid_width, int valid_height)
 {
-    ASVEncContext *const a = avctx->priv_data;
-    const ASVCommonContext *const c = &a->c;
-    int size, ret;
-
-    if (pict->width % 16 || pict->height % 16) {
-        AVFrame *clone = av_frame_alloc();
-        int i;
-
-        if (!clone)
-            return AVERROR(ENOMEM);
-        clone->format = pict->format;
-        clone->width  = FFALIGN(pict->width, 16);
-        clone->height = FFALIGN(pict->height, 16);
-        ret = av_frame_get_buffer(clone, 0);
-        if (ret < 0) {
-            av_frame_free(&clone);
-            return ret;
+    const int nb_blocks = a->c.avctx->flags & AV_CODEC_FLAG_GRAY ? 4 : 6;
+    static const struct Descriptor {
+        uint8_t x_offset, y_offset;
+        uint8_t component, subsampling;
+    } block_descriptor[] = {
+        { 0, 0, 0, 0 }, { 8, 0, 0, 0 }, { 0, 8, 0, 0 }, { 8, 8, 0, 0 },
+        { 0, 0, 1, 1 }, { 0, 0, 2, 1 },
+    };
+
+    for (int i = 0; i < nb_blocks; ++i) {
+        const struct Descriptor *const desc = block_descriptor + i;
+        int width_avail  = AV_CEIL_RSHIFT(valid_width,  desc->subsampling) - desc->x_offset;
+        int height_avail = AV_CEIL_RSHIFT(valid_height, desc->subsampling) - desc->y_offset;
+
+        if (width_avail <= 0 || height_avail <= 0) {
+            // This block is outside of the visible part; don't replicate pixels,
+            // just zero the block, so that only the dc value will be coded.
+            memset(a->block[i], 0, sizeof(a->block[i]));
+            continue;
         }
-
-        ret = av_frame_copy(clone, pict);
-        if (ret < 0) {
-            av_frame_free(&clone);
-            return ret;
+        width_avail  = FFMIN(width_avail,  8);
+        height_avail = FFMIN(height_avail, 8);
+
+        ptrdiff_t linesize = linesizes[desc->component];
+        const uint8_t *src = data[desc->component] + desc->y_offset * linesize + desc->x_offset;
+        int16_t *block = a->block[i];
+
+        for (int h = 0;; block += 8, src += linesize) {
+            int16_t last;
+            for (int w = 0; w < width_avail; ++w)
+                last = block[w] = src[w];
+            for (int w = width_avail; w < 8; ++w)
+                block[w] = last;
+            if (++h == height_avail)
+                break;
         }
-
-        for (i = 0; i<3; i++) {
-            int x, y;
-            int w  = AV_CEIL_RSHIFT(pict->width, !!i);
-            int h  = AV_CEIL_RSHIFT(pict->height, !!i);
-            int w2 = AV_CEIL_RSHIFT(clone->width, !!i);
-            int h2 = AV_CEIL_RSHIFT(clone->height, !!i);
-            for (y=0; y<h; y++)
-                for (x=w; x<w2; x++)
-                    clone->data[i][x + y*clone->linesize[i]] =
-                        clone->data[i][w - 1 + y*clone->linesize[i]];
-            for (y=h; y<h2; y++)
-                for (x=0; x<w2; x++)
-                    clone->data[i][x + y*clone->linesize[i]] =
-                        clone->data[i][x + (h-1)*clone->linesize[i]];
+        const int16_t *const last_row = block;
+        for (int h = height_avail; h < 8; ++h) {
+            block += 8;
+            AV_COPY128(block, last_row);
         }
-        ret = encode_frame(avctx, pkt, clone, got_packet);
 
-        av_frame_free(&clone);
-        return ret;
+        a->fdsp.fdct(a->block[i]);
     }
 
+    encode_mb(a, a->block);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    ASVEncContext *const a = avctx->priv_data;
+    const ASVCommonContext *const c = &a->c;
+    int size, ret;
+
     ret = ff_alloc_packet(avctx, pkt, c->mb_height * c->mb_width * MAX_MB_SIZE + 3);
     if (ret < 0)
         return ret;
 
+    if (!PIXBLOCKDSP_8BPP_GET_PIXELS_SUPPORTS_UNALIGNED &&
+        ((uintptr_t)pict->data[0] & 7 || pict->linesize[0] & 7 ||
+         (uintptr_t)pict->data[1] & 7 || pict->linesize[1] & 7 ||
+         (uintptr_t)pict->data[2] & 7 || pict->linesize[2] & 7))
+        a->get_pixels = a->pdsp.get_pixels_unaligned;
+    else
+        a->get_pixels = a->pdsp.get_pixels;
+
     init_put_bits(&a->pb, pkt->data, pkt->size);
 
     for (int mb_y = 0; mb_y < c->mb_height2; mb_y++) {
@@ -290,19 +318,37 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
-    if (c->mb_width2 != c->mb_width) {
-        int mb_x = c->mb_width2;
+    if (avctx->width & 15) {
+        const uint8_t *src[3] = {
+            pict->data[0] + c->mb_width2 * 16,
+            pict->data[1] + c->mb_width2 *  8,
+            pict->data[2] + c->mb_width2 *  8,
+        };
+        int available_width = avctx->width & 15;
+
         for (int mb_y = 0; mb_y < c->mb_height2; mb_y++) {
-            dct_get(a, pict, mb_x, mb_y);
-            encode_mb(a, a->block);
+            handle_partial_mb(a, src, pict->linesize, available_width, 16);
+            src[0] += 16 * pict->linesize[0];
+            src[1] +=  8 * pict->linesize[1];
+            src[2] +=  8 * pict->linesize[2];
         }
     }
 
-    if (c->mb_height2 != c->mb_height) {
-        int mb_y = c->mb_height2;
-        for (int mb_x = 0; mb_x < c->mb_width; mb_x++) {
-            dct_get(a, pict, mb_x, mb_y);
-            encode_mb(a, a->block);
+    if (avctx->height & 15) {
+        const uint8_t *src[3] = {
+            pict->data[0] + c->mb_height2 * 16 * pict->linesize[0],
+            pict->data[1] + c->mb_height2 *  8 * pict->linesize[1],
+            pict->data[2] + c->mb_height2 *  8 * pict->linesize[2],
+        };
+        int available_height = avctx->height & 15;
+
+        for (int remaining = avctx->width;; remaining -= 16) {
+            handle_partial_mb(a, src, pict->linesize, remaining, available_height);
+            if (remaining <= 16)
+                break;
+            src[0] += 16;
+            src[1] +=  8;
+            src[2] +=  8;
         }
     }
 
@@ -333,7 +379,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     ff_asv_common_init(avctx);
     ff_fdctdsp_init(&a->fdsp, avctx);
-    ff_pixblockdsp_init(&a->pdsp, avctx);
+    ff_pixblockdsp_init(&a->pdsp, 8);
 
     if (avctx->global_quality <= 0)
         avctx->global_quality = 4 * FF_QUALITY_SCALE;
@@ -345,8 +391,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
     avctx->extradata_size              = 8;
-    AV_WLA(32, avctx->extradata, inv_qscale);
-    ((uint32_t *) avctx->extradata)[1] = av_le2ne32(AV_RL32("ASUS"));
+    AV_WL32A(avctx->extradata, inv_qscale);
+    AV_WL32A(avctx->extradata + 4, MKTAG('A', 'S', 'U', 'S'));
 
     for (i = 0; i < 64; i++) {
         if (a->fdsp.fdct == ff_fdct_ifast) {
diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c
index faa3daa9e6cd8..fe156fa4821e1 100644
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -526,7 +526,7 @@ static void reverse_matrixing(float *su1, float *su2, int *prev_code,
             }
             break;
         default:
-            av_assert1(0);
+            av_unreachable("curr_code/matrix_coeff_index_* values are stored in two bits");
         }
     }
 }
diff --git a/libavcodec/avdct.c b/libavcodec/avdct.c
index f995e73eab44d..5322b181bcf21 100644
--- a/libavcodec/avdct.c
+++ b/libavcodec/avdct.c
@@ -119,7 +119,7 @@ int avcodec_dct_init(AVDCT *dsp)
 #if CONFIG_PIXBLOCKDSP
     {
         PixblockDSPContext pdsp;
-        ff_pixblockdsp_init(&pdsp, avctx);
+        ff_pixblockdsp_init(&pdsp, dsp->bits_per_sample);
         COPY(pdsp, get_pixels);
         COPY(pdsp, get_pixels_unaligned);
     }
diff --git a/libavcodec/bsf/dovi_rpu.c b/libavcodec/bsf/dovi_rpu.c
index 5dccd4bc7e76e..84b271f736b93 100644
--- a/libavcodec/bsf/dovi_rpu.c
+++ b/libavcodec/bsf/dovi_rpu.c
@@ -228,8 +228,8 @@ static int dovi_rpu_init(AVBSFContext *bsf)
         } else {
             av_log(bsf, AV_LOG_WARNING, "No Dolby Vision configuration record "
                    "found? Generating one, but results may be invalid.\n");
-            ret = ff_dovi_configure_ext(&s->enc, bsf->par_out, NULL, s->compression,
-                                        FF_COMPLIANCE_NORMAL);
+            ret = ff_dovi_configure_from_codedpar(&s->enc, bsf->par_out, NULL, s->compression,
+                                                  FF_COMPLIANCE_NORMAL);
             if (ret < 0)
                 return ret;
             /* Be conservative in accepting all compressed RPUs */
diff --git a/libavcodec/cbs_apv_syntax_template.c b/libavcodec/cbs_apv_syntax_template.c
index ca66349141e54..fc8a08ff31dd0 100644
--- a/libavcodec/cbs_apv_syntax_template.c
+++ b/libavcodec/cbs_apv_syntax_template.c
@@ -543,11 +543,11 @@ static int FUNC(metadata)(CodedBitstreamContext *ctx, RWContext *rw,
             return AVERROR_INVALIDDATA;
         }
 
+        current->metadata_count = p + 1;
+
         CHECK(FUNC(metadata_payload)(ctx, rw, pl));
 
         metadata_bytes_left -= pl->payload_size;
-
-        current->metadata_count = p + 1;
         if (metadata_bytes_left == 0)
             break;
     }
diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
index 369e3ac876994..fa70a8fb7b97e 100644
--- a/libavcodec/cbs_h2645.c
+++ b/libavcodec/cbs_h2645.c
@@ -2310,6 +2310,28 @@ static const SEIMessageTypeDescriptor cbs_sei_h266_types[] = {
     SEI_MESSAGE_TYPE_END
 };
 
+static const SEIMessageTypeDescriptor cbs_sei_h274_types[] = {
+    {
+        SEI_TYPE_FILM_GRAIN_CHARACTERISTICS,
+        1, 0,
+        sizeof(SEIRawFilmGrainCharacteristics),
+        SEI_MESSAGE_RW(sei, film_grain_characteristics),
+    },
+    {
+        SEI_TYPE_DISPLAY_ORIENTATION,
+        1, 0,
+        sizeof(SEIRawDisplayOrientation),
+        SEI_MESSAGE_RW(sei, display_orientation)
+    },
+    {
+        SEI_TYPE_FRAME_FIELD_INFO,
+        1, 0,
+        sizeof(SEIRawFrameFieldInformation),
+        SEI_MESSAGE_RW(sei, frame_field_information)
+    },
+    SEI_MESSAGE_TYPE_END,
+};
+
 const SEIMessageTypeDescriptor *ff_cbs_sei_find_type(CodedBitstreamContext *ctx,
                                                      int payload_type)
 {
@@ -2335,6 +2357,13 @@ const SEIMessageTypeDescriptor *ff_cbs_sei_find_type(CodedBitstreamContext *ctx,
             return &codec_list[i];
     }
 
+    if (ctx->codec->codec_id == AV_CODEC_ID_H266) {
+        for (i = 0; cbs_sei_h274_types[i].type >= 0; i++) {
+            if (cbs_sei_h274_types[i].type == payload_type)
+                return &cbs_sei_h274_types[i];
+        }
+    }
+
     for (i = 0; cbs_sei_common_types[i].type >= 0; i++) {
         if (cbs_sei_common_types[i].type == payload_type)
             return &cbs_sei_common_types[i];
diff --git a/libavcodec/cbs_sei.h b/libavcodec/cbs_sei.h
index 15ef3415aba84..81867b79a7e06 100644
--- a/libavcodec/cbs_sei.h
+++ b/libavcodec/cbs_sei.h
@@ -97,6 +97,46 @@ typedef struct SEIRawAmbientViewingEnvironment {
     uint16_t ambient_light_y;
 } SEIRawAmbientViewingEnvironment;
 
+typedef struct SEIRawFilmGrainCharacteristics {
+    uint8_t      fg_characteristics_cancel_flag;
+    uint8_t      fg_model_id;
+    uint8_t      fg_separate_colour_description_present_flag;
+    uint8_t      fg_bit_depth_luma_minus8;
+    uint8_t      fg_bit_depth_chroma_minus8;
+    uint8_t      fg_full_range_flag;
+    uint8_t      fg_colour_primaries;
+    uint8_t      fg_transfer_characteristics;
+    uint8_t      fg_matrix_coeffs;
+    uint8_t      fg_blending_mode_id;
+    uint8_t      fg_log2_scale_factor;
+    uint8_t      fg_comp_model_present_flag[3];
+    uint8_t      fg_num_intensity_intervals_minus1[3];
+    uint8_t      fg_num_model_values_minus1[3];
+    uint8_t      fg_intensity_interval_lower_bound[3][256];
+    uint8_t      fg_intensity_interval_upper_bound[3][256];
+    int16_t      fg_comp_model_value[3][256][6];
+    uint8_t      fg_characteristics_persistence_flag;
+} SEIRawFilmGrainCharacteristics;
+
+typedef struct SEIRawDisplayOrientation {
+    uint8_t      display_orientation_cancel_flag;
+    uint8_t      display_orientation_persistence_flag;
+    uint8_t      display_orientation_transform_type;
+    uint8_t      display_orientation_reserved_zero_3bits;
+} SEIRawDisplayOrientation;
+
+typedef struct SEIRawFrameFieldInformation {
+    uint8_t      ffi_field_pic_flag;
+    uint8_t      ffi_bottom_field_flag;
+    uint8_t      ffi_pairing_indicated_flag;
+    uint8_t      ffi_paired_with_next_field_flag;
+    uint8_t      ffi_display_fields_from_frame_flag;
+    uint8_t      ffi_top_field_first_flag;
+    uint8_t      ffi_display_elemental_periods_minus1;
+    uint8_t      ffi_source_scan_type;
+    uint8_t      ffi_duplicate_flag;
+} SEIRawFrameFieldInformation;
+
 typedef struct SEIRawMessage {
     uint32_t     payload_type;
     uint32_t     payload_size;
diff --git a/libavcodec/cbs_sei_syntax_template.c b/libavcodec/cbs_sei_syntax_template.c
index 0205bb47aa3f4..e6863a0fd7833 100644
--- a/libavcodec/cbs_sei_syntax_template.c
+++ b/libavcodec/cbs_sei_syntax_template.c
@@ -224,6 +224,103 @@ SEI_FUNC(ambient_viewing_environment,
     return 0;
 }
 
+SEI_FUNC(film_grain_characteristics,
+        (CodedBitstreamContext *ctx, RWContext *rw,
+         SEIRawFilmGrainCharacteristics *current,
+         SEIMessageState *state))
+{
+    int err, c, i, j;
+
+    HEADER("Film Grain Characteristics");
+
+    flag(fg_characteristics_cancel_flag);
+    if (!current->fg_characteristics_cancel_flag) {
+        int filmGrainBitDepth[3];
+
+        u(2, fg_model_id, 0, 1);
+        flag(fg_separate_colour_description_present_flag);
+        if (current->fg_separate_colour_description_present_flag) {
+            ub(3, fg_bit_depth_luma_minus8);
+            ub(3, fg_bit_depth_chroma_minus8);
+            flag(fg_full_range_flag);
+            ub(8, fg_colour_primaries);
+            ub(8, fg_transfer_characteristics);
+            ub(8, fg_matrix_coeffs);
+        }
+
+        filmGrainBitDepth[0] = current->fg_bit_depth_luma_minus8 + 8;
+        filmGrainBitDepth[1] =
+        filmGrainBitDepth[2] = current->fg_bit_depth_chroma_minus8 + 8;
+
+        u(2, fg_blending_mode_id, 0, 1);
+        ub(4, fg_log2_scale_factor);
+        for (c = 0; c < 3; c++)
+            flags(fg_comp_model_present_flag[c], 1, c);
+
+        for (c = 0; c < 3; c++) {
+            if (current->fg_comp_model_present_flag[c]) {
+                ubs(8, fg_num_intensity_intervals_minus1[c], 1, c);
+                us(3, fg_num_model_values_minus1[c], 0, 5, 1, c);
+                for (i = 0; i <= current->fg_num_intensity_intervals_minus1[c]; i++) {
+                    ubs(8, fg_intensity_interval_lower_bound[c][i], 2, c, i);
+                    ubs(8, fg_intensity_interval_upper_bound[c][i], 2, c, i);
+                    for (j = 0; j <= current->fg_num_model_values_minus1[c]; j++)
+                        ses(fg_comp_model_value[c][i][j],  0  - current->fg_model_id * (1 << (filmGrainBitDepth[c] - 1)),
+                            ((1 << filmGrainBitDepth[c]) - 1) - current->fg_model_id * (1 << (filmGrainBitDepth[c] - 1)),
+                            3, c, i, j);
+                }
+            }
+        }
+        flag(fg_characteristics_persistence_flag);
+    }
+
+    return 0;
+}
+
+SEI_FUNC(display_orientation, (CodedBitstreamContext *ctx, RWContext *rw,
+                               SEIRawDisplayOrientation *current,
+                               SEIMessageState *state))
+{
+    int err;
+
+    HEADER("Display Orientation");
+
+    flag(display_orientation_cancel_flag);
+    if (!current->display_orientation_cancel_flag) {
+        flag(display_orientation_persistence_flag);
+        u(3, display_orientation_transform_type, 0, 7);
+        ub(3, display_orientation_reserved_zero_3bits);
+    }
+
+    return 0;
+}
+
+SEI_FUNC(frame_field_information, (CodedBitstreamContext *ctx, RWContext *rw,
+                                       SEIRawFrameFieldInformation *current,
+                                       SEIMessageState *state))
+{
+    int err;
+
+    HEADER("Frame-field information");
+
+    flag(ffi_field_pic_flag);
+    if (current->ffi_field_pic_flag) {
+        flag(ffi_bottom_field_flag);
+        flag(ffi_pairing_indicated_flag);
+        if (current->ffi_pairing_indicated_flag)
+            flag(ffi_paired_with_next_field_flag);
+    } else {
+        flag(ffi_display_fields_from_frame_flag);
+        if (current->ffi_display_fields_from_frame_flag)
+            flag(ffi_top_field_first_flag);
+        u(8, ffi_display_elemental_periods_minus1, 0, 0xff);
+    }
+    u(2, ffi_source_scan_type, 0, 3);
+    flag(ffi_duplicate_flag);
+
+    return 0;
+}
+
 static int FUNC(message)(CodedBitstreamContext *ctx, RWContext *rw,
                          SEIRawMessage *current)
 {
diff --git a/libavcodec/cuviddec.c b/libavcodec/cuviddec.c
index 6575f0f6b14a9..3437ee2109c23 100644
--- a/libavcodec/cuviddec.c
+++ b/libavcodec/cuviddec.c
@@ -424,6 +424,7 @@ static int CUDAAPI cuvid_handle_picture_display(void *opaque, CUVIDPARSERDISPINF
     AVCodecContext *avctx = opaque;
     CuvidContext *ctx = avctx->priv_data;
     CuvidParsedFrame parsed_frame = { { 0 } };
+    int ret;
 
     parsed_frame.dispinfo = *dispinfo;
     ctx->internal_error = 0;
@@ -432,13 +433,20 @@ static int CUDAAPI cuvid_handle_picture_display(void *opaque, CUVIDPARSERDISPINF
     parsed_frame.dispinfo.progressive_frame = ctx->progressive_sequence;
 
     if (ctx->deint_mode_current == cudaVideoDeinterlaceMode_Weave) {
-        av_fifo_write(ctx->frame_queue, &parsed_frame, 1);
+        ret = av_fifo_write(ctx->frame_queue, &parsed_frame, 1);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_ERROR, "Writing frame to fifo failed!\n");
     } else {
         parsed_frame.is_deinterlacing = 1;
-        av_fifo_write(ctx->frame_queue, &parsed_frame, 1);
+        ret = av_fifo_write(ctx->frame_queue, &parsed_frame, 1);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_ERROR, "Writing first frame to fifo failed!\n");
+
         if (!ctx->drop_second_field) {
             parsed_frame.second_field = 1;
-            av_fifo_write(ctx->frame_queue, &parsed_frame, 1);
+            ret = av_fifo_write(ctx->frame_queue, &parsed_frame, 1);
+            if (ret < 0)
+                av_log(avctx, AV_LOG_ERROR, "Writing second frame to fifo failed!\n");
         }
     }
 
@@ -497,7 +505,12 @@ static int cuvid_decode_packet(AVCodecContext *avctx, const AVPacket *avpkt)
         ctx->decoder_flushing = 1;
     }
 
-    ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &cupkt));
+    // When flushing, only actually flush cuvid when the output buffer has been fully emptied.
+    // CUVID happily dumps out a ton of frames with no regard for its own available surfaces.
+    if (!ctx->decoder_flushing || (ctx->decoder_flushing && !av_fifo_can_read(ctx->frame_queue)))
+        ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &cupkt));
+    else
+        ret = 0;
 
     if (ret < 0)
         goto error;
diff --git a/libavcodec/decode.c b/libavcodec/decode.c
index c2b2dd6e3b6e6..ef0956838137f 100644
--- a/libavcodec/decode.c
+++ b/libavcodec/decode.c
@@ -1590,22 +1590,49 @@ static void update_frame_props(AVCodecContext *avctx, AVFrame *frame)
     }
 }
 
-static void attach_post_process_data(AVCodecContext *avctx, AVFrame *frame)
+static int attach_post_process_data(AVCodecContext *avctx, AVFrame *frame)
 {
     AVCodecInternal    *avci = avctx->internal;
     DecodeContext        *dc = decode_ctx(avci);
 
     if (dc->lcevc_frame) {
         FrameDecodeData *fdd = frame->private_ref;
+        FFLCEVCFrame *frame_ctx;
+        int ret;
 
-        fdd->post_process_opaque = av_refstruct_ref(dc->lcevc);
-        fdd->post_process_opaque_free = ff_lcevc_unref;
-        fdd->post_process = ff_lcevc_process;
+        frame_ctx = av_mallocz(sizeof(*frame_ctx));
+        if (!frame_ctx)
+            return AVERROR(ENOMEM);
+
+        frame_ctx->frame = av_frame_alloc();
+        if (!frame_ctx->frame) {
+            av_free(frame_ctx);
+            return AVERROR(ENOMEM);
+        }
+
+        frame_ctx->lcevc = av_refstruct_ref(dc->lcevc);
+        frame_ctx->frame->width  = frame->width;
+        frame_ctx->frame->height = frame->height;
+        frame_ctx->frame->format = frame->format;
 
         frame->width  = dc->width;
         frame->height = dc->height;
+
+        ret = avctx->get_buffer2(avctx, frame_ctx->frame, 0);
+        if (ret < 0) {
+            ff_lcevc_unref(frame_ctx);
+            return ret;
+        }
+
+        validate_avframe_allocation(avctx, frame_ctx->frame);
+
+        fdd->post_process_opaque = frame_ctx;
+        fdd->post_process_opaque_free = ff_lcevc_unref;
+        fdd->post_process = ff_lcevc_process;
     }
     dc->lcevc_frame = 0;
+
+    return 0;
 }
 
 int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
@@ -1666,7 +1693,9 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
     if (ret < 0)
         goto fail;
 
-    attach_post_process_data(avctx, frame);
+    ret = attach_post_process_data(avctx, frame);
+    if (ret < 0)
+        goto fail;
 
 end:
     if (avctx->codec_type == AVMEDIA_TYPE_VIDEO && !override_dimensions &&
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index a8f8ab3cd925a..7a5978c137dfb 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -423,7 +423,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
     ff_fdctdsp_init(&ctx->m.fdsp, avctx);
     ff_mpv_idct_init(&ctx->m.c);
     ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
-    ff_pixblockdsp_init(&ctx->m.pdsp, avctx);
+    ff_pixblockdsp_init(&ctx->m.pdsp, ctx->bit_depth);
     ff_dct_encode_init(&ctx->m);
 
     if (ctx->profile != AV_PROFILE_DNXHD)
diff --git a/libavcodec/dolby_e_parse.c b/libavcodec/dolby_e_parse.c
index ffedcd99a44c0..fc20eae5b4a06 100644
--- a/libavcodec/dolby_e_parse.c
+++ b/libavcodec/dolby_e_parse.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "get_bits.h"
 #include "put_bits.h"
 #include "dolby_e.h"
@@ -88,7 +89,7 @@ int ff_dolby_e_convert_input(DBEContext *s, int nb_words, int key)
             AV_WB24(dst, AV_RB24(src) ^ key);
         break;
     default:
-        av_assert0(0);
+        av_unreachable("ff_dolby_e_parse_header() only sets 16, 20, 24 and errors out otherwise");
     }
 
     return init_get_bits(&s->gb, s->buffer, nb_words * s->word_bits);
diff --git a/libavcodec/dovi_rpu.h b/libavcodec/dovi_rpu.h
index f3ccc27ae87e7..1b74983205dba 100644
--- a/libavcodec/dovi_rpu.h
+++ b/libavcodec/dovi_rpu.h
@@ -133,9 +133,10 @@ int ff_dovi_attach_side_data(DOVIContext *s, AVFrame *frame);
 
 /**
  * Configure the encoder for Dolby Vision encoding. Generates a configuration
- * record in s->cfg, and attaches it to avctx->coded_side_data. Sets the correct
- * profile and compatibility ID based on the tagged AVCodecParameters colorspace
- * metadata, and the correct level based on the resolution and tagged framerate.
+ * record in s->cfg, and attaches it to codecpar->coded_side_data. Sets the
+ * correct profile and compatibility ID based on the tagged AVCodecParameters
+ * colorspace metadata, and the correct level based on the resolution and
+ * tagged framerate.
  *
  * `metadata` should point to the first frame's RPU, if available. If absent,
  * auto-detection will be performed, but this can sometimes lead to inaccurate
@@ -143,13 +144,13 @@ int ff_dovi_attach_side_data(DOVIContext *s, AVFrame *frame);
  *
  * Returns 0 or a negative error code.
  */
-int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
-                          const AVDOVIMetadata *metadata,
-                          enum AVDOVICompression compression,
-                          int strict_std_compliance);
+int ff_dovi_configure_from_codedpar(DOVIContext *s, AVCodecParameters *codecpar,
+                                    const AVDOVIMetadata *metadata,
+                                    enum AVDOVICompression compression,
+                                    int strict_std_compliance);
 
 /**
- * Helper wrapper around `ff_dovi_configure_ext` which infers the codec
+ * Variant of `ff_dovi_configure_from_codedpar` which infers the codec
  * parameters from an AVCodecContext.
  */
 int ff_dovi_configure(DOVIContext *s, AVCodecContext *avctx);
diff --git a/libavcodec/dovi_rpuenc.c b/libavcodec/dovi_rpuenc.c
index 2e1f8be08ee16..b05ad0a358818 100644
--- a/libavcodec/dovi_rpuenc.c
+++ b/libavcodec/dovi_rpuenc.c
@@ -20,16 +20,17 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/crc.h"
 #include "libavutil/mem.h"
+#include "libavutil/refstruct.h"
 
 #include "avcodec.h"
 #include "dovi_rpu.h"
 #include "itut35.h"
 #include "put_bits.h"
 #include "put_golomb.h"
-#include "libavutil/refstruct.h"
 
 static const struct {
     uint64_t pps; // maximum pixels per second
@@ -52,10 +53,18 @@ static const struct {
     [13] = {7680*4320*120u, 7680, 240, 800},
 };
 
-int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
-                          const AVDOVIMetadata *metadata,
-                          enum AVDOVICompression compression,
-                          int strict_std_compliance)
+static av_cold int dovi_configure_ext(DOVIContext *s, enum AVCodecID codec_id,
+                                      const AVDOVIMetadata *metadata,
+                                      enum AVDOVICompression compression,
+                                      int strict_std_compliance,
+                                      int width, int height,
+                                      AVRational framerate,
+                                      enum AVPixelFormat pix_format,
+                                      enum AVColorSpace color_space,
+                                      enum AVColorPrimaries color_primaries,
+                                      enum AVColorTransferCharacteristic color_trc,
+                                      AVPacketSideData **coded_side_data,
+                                      int *nb_coded_side_data)
 {
     AVDOVIDecoderConfigurationRecord *cfg;
     const AVDOVIRpuDataHeader *hdr = NULL;
@@ -76,7 +85,7 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
         compression > AV_DOVI_COMPRESSION_EXTENDED)
         return AVERROR(EINVAL);
 
-    switch (codecpar->codec_id) {
+    switch (codec_id) {
     case AV_CODEC_ID_AV1:  dv_profile = 10; break;
     case AV_CODEC_ID_H264: dv_profile = 9;  break;
     case AV_CODEC_ID_HEVC:
@@ -86,25 +95,23 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
         }
 
         /* This is likely to be proprietary IPTPQc2 */
-        if (codecpar->color_space == AVCOL_SPC_IPT_C2 ||
-            (codecpar->color_space == AVCOL_SPC_UNSPECIFIED &&
-             codecpar->color_trc == AVCOL_TRC_UNSPECIFIED))
+        if (color_space == AVCOL_SPC_IPT_C2 ||
+            (color_space == AVCOL_SPC_UNSPECIFIED &&
+             color_trc == AVCOL_TRC_UNSPECIFIED))
             dv_profile = 5;
         else
             dv_profile = 8;
         break;
     default:
-        /* No other encoder should be calling this! */
-        av_assert0(0);
-        return AVERROR_BUG;
+        av_unreachable("ff_dovi_configure only used with AV1, H.264 and HEVC");
     }
 
     if (strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         if (dv_profile == 9) {
-            if (codecpar->format != AV_PIX_FMT_YUV420P)
+            if (pix_format != AV_PIX_FMT_YUV420P)
                 dv_profile = 0;
         } else {
-            if (codecpar->format != AV_PIX_FMT_YUV420P10)
+            if (pix_format != AV_PIX_FMT_YUV420P10)
                 dv_profile = 0;
         }
     }
@@ -131,17 +138,17 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
         }
         /* fall through */
     case 8: /* HEVC (or AV1) with BL compatibility */
-        if (codecpar->color_space == AVCOL_SPC_BT2020_NCL &&
-            codecpar->color_primaries == AVCOL_PRI_BT2020 &&
-            codecpar->color_trc == AVCOL_TRC_SMPTE2084) {
+        if (color_space == AVCOL_SPC_BT2020_NCL &&
+            color_primaries == AVCOL_PRI_BT2020 &&
+            color_trc == AVCOL_TRC_SMPTE2084) {
             bl_compat_id = 1;
-        } else if (codecpar->color_space == AVCOL_SPC_BT2020_NCL &&
-                   codecpar->color_primaries == AVCOL_PRI_BT2020 &&
-                   codecpar->color_trc == AVCOL_TRC_ARIB_STD_B67) {
+        } else if (color_space == AVCOL_SPC_BT2020_NCL &&
+                   color_primaries == AVCOL_PRI_BT2020 &&
+                   color_trc == AVCOL_TRC_ARIB_STD_B67) {
             bl_compat_id = 4;
-        } else if (codecpar->color_space == AVCOL_SPC_BT709 &&
-                   codecpar->color_primaries == AVCOL_PRI_BT709 &&
-                   codecpar->color_trc == AVCOL_TRC_BT709) {
+        } else if (color_space == AVCOL_SPC_BT709 &&
+                   color_primaries == AVCOL_PRI_BT709 &&
+                   color_trc == AVCOL_TRC_BT709) {
             bl_compat_id = 2;
         }
     }
@@ -175,9 +182,9 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
         }
     }
 
-    pps = codecpar->width * codecpar->height;
-    if (codecpar->framerate.num) {
-        pps = pps * codecpar->framerate.num / codecpar->framerate.den;
+    pps = width * height;
+    if (framerate.num) {
+        pps = pps * framerate.num / framerate.den;
     } else {
         pps *= 25; /* sanity fallback */
     }
@@ -186,7 +193,7 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
     for (int i = 1; i < FF_ARRAY_ELEMS(dv_levels); i++) {
         if (pps > dv_levels[i].pps)
             continue;
-        if (codecpar->width > dv_levels[i].width)
+        if (width > dv_levels[i].width)
             continue;
         /* In theory, we should also test the bitrate when known, and
          * distinguish between main and high tier. In practice, just ignore
@@ -199,12 +206,12 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
     if (!dv_level) {
         if (strict_std_compliance >= FF_COMPLIANCE_STRICT) {
             av_log(s->logctx, AV_LOG_ERROR, "Coded PPS (%"PRIu64") and width (%d) "
-                   "exceed Dolby Vision limitations\n", pps, codecpar->width);
+                   "exceed Dolby Vision limitations\n", pps, width);
             return AVERROR(EINVAL);
         } else {
             av_log(s->logctx, AV_LOG_WARNING, "Coded PPS (%"PRIu64") and width (%d) "
                    "exceed Dolby Vision limitations. Ignoring, resulting file "
-                   "may be non-conforming.\n", pps, codecpar->width);
+                   "may be non-conforming.\n", pps, width);
             dv_level = FF_ARRAY_ELEMS(dv_levels) - 1;
         }
     }
@@ -213,8 +220,8 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
     if (!cfg)
         return AVERROR(ENOMEM);
 
-    if (!av_packet_side_data_add(&codecpar->coded_side_data,
-                                 &codecpar->nb_coded_side_data,
+    if (!av_packet_side_data_add(coded_side_data,
+                                 nb_coded_side_data,
                                  AV_PKT_DATA_DOVI_CONF, cfg, cfg_size, 0)) {
         av_free(cfg);
         return AVERROR(ENOMEM);
@@ -238,19 +245,22 @@ int ff_dovi_configure_ext(DOVIContext *s, AVCodecParameters *codecpar,
     return 0;
 }
 
-int ff_dovi_configure(DOVIContext *s, AVCodecContext *avctx)
+av_cold int ff_dovi_configure_from_codedpar(DOVIContext *s, AVCodecParameters *par,
+                                            const AVDOVIMetadata *metadata,
+                                            enum AVDOVICompression compression,
+                                            int strict_std_compliance)
 {
-    int ret;
-    const AVFrameSideData *sd;
-    const AVDOVIMetadata *metadata = NULL;
-    AVCodecParameters *codecpar = avcodec_parameters_alloc();
-    if (!codecpar)
-        return AVERROR(ENOMEM);
-
-    ret = avcodec_parameters_from_context(codecpar, avctx);
-    if (ret < 0)
-        goto fail;
+    return dovi_configure_ext(s, par->codec_id, metadata, compression,
+                              strict_std_compliance, par->width, par->height,
+                              par->framerate, par->format, par->color_space,
+                              par->color_primaries, par->color_trc,
+                              &par->coded_side_data, &par->nb_coded_side_data);
+}
 
+av_cold int ff_dovi_configure(DOVIContext *s, AVCodecContext *avctx)
+{
+    const AVDOVIMetadata *metadata = NULL;
+    const AVFrameSideData *sd;
     sd = av_frame_side_data_get(avctx->decoded_side_data,
                                 avctx->nb_decoded_side_data,
                                 AV_FRAME_DATA_DOVI_METADATA);
@@ -258,16 +268,11 @@ int ff_dovi_configure(DOVIContext *s, AVCodecContext *avctx)
         metadata = (const AVDOVIMetadata *) sd->data;
 
     /* Current encoders cannot handle metadata compression during encoding */
-    ret = ff_dovi_configure_ext(s, codecpar, metadata, AV_DOVI_COMPRESSION_NONE,
-                                avctx->strict_std_compliance);
-    if (ret < 0)
-        goto fail;
-
-    ret = avcodec_parameters_to_context(avctx, codecpar);
-
-fail:
-    avcodec_parameters_free(&codecpar);
-    return ret;
+    return dovi_configure_ext(s, avctx->codec_id, metadata, AV_DOVI_COMPRESSION_NONE,
+                              avctx->strict_std_compliance, avctx->width,
+                              avctx->height, avctx->framerate, avctx->pix_fmt,
+                              avctx->colorspace, avctx->color_primaries, avctx->color_trc,
+                              &avctx->coded_side_data, &avctx->nb_coded_side_data);
 }
 
 /* Compares only the static DM metadata parts of AVDOVIColorMetadata (excluding
diff --git a/libavcodec/dvenc.c b/libavcodec/dvenc.c
index c7fc930b4b1db..a477b84261bfa 100644
--- a/libavcodec/dvenc.c
+++ b/libavcodec/dvenc.c
@@ -63,6 +63,8 @@ typedef struct DVEncContext {
     DVwork_chunk work_chunks[4 * 12 * 27];
 
     int quant_deadzone;
+
+    PixblockDSPContext pdsp;
 } DVEncContext;
 
 
@@ -70,7 +72,6 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
 {
     DVEncContext *s = avctx->priv_data;
     FDCTDSPContext fdsp;
-    PixblockDSPContext pdsp;
     int ret;
 
     s->avctx = avctx;
@@ -108,12 +109,10 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
     }
 
     memset(&fdsp,0, sizeof(fdsp));
-    memset(&pdsp,0, sizeof(pdsp));
     ff_fdctdsp_init(&fdsp, avctx);
-    ff_pixblockdsp_init(&pdsp, avctx);
-    s->get_pixels = pdsp.get_pixels;
     s->fdct[0]    = fdsp.fdct;
     s->fdct[1]    = fdsp.fdct248;
+    ff_pixblockdsp_init(&s->pdsp, 8);
 
 #if !CONFIG_HARDCODED_TABLES
     {
@@ -1201,6 +1200,14 @@ static int dvvideo_encode_frame(AVCodecContext *c, AVPacket *pkt,
     DVEncContext *s = c->priv_data;
     int ret;
 
+    if (!PIXBLOCKDSP_8BPP_GET_PIXELS_SUPPORTS_UNALIGNED &&
+        ((uintptr_t)frame->data[0] & 7 || frame->linesize[0] & 7 ||
+         (uintptr_t)frame->data[1] & 7 || frame->linesize[1] & 7 ||
+         (uintptr_t)frame->data[2] & 7 || frame->linesize[2] & 7))
+        s->get_pixels = s->pdsp.get_pixels_unaligned;
+    else
+        s->get_pixels = s->pdsp.get_pixels;
+
     if ((ret = ff_get_encode_buffer(c, pkt, s->sys->frame_size, 0)) < 0)
         return ret;
     /* Fixme: Only zero the part that is not overwritten later. */
diff --git a/libavcodec/dxv.h b/libavcodec/dxv.h
index 71cfddec858de..184813e427b73 100644
--- a/libavcodec/dxv.h
+++ b/libavcodec/dxv.h
@@ -1,6 +1,6 @@
 /*
  * Resolume DXV common
- * Copyright (C) 2024 Connor Worley <connorbworley@gmail.com>
+ * Copyright (C) 2024 Emma Worley <emma@emma.gg>
  *
  * This file is part of FFmpeg.
  *
diff --git a/libavcodec/dxvenc.c b/libavcodec/dxvenc.c
index 808d8daedb529..ee6a0a5b367e4 100644
--- a/libavcodec/dxvenc.c
+++ b/libavcodec/dxvenc.c
@@ -1,6 +1,6 @@
 /*
  * Resolume DXV encoder
- * Copyright (C) 2024 Connor Worley <connorbworley@gmail.com>
+ * Copyright (C) 2024 Emma Worley <emma@emma.gg>
  *
  * This file is part of FFmpeg.
  *
@@ -21,7 +21,7 @@
 
 #include <stdint.h>
 
-#include "libavutil/crc.h"
+#include "libavcodec/hashtable.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
@@ -34,77 +34,19 @@
 
 #define DXV_HEADER_LENGTH 12
 
+/*
+ * Resolume will refuse to display frames that are not padded to 16x16 pixels.
+ */
+#define DXV_ALIGN(x) FFALIGN(x, 16)
+
 /*
  * DXV uses LZ-like back-references to avoid copying words that have already
  * appeared in the decompressed stream. Using a simple hash table (HT)
  * significantly speeds up the lookback process while encoding.
  */
-#define LOOKBACK_HT_ELEMS 0x40000
+#define LOOKBACK_HT_ELEMS 0x20202
 #define LOOKBACK_WORDS    0x20202
 
-typedef struct HTEntry {
-    uint32_t key;
-    uint32_t pos;
-} HTEntry;
-
-static void ht_init(HTEntry *ht)
-{
-    for (size_t i = 0; i < LOOKBACK_HT_ELEMS; i++) {
-        ht[i].pos = -1;
-    }
-}
-
-static uint32_t ht_lookup_and_upsert(HTEntry *ht, const AVCRC *hash_ctx,
-                                    uint32_t key, uint32_t pos)
-{
-    uint32_t ret = -1;
-    size_t hash = av_crc(hash_ctx, 0, (uint8_t*)&key, 4) % LOOKBACK_HT_ELEMS;
-    for (size_t i = hash; i < hash + LOOKBACK_HT_ELEMS; i++) {
-        size_t wrapped_index = i % LOOKBACK_HT_ELEMS;
-        HTEntry *entry = &ht[wrapped_index];
-        if (entry->key == key || entry->pos == -1) {
-            ret = entry->pos;
-            entry->key = key;
-            entry->pos = pos;
-            break;
-        }
-    }
-    return ret;
-}
-
-static void ht_delete(HTEntry *ht, const AVCRC *hash_ctx,
-                      uint32_t key, uint32_t pos)
-{
-    HTEntry *removed_entry = NULL;
-    size_t removed_hash;
-    size_t hash = av_crc(hash_ctx, 0, (uint8_t*)&key, 4) % LOOKBACK_HT_ELEMS;
-
-    for (size_t i = hash; i < hash + LOOKBACK_HT_ELEMS; i++) {
-        size_t wrapped_index = i % LOOKBACK_HT_ELEMS;
-        HTEntry *entry = &ht[wrapped_index];
-        if (entry->pos == -1)
-            return;
-        if (removed_entry) {
-            size_t candidate_hash = av_crc(hash_ctx, 0, (uint8_t*)&entry->key, 4) % LOOKBACK_HT_ELEMS;
-            if ((wrapped_index > removed_hash && (candidate_hash <= removed_hash || candidate_hash > wrapped_index)) ||
-                (wrapped_index < removed_hash && (candidate_hash <= removed_hash && candidate_hash > wrapped_index))) {
-                *removed_entry = *entry;
-                entry->pos = -1;
-                removed_entry = entry;
-                removed_hash = wrapped_index;
-            }
-        } else if (entry->key == key) {
-            if (entry->pos <= pos) {
-                entry->pos = -1;
-                removed_entry = entry;
-                removed_hash = wrapped_index;
-            } else {
-                return;
-            }
-        }
-    }
-}
-
 typedef struct DXVEncContext {
     AVClass *class;
 
@@ -121,10 +63,9 @@ typedef struct DXVEncContext {
     DXVTextureFormat tex_fmt;
     int (*compress_tex)(AVCodecContext *avctx);
 
-    const AVCRC *crc_ctx;
-
-    HTEntry color_lookback_ht[LOOKBACK_HT_ELEMS];
-    HTEntry lut_lookback_ht[LOOKBACK_HT_ELEMS];
+    FFHashtableContext *color_ht;
+    FFHashtableContext *lut_ht;
+    FFHashtableContext *combo_ht;
 } DXVEncContext;
 
 /* Converts an index offset value to a 2-bit opcode and pushes it to a stream.
@@ -159,58 +100,63 @@ static int dxv_compress_dxt1(AVCodecContext *avctx)
     DXVEncContext *ctx = avctx->priv_data;
     PutByteContext *pbc = &ctx->pbc;
     void *value;
-    uint32_t color, lut, idx, color_idx, lut_idx, prev_pos, state = 16, pos = 2, op = 0;
+    uint64_t combo;
+    uint32_t color, lut, idx, combo_idx, prev_pos, old_pos, state = 16, pos = 0, op = 0;
+
+    ff_hashtable_clear(ctx->color_ht);
+    ff_hashtable_clear(ctx->lut_ht);
+    ff_hashtable_clear(ctx->combo_ht);
 
-    ht_init(ctx->color_lookback_ht);
-    ht_init(ctx->lut_lookback_ht);
+    ff_hashtable_set(ctx->combo_ht, ctx->tex_data, &pos);
 
     bytestream2_put_le32(pbc, AV_RL32(ctx->tex_data));
+    ff_hashtable_set(ctx->color_ht, ctx->tex_data, &pos);
+    pos++;
     bytestream2_put_le32(pbc, AV_RL32(ctx->tex_data + 4));
-
-    ht_lookup_and_upsert(ctx->color_lookback_ht, ctx->crc_ctx, AV_RL32(ctx->tex_data), 0);
-    ht_lookup_and_upsert(ctx->lut_lookback_ht, ctx->crc_ctx, AV_RL32(ctx->tex_data + 4), 1);
+    ff_hashtable_set(ctx->lut_ht, ctx->tex_data + 4, &pos);
+    pos++;
 
     while (pos + 2 <= ctx->tex_size / 4) {
-        idx = 0;
-
-        color = AV_RL32(ctx->tex_data + pos * 4);
-        prev_pos = ht_lookup_and_upsert(ctx->color_lookback_ht, ctx->crc_ctx, color, pos);
-        color_idx = prev_pos != -1 ? pos - prev_pos : 0;
+        combo = AV_RL64(ctx->tex_data + pos * 4);
+        combo_idx = ff_hashtable_get(ctx->combo_ht, &combo, &prev_pos) ? pos - prev_pos : 0;
+        idx = combo_idx;
+        PUSH_OP(2);
         if (pos >= LOOKBACK_WORDS) {
-            uint32_t old_pos = pos - LOOKBACK_WORDS;
-            uint32_t old_color = AV_RL32(ctx->tex_data + old_pos * 4);
-            ht_delete(ctx->color_lookback_ht, ctx->crc_ctx, old_color, old_pos);
+            old_pos = pos - LOOKBACK_WORDS;
+            if (ff_hashtable_get(ctx->combo_ht, ctx->tex_data + old_pos * 4, &prev_pos) && prev_pos <= old_pos)
+                ff_hashtable_delete(ctx->combo_ht, ctx->tex_data + old_pos * 4);
         }
-        pos++;
+        ff_hashtable_set(ctx->combo_ht, &combo, &pos);
 
-        lut = AV_RL32(ctx->tex_data + pos * 4);
-        if (color_idx && lut == AV_RL32(ctx->tex_data + (pos - color_idx) * 4)) {
-            idx = color_idx;
-        } else {
-            idx = 0;
-            prev_pos = ht_lookup_and_upsert(ctx->lut_lookback_ht, ctx->crc_ctx, lut, pos);
-            lut_idx = prev_pos != -1 ? pos - prev_pos : 0;
+        color = AV_RL32(ctx->tex_data + pos * 4);
+        if (!combo_idx) {
+            idx = ff_hashtable_get(ctx->color_ht, &color, &prev_pos) ? pos - prev_pos : 0;
+            PUSH_OP(2);
+            if (!idx)
+                bytestream2_put_le32(pbc, color);
         }
         if (pos >= LOOKBACK_WORDS) {
-            uint32_t old_pos = pos - LOOKBACK_WORDS;
-            uint32_t old_lut = AV_RL32(ctx->tex_data + old_pos * 4);
-            ht_delete(ctx->lut_lookback_ht, ctx->crc_ctx, old_lut, old_pos);
+            old_pos = pos - LOOKBACK_WORDS;
+            if (ff_hashtable_get(ctx->color_ht, ctx->tex_data + old_pos * 4, &prev_pos) && prev_pos <= old_pos)
+                ff_hashtable_delete(ctx->color_ht, ctx->tex_data + old_pos * 4);
         }
+        ff_hashtable_set(ctx->color_ht, &color, &pos);
         pos++;
 
-        PUSH_OP(2);
-
-        if (!idx) {
-            idx = color_idx;
-            PUSH_OP(2);
-            if (!idx)
-                bytestream2_put_le32(pbc,  color);
-
-            idx = lut_idx;
+        lut = AV_RL32(ctx->tex_data + pos * 4);
+        if (!combo_idx) {
+            idx = ff_hashtable_get(ctx->lut_ht, &lut, &prev_pos) ? pos - prev_pos : 0;
             PUSH_OP(2);
             if (!idx)
-                bytestream2_put_le32(pbc,  lut);
+                bytestream2_put_le32(pbc, lut);
+        }
+        if (pos >= LOOKBACK_WORDS) {
+            old_pos = pos - LOOKBACK_WORDS;
+            if (ff_hashtable_get(ctx->lut_ht, ctx->tex_data + old_pos * 4, &prev_pos) && prev_pos <= old_pos)
+                ff_hashtable_delete(ctx->lut_ht, ctx->tex_data + old_pos * 4);
         }
+        ff_hashtable_set(ctx->lut_ht, &lut, &pos);
+        pos++;
     }
 
     return 0;
@@ -231,12 +177,50 @@ static int dxv_encode(AVCodecContext *avctx, AVPacket *pkt,
         return ret;
 
     if (ctx->enc.tex_funct) {
+        uint8_t *safe_data[4] = {frame->data[0], 0, 0, 0};
+        int safe_linesize[4] = {frame->linesize[0], 0, 0, 0};
+
+        if (avctx->width != DXV_ALIGN(avctx->width) || avctx->height != DXV_ALIGN(avctx->height)) {
+            ret = av_image_alloc(
+                safe_data,
+                safe_linesize,
+                DXV_ALIGN(avctx->width),
+                DXV_ALIGN(avctx->height),
+                avctx->pix_fmt,
+                1);
+            if (ret < 0)
+                return ret;
+
+            av_image_copy2(
+                safe_data,
+                safe_linesize,
+                frame->data,
+                frame->linesize,
+                avctx->pix_fmt,
+                avctx->width,
+                avctx->height);
+
+            if (avctx->width != DXV_ALIGN(avctx->width)) {
+                for (int y = 0; y < avctx->height; y++) {
+                    memset(safe_data[0] + y * safe_linesize[0] + frame->linesize[0], 0, safe_linesize[0] - frame->linesize[0]);
+                }
+            }
+            if (avctx->height != DXV_ALIGN(avctx->height)) {
+                for (int y = avctx->height; y < DXV_ALIGN(avctx->height); y++) {
+                    memset(safe_data[0] + y * safe_linesize[0], 0, safe_linesize[0]);
+                }
+            }
+        }
+
         ctx->enc.tex_data.out = ctx->tex_data;
-        ctx->enc.frame_data.in = frame->data[0];
-        ctx->enc.stride = frame->linesize[0];
-        ctx->enc.width  = avctx->width;
-        ctx->enc.height = avctx->height;
+        ctx->enc.frame_data.in = safe_data[0];
+        ctx->enc.stride = safe_linesize[0];
+        ctx->enc.width  = DXV_ALIGN(avctx->width);
+        ctx->enc.height = DXV_ALIGN(avctx->height);
         ff_texturedsp_exec_compress_threads(avctx, &ctx->enc);
+
+        if (safe_data[0] != frame->data[0])
+            av_freep(&safe_data[0]);
     } else {
         /* unimplemented: YCoCg formats */
         return AVERROR_INVALIDDATA;
@@ -275,14 +259,6 @@ static av_cold int dxv_init(AVCodecContext *avctx)
         return ret;
     }
 
-    if (avctx->width % TEXTURE_BLOCK_W || avctx->height % TEXTURE_BLOCK_H) {
-        av_log(avctx,
-               AV_LOG_ERROR,
-               "Video size %dx%d is not multiple of "AV_STRINGIFY(TEXTURE_BLOCK_W)"x"AV_STRINGIFY(TEXTURE_BLOCK_H)".\n",
-               avctx->width, avctx->height);
-        return AVERROR_INVALIDDATA;
-    }
-
     ff_texturedspenc_init(&texdsp);
 
     switch (ctx->tex_fmt) {
@@ -296,21 +272,25 @@ static av_cold int dxv_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
     ctx->enc.raw_ratio = 16;
-    ctx->tex_size = avctx->width  / TEXTURE_BLOCK_W *
-                    avctx->height / TEXTURE_BLOCK_H *
+    ctx->tex_size = DXV_ALIGN(avctx->width) / TEXTURE_BLOCK_W *
+                    DXV_ALIGN(avctx->height) / TEXTURE_BLOCK_H *
                     ctx->enc.tex_ratio;
-    ctx->enc.slice_count = av_clip(avctx->thread_count, 1, avctx->height / TEXTURE_BLOCK_H);
+    ctx->enc.slice_count = av_clip(avctx->thread_count, 1, DXV_ALIGN(avctx->height) / TEXTURE_BLOCK_H);
 
     ctx->tex_data = av_malloc(ctx->tex_size);
     if (!ctx->tex_data) {
         return AVERROR(ENOMEM);
     }
 
-    ctx->crc_ctx = av_crc_get_table(AV_CRC_32_IEEE);
-    if (!ctx->crc_ctx) {
-        av_log(avctx, AV_LOG_ERROR, "Could not initialize CRC table.\n");
-        return AVERROR_BUG;
-    }
+    ret = ff_hashtable_alloc(&ctx->color_ht, sizeof(uint32_t), sizeof(uint32_t), LOOKBACK_HT_ELEMS);
+    if (ret < 0)
+        return ret;
+    ret = ff_hashtable_alloc(&ctx->lut_ht, sizeof(uint32_t), sizeof(uint32_t), LOOKBACK_HT_ELEMS);
+    if (ret < 0)
+        return ret;
+    ret = ff_hashtable_alloc(&ctx->combo_ht, sizeof(uint64_t), sizeof(uint32_t), LOOKBACK_HT_ELEMS);
+    if (ret < 0)
+        return ret;
 
     return 0;
 }
@@ -321,6 +301,10 @@ static av_cold int dxv_close(AVCodecContext *avctx)
 
     av_freep(&ctx->tex_data);
 
+    ff_hashtable_freep(&ctx->color_ht);
+    ff_hashtable_freep(&ctx->lut_ht);
+    ff_hashtable_freep(&ctx->combo_ht);
+
     return 0;
 }
 
diff --git a/libavcodec/eac3enc.c b/libavcodec/eac3enc.c
index 3590b821a33ee..10b1ab337c648 100644
--- a/libavcodec/eac3enc.c
+++ b/libavcodec/eac3enc.c
@@ -135,6 +135,8 @@ static void eac3_output_frame_header(AC3EncodeContext *s, PutBitContext *pb)
     int blk, ch;
     AC3EncOptions *opt = &s->options;
 
+    put_bits_assume_flushed(pb);
+
     put_bits(pb, 16, 0x0b77);                   /* sync word */
 
     /* BSI header */
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 40209f99359ab..463f46e091ef1 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -629,7 +629,6 @@ av_cold int ff_ffv1_encode_init(AVCodecContext *avctx)
     if (s->ec < 0) {
         if (s->version >= 4) {
             s->ec = 2;
-            s->crcref = 0x7a8c4079;
         } else if (s->version >= 3) {
             s->ec = 1;
         } else
@@ -639,8 +638,10 @@ av_cold int ff_ffv1_encode_init(AVCodecContext *avctx)
     // CRC requires version 3+
     if (s->ec == 1)
         s->version = FFMAX(s->version, 3);
-    if (s->ec == 2)
+    if (s->ec == 2) {
         s->version = FFMAX(s->version, 4);
+        s->crcref = 0x7a8c4079;
+    }
 
     if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
         av_log(avctx, AV_LOG_ERROR, "Version 2 or 4 needed for requested features but version 2 or 4 is experimental and not enabled\n");
diff --git a/libavcodec/ffv1enc_template.c b/libavcodec/ffv1enc_template.c
index 64f3c420c51a7..aaf82159eef81 100644
--- a/libavcodec/ffv1enc_template.c
+++ b/libavcodec/ffv1enc_template.c
@@ -38,19 +38,13 @@ RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
     if (bits == 0)
         return 0;
 
-    if (ac != AC_GOLOMB_RICE) {
-        if (c->bytestream_end - c->bytestream < w * 35) {
+    if (sc->slice_coding_mode == 1) {
+        av_assert0(ac != AC_GOLOMB_RICE);
+        if (c->bytestream_end - c->bytestream < (w * bits + 7LL)>>3) {
             av_log(logctx, AV_LOG_ERROR, "encoded Range Coder frame too large\n");
             return AVERROR_INVALIDDATA;
         }
-    } else {
-        if (put_bytes_left(&sc->pb, 0) < w * 4) {
-            av_log(logctx, AV_LOG_ERROR, "encoded Golomb Rice frame too large\n");
-            return AVERROR_INVALIDDATA;
-        }
-    }
 
-    if (sc->slice_coding_mode == 1) {
         for (x = 0; x < w; x++) {
             int i;
             int v = sample[0][x];
@@ -62,6 +56,18 @@ RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
         return 0;
     }
 
+    if (ac != AC_GOLOMB_RICE) {
+        if (c->bytestream_end - c->bytestream < w * 35) {
+            av_log(logctx, AV_LOG_ERROR, "encoded Range Coder frame too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        if (put_bytes_left(&sc->pb, 0) < w * 4) {
+            av_log(logctx, AV_LOG_ERROR, "encoded Golomb Rice frame too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     for (x = 0; x < w; x++) {
         int diff, context;
 
diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 42a98a5efa244..259bc75d4c8ff 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -37,6 +37,9 @@
 #define LG_ALIGN_W 32
 #define LG_ALIGN_H 32
 
+/* Unlike the decoder, we need 4 lines (but really only 3) */
+#define RGB_LINECACHE 4
+
 typedef struct VulkanEncodeFFv1FrameData {
     /* Output data */
     AVBufferRef *out_data_ref;
@@ -71,8 +74,8 @@ typedef struct VulkanEncodeFFv1Context {
     size_t max_heap_size;
 
     FFVulkanShader setup;
+    FFVulkanShader rct_search;
     FFVulkanShader reset;
-    FFVulkanShader rct;
     FFVulkanShader enc;
 
     /* Constant read-only buffers */
@@ -86,10 +89,6 @@ typedef struct VulkanEncodeFFv1Context {
 
     /* Output data buffer */
     AVBufferPool *out_data_pool;
-    AVBufferPool *pkt_data_pool;
-
-    /* Temporary data buffer */
-    AVBufferPool *tmp_data_pool;
 
     /* Slice results buffer */
     AVBufferPool *results_data_pool;
@@ -103,6 +102,7 @@ typedef struct VulkanEncodeFFv1Context {
     int num_h_slices;
     int num_v_slices;
     int force_pcm;
+    int optimize_rct;
 
     int is_rgb;
     int ppi;
@@ -114,19 +114,16 @@ extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
 extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_reset_comp;
-extern const char *ff_source_ffv1_enc_common_comp;
-extern const char *ff_source_ffv1_enc_rct_comp;
-extern const char *ff_source_ffv1_enc_vlc_comp;
-extern const char *ff_source_ffv1_enc_ac_comp;
+extern const char *ff_source_ffv1_rct_search_comp;
 extern const char *ff_source_ffv1_enc_setup_comp;
 extern const char *ff_source_ffv1_enc_comp;
-extern const char *ff_source_ffv1_enc_rgb_comp;
 
 typedef struct FFv1VkParameters {
     VkDeviceAddress slice_state;
     VkDeviceAddress scratch_data;
     VkDeviceAddress out_data;
 
+    int32_t fmt_lut[4];
     int32_t sar[2];
     uint32_t chroma_shift[2];
 
@@ -134,7 +131,9 @@ typedef struct FFv1VkParameters {
     uint32_t context_count;
     uint32_t crcref;
     uint32_t slice_size_max;
+    int      rct_offset;
 
+    uint8_t extend_lookup[8];
     uint8_t bits_per_raw_sample;
     uint8_t context_model;
     uint8_t version;
@@ -144,13 +143,15 @@ typedef struct FFv1VkParameters {
     uint8_t components;
     uint8_t planes;
     uint8_t codec_planes;
+    uint8_t planar_rgb;
     uint8_t transparency;
     uint8_t colorspace;
     uint8_t pic_mode;
     uint8_t ec;
     uint8_t ppi;
     uint8_t chunks;
-    uint8_t padding[1];
+    uint8_t rct_search;
+    uint8_t padding[3];
 } FFv1VkParameters;
 
 static void add_push_data(FFVulkanShader *shd)
@@ -160,6 +161,7 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    u8buf scratch_data;                                           );
     GLSLC(1,    u8buf out_data;                                               );
     GLSLC(0,                                                                  );
+    GLSLC(1,    ivec4 fmt_lut;                                                );
     GLSLC(1,    ivec2 sar;                                                    );
     GLSLC(1,    uvec2 chroma_shift;                                           );
     GLSLC(0,                                                                  );
@@ -167,7 +169,9 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint context_count;                                           );
     GLSLC(1,    uint32_t crcref;                                              );
     GLSLC(1,    uint32_t slice_size_max;                                      );
+    GLSLC(1,    int rct_offset;                                               );
     GLSLC(0,                                                                  );
+    GLSLC(1,    uint8_t extend_lookup[8];                                     );
     GLSLC(1,    uint8_t bits_per_raw_sample;                                  );
     GLSLC(1,    uint8_t context_model;                                        );
     GLSLC(1,    uint8_t version;                                              );
@@ -177,120 +181,81 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint8_t components;                                           );
     GLSLC(1,    uint8_t planes;                                               );
     GLSLC(1,    uint8_t codec_planes;                                         );
+    GLSLC(1,    uint8_t planar_rgb;                                           );
     GLSLC(1,    uint8_t transparency;                                         );
     GLSLC(1,    uint8_t colorspace;                                           );
     GLSLC(1,    uint8_t pic_mode;                                             );
     GLSLC(1,    uint8_t ec;                                                   );
     GLSLC(1,    uint8_t ppi;                                                  );
     GLSLC(1,    uint8_t chunks;                                               );
-    GLSLC(1,    uint8_t padding[1];                                           );
+    GLSLC(1,    uint8_t rct_search;                                           );
+    GLSLC(1,    uint8_t padding[3];                                           );
     GLSLC(0, };                                                               );
     ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 }
 
-static int run_rct(AVCodecContext *avctx, FFVkExecContext *exec,
-                   AVFrame *enc_in, VkImageView *enc_in_views,
-                   AVFrame **intermediate_frame, VkImageView *intermediate_views,
-                   VkImageMemoryBarrier2 *img_bar, int *nb_img_bar,
-                   VkBufferMemoryBarrier2 *buf_bar, int *nb_buf_bar,
-                   FFVkBuffer *slice_data_buf, uint32_t slice_data_size)
+typedef struct FFv1VkRCTSearchParameters {
+    int fmt_lut[4];
+    int rct_offset;
+    uint8_t planar_rgb;
+    uint8_t transparency;
+    uint8_t key_frame;
+    uint8_t force_pcm;
+    uint8_t version;
+    uint8_t micro_version;
+    uint8_t padding[2];
+} FFv1VkRCTSearchParameters;
+
+static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec,
+                          AVFrame *enc_in, VkImageView *enc_in_views,
+                          FFVkBuffer *slice_data_buf, uint32_t slice_data_size)
 {
-    int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
     FFV1Context *f = &fv->ctx;
     FFVulkanFunctions *vk = &fv->s.vkfn;
     AVHWFramesContext *src_hwfc = (AVHWFramesContext *)enc_in->hw_frames_ctx->data;
-    FFv1VkRCTParameters pd;
-
-    /* Create a temporaty frame */
-    *intermediate_frame = av_frame_alloc();
-    if (!(*intermediate_frame))
-        return AVERROR(ENOMEM);
-
-    RET(av_hwframe_get_buffer(fv->intermediate_frames_ref,
-                              *intermediate_frame, 0));
-
-    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, *intermediate_frame,
-                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
-    RET(ff_vk_create_imageviews(&fv->s, exec, intermediate_views,
-                                *intermediate_frame,
-                                fv->rep_fmt));
+    FFv1VkRCTSearchParameters pd;
 
     /* Update descriptors */
-    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct,
-                                    1, 0, 0,
+    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct_search,
+                                    0, 0, 0,
                                     slice_data_buf,
                                     0, slice_data_size*f->slice_count,
                                     VK_FORMAT_UNDEFINED);
-    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct,
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct_search,
                                   enc_in, enc_in_views,
-                                  1, 1,
-                                  VK_IMAGE_LAYOUT_GENERAL,
-                                  VK_NULL_HANDLE);
-    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct,
-                                  *intermediate_frame, intermediate_views,
-                                  1, 2,
+                                  0, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
 
-    ff_vk_frame_barrier(&fv->s, exec, *intermediate_frame, img_bar, nb_img_bar,
-                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_ACCESS_SHADER_WRITE_BIT,
-                        VK_IMAGE_LAYOUT_GENERAL,
-                        VK_QUEUE_FAMILY_IGNORED);
-
-    /* Prep the input/output images */
-    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pImageMemoryBarriers = img_bar,
-            .imageMemoryBarrierCount = *nb_img_bar,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = *nb_buf_bar,
-    });
-    *nb_img_bar = 0;
-    if (*nb_buf_bar) {
-        slice_data_buf->stage = buf_bar[0].dstStageMask;
-        slice_data_buf->access = buf_bar[0].dstAccessMask;
-        *nb_buf_bar = 0;
-    }
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct_search);
 
-    /* Run the shader */
-    ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct);
-    pd = (FFv1VkRCTParameters) {
-        .offset = 1 << f->bits_per_raw_sample,
-        .bits = f->bits_per_raw_sample,
+    pd = (FFv1VkRCTSearchParameters) {
+        .rct_offset = 1 << f->bits_per_raw_sample,
         .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) &&
                       (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1),
         .transparency = f->transparency,
+        .key_frame = f->key_frame,
+        .force_pcm = fv->force_pcm,
+        .version = f->version,
+        .micro_version = f->micro_version,
     };
 
-    /* For some reason the C FFv1 encoder/decoder treats these differently */
-    if (src_hwfc->sw_format == AV_PIX_FMT_GBRP10 ||
-        src_hwfc->sw_format == AV_PIX_FMT_GBRP12 ||
-        src_hwfc->sw_format == AV_PIX_FMT_GBRP14)
+    if (avctx->sw_pix_fmt == AV_PIX_FMT_GBRP10 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP12 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP14)
         memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
     else
-        ff_vk_set_perm(src_hwfc->sw_format, pd.fmt_lut, 1);
+        ff_vk_set_perm(avctx->sw_pix_fmt, pd.fmt_lut, 1);
 
-    ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct,
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct_search,
                                    VK_SHADER_STAGE_COMPUTE_BIT,
                                    0, sizeof(pd), &pd);
 
     vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
 
-    /* Add a post-dispatch barrier before encoding */
-    ff_vk_frame_barrier(&fv->s, exec, *intermediate_frame, img_bar, nb_img_bar,
-                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_ACCESS_SHADER_READ_BIT,
-                        VK_IMAGE_LAYOUT_GENERAL,
-                        VK_QUEUE_FAMILY_IGNORED);
-
-fail:
-    return err;
+    return 0;
 }
 
 static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
@@ -305,13 +270,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     VulkanEncodeFFv1FrameData *fd = exec->opaque;
     FFv1VkParameters pd;
 
-    AVFrame *intermediate_frame = NULL;
-
-    /* Temporary data */
-    size_t tmp_data_size;
-    AVBufferRef *tmp_data_ref;
-    FFVkBuffer *tmp_data_buf;
-
     /* Slice data */
     AVBufferRef *slice_data_ref;
     FFVkBuffer *slice_data_buf;
@@ -330,11 +288,11 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     uint32_t context_count = f->context_count[f->context_model];
     const AVPixFmtDescriptor *fmt_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
 
-    VkImageView in_views[AV_NUM_DATA_POINTERS];
-    VkImageView intermediate_views[AV_NUM_DATA_POINTERS];
+    AVFrame *src = (AVFrame *)pict;
+    VkImageView src_views[AV_NUM_DATA_POINTERS];
 
-    AVFrame *enc_in = (AVFrame *)pict;
-    VkImageView *enc_in_views = in_views;
+    AVFrame *tmp = NULL;
+    VkImageView tmp_views[AV_NUM_DATA_POINTERS];
 
     VkImageMemoryBarrier2 img_bar[37];
     int nb_img_bar = 0;
@@ -356,17 +314,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
     f->slice_count = f->max_slice_count;
 
-    /* Allocate temporary data buffer */
-    tmp_data_size = f->slice_count*CONTEXT_SIZE;
-    RET(ff_vk_get_pooled_buffer(&fv->s, &fv->tmp_data_pool,
-                                &tmp_data_ref,
-                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                NULL, tmp_data_size,
-                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
-    tmp_data_buf = (FFVkBuffer *)tmp_data_ref->data;
-    ff_vk_exec_add_dep_buf(&fv->s, exec, &tmp_data_ref, 1, 0);
-
     /* Allocate slice buffer data */
     if (f->ac == AC_GOLOMB_RICE)
         plane_state_size = 8;
@@ -419,33 +366,53 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                 NULL, maxsize,
-                                maxsize < fv->max_heap_size ?
-                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : 0x0));
+                                VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                (maxsize < fv->max_heap_size ?
+                                 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : 0x0) |
+                                (!(fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) ?
+                                 VK_MEMORY_PROPERTY_HOST_CACHED_BIT : 0x0)));
     out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1);
 
     /* Prepare input frame */
-    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, enc_in,
+    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, src,
                                  VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                                  VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
 
-    RET(ff_vk_create_imageviews(&fv->s, exec, enc_in_views, enc_in,
+    RET(ff_vk_create_imageviews(&fv->s, exec, src_views, src,
                                 fv->rep_fmt));
-    ff_vk_frame_barrier(&fv->s, exec, enc_in, img_bar, &nb_img_bar,
+    ff_vk_frame_barrier(&fv->s, exec, src, img_bar, &nb_img_bar,
                         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
                         VK_ACCESS_SHADER_READ_BIT,
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
 
-    /* Setup shader needs the original input */
+    if (fv->is_rgb) {
+        /* Create a temporaty frame */
+        tmp = av_frame_alloc();
+        if (!(tmp))
+            return AVERROR(ENOMEM);
+
+        RET(av_hwframe_get_buffer(fv->intermediate_frames_ref,
+                                  tmp, 0));
+
+        RET(ff_vk_exec_add_dep_frame(&fv->s, exec, tmp,
+                                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                     VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+        RET(ff_vk_create_imageviews(&fv->s, exec, tmp_views,
+                                    tmp,
+                                    fv->rep_fmt));
+    }
+
+    /* Setup shader */
     ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->setup,
                                     1, 0, 0,
                                     slice_data_buf,
                                     0, slice_data_size*f->slice_count,
                                     VK_FORMAT_UNDEFINED);
     ff_vk_shader_update_img_array(&fv->s, exec, &fv->setup,
-                                  enc_in, enc_in_views,
+                                  src, src_views,
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
@@ -467,6 +434,25 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         };
     }
 
+    if (fv->optimize_rct) {
+        RET(run_rct_search(avctx, exec,
+                           src, src_views,
+                           slice_data_buf, slice_data_size));
+
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = slice_data_buf->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = slice_data_buf->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_data_buf->buf,
+            .size = slice_data_size*f->slice_count,
+            .offset = 0,
+        };
+    }
+
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -485,7 +471,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     ff_vk_exec_bind_shader(&fv->s, exec, &fv->setup);
     pd = (FFv1VkParameters) {
         .slice_state = slice_data_buf->address + f->slice_count*256,
-        .scratch_data = tmp_data_buf->address,
         .out_data = out_data_buf->address,
         .bits_per_raw_sample = f->bits_per_raw_sample,
         .sar[0] = pict->sample_aspect_ratio.num,
@@ -495,6 +480,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .plane_state_size = plane_state_size,
         .context_count = context_count,
         .crcref = f->crcref,
+        .rct_offset = 1 << f->bits_per_raw_sample,
         .slice_size_max = out_data_buf->size / f->slice_count,
         .context_model = fv->ctx.context_model,
         .version = f->version,
@@ -504,6 +490,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .components = fmt_desc->nb_components,
         .planes = av_pix_fmt_count_planes(avctx->sw_pix_fmt),
         .codec_planes = f->plane_count,
+        .planar_rgb = ff_vk_mt_is_np_rgb(avctx->sw_pix_fmt) &&
+                      (ff_vk_count_images((AVVkFrame *)src->data[0]) > 1),
         .transparency = f->transparency,
         .colorspace = f->colorspace,
         .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 :
@@ -511,12 +499,37 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .ec = f->ec,
         .ppi = fv->ppi,
         .chunks = fv->chunks,
+        .rct_search = fv->optimize_rct,
     };
+
+    /* For some reason the C FFv1 encoder/decoder treats these differently */
+    if (avctx->sw_pix_fmt == AV_PIX_FMT_GBRP10 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP12 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP14)
+        memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
+    else
+        ff_vk_set_perm(avctx->sw_pix_fmt, pd.fmt_lut, 1);
+
+    for (int i = 0; i < f->quant_table_count; i++)
+        pd.extend_lookup[i] = (f->quant_tables[i][3][127] != 0) ||
+                              (f->quant_tables[i][4][127] != 0);
     ff_vk_shader_update_push_const(&fv->s, exec, &fv->setup,
                                    VK_SHADER_STAGE_COMPUTE_BIT,
                                    0, sizeof(pd), &pd);
     vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
 
+    /* Clean up temporary image */
+    if (fv->is_rgb) {
+        AVVkFrame *vkf = (AVVkFrame *)tmp->data[0];
+        vk->CmdClearColorImage(exec->buf, vkf->img[0], VK_IMAGE_LAYOUT_GENERAL,
+                               &((VkClearColorValue) { 0 }),
+                               1, &((VkImageSubresourceRange) {
+                                   .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                                   .levelCount = 1,
+                                   .layerCount = 1,
+                               }));
+    }
+
     /* Setup shader modified the slice data buffer */
     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
@@ -570,19 +583,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                         f->plane_count);
     }
 
-    /* Run RCT shader */
-    if (fv->is_rgb) {
-        RET(run_rct(avctx, exec,
-                    enc_in, enc_in_views,
-                    &intermediate_frame, intermediate_views,
-                    img_bar, &nb_img_bar, buf_bar, &nb_buf_bar,
-                    slice_data_buf, slice_data_size));
-
-        /* Use the new frame */
-        enc_in = intermediate_frame;
-        enc_in_views = intermediate_views;
-    }
-
     /* If the reset shader ran, insert a barrier now. */
     if (f->key_frame || f->version > 3) {
         /* Reset shader modified the slice data buffer */
@@ -601,6 +601,15 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         };
     }
 
+    if (fv->is_rgb) {
+        ff_vk_frame_barrier(&fv->s, exec, tmp, img_bar, &nb_img_bar,
+                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                            VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                            VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+                            VK_IMAGE_LAYOUT_GENERAL,
+                            VK_QUEUE_FAMILY_IGNORED);
+    }
+
     /* Final barrier before encoding */
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -623,7 +632,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                     0, slice_data_size*f->slice_count,
                                     VK_FORMAT_UNDEFINED);
     ff_vk_shader_update_img_array(&fv->s, exec, &fv->enc,
-                                  enc_in, enc_in_views,
+                                  src, src_views,
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
@@ -632,6 +641,12 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                     results_data_buf,
                                     0, results_data_buf->size,
                                     VK_FORMAT_UNDEFINED);
+    if (fv->is_rgb)
+        ff_vk_shader_update_img_array(&fv->s, exec, &fv->enc,
+                                      tmp, tmp_views,
+                                      1, 3,
+                                      VK_IMAGE_LAYOUT_GENERAL,
+                                      VK_NULL_HANDLE);
 
     ff_vk_exec_bind_shader(&fv->s, exec, &fv->enc);
     ff_vk_shader_update_push_const(&fv->s, exec, &fv->enc,
@@ -648,20 +663,20 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
     /* This, if needed, was referenced by the execution context
      * as it was declared as a dependency. */
-    av_frame_free(&intermediate_frame);
+    av_frame_free(&tmp);
     return 0;
 
 fail:
-    av_frame_free(&intermediate_frame);
+    av_frame_free(&tmp);
     ff_vk_exec_discard_deps(&fv->s, exec);
 
     return err;
 }
 
-static int download_slices(AVCodecContext *avctx,
+static int transfer_slices(AVCodecContext *avctx,
                            VkBufferCopy *buf_regions, int nb_regions,
                            VulkanEncodeFFv1FrameData *fd,
-                           AVBufferRef *pkt_data_ref)
+                           uint8_t *dst, AVBufferRef *dst_ref)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
@@ -669,11 +684,20 @@ static int download_slices(AVCodecContext *avctx,
     FFVkExecContext *exec;
 
     FFVkBuffer *out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
-    FFVkBuffer *pkt_data_buf = (FFVkBuffer *)pkt_data_ref->data;
+
+    AVBufferRef *mapped_ref;
+    FFVkBuffer *mapped_buf;
 
     VkBufferMemoryBarrier2 buf_bar[8];
     int nb_buf_bar = 0;
 
+    err = ff_vk_host_map_buffer(&fv->s, &mapped_ref, dst, dst_ref,
+                                VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+    if (err < 0)
+        return err;
+
+    mapped_buf = (FFVkBuffer *)mapped_ref->data;
+
     /* Transfer the slices */
     exec = ff_vk_exec_get(&fv->s, &fv->transfer_exec_pool);
     ff_vk_exec_start(&fv->s, exec);
@@ -681,7 +705,8 @@ static int download_slices(AVCodecContext *avctx,
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 0);
     fd->out_data_ref = NULL; /* Ownership passed */
 
-    ff_vk_exec_add_dep_buf(&fv->s, exec, &pkt_data_ref, 1, 1);
+    ff_vk_exec_add_dep_buf(&fv->s, exec, &mapped_ref, 1, 0);
+    mapped_ref = NULL; /* Ownership passed */
 
     /* Ensure the output buffer is finished */
     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
@@ -705,8 +730,11 @@ static int download_slices(AVCodecContext *avctx,
     out_data_buf->access = buf_bar[0].dstAccessMask;
     nb_buf_bar = 0;
 
+    for (int i = 0; i < nb_regions; i++)
+        buf_regions[i].dstOffset += mapped_buf->virtual_offset;
+
     vk->CmdCopyBuffer(exec->buf,
-                      out_data_buf->buf, pkt_data_buf->buf,
+                      out_data_buf->buf, mapped_buf->buf,
                       nb_regions, buf_regions);
 
     /* Submit */
@@ -717,18 +745,6 @@ static int download_slices(AVCodecContext *avctx,
     /* We need the encoded data immediately */
     ff_vk_exec_wait(&fv->s, exec);
 
-    /* Invalidate slice/output data if needed */
-    if (!(pkt_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
-        VkMappedMemoryRange invalidate_data = {
-            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
-            .memory = pkt_data_buf->mem,
-            .offset = 0,
-            .size = VK_WHOLE_SIZE,
-        };
-        vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev,
-                                         1, &invalidate_data);
-    }
-
     return 0;
 }
 
@@ -739,13 +755,9 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec,
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
     FFV1Context *f = &fv->ctx;
     FFVulkanFunctions *vk = &fv->s.vkfn;
-
-    /* Packet data */
-    AVBufferRef *pkt_data_ref;
-    FFVkBuffer *pkt_data_buf;
-
     VulkanEncodeFFv1FrameData *fd = exec->opaque;
 
+    FFVkBuffer *out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
     FFVkBuffer *results_data_buf = (FFVkBuffer *)fd->results_data_ref->data;
     uint64_t *sc;
 
@@ -782,20 +794,9 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec,
     av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024));
     av_buffer_unref(&fd->results_data_ref); /* No need for this buffer anymore */
 
-    /* Allocate packet buffer */
-    err = ff_vk_get_pooled_buffer(&fv->s, &fv->pkt_data_pool,
-                                  &pkt_data_ref,
-                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-                                  NULL, pkt->size,
-                                  VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
-                                  VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-    if (err < 0)
+    /* Allocate packet */
+    if ((err = ff_get_encode_buffer(avctx, pkt, pkt->size, 0)) < 0)
         return err;
-    pkt_data_buf = (FFVkBuffer *)pkt_data_ref->data;
-
-    /* Setup packet data */
-    pkt->data     = pkt_data_buf->mapped_mem;
-    pkt->buf      = pkt_data_ref;
 
     pkt->pts      = fd->pts;
     pkt->dts      = fd->pts;
@@ -808,8 +809,37 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec,
         fd->frame_opaque_ref = NULL;
     }
 
-    return download_slices(avctx, fv->buf_regions, f->slice_count, fd,
-                           pkt_data_ref);
+    /* Try using host mapped memory transfers first */
+    if (fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) {
+        err = transfer_slices(avctx, fv->buf_regions, f->slice_count, fd,
+                              pkt->data, pkt->buf);
+        if (err >= 0)
+            return err;
+    }
+
+    /* Invalidate slice/output data if needed */
+    if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+        VkMappedMemoryRange invalidate_data = {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = out_data_buf->mem,
+            .offset = 0,
+            .size = VK_WHOLE_SIZE,
+        };
+        vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev,
+                                         1, &invalidate_data);
+    }
+
+    /* Copy each slice */
+    for (int i = 0; i < f->slice_count; i++) {
+        VkBufferCopy *region = &fv->buf_regions[i];
+        memcpy(pkt->data + region->dstOffset,
+               out_data_buf->mapped_mem + region->srcOffset,
+               region->size);
+    }
+
+    av_buffer_unref(&fd->out_data_ref);
+
+    return 0;
 }
 
 static int vulkan_encode_ffv1_receive_packet(AVCodecContext *avctx,
@@ -870,6 +900,7 @@ static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
     AVHWFramesContext *frames_ctx;
     AVVulkanFramesContext *vk_frames;
 
@@ -880,12 +911,13 @@ static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format)
     frames_ctx = (AVHWFramesContext *)fv->intermediate_frames_ref->data;
     frames_ctx->format    = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = sw_format;
-    frames_ctx->width     = FFALIGN(fv->s.frames->width, 32);
-    frames_ctx->height    = FFALIGN(fv->s.frames->height, 32);
+    frames_ctx->width     = fv->s.frames->width;
+    frames_ctx->height    = f->num_v_slices*RGB_LINECACHE;
 
     vk_frames = frames_ctx->hwctx;
     vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
-    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT;
+    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT |
+                           VK_IMAGE_USAGE_TRANSFER_DST_BIT;
     vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
 
     err = av_hwframe_ctx_init(fv->intermediate_frames_ref);
@@ -953,6 +985,7 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
     FFV1Context *f = &fv->ctx;
     int smp_bits = fv->ctx.use32bit ? 32 : 16;
 
+    av_bprintf(&shd->src, "#define RGB_LINECACHE %i\n"                   ,RGB_LINECACHE);
     av_bprintf(&shd->src, "#define CONTEXT_SIZE %i\n"                    ,CONTEXT_SIZE);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_MASK 0x%x\n"          ,MAX_QUANT_TABLE_MASK);
 
@@ -961,6 +994,9 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
         av_bprintf(&shd->src, "#define GOLOMB\n"                         );
     }
 
+    if (fv->is_rgb)
+        av_bprintf(&shd->src, "#define RGB\n");
+
     GLSLF(0, #define TYPE int%i_t                                        ,smp_bits);
     GLSLF(0, #define VTYPE2 i%ivec2                                      ,smp_bits);
     GLSLF(0, #define VTYPE3 i%ivec3                                      ,smp_bits);
@@ -972,32 +1008,48 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
     GLSLD(ff_source_ffv1_common_comp);
 }
 
-static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
+static int init_rct_search_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
-    FFVulkanShader *shd = &fv->setup;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanShader *shd = &fv->rct_search;
     FFVulkanDescriptorSetBinding *desc_set;
 
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
 
-    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_setup",
+    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct_search",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          1, 1, 1,
+                                             "GL_EXT_buffer_reference2",
+                                             "GL_EXT_null_initializer" }, 3,
+                          32, 32, 1,
                           0));
 
     /* Common codec header */
     GLSLD(ff_source_common_comp);
-    add_push_data(shd);
+
+    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             );
+    GLSLC(1,    ivec4 fmt_lut;                                                 );
+    GLSLC(1,    int rct_offset;                                                );
+    GLSLC(1,    uint8_t planar_rgb;                                            );
+    GLSLC(1,    uint8_t transparency;                                          );
+    GLSLC(1,    uint8_t key_frame;                                             );
+    GLSLC(1,    uint8_t force_pcm;                                             );
+    GLSLC(1,    uint8_t version;                                               );
+    GLSLC(1,    uint8_t micro_version;                                         );
+    GLSLC(1,    uint8_t padding[3];                                            );
+    GLSLC(0, };                                                                );
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
 
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
     av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
 
+    /* Never used */
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
             .name        = "rangecoder_static_buf",
@@ -1006,7 +1058,7 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .mem_layout  = "scalar",
             .buf_content = "uint8_t zero_one_state[512];",
         },
-        { /* This descriptor is never used */
+        {
             .name        = "quant_buf",
             .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
@@ -1015,7 +1067,7 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
                            "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];",
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 0));
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 1));
 
     define_shared_code(avctx, shd);
 
@@ -1024,7 +1076,8 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .name        = "slice_data_buf",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "SliceContext slice_ctx[1024];",
+            .buf_content = "SliceContext slice_ctx",
+            .buf_elems   = f->max_slice_count,
         },
         {
             .name       = "src",
@@ -1039,7 +1092,7 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     };
     RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0));
 
-    GLSLD(ff_source_ffv1_enc_setup_comp);
+    GLSLD(ff_source_ffv1_rct_search_comp);
 
     RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
                             &spv_opaque));
@@ -1054,44 +1107,33 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     return err;
 }
 
-static int init_reset_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
+static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
-    FFVulkanShader *shd = &fv->reset;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanShader *shd = &fv->setup;
     FFVulkanDescriptorSetBinding *desc_set;
 
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
-    int wg_dim = FFMIN(fv->s.props.properties.limits.maxComputeWorkGroupSize[0], 1024);
 
-    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_reset",
+    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_setup",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          wg_dim, 1, 1,
+                          1, 1, 1,
                           0));
 
     /* Common codec header */
     GLSLD(ff_source_common_comp);
-
-    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             );
-    GLSLF(1,    uint context_count[%i];                                        ,MAX_QUANT_TABLES);
-    GLSLC(1,    u8buf slice_state;                                             );
-    GLSLC(1,    uint plane_state_size;                                         );
-    GLSLC(1,    uint8_t codec_planes;                                          );
-    GLSLC(1,    uint8_t key_frame;                                             );
-    GLSLC(1,    uint8_t version;                                               );
-    GLSLC(1,    uint8_t micro_version;                                         );
-    GLSLC(1,    uint8_t padding[1];                                            );
-    GLSLC(0, };                                                                );
-    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters),
-                                VK_SHADER_STAGE_COMPUTE_BIT);
+    add_push_data(shd);
 
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
     av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
+    av_bprintf(&shd->src, "#define FULL_RENORM\n");
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
@@ -1101,7 +1143,7 @@ static int init_reset_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .mem_layout  = "scalar",
             .buf_content = "uint8_t zero_one_state[512];",
         },
-        {
+        { /* This descriptor is never used */
             .name        = "quant_buf",
             .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
@@ -1118,14 +1160,24 @@ static int init_reset_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
         {
             .name        = "slice_data_buf",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "SliceContext slice_ctx[1024];",
+            .buf_content = "SliceContext slice_ctx",
+            .buf_elems   = f->max_slice_count,
+        },
+        {
+            .name       = "src",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
+                                               fv->rep_fmt),
+            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
+            .mem_quali  = "readonly",
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 1, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0));
 
-    GLSLD(ff_source_ffv1_reset_comp);
+    GLSLD(ff_source_ffv1_enc_setup_comp);
 
     RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
                             &spv_opaque));
@@ -1140,49 +1192,40 @@ static int init_reset_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     return err;
 }
 
-static int init_rct_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
+static int init_reset_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
-    FFVulkanShader *shd = &fv->rct;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanShader *shd = &fv->reset;
     FFVulkanDescriptorSetBinding *desc_set;
 
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
-    int wg_count = sqrt(fv->s.props.properties.limits.maxComputeWorkGroupInvocations);
-
-    enum AVPixelFormat intermediate_fmt = get_supported_rgb_buffer_fmt(avctx);
-    if (intermediate_fmt == AV_PIX_FMT_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to find a supported compatible "
-                                    "pixel format for RCT buffer!\n");
-        return AVERROR(ENOTSUP);
-    }
-
-    RET(init_indirect(avctx, intermediate_fmt));
+    int wg_dim = FFMIN(fv->s.props.properties.limits.maxComputeWorkGroupSize[0], 1024);
 
-    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct",
+    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_reset",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          wg_count, wg_count, 1,
+                          wg_dim, 1, 1,
                           0));
 
     /* Common codec header */
     GLSLD(ff_source_common_comp);
 
     GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             );
-    GLSLC(1,    ivec4 fmt_lut;                                                 );
-    GLSLC(1,    int offset;                                                    );
-    GLSLC(1,    uint8_t bits;                                                  );
-    GLSLC(1,    uint8_t planar_rgb;                                            );
-    GLSLC(1,    uint8_t color_planes;                                          );
-    GLSLC(1,    uint8_t transparency;                                          );
+    GLSLF(1,    uint context_count[%i];                                        ,MAX_QUANT_TABLES);
+    GLSLC(1,    u8buf slice_state;                                             );
+    GLSLC(1,    uint plane_state_size;                                         );
+    GLSLC(1,    uint8_t codec_planes;                                          );
+    GLSLC(1,    uint8_t key_frame;                                             );
     GLSLC(1,    uint8_t version;                                               );
     GLSLC(1,    uint8_t micro_version;                                         );
-    GLSLC(1,    uint8_t padding[2];                                            );
+    GLSLC(1,    uint8_t padding[1];                                            );
     GLSLC(0, };                                                                );
-    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters),
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
@@ -1216,32 +1259,13 @@ static int init_rct_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .mem_quali   = "readonly",
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "SliceContext slice_ctx[1024];",
-        },
-        {
-            .name       = "src",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
-                                               fv->rep_fmt),
-            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
-            .mem_quali  = "readonly",
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
-        },
-        {
-            .name       = "dst",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(intermediate_fmt,
-                                               fv->rep_fmt),
-            .elems      = av_pix_fmt_count_planes(intermediate_fmt),
-            .mem_quali  = "writeonly",
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "SliceContext slice_ctx",
+            .buf_elems   = f->max_slice_count,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 1, 0, 0));
 
-    GLSLD(ff_source_ffv1_enc_rct_comp);
+    GLSLD(ff_source_ffv1_reset_comp);
 
     RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
                             &spv_opaque));
@@ -1264,19 +1288,16 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     FFVulkanShader *shd = &fv->enc;
     FFVulkanDescriptorSetBinding *desc_set;
 
-    AVHWFramesContext *frames_ctx = fv->intermediate_frames_ref ?
-                                    (AVHWFramesContext *)fv->intermediate_frames_ref->data :
-                                    fv->s.frames;
-
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
+    int use_cached_reader = fv->ctx.ac != AC_GOLOMB_RICE;
 
     RET(ff_vk_shader_init(&fv->s, shd, "ffv1_enc",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          1, 1, 1,
+                          use_cached_reader ? CONTEXT_SIZE : 1, 1, 1,
                           0));
 
     /* Common codec header */
@@ -1288,6 +1309,9 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
 
+    if (use_cached_reader)
+        av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n");
+
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
             .name        = "rangecoder_static_buf",
@@ -1322,15 +1346,16 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .name        = "slice_data_buf",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "SliceContext slice_ctx[1024];",
+            .buf_content = "SliceContext slice_ctx",
+            .buf_elems   = f->max_slice_count,
         },
         {
             .name       = "src",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
             .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(frames_ctx->sw_format,
+            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
                                                fv->rep_fmt),
-            .elems      = av_pix_fmt_count_planes(frames_ctx->sw_format),
+            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
             .mem_quali  = "readonly",
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
@@ -1341,21 +1366,24 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .mem_quali   = "writeonly",
             .buf_content = "uint64_t slice_results[2048];",
         },
+        { /* place holder for desc_set[3] */
+        },
     };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0));
-
-    /* Assemble the shader body */
-    GLSLD(ff_source_ffv1_enc_common_comp);
-
-    if (f->ac == AC_GOLOMB_RICE)
-        GLSLD(ff_source_ffv1_enc_vlc_comp);
-    else
-        GLSLD(ff_source_ffv1_enc_ac_comp);
+    if (fv->is_rgb) {
+        AVHWFramesContext *intermediate_frames_ctx;
+        intermediate_frames_ctx = (AVHWFramesContext *)fv->intermediate_frames_ref->data;
+        desc_set[3] = (FFVulkanDescriptorSetBinding) {
+            .name       = "tmp",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(intermediate_frames_ctx->sw_format,
+                                               FF_VK_REP_NATIVE),
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        };
+    }
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3 + fv->is_rgb, 0, 0));
 
-    if (fv->is_rgb)
-        GLSLD(ff_source_ffv1_enc_rgb_comp);
-    else
-        GLSLD(ff_source_ffv1_enc_comp);
+    GLSLD(ff_source_ffv1_enc_comp);
 
     RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
                             &spv_opaque));
@@ -1463,22 +1491,24 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
                 f->num_v_slices = 32;
             }
         } else if (f->num_h_slices && f->num_v_slices <= 0) {
-            f->num_v_slices = 1024 / f->num_h_slices;
+            f->num_v_slices = MAX_SLICES / f->num_h_slices;
         } else if (f->num_v_slices && f->num_h_slices <= 0) {
-            f->num_h_slices = 1024 / f->num_v_slices;
+            f->num_h_slices = MAX_SLICES / f->num_v_slices;
         }
 
         f->num_h_slices = FFMIN(f->num_h_slices, avctx->width);
         f->num_v_slices = FFMIN(f->num_v_slices, avctx->height);
 
-        if (f->num_h_slices * f->num_v_slices > 1024) {
+        if (f->num_h_slices * f->num_v_slices > MAX_SLICES) {
             av_log(avctx, AV_LOG_ERROR, "Too many slices (%i), maximum supported "
-                                        "by the standard is 1024\n",
-                   f->num_h_slices * f->num_v_slices);
+                                        "by the standard is %i\n",
+                   f->num_h_slices * f->num_v_slices, MAX_SLICES);
             return AVERROR_PATCHWELCOME;
         }
     }
 
+    f->max_slice_count = f->num_h_slices * f->num_v_slices;
+
     if ((err = ff_ffv1_write_extradata(avctx)) < 0)
         return err;
 
@@ -1584,6 +1614,17 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
     if (!fv->is_rgb && f->bits_per_raw_sample > 8)
         fv->rep_fmt = FF_VK_REP_INT;
 
+    /* Init rct search shader */
+    fv->optimize_rct = fv->is_rgb && f->version >= 4 &&
+                       !fv->force_pcm && fv->optimize_rct;
+    if (fv->optimize_rct) {
+        err = init_rct_search_shader(avctx, spv);
+        if (err < 0) {
+            spv->uninit(&spv);
+            return err;
+        }
+    }
+
     /* Init setup shader */
     err = init_setup_shader(avctx, spv);
     if (err < 0) {
@@ -1598,13 +1639,15 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
         return err;
     }
 
-    /* Init RCT shader */
     if (fv->is_rgb) {
-        err = init_rct_shader(avctx, spv);
-        if (err < 0) {
-            spv->uninit(&spv);
-            return err;
+        enum AVPixelFormat intermediate_fmt = get_supported_rgb_buffer_fmt(avctx);
+        if (intermediate_fmt == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Unable to find a supported compatible "
+                                        "pixel format for RCT buffer!\n");
+            return AVERROR(ENOTSUP);
         }
+
+        RET(init_indirect(avctx, intermediate_fmt));
     }
 
     /* Encode shader */
@@ -1674,7 +1717,6 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
     for (int i = 0; i < fv->async_depth; i++)
         fv->exec_pool.contexts[i].opaque = &fv->exec_ctx_info[i];
 
-    f->max_slice_count = f->num_h_slices * f->num_v_slices;
     fv->buf_regions = av_malloc_array(f->max_slice_count, sizeof(*fv->buf_regions));
     if (!fv->buf_regions)
         return AVERROR(ENOMEM);
@@ -1691,9 +1733,9 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
     ff_vk_exec_pool_free(&fv->s, &fv->transfer_exec_pool);
 
     ff_vk_shader_free(&fv->s, &fv->enc);
-    ff_vk_shader_free(&fv->s, &fv->rct);
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
+    ff_vk_shader_free(&fv->s, &fv->rct_search);
 
     if (fv->exec_ctx_info) {
         for (int i = 0; i < fv->async_depth; i++) {
@@ -1710,8 +1752,6 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
     av_buffer_pool_uninit(&fv->results_data_pool);
 
     av_buffer_pool_uninit(&fv->out_data_pool);
-    av_buffer_pool_uninit(&fv->pkt_data_pool);
-    av_buffer_pool_uninit(&fv->tmp_data_pool);
 
     av_buffer_unref(&fv->keyframe_slice_data_ref);
     av_buffer_pool_uninit(&fv->slice_data_pool);
@@ -1730,8 +1770,8 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
 #define OFFSET(x) offsetof(VulkanEncodeFFv1Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption vulkan_encode_ffv1_options[] = {
-    { "slicecrc", "Protect slices with CRCs", OFFSET(ctx.ec), AV_OPT_TYPE_BOOL,
-            { .i64 = -1 }, -1, 1, VE },
+    { "slicecrc", "Protect slices with CRCs", OFFSET(ctx.ec), AV_OPT_TYPE_INT,
+            { .i64 = -1 }, -1, 2, VE },
     { "context", "Context model", OFFSET(ctx.context_model), AV_OPT_TYPE_INT,
             { .i64 = 0 }, 0, 1, VE },
     { "coder", "Coder type", OFFSET(ctx.ac), AV_OPT_TYPE_INT,
@@ -1752,13 +1792,16 @@ static const AVOption vulkan_encode_ffv1_options[] = {
             { .i64 = QTABLE_GT8BIT }, INT_MIN, INT_MAX, VE, .unit = "qtable" },
 
     { "slices_h", "Number of horizontal slices", OFFSET(num_h_slices), AV_OPT_TYPE_INT,
-            { .i64 = -1 }, -1, 1024, VE },
+            { .i64 = -1 }, -1, MAX_SLICES, VE },
     { "slices_v", "Number of vertical slices", OFFSET(num_v_slices), AV_OPT_TYPE_INT,
-            { .i64 = -1 }, -1, 1024, VE },
+            { .i64 = -1 }, -1, MAX_SLICES, VE },
 
     { "force_pcm", "Code all slices with no prediction", OFFSET(force_pcm), AV_OPT_TYPE_BOOL,
             { .i64 = 0 }, 0, 1, VE },
 
+    { "rct_search", "Run a search for RCT parameters (level 4 only)", OFFSET(optimize_rct), AV_OPT_TYPE_BOOL,
+            { .i64 = 1 }, 0, 1, VE },
+
     { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT,
             { .i64 = 1 }, 1, INT_MAX, VE },
 
diff --git a/libavcodec/flvdec.c b/libavcodec/flvdec.c
index 627538ef83d42..e8c159563967a 100644
--- a/libavcodec/flvdec.c
+++ b/libavcodec/flvdec.c
@@ -89,8 +89,6 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     skip_bits1(&s->gb); /* deblocking flag */
     s->chroma_qscale = s->qscale = get_bits(&s->gb, 5);
 
-    s->h263_plus = 0;
-
     s->h263_long_vectors = 0;
 
     /* PEI */
diff --git a/libavcodec/flvenc.c b/libavcodec/flvenc.c
index df1a650222edb..8f07c3c778119 100644
--- a/libavcodec/flvenc.c
+++ b/libavcodec/flvenc.c
@@ -22,13 +22,14 @@
 #include "flvenc.h"
 #include "mpegvideo.h"
 #include "mpegvideoenc.h"
+#include "put_bits.h"
 
 int ff_flv_encode_picture_header(MPVMainEncContext *const m)
 {
     MPVEncContext *const s = &m->s;
     int format;
 
-    align_put_bits(&s->pb);
+    put_bits_assume_flushed(&s->pb);
 
     put_bits(&s->pb, 17, 1);
     /* 0: H.263 escape codes 1: 11-bit escape codes */
diff --git a/libavcodec/h261enc.c b/libavcodec/h261enc.c
index 70f5f2b09c984..c217fb6233d1c 100644
--- a/libavcodec/h261enc.c
+++ b/libavcodec/h261enc.c
@@ -35,6 +35,7 @@
 #include "h261.h"
 #include "h261enc.h"
 #include "mpegvideoenc.h"
+#include "put_bits.h"
 
 #define H261_MAX_RUN   26
 #define H261_MAX_LEVEL 15
@@ -72,7 +73,7 @@ static int h261_encode_picture_header(MPVMainEncContext *const m)
     MPVEncContext *const s = &h->s.s;
     int temp_ref;
 
-    align_put_bits(&s->pb);
+    put_bits_assume_flushed(&s->pb);
 
     put_bits(&s->pb, 20, 0x10); /* PSC */
 
diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index 2f8bd73665e91..37abf3382ebfa 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -110,7 +110,8 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     ff_mpv_unquantize_init(&unquant_dsp_ctx,
                            avctx->flags & AV_CODEC_FLAG_BITEXACT, 0);
     // dct_unquantize defaults for H.263;
-    // they might change on a per-frame basis for MPEG-4.
+    // they might change on a per-frame basis for MPEG-4;
+    // dct_unquantize_inter will be unset for MSMPEG4 codecs later.
     s->dct_unquantize_intra = unquant_dsp_ctx.dct_unquantize_h263_intra;
     s->dct_unquantize_inter = unquant_dsp_ctx.dct_unquantize_h263_inter;
 
@@ -150,9 +151,7 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
         s->h263_flv = 1;
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "Unsupported codec %d\n",
-               avctx->codec->id);
-        return AVERROR(ENOSYS);
+        av_unreachable("Switch contains a case for every codec using ff_h263_decode_init()");
     }
 
     if (avctx->codec_tag == AV_RL32("L263") || avctx->codec_tag == AV_RL32("S263"))
@@ -174,6 +173,12 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static void report_decode_progress(MpegEncContext *s)
+{
+    if (s->pict_type != AV_PICTURE_TYPE_B && !s->partitioned_frame && !s->er.error_occurred)
+        ff_thread_progress_report(&s->cur_pic.ptr->progress, s->mb_y);
+}
+
 static int decode_slice(MpegEncContext *s)
 {
     const int part_mask = s->partitioned_frame
@@ -278,8 +283,8 @@ static int decode_slice(MpegEncContext *s)
 
                     if (++s->mb_x >= s->mb_width) {
                         s->mb_x = 0;
+                        report_decode_progress(s);
                         ff_mpeg_draw_horiz_band(s, s->mb_y * mb_size, mb_size);
-                        ff_mpv_report_decode_progress(s);
                         s->mb_y++;
                     }
                     return 0;
@@ -305,8 +310,8 @@ static int decode_slice(MpegEncContext *s)
                 ff_h263_loop_filter(s);
         }
 
+        report_decode_progress(s);
         ff_mpeg_draw_horiz_band(s, s->mb_y * mb_size, mb_size);
-        ff_mpv_report_decode_progress(s);
 
         s->mb_x = 0;
     }
@@ -531,11 +536,6 @@ int ff_h263_decode_frame(AVCodecContext *avctx, AVFrame *pict,
         }
     }
 
-    if (s->codec_id == AV_CODEC_ID_H263  ||
-        s->codec_id == AV_CODEC_ID_H263P ||
-        s->codec_id == AV_CODEC_ID_H263I)
-        s->gob_index = H263_GOB_HEIGHT(s->height);
-
     /* skip B-frames if we don't have reference frames */
     if (!s->last_pic.ptr &&
         (s->pict_type == AV_PICTURE_TYPE_B || s->droppable))
diff --git a/libavcodec/h263dec.h b/libavcodec/h263dec.h
index 633d4aa577453..c1306c7ec5a15 100644
--- a/libavcodec/h263dec.h
+++ b/libavcodec/h263dec.h
@@ -55,7 +55,7 @@ int ff_h263_decode_mba(MpegEncContext *s);
 /**
  * Print picture info if FF_DEBUG_PICT_INFO is set.
  */
-void ff_h263_show_pict_info(MpegEncContext *s);
+void ff_h263_show_pict_info(MpegEncContext *s, int h263_plus);
 
 int ff_intel_h263_decode_picture_header(MpegEncContext *s);
 int ff_h263_decode_mb(MpegEncContext *s,
diff --git a/libavcodec/h2645_parse.c b/libavcodec/h2645_parse.c
index 82816999e842c..fa57911c08bbb 100644
--- a/libavcodec/h2645_parse.c
+++ b/libavcodec/h2645_parse.c
@@ -22,6 +22,7 @@
 
 #include "config.h"
 
+#include "libavutil/error.h"
 #include "libavutil/intmath.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
@@ -588,8 +589,9 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
         } else
             ret = h264_parse_nal_header(nal, logctx);
         if (ret < 0) {
-            av_log(logctx, AV_LOG_WARNING, "Invalid NAL unit %d, skipping.\n",
-                   nal->type);
+            av_log(logctx, AV_LOG_WARNING,
+                   "Failed to parse header of NALU (type %d): \"%s\". Skipping NALU.\n",
+                   nal->type, av_err2str(ret));
             continue;
         }
 
diff --git a/libavcodec/h2645_sei.c b/libavcodec/h2645_sei.c
index c7950a4a45ae7..d17c4fb5f9bae 100644
--- a/libavcodec/h2645_sei.c
+++ b/libavcodec/h2645_sei.c
@@ -44,8 +44,9 @@
 #include "h2645_sei.h"
 #include "itut35.h"
 
-#define IS_H264(codec_id) (CONFIG_H264_SEI && CONFIG_HEVC_SEI ? codec_id == AV_CODEC_ID_H264 : CONFIG_H264_SEI)
-#define IS_HEVC(codec_id) (CONFIG_H264_SEI && CONFIG_HEVC_SEI ? codec_id == AV_CODEC_ID_HEVC : CONFIG_HEVC_SEI)
+#define IS_H264(codec_id) (CONFIG_H264_SEI && (CONFIG_HEVC_SEI || CONFIG_VVC_SEI ) ? codec_id == AV_CODEC_ID_H264 : CONFIG_H264_SEI)
+#define IS_HEVC(codec_id) (CONFIG_HEVC_SEI && (CONFIG_H264_SEI || CONFIG_VVC_SEI ) ? codec_id == AV_CODEC_ID_HEVC : CONFIG_HEVC_SEI)
+#define IS_VVC(codec_id)  (CONFIG_VVC_SEI  && (CONFIG_H264_SEI || CONFIG_HEVC_SEI) ? codec_id == AV_CODEC_ID_VVC  : CONFIG_VVC_SEI )
 
 #if CONFIG_HEVC_SEI
 static int decode_registered_user_data_dynamic_hdr_plus(HEVCSEIDynamicHDRPlus *s,
@@ -427,7 +428,7 @@ static int decode_film_grain_characteristics(H2645SEIFilmGrainCharacteristics *h
                 }
             }
         }
-        if (IS_HEVC(codec_id))
+        if (!IS_H264(codec_id))
             h->persistence_flag = get_bits1(gb);
         else
             h->repetition_period = get_ue_golomb_long(gb);
@@ -854,7 +855,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         fgp->subsampling_x = fgp->subsampling_y = 0;
 
         h274->model_id = fgc->model_id;
-        if (fgc->separate_colour_description_present_flag) {
+        if (IS_VVC(codec_id) || fgc->separate_colour_description_present_flag) {
             fgp->bit_depth_luma   = fgc->bit_depth_luma;
             fgp->bit_depth_chroma = fgc->bit_depth_chroma;
             fgp->color_range      = fgc->full_range + 1;
diff --git a/libavcodec/h2645_sei.h b/libavcodec/h2645_sei.h
index abc49760d9f1b..f2ad7147c6059 100644
--- a/libavcodec/h2645_sei.h
+++ b/libavcodec/h2645_sei.h
@@ -108,7 +108,7 @@ typedef struct H2645SEIFilmGrainCharacteristics {
     uint8_t intensity_interval_upper_bound[3][256];
     int16_t comp_model_value[3][256][6];
     int repetition_period;       //< H.264 only
-    int persistence_flag;        //< HEVC  only
+    int persistence_flag;        //< HEVC/VVC
 } H2645SEIFilmGrainCharacteristics;
 
 typedef struct H2645SEIMasteringDisplay {
diff --git a/libavcodec/h2645_vui.c b/libavcodec/h2645_vui.c
index e5c7bf46f9b3d..0e576c15632a3 100644
--- a/libavcodec/h2645_vui.c
+++ b/libavcodec/h2645_vui.c
@@ -67,11 +67,16 @@ void ff_h2645_decode_common_vui_params(GetBitContext *gb, H2645VUI *vui, void *l
             vui->matrix_coeffs            = get_bits(gb, 8);
 
             // Set invalid values to "unspecified"
-            if (!av_color_primaries_name(vui->colour_primaries))
+            if (vui->colour_primaries == AVCOL_PRI_RESERVED0 ||
+                vui->colour_primaries == AVCOL_PRI_RESERVED ||
+                !av_color_primaries_name(vui->colour_primaries))
                 vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
-            if (!av_color_transfer_name(vui->transfer_characteristics))
+            if (vui->transfer_characteristics == AVCOL_TRC_RESERVED0 ||
+                vui->transfer_characteristics == AVCOL_TRC_RESERVED ||
+                !av_color_transfer_name(vui->transfer_characteristics))
                 vui->transfer_characteristics = AVCOL_TRC_UNSPECIFIED;
-            if (!av_color_space_name(vui->matrix_coeffs))
+            if (vui->matrix_coeffs == AVCOL_SPC_RESERVED ||
+                !av_color_space_name(vui->matrix_coeffs))
                 vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
         }
     }
diff --git a/libavcodec/h274.c b/libavcodec/h274.c
index 5709200322e61..e46926e4cc625 100644
--- a/libavcodec/h274.c
+++ b/libavcodec/h274.c
@@ -26,7 +26,11 @@
  */
 
 #include "libavutil/avassert.h"
+#include "libavutil/bswap.h"
+#include "libavutil/crc.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/md5.h"
+#include "libavutil/mem.h"
 
 #include "h274.h"
 
@@ -790,3 +794,154 @@ static const int8_t R64T[64][64] = {
          17, -16,  15, -14,  13, -12,  11, -10,   9,  -8,   7,  -6,   4,  -3,   2,  -1,
     }
 };
+
+static int verify_plane_md5(struct AVMD5 *ctx,
+    const uint8_t *src, const int w, const int h, const int stride,
+    const uint8_t *expected)
+{
+#define MD5_SIZE 16
+    uint8_t md5[MD5_SIZE];
+    av_md5_init(ctx);
+    for (int j = 0; j < h; j++) {
+        av_md5_update(ctx, src, w);
+        src += stride;
+    }
+    av_md5_final(ctx, md5);
+
+    if (memcmp(md5, expected, MD5_SIZE))
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+static int verify_plane_crc(const uint8_t *src, const int w, const int h, const int stride,
+    uint16_t expected)
+{
+    uint32_t crc = 0x0F1D;     // CRC-16-CCITT-AUG
+    const AVCRC *ctx = av_crc_get_table(AV_CRC_16_CCITT);
+
+    expected = av_le2ne32(expected);
+    for (int j = 0; j < h; j++) {
+        crc = av_crc(ctx, crc, src, w);
+        src += stride;
+    }
+    crc = av_bswap16(crc);
+
+    if (crc != expected)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+#define CAL_CHECKSUM(pixel) ((pixel) ^ xor_mask)
+static int verify_plane_checksum(const uint8_t *src, const int w, const int h, const int stride, const int ps,
+    uint32_t expected)
+{
+    uint32_t checksum = 0;
+    expected = av_le2ne32(expected);
+
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            const int xor_mask = (x & 0xFF) ^ (y & 0xFF) ^ (x >> 8) ^ (y >> 8);
+            checksum += CAL_CHECKSUM(src[x << ps]);
+            if (ps)
+                checksum += CAL_CHECKSUM(src[(x << ps) + 1]);
+        }
+        src += stride;
+    }
+
+    if (checksum != expected)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+enum {
+    HASH_MD5SUM,
+    HASH_CRC,
+    HASH_CHECKSUM,
+    HASH_LAST = HASH_CHECKSUM,
+};
+
+struct H274HashContext {
+    int type;
+    struct AVMD5 *ctx;
+};
+
+void ff_h274_hash_freep(H274HashContext **ctx)
+{
+    if (*ctx) {
+        H274HashContext *c = *ctx;
+        if (c->ctx)
+            av_free(c->ctx);
+        av_freep(ctx);
+    }
+}
+
+int ff_h274_hash_init(H274HashContext **ctx, const int type)
+{
+    H274HashContext *c;
+
+    if (type > HASH_LAST || !ctx)
+        return AVERROR(EINVAL);
+
+    c = *ctx;
+    if (c) {
+        if (c->type != type) {
+            if (c->type == HASH_MD5SUM)
+                av_freep(&c->ctx);
+            c->type = type;
+        }
+    } else {
+        c = av_mallocz(sizeof(H274HashContext));
+        if (!c)
+            return AVERROR(ENOMEM);
+        c->type = type;
+        *ctx = c;
+    }
+
+    if (type == HASH_MD5SUM && !c->ctx) {
+        c->ctx = av_md5_alloc();
+        if (!c->ctx)
+            return AVERROR(ENOMEM);
+    }
+
+    return 0;
+}
+
+int ff_h274_hash_verify(H274HashContext *c, const H274SEIPictureHash *hash,
+    const AVFrame *frame, const int coded_width, const int coded_height)
+{
+    const AVPixFmtDescriptor *desc;
+    int err = 0;
+
+    if (!c || !hash || !frame)
+        return AVERROR(EINVAL);
+
+    if (c->type != hash->hash_type)
+        return AVERROR(EINVAL);
+
+    desc = av_pix_fmt_desc_get(frame->format);
+    if (!desc)
+        return AVERROR(EINVAL);
+
+    for (int i = 0; i < desc->nb_components; i++) {
+        const int w        = i ? (coded_width  >> desc->log2_chroma_w) : coded_width;
+        const int h        = i ? (coded_height >> desc->log2_chroma_h) : coded_height;
+        const int ps       = desc->comp[i].step - 1;
+        const uint8_t *src = frame->data[i];
+        const int stride   = frame->linesize[i];
+
+        if (c->type == HASH_MD5SUM)
+            err = verify_plane_md5(c->ctx, src, w << ps, h, stride, hash->md5[i]);
+        else if (c->type == HASH_CRC)
+            err = verify_plane_crc(src, w << ps, h, stride, hash->crc[i]);
+        else if (c->type == HASH_CHECKSUM)
+            err = verify_plane_checksum(src, w, h, stride, ps, hash->checksum[i]);
+        if (err < 0)
+            goto fail;
+    }
+
+fail:
+    return err;
+}
diff --git a/libavcodec/h274.h b/libavcodec/h274.h
index cebc8becb37ba..055dd591d2966 100644
--- a/libavcodec/h274.h
+++ b/libavcodec/h274.h
@@ -64,4 +64,29 @@ int ff_h274_apply_film_grain(AVFrame *out, const AVFrame *in,
                              H274FilmGrainDatabase *db,
                              const AVFilmGrainParams *params);
 
+typedef struct H274HashContext H274HashContext;
+
+typedef struct H274SEIPictureHash {
+    int present;
+    union {
+        uint8_t  md5[3][16];
+        uint16_t crc[3];
+        uint32_t checksum[3];
+    };
+    uint8_t hash_type;
+} H274SEIPictureHash;
+
+int ff_h274_hash_init(H274HashContext **c, int type);
+int ff_h274_hash_verify(H274HashContext *c, const H274SEIPictureHash *hash,
+    const AVFrame *frame, int coded_width, int coded_height);
+void ff_h274_hash_freep(H274HashContext **c);
+
+typedef struct H274SEIFrameFieldInfo {
+    int present;
+    int picture_struct;
+    uint8_t display_elemental_periods;
+    uint8_t source_scan_type;
+    uint8_t duplicate_flag;
+} H274SEIFrameFieldInfo;
+
 #endif /* AVCODEC_H274_H */
diff --git a/libavcodec/hashtable.c b/libavcodec/hashtable.c
new file mode 100644
index 0000000000000..151476176ba10
--- /dev/null
+++ b/libavcodec/hashtable.c
@@ -0,0 +1,214 @@
+/*
+ * Generic hashtable
+ * Copyright (C) 2025 Emma Worley <emma@emma.gg>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "libavutil/crc.h"
+#include "libavutil/error.h"
+#include "libavutil/mem.h"
+#include "hashtable.h"
+
+#define ALIGN _Alignof(size_t)
+
+struct FFHashtableContext {
+    size_t key_size;
+    size_t key_size_aligned;
+    size_t val_size;
+    size_t val_size_aligned;
+    size_t entry_size;
+    size_t max_entries;
+    size_t nb_entries;
+    const AVCRC *crc;
+    uint8_t *table;
+    uint8_t *swapbuf;
+};
+
+/*
+ * Hash table entries are comprised of a probe sequence length (PSL), key, and
+ * value. When the PSL of an entry is zero, it means it is not occupied by a
+ * key/value pair. When the PSL is non-zero, it represents the "distance" of
+ * the entry from its "home" location plus one, where the "home" location is
+ * hash(key) % max_entries.
+ */
+
+#define ENTRY_PSL_VAL(entry) (*(size_t*)(entry))
+#define ENTRY_KEY_PTR(entry) ((entry) + FFALIGN(sizeof(size_t), ALIGN))
+#define ENTRY_VAL_PTR(entry) (ENTRY_KEY_PTR(entry) + ctx->key_size_aligned)
+
+#define KEYS_EQUAL(k1, k2) (!memcmp((k1), (k2), ctx->key_size))
+
+int ff_hashtable_alloc(struct FFHashtableContext **ctx, size_t key_size, size_t val_size, size_t max_entries)
+{
+    struct FFHashtableContext *res = av_malloc(sizeof(struct FFHashtableContext));
+    if (!res)
+        return AVERROR(ENOMEM);
+    res->key_size = key_size;
+    res->key_size_aligned = FFALIGN(key_size, ALIGN);
+    res->val_size = val_size;
+    res->val_size_aligned = FFALIGN(val_size, ALIGN);
+    res->entry_size = FFALIGN(sizeof(size_t), ALIGN)
+                    + res->key_size_aligned
+                    + res->val_size_aligned;
+    res->max_entries = max_entries;
+    res->nb_entries = 0;
+    res->crc = av_crc_get_table(AV_CRC_32_IEEE);
+    if (!res->crc) {
+        ff_hashtable_freep(&res);
+        return AVERROR_BUG;
+    }
+    res->table = av_calloc(res->max_entries, res->entry_size);
+    if (!res->table) {
+        ff_hashtable_freep(&res);
+        return AVERROR(ENOMEM);
+    }
+
+    res->swapbuf = av_calloc(2, res->key_size_aligned + res->val_size_aligned);
+    if (!res->swapbuf) {
+        ff_hashtable_freep(&res);
+        return AVERROR(ENOMEM);
+    }
+    *ctx = res;
+    return 0;
+}
+
+static size_t hash_key(const struct FFHashtableContext *ctx, const void *key)
+{
+    return av_crc(ctx->crc, 0, key, ctx->key_size) % ctx->max_entries;
+}
+
+int ff_hashtable_get(const struct FFHashtableContext *ctx, const void *key, void *val)
+{
+    if (!ctx->nb_entries)
+        return 0;
+
+    size_t hash = hash_key(ctx, key);
+
+    for (size_t psl = 1; psl <= ctx->max_entries; psl++) {
+        size_t wrapped_index = (hash + psl) % ctx->max_entries;
+        uint8_t *entry = ctx->table + wrapped_index * ctx->entry_size;
+        if (ENTRY_PSL_VAL(entry) < psl)
+            // When PSL stops increasing it means there are no further entries
+            // with the same key hash.
+            return 0;
+        if (KEYS_EQUAL(ENTRY_KEY_PTR(entry), key)) {
+            memcpy(val, ENTRY_VAL_PTR(entry), ctx->val_size);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int ff_hashtable_set(struct FFHashtableContext *ctx, const void *key, const void *val)
+{
+    int swapping = 0;
+    size_t psl = 1;
+    size_t hash = hash_key(ctx, key);
+    size_t wrapped_index = hash % ctx->max_entries;
+    uint8_t *set = ctx->swapbuf;
+    uint8_t *tmp = ctx->swapbuf + ctx->key_size_aligned + ctx->val_size_aligned;
+
+    memcpy(set, key, ctx->key_size);
+    memcpy(set + ctx->key_size_aligned, val, ctx->val_size);
+
+    for (size_t i = 0; i < ctx->max_entries; i++) {
+        if (++wrapped_index == ctx->max_entries)
+            wrapped_index = 0;
+        uint8_t *entry = ctx->table + wrapped_index * ctx->entry_size;
+        if (!ENTRY_PSL_VAL(entry) || (!swapping && KEYS_EQUAL(ENTRY_KEY_PTR(entry), set))) {
+            if (!ENTRY_PSL_VAL(entry))
+                ctx->nb_entries++;
+            ENTRY_PSL_VAL(entry) = psl;
+            memcpy(ENTRY_KEY_PTR(entry), set, ctx->key_size_aligned + ctx->val_size);
+            return 1;
+        }
+        if (ENTRY_PSL_VAL(entry) < psl) {
+            // When PSL stops increasing it means there are no further entries
+            // with the same key hash. We can only hope to find an unoccupied
+            // entry.
+            if (ctx->nb_entries == ctx->max_entries)
+                 // The table is full so inserts are impossible.
+                return 0;
+            // Robin Hood hash tables "steal from the rich" by minimizing the
+            // PSL of the inserted entry.
+            swapping = 1;
+            // set needs to swap with entry
+            memcpy(tmp, ENTRY_KEY_PTR(entry), ctx->key_size_aligned + ctx->val_size_aligned);
+            memcpy(ENTRY_KEY_PTR(entry), set, ctx->key_size_aligned + ctx->val_size_aligned);
+            FFSWAP(uint8_t*, set, tmp);
+            FFSWAP(size_t, psl, ENTRY_PSL_VAL(entry));
+        }
+        psl++;
+    }
+    return 0;
+}
+
+int ff_hashtable_delete(struct FFHashtableContext *ctx, const void *key)
+{
+    if (!ctx->nb_entries)
+        return 0;
+
+    uint8_t *next_entry;
+    size_t hash = hash_key(ctx, key);
+    size_t wrapped_index = hash % ctx->max_entries;
+
+    for (size_t psl = 1; psl <= ctx->max_entries; psl++) {
+        if (++wrapped_index == ctx->max_entries)
+            wrapped_index = 0;
+        uint8_t *entry = ctx->table + wrapped_index * ctx->entry_size;
+        if (ENTRY_PSL_VAL(entry) < psl)
+            // When PSL stops increasing it means there are no further entries
+            // with the same key hash.
+            return 0;
+        if (KEYS_EQUAL(ENTRY_KEY_PTR(entry), key)) {
+            ENTRY_PSL_VAL(entry) = 0;
+            // Shift each following entry that will benefit from a reduced PSL.
+            for (psl++; psl <= ctx->max_entries; psl++) {
+                if (++wrapped_index == ctx->max_entries)
+                    wrapped_index = 0;
+                next_entry = ctx->table + wrapped_index * ctx->entry_size;
+                if (ENTRY_PSL_VAL(next_entry) <= 1) {
+                    ctx->nb_entries--;
+                    return 1;
+                }
+                memcpy(entry, next_entry, ctx->entry_size);
+                ENTRY_PSL_VAL(entry)--;
+                ENTRY_PSL_VAL(next_entry) = 0;
+                entry = next_entry;
+            }
+        }
+    };
+    return 0;
+}
+
+void ff_hashtable_clear(struct FFHashtableContext *ctx)
+{
+    memset(ctx->table, 0, ctx->entry_size * ctx->max_entries);
+}
+
+void ff_hashtable_freep(struct FFHashtableContext **ctx)
+{
+    if (*ctx) {
+        av_freep(&(*ctx)->table);
+        av_freep(&(*ctx)->swapbuf);
+    }
+    av_freep(ctx);
+}
diff --git a/libavcodec/hashtable.h b/libavcodec/hashtable.h
new file mode 100644
index 0000000000000..f81b4bb93f478
--- /dev/null
+++ b/libavcodec/hashtable.h
@@ -0,0 +1,94 @@
+/*
+ * Generic hashtable
+ * Copyright (C) 2024 Emma Worley <emma@emma.gg>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HASHTABLE_H
+#define AVCODEC_HASHTABLE_H
+
+#include <stddef.h>
+
+/* Implements a hash table using Robin Hood open addressing.
+ * See: https://cs.uwaterloo.ca/research/tr/1986/CS-86-14.pdf
+ *
+ * Keys are placed in the table based on their CRC value and are considered
+ * equal when they are bytewise-identical.
+ */
+
+typedef struct FFHashtableContext FFHashtableContext;
+
+/**
+ * Create a fixed-sized Robin Hood hash table.
+ *
+ * @param ctx         context to allocate and initialize
+ * @param key_size    size of key type in bytes
+ * @param val_size    size of value type in bytes
+ * @param max_entries maximum number of key-value pairs to store
+ *
+ * @return zero on success, nonzero on error
+ */
+int ff_hashtable_alloc(struct FFHashtableContext **ctx, size_t key_size, size_t val_size, size_t max_entries);
+
+/**
+ * Look up a value from a hash table given a key.
+ *
+ * @param ctx hash table context
+ * @param key pointer to key data
+ * @param val destination pointer for value data
+ *
+ * @return 1 if the key is found, zero if the key is not found
+ */
+int ff_hashtable_get(const struct FFHashtableContext *ctx, const void *key, void *val);
+
+/**
+ * Store a value in a hash table given a key.
+ *
+ * @param ctx hash table context
+ * @param key pointer to key data
+ * @param val pointer for value data
+ *
+ * @return 1 if the key is written, zero if the key is not written due to the hash table reaching max capacity
+ */
+int ff_hashtable_set(struct FFHashtableContext *ctx, const void *key, const void *val);
+
+/**
+ * Delete a value from a hash table given a key.
+ *
+ * @param ctx hash table context
+ * @param key pointer to key data
+ *
+ * @return 1 if the key is deleted, zero if the key is not deleted due to not being found
+ */
+int ff_hashtable_delete(struct FFHashtableContext *ctx, const void *key);
+
+/**
+ * Delete all values from a hash table.
+ *
+ * @param ctx hash table context
+ */
+void ff_hashtable_clear(struct FFHashtableContext *ctx);
+
+/**
+ * Free a hash table.
+ *
+ * @param ctx hash table context
+ */
+void ff_hashtable_freep(struct FFHashtableContext **ctx);
+
+#endif
diff --git a/libavcodec/hevc/hevcdec.c b/libavcodec/hevc/hevcdec.c
index a7a91769fec94..636df5a4e9e1c 100644
--- a/libavcodec/hevc/hevcdec.c
+++ b/libavcodec/hevc/hevcdec.c
@@ -1110,7 +1110,7 @@ static int hls_slice_header(SliceHeader *sh, const HEVCContext *s, GetBitContext
     if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) {
         unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
         // It would be possible to bound this tighter but this here is simpler
-        if (num_entry_point_offsets > get_bits_left(gb)) {
+        if (num_entry_point_offsets > get_bits_left(gb) || num_entry_point_offsets > UINT16_MAX) {
             av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
             return AVERROR_INVALIDDATA;
         }
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 80494c9749d07..db0e02ee934f3 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -314,9 +314,6 @@ CALL_2X_PIXELS(OPNAME ## _pixels16_y2_8_c,                              \
 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_8_c,                             \
                OPNAME ## _pixels8_xy2_8_c,                              \
                8)                                                       \
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_8_c,                          \
-               OPNAME ## _pixels8_8_c,                                  \
-               8)                                                       \
 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_8_c,                       \
                OPNAME ## _no_rnd_pixels8_x2_8_c,                        \
                8)                                                       \
@@ -330,6 +327,8 @@ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_8_c,                      \
 #define op_avg(a, b) a = rnd_avg32(a, b)
 #define op_put(a, b) a = b
 #define put_no_rnd_pixels8_8_c put_pixels8_8_c
+#define put_no_rnd_pixels16_8_c put_pixels16_8_c
+#define avg_no_rnd_pixels16_8_c avg_pixels16_8_c
 PIXOP2(avg, op_avg)
 PIXOP2(put, op_put)
 #undef op_avg
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index 4259a117dc27b..8a71c7ef7733b 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -276,6 +276,10 @@ av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
                 c->idct      = ff_faanidct;
                 c->perm_type = FF_IDCT_PERM_NONE;
 #endif /* CONFIG_FAANIDCT */
+#if CONFIG_MPEG4_DECODER
+            } else if (avctx->idct_algo == FF_IDCT_XVID) {
+                ff_xvid_idct_init(c);
+#endif
             } else { // accurate/default
                 c->idct_put  = ff_simple_idct_put_int16_8bit;
                 c->idct_add  = ff_simple_idct_add_int16_8bit;
@@ -289,9 +293,6 @@ av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
     c->add_pixels_clamped        = ff_add_pixels_clamped_c;
 
-    if (CONFIG_MPEG4_DECODER && avctx->idct_algo == FF_IDCT_XVID)
-        ff_xvid_idct_init(c, avctx);
-
 #if ARCH_AARCH64
     ff_idctdsp_init_aarch64(c, avctx, high_bit_depth);
 #elif ARCH_ARM
diff --git a/libavcodec/intelh263dec.c b/libavcodec/intelh263dec.c
index 374dfdc0de1f3..02016e93bfef2 100644
--- a/libavcodec/intelh263dec.c
+++ b/libavcodec/intelh263dec.c
@@ -19,6 +19,7 @@
  */
 
 #include "codec_internal.h"
+#include "h263.h"
 #include "mpegvideo.h"
 #include "mpegvideodec.h"
 #include "h263data.h"
@@ -56,7 +57,6 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
         av_log(s->avctx, AV_LOG_ERROR, "Intel H.263 free format not supported\n");
         return -1;
     }
-    s->h263_plus = 0;
 
     s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
@@ -119,7 +119,9 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     if (skip_1stop_8data_bits(&s->gb) < 0)
         return AVERROR_INVALIDDATA;
 
-    ff_h263_show_pict_info(s);
+    s->gob_index = H263_GOB_HEIGHT(s->height);
+
+    ff_h263_show_pict_info(s, 0);
 
     return 0;
 }
diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c
index d19bdc4dab3cd..21c78f3cb5bd6 100644
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -77,7 +77,8 @@ static const int16_t h263_mb_type_b_map[15]= {
     MB_TYPE_INTRA4x4                | MB_TYPE_CBP | MB_TYPE_QUANT,
 };
 
-void ff_h263_show_pict_info(MpegEncContext *s){
+void ff_h263_show_pict_info(MpegEncContext *s, int h263_plus)
+{
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
     av_log(s->avctx, AV_LOG_DEBUG, "qp:%d %c size:%d rnd:%d%s%s%s%s%s%s%s%s%s %d/%d\n",
          s->qscale, av_get_picture_type_char(s->pict_type),
@@ -85,7 +86,7 @@ void ff_h263_show_pict_info(MpegEncContext *s){
          s->obmc ? " AP" : "",
          s->umvplus ? " UMV" : "",
          s->h263_long_vectors ? " LONG" : "",
-         s->h263_plus ? " +" : "",
+         h263_plus ? " +" : "",
          s->h263_aic ? " AIC" : "",
          s->alt_inter_vlc ? " AIV" : "",
          s->modified_quant ? " MQ" : "",
@@ -1089,6 +1090,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
 {
     int format, width, height, i, ret;
     uint32_t startcode;
+    int h263_plus;
 
     align_get_bits(&s->gb);
 
@@ -1137,7 +1139,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     */
 
     if (format != 7 && format != 6) {
-        s->h263_plus = 0;
+        h263_plus = 0;
         /* H.263v1 */
         width = ff_h263_format[format][0];
         height = ff_h263_format[format][1];
@@ -1166,7 +1168,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
         int ufep;
 
         /* H.263v2 */
-        s->h263_plus = 1;
+        h263_plus = 1;
         ufep = get_bits(&s->gb, 3); /* Update Full Extended PTYPE */
 
         /* ufep other than 0 and 1 are reserved */
@@ -1314,6 +1316,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     s->mb_height = (s->height  + 15) / 16;
     s->mb_num = s->mb_width * s->mb_height;
 
+    s->gob_index = H263_GOB_HEIGHT(s->height);
+
     if (s->pb_frame) {
         skip_bits(&s->gb, 3); /* Temporal reference for B-pictures */
         if (s->custom_pcf)
@@ -1364,7 +1368,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
         s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
     }
 
-        ff_h263_show_pict_info(s);
+    ff_h263_show_pict_info(s, h263_plus);
+
     if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO") && get_bits_left(&s->gb) >= 85 + 13*3*16 + 50){
         int i,j;
         for(i=0; i<85; i++) av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
diff --git a/libavcodec/ituh263enc.c b/libavcodec/ituh263enc.c
index 8be7ee4636fdf..4fdf9cf40281c 100644
--- a/libavcodec/ituh263enc.c
+++ b/libavcodec/ituh263enc.c
@@ -46,6 +46,7 @@
 #include "mathops.h"
 #include "mpegutils.h"
 #include "internal.h"
+#include "put_bits.h"
 
 /**
  * Table of number of bits a motion vector component needs.
@@ -230,7 +231,9 @@ static int h263_encode_picture_header(MPVMainEncContext *const m)
     int best_error= INT_MAX;
     int custom_pcf;
 
-    if(s->c.h263_plus){
+    put_bits_assume_flushed(&s->pb);
+
+    if (s->c.codec_id == AV_CODEC_ID_H263P) {
         for(i=0; i<2; i++){
             int div, error;
             div= (s->c.avctx->time_base.num*1800000LL + 500LL*s->c.avctx->time_base.den) / ((1000LL+i)*s->c.avctx->time_base.den);
@@ -247,8 +250,6 @@ static int h263_encode_picture_header(MPVMainEncContext *const m)
     coded_frame_rate= 1800000;
     coded_frame_rate_base= (1000+best_clock_code)*best_divisor;
 
-    align_put_bits(&s->pb);
-
     put_bits(&s->pb, 22, 0x20); /* PSC */
     temp_ref= s->c.picture_number * (int64_t)coded_frame_rate * s->c.avctx->time_base.num / //FIXME use timestamp
                          (coded_frame_rate_base * (int64_t)s->c.avctx->time_base.den);
@@ -261,7 +262,7 @@ static int h263_encode_picture_header(MPVMainEncContext *const m)
     put_bits(&s->pb, 1, 0);     /* freeze picture release off */
 
     format = ff_match_2uint16(ff_h263_format, FF_ARRAY_ELEMS(ff_h263_format), s->c.width, s->c.height);
-    if (!s->c.h263_plus) {
+    if (s->c.codec_id != AV_CODEC_ID_H263P) {
         /* H.263v1 */
         put_bits(&s->pb, 3, format);
         put_bits(&s->pb, 1, (s->c.pict_type == AV_PICTURE_TYPE_P));
@@ -841,6 +842,9 @@ av_cold void ff_h263_encode_init(MPVMainEncContext *const m)
     if (s->c.modified_quant)
         s->c.chroma_qscale_table = ff_h263_chroma_qscale_table;
 
+    // Only used for H.263 and H.263+
+    s->c.gob_index = H263_GOB_HEIGHT(s->c.height);
+
     // use fcodes >1 only for MPEG-4 & H.263 & H.263+ FIXME
     switch(s->c.codec_id){
     case AV_CODEC_ID_H263P:
diff --git a/libavcodec/lcevcdec.c b/libavcodec/lcevcdec.c
index 2fe06b8800be8..102f6f32e9513 100644
--- a/libavcodec/lcevcdec.c
+++ b/libavcodec/lcevcdec.c
@@ -47,7 +47,7 @@ static LCEVC_ColorFormat map_format(int format)
     return LCEVC_ColorFormat_Unknown;
 }
 
-static int alloc_base_frame(void *logctx, LCEVC_DecoderHandle decoder,
+static int alloc_base_frame(void *logctx, FFLCEVCContext *lcevc,
                             const AVFrame *frame, LCEVC_PictureHandle *picture)
 {
     LCEVC_PictureDesc desc;
@@ -70,22 +70,22 @@ static int alloc_base_frame(void *logctx, LCEVC_DecoderHandle decoder,
     desc.sampleAspectRatioDen  = frame->sample_aspect_ratio.den;
 
     /* Allocate LCEVC Picture */
-    res = LCEVC_AllocPicture(decoder, &desc, picture);
+    res = LCEVC_AllocPicture(lcevc->decoder, &desc, picture);
     if (res != LCEVC_Success) {
         return AVERROR_EXTERNAL;
     }
-    res = LCEVC_LockPicture(decoder, *picture, LCEVC_Access_Write, &lock);
+    res = LCEVC_LockPicture(lcevc->decoder, *picture, LCEVC_Access_Write, &lock);
     if (res != LCEVC_Success)
         return AVERROR_EXTERNAL;
 
-    res = LCEVC_GetPicturePlaneCount(decoder, *picture, &planes);
+    res = LCEVC_GetPicturePlaneCount(lcevc->decoder, *picture, &planes);
     if (res != LCEVC_Success)
         return AVERROR_EXTERNAL;
 
     for (unsigned i = 0; i < planes; i++) {
         LCEVC_PicturePlaneDesc plane;
 
-        res = LCEVC_GetPictureLockPlaneDesc(decoder, lock, i, &plane);
+        res = LCEVC_GetPictureLockPlaneDesc(lcevc->decoder, lock, i, &plane);
         if (res != LCEVC_Success)
             return AVERROR_EXTERNAL;
 
@@ -96,43 +96,43 @@ static int alloc_base_frame(void *logctx, LCEVC_DecoderHandle decoder,
     av_image_copy2(data, linesizes, frame->data, frame->linesize,
                    frame->format, frame->width, frame->height);
 
-    res = LCEVC_UnlockPicture(decoder, lock);
+    res = LCEVC_UnlockPicture(lcevc->decoder, lock);
     if (res != LCEVC_Success)
         return AVERROR_EXTERNAL;
 
     return 0;
 }
 
-static int alloc_enhanced_frame(void *logctx, LCEVC_DecoderHandle decoder,
-                                const AVFrame *frame, LCEVC_PictureHandle *picture)
+static int alloc_enhanced_frame(void *logctx, FFLCEVCFrame *frame_ctx,
+                                LCEVC_PictureHandle *picture)
 {
+    FFLCEVCContext *lcevc = frame_ctx->lcevc;
     LCEVC_PictureDesc desc ;
-    LCEVC_ColorFormat fmt = map_format(frame->format);
+    LCEVC_ColorFormat fmt = map_format(frame_ctx->frame->format);
     LCEVC_PicturePlaneDesc planes[4] = { 0 };
-    int width = frame->width * 2 / FFMAX(frame->sample_aspect_ratio.den, 1);
-    int height = frame->height * 2 / FFMAX(frame->sample_aspect_ratio.num, 1);
     LCEVC_ReturnCode res;
 
-    res = LCEVC_DefaultPictureDesc(&desc, fmt, width, height);
+    res = LCEVC_DefaultPictureDesc(&desc, fmt, frame_ctx->frame->width, frame_ctx->frame->height);
     if (res != LCEVC_Success)
         return AVERROR_EXTERNAL;
 
     /* Set plane description */
     for (int i = 0; i < 4; i++) {
-        planes[i].firstSample = frame->data[i];
-        planes[i].rowByteStride = frame->linesize[i];
+        planes[i].firstSample = frame_ctx->frame->data[i];
+        planes[i].rowByteStride = frame_ctx->frame->linesize[i];
     }
 
     /* Allocate LCEVC Picture */
-    res = LCEVC_AllocPictureExternal(decoder, &desc, NULL, planes, picture);
+    res = LCEVC_AllocPictureExternal(lcevc->decoder, &desc, NULL, planes, picture);
     if (res != LCEVC_Success) {
         return AVERROR_EXTERNAL;
     }
     return 0;
 }
 
-static int lcevc_send_frame(void *logctx, FFLCEVCContext *lcevc, const AVFrame *in)
+static int lcevc_send_frame(void *logctx, FFLCEVCFrame *frame_ctx, const AVFrame *in)
 {
+    FFLCEVCContext *lcevc = frame_ctx->lcevc;
     const AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_LCEVC);
     LCEVC_PictureHandle picture;
     LCEVC_ReturnCode res;
@@ -145,7 +145,7 @@ static int lcevc_send_frame(void *logctx, FFLCEVCContext *lcevc, const AVFrame *
     if (res != LCEVC_Success)
         return AVERROR_EXTERNAL;
 
-    ret = alloc_base_frame(logctx, lcevc->decoder, in, &picture);
+    ret = alloc_base_frame(logctx, lcevc, in, &picture);
     if (ret < 0)
         return ret;
 
@@ -154,7 +154,7 @@ static int lcevc_send_frame(void *logctx, FFLCEVCContext *lcevc, const AVFrame *
         return AVERROR_EXTERNAL;
 
     memset(&picture, 0, sizeof(picture));
-    ret = alloc_enhanced_frame(logctx, lcevc->decoder, in, &picture);
+    ret = alloc_enhanced_frame(logctx, frame_ctx, &picture);
     if (ret < 0)
         return ret;
 
@@ -165,8 +165,9 @@ static int lcevc_send_frame(void *logctx, FFLCEVCContext *lcevc, const AVFrame *
     return 0;
 }
 
-static int generate_output(void *logctx, FFLCEVCContext *lcevc, AVFrame *out)
+static int generate_output(void *logctx, FFLCEVCFrame *frame_ctx, AVFrame *out)
 {
+    FFLCEVCContext *lcevc = frame_ctx->lcevc;
     LCEVC_PictureDesc desc;
     LCEVC_DecodeInformation info;
     LCEVC_PictureHandle picture;
@@ -186,6 +187,11 @@ static int generate_output(void *logctx, FFLCEVCContext *lcevc, AVFrame *out)
     out->crop_right = desc.cropRight;
     out->sample_aspect_ratio.num = desc.sampleAspectRatioNum;
     out->sample_aspect_ratio.den = desc.sampleAspectRatioDen;
+
+    av_frame_copy_props(frame_ctx->frame, out);
+    av_frame_unref(out);
+    av_frame_move_ref(out, frame_ctx->frame);
+
     out->width = desc.width + out->crop_left + out->crop_right;
     out->height = desc.height + out->crop_top + out->crop_bottom;
 
@@ -196,13 +202,14 @@ static int generate_output(void *logctx, FFLCEVCContext *lcevc, AVFrame *out)
     return 0;
 }
 
-static int lcevc_receive_frame(void *logctx, FFLCEVCContext *lcevc, AVFrame *out)
+static int lcevc_receive_frame(void *logctx, FFLCEVCFrame *frame_ctx, AVFrame *out)
 {
+    FFLCEVCContext *lcevc = frame_ctx->lcevc;
     LCEVC_PictureHandle picture;
     LCEVC_ReturnCode res;
     int ret;
 
-    ret = generate_output(logctx, lcevc, out);
+    ret = generate_output(logctx, frame_ctx, out);
     if (ret < 0)
         return ret;
 
@@ -249,12 +256,7 @@ static int lcevc_init(FFLCEVCContext *lcevc, void *logctx)
 #if CONFIG_LIBLCEVC_DEC
     LCEVC_AccelContextHandle dummy = { 0 };
     const int32_t event = LCEVC_Log;
-#endif
 
-    if (lcevc->initialized)
-        return 0;
-
-#if CONFIG_LIBLCEVC_DEC
     if (LCEVC_CreateDecoder(&lcevc->decoder, dummy) != LCEVC_Success) {
         av_log(logctx, AV_LOG_ERROR, "Failed to create LCEVC decoder\n");
         return AVERROR_EXTERNAL;
@@ -279,7 +281,8 @@ static int lcevc_init(FFLCEVCContext *lcevc, void *logctx)
 int ff_lcevc_process(void *logctx, AVFrame *frame)
 {
     FrameDecodeData  *fdd = frame->private_ref;
-    FFLCEVCContext *lcevc = fdd->post_process_opaque;
+    FFLCEVCFrame *frame_ctx = fdd->post_process_opaque;
+    FFLCEVCContext *lcevc = frame_ctx->lcevc;
     int ret;
 
     if (!lcevc->initialized) {
@@ -289,11 +292,14 @@ int ff_lcevc_process(void *logctx, AVFrame *frame)
     }
 
 #if CONFIG_LIBLCEVC_DEC
-    ret = lcevc_send_frame(logctx, lcevc, frame);
+    av_assert0(frame_ctx->frame);
+
+
+    ret = lcevc_send_frame(logctx, frame_ctx, frame);
     if (ret)
         return ret < 0 ? ret : 0;
 
-    lcevc_receive_frame(logctx, lcevc, frame);
+    lcevc_receive_frame(logctx, frame_ctx, frame);
     if (ret < 0)
         return ret;
 
@@ -317,5 +323,8 @@ int ff_lcevc_alloc(FFLCEVCContext **plcevc)
 
 void ff_lcevc_unref(void *opaque)
 {
-    av_refstruct_unref(&opaque);
+    FFLCEVCFrame *lcevc = opaque;
+    av_refstruct_unref(&lcevc->lcevc);
+    av_frame_free(&lcevc->frame);
+    av_free(opaque);
 }
diff --git a/libavcodec/lcevcdec.h b/libavcodec/lcevcdec.h
index b21d1073c4af1..62014132d92a8 100644
--- a/libavcodec/lcevcdec.h
+++ b/libavcodec/lcevcdec.h
@@ -35,6 +35,11 @@ typedef struct FFLCEVCContext {
 
 struct AVFrame;
 
+typedef struct FFLCEVCFrame {
+    FFLCEVCContext *lcevc;
+    struct AVFrame *frame;
+} FFLCEVCFrame;
+
 int ff_lcevc_alloc(FFLCEVCContext **plcevc);
 int ff_lcevc_process(void *logctx, struct AVFrame *frame);
 void ff_lcevc_unref(void *opaque);
diff --git a/libavcodec/libaomenc.c b/libavcodec/libaomenc.c
index 9a384fcc39ccc..903292d164397 100644
--- a/libavcodec/libaomenc.c
+++ b/libavcodec/libaomenc.c
@@ -681,7 +681,6 @@ static av_cold int aom_init(AVCodecContext *avctx,
     struct aom_codec_enc_cfg enccfg = { 0 };
     aom_codec_flags_t flags =
         (avctx->flags & AV_CODEC_FLAG_PSNR) ? AOM_CODEC_USE_PSNR : 0;
-    AVCPBProperties *cpb_props;
     int res;
     aom_img_fmt_t img_fmt;
     aom_codec_caps_t codec_caps = aom_codec_get_caps(iface);
@@ -989,10 +988,6 @@ static av_cold int aom_init(AVCodecContext *avctx,
     if (codec_caps & AOM_CODEC_CAP_HIGHBITDEPTH)
         ctx->rawimg.bit_depth = enccfg.g_bit_depth;
 
-    cpb_props = ff_encode_add_cpb_side_data(avctx);
-    if (!cpb_props)
-        return AVERROR(ENOMEM);
-
     ctx->dovi.logctx = avctx;
     if ((res = ff_dovi_configure(&ctx->dovi, avctx)) < 0)
         return res;
@@ -1019,6 +1014,10 @@ static av_cold int aom_init(AVCodecContext *avctx,
            return ret;
     }
 
+    AVCPBProperties *cpb_props = ff_encode_add_cpb_side_data(avctx);
+    if (!cpb_props)
+        return AVERROR(ENOMEM);
+
     if (enccfg.rc_end_usage == AOM_CBR ||
         enccfg.g_pass != AOM_RC_ONE_PASS) {
         cpb_props->max_bitrate = avctx->rc_max_rate;
diff --git a/libavcodec/libaribb24.c b/libavcodec/libaribb24.c
index 6e062cbffd699..a26e456295418 100644
--- a/libavcodec/libaribb24.c
+++ b/libavcodec/libaribb24.c
@@ -96,13 +96,13 @@ static int libaribb24_generate_ass_header(AVCodecContext *avctx)
     font_size = get_profile_font_size(avctx);
 
     avctx->subtitle_header = av_asprintf(
-             "[Script Info]\r\n"
-             "; Script generated by FFmpeg/Lavc%s\r\n"
-             "ScriptType: v4.00+\r\n"
-             "PlayResX: %d\r\n"
-             "PlayResY: %d\r\n"
-             "\r\n"
-             "[V4+ Styles]\r\n"
+             "[Script Info]\n"
+             "; Script generated by FFmpeg/Lavc%s\n"
+             "ScriptType: v4.00+\n"
+             "PlayResX: %d\n"
+             "PlayResY: %d\n"
+             "\n"
+             "[V4+ Styles]\n"
 
              /* ASSv4 header */
              "Format: Name, "
@@ -113,7 +113,7 @@ static int libaribb24_generate_ass_header(AVCodecContext *avctx)
              "Spacing, Angle, "
              "BorderStyle, Outline, Shadow, "
              "Alignment, MarginL, MarginR, MarginV, "
-             "Encoding\r\n"
+             "Encoding\n"
 
              "Style: "
              "Default,"             /* Name */
@@ -124,11 +124,11 @@ static int libaribb24_generate_ass_header(AVCodecContext *avctx)
              "0,0,"                 /* Spacing, Angle */
              "%d,1,0,"              /* BorderStyle, Outline, Shadow */
              "%d,10,10,10,"         /* Alignment, Margin[LRV] */
-             "0\r\n"                /* Encoding */
+             "0\n"                  /* Encoding */
 
-             "\r\n"
-             "[Events]\r\n"
-             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n",
+             "\n"
+             "[Events]\n"
+             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n",
              !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
              plane_width, plane_height,
              ASS_DEFAULT_FONT, font_size, ASS_DEFAULT_COLOR,
diff --git a/libavcodec/libaribcaption.c b/libavcodec/libaribcaption.c
index 91691f67787df..53d334803fe41 100644
--- a/libavcodec/libaribcaption.c
+++ b/libavcodec/libaribcaption.c
@@ -522,14 +522,14 @@ static int set_ass_header(ARIBCaptionContext *ctx)
 
     av_freep(&avctx->subtitle_header);
     avctx->subtitle_header = av_asprintf(
-            "[Script Info]\r\n"
-            "ScriptType: v4.00+\r\n"
-            "PlayResX: %d\r\n"
-            "PlayResY: %d\r\n"
-            "WrapStyle: 2\r\n"      /* 2: no word wrapping */
-            "\r\n"
-
-            "[V4+ Styles]\r\n"
+            "[Script Info]\n"
+            "ScriptType: v4.00+\n"
+            "PlayResX: %d\n"
+            "PlayResY: %d\n"
+            "WrapStyle: 2\n"        /* 2: no word wrapping */
+            "\n"
+
+            "[V4+ Styles]\n"
              "Format: Name, "
              "Fontname, Fontsize, "
              "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
@@ -538,7 +538,7 @@ static int set_ass_header(ARIBCaptionContext *ctx)
              "Spacing, Angle, "
              "BorderStyle, Outline, Shadow, "
              "Alignment, MarginL, MarginR, MarginV, "
-             "Encoding\r\n"
+             "Encoding\n"
 
              "Style: "
              "Default,"             /* Name */
@@ -549,11 +549,11 @@ static int set_ass_header(ARIBCaptionContext *ctx)
              "0,0,"                 /* Spacing, Angle */
              "%d,%d,%d,"            /* BorderStyle, Outline, Shadow */
              "%d,10,10,10,"         /* Alignment, Margin[LRV] */
-             "0\r\n"                /* Encoding */
-             "\r\n"
+             "0\n"                  /* Encoding */
+             "\n"
 
-             "[Events]\r\n"
-             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n",
+             "[Events]\n"
+             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n",
             ctx->plane_width, ctx->plane_height,
             font_name, ctx->font_size,
             ASS_DEFAULT_COLOR, ASS_DEFAULT_COLOR,
diff --git a/libavcodec/libzvbi-teletextdec.c b/libavcodec/libzvbi-teletextdec.c
index 68ffe1f76ce1d..e02ecb8b3a3f9 100644
--- a/libavcodec/libzvbi-teletextdec.c
+++ b/libavcodec/libzvbi-teletextdec.c
@@ -91,7 +91,7 @@ static int my_ass_subtitle_header(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
-    event_pos = strstr(avctx->subtitle_header, "\r\n[Events]\r\n");
+    event_pos = strstr(avctx->subtitle_header, "\n[Events]\n");
     if (!event_pos)
         return AVERROR_BUG;
 
@@ -106,7 +106,7 @@ static int my_ass_subtitle_header(AVCodecContext *avctx)
         "0,0,"                 /* Spacing, Angle */
         "3,0.1,0,"             /* BorderStyle, Outline, Shadow */
         "5,1,1,1,"             /* Alignment, Margin[LRV] */
-        "0\r\n"                /* Encoding */
+        "0\n"                  /* Encoding */
         "Style: "
         "Subtitle,"            /* Name */
         "Monospace,16,"        /* Font{name,size} */
@@ -116,7 +116,7 @@ static int my_ass_subtitle_header(AVCodecContext *avctx)
         "0,0,"                 /* Spacing, Angle */
         "1,1,1,"               /* BorderStyle, Outline, Shadow */
         "8,48,48,20,"          /* Alignment, Margin[LRV] */
-        "0\r\n"                /* Encoding */
+        "0\n"                  /* Encoding */
         , event_pos);
 
     if (!new_header)
diff --git a/libavcodec/mips/pixblockdsp_init_mips.c b/libavcodec/mips/pixblockdsp_init_mips.c
index 00f189d558d7f..acea95d36e74e 100644
--- a/libavcodec/mips/pixblockdsp_init_mips.c
+++ b/libavcodec/mips/pixblockdsp_init_mips.c
@@ -23,7 +23,7 @@
 #include "libavcodec/pixblockdsp.h"
 #include "pixblockdsp_mips.h"
 
-void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c,
                               unsigned high_bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -31,27 +31,13 @@ void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
     if (have_mmi(cpu_flags)) {
         c->diff_pixels = ff_diff_pixels_mmi;
 
-        if (!high_bit_depth || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+        if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_8_mmi;
-        }
     }
 
     if (have_msa(cpu_flags)) {
         c->diff_pixels = ff_diff_pixels_msa;
 
-        switch (avctx->bits_per_raw_sample) {
-        case 9:
-        case 10:
-        case 12:
-        case 14:
-            c->get_pixels = ff_get_pixels_16_msa;
-            break;
-        default:
-            if (avctx->bits_per_raw_sample <= 8 || avctx->codec_type !=
-                AVMEDIA_TYPE_VIDEO) {
-                c->get_pixels = ff_get_pixels_8_msa;
-            }
-            break;
-        }
+        c->get_pixels = high_bit_depth ? ff_get_pixels_16_msa : ff_get_pixels_8_msa;
     }
 }
diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c
index e7a4f8f16a67a..21b3b19b9362b 100644
--- a/libavcodec/mjpegenc_common.c
+++ b/libavcodec/mjpegenc_common.c
@@ -304,7 +304,8 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_marker(pb, SOF0 ); break;
     case AV_CODEC_ID_LJPEG:  put_marker(pb, SOF3 ); break;
-    default: av_assert0(0);
+    default: av_unreachable("ff_mjpeg_encode_picture_header only called by "
+                            "AMV, LJPEG, MJPEG and the former has been ruled out");
     }
 
     put_bits(pb, 16, 8 + 3 * components);
@@ -375,7 +376,7 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_bits(pb, 8, 63); break; /* Se (not used) */
     case AV_CODEC_ID_LJPEG:  put_bits(pb, 8,  0); break; /* not used */
-    default: av_assert0(0);
+    default: av_unreachable("Only LJPEG, MJPEG possible here");
     }
 
     put_bits(pb, 8, 0); /* Ah/Al (not used) */
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index fc41c548e204c..9cf1bb9b28eb4 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -80,7 +80,7 @@ typedef struct Mpeg1Context {
     int has_afd;
     int slice_count;
     unsigned aspect_ratio_info;
-    int save_width, save_height, save_progressive_seq, save_chroma_format;
+    int save_progressive_seq, save_chroma_format;
     AVRational frame_rate_ext;  /* MPEG-2 specific framerate modificator */
     unsigned frame_rate_index;
     int sync;                   /* Did we reach a sync point like a GOP/SEQ/KEYFrame? */
@@ -381,9 +381,6 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     return 0;
 }
 
-/******************************************/
-/* decoding */
-
 static inline int get_dmv(MpegEncContext *s)
 {
     if (get_bits1(&s->gb))
@@ -915,8 +912,6 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
     if (!s->context_initialized                             ||
         avctx->coded_width       != s->width                ||
         avctx->coded_height      != s->height               ||
-        s1->save_width           != s->width                ||
-        s1->save_height          != s->height               ||
         s1->save_chroma_format   != s->chroma_format        ||
         (s1->save_progressive_seq != s->progressive_sequence && FFALIGN(s->height, 16) != FFALIGN(s->height, 32)) ||
         0) {
@@ -934,8 +929,6 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
                    (s1->bit_rate != 0x3FFFF*400 || s1->vbv_delay != 0xFFFF)) {
             avctx->bit_rate = s1->bit_rate;
         }
-        s1->save_width           = s->width;
-        s1->save_height          = s->height;
         s1->save_progressive_seq = s->progressive_sequence;
         s1->save_chroma_format   = s->chroma_format;
 
@@ -1863,9 +1856,8 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     } else {
         s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
     }
-    s1->save_width           = s->width;
-    s1->save_height          = s->height;
     s1->save_progressive_seq = s->progressive_sequence;
+    s1->save_chroma_format   = s->chroma_format;
     return 0;
 }
 
diff --git a/libavcodec/mpeg12enc.c b/libavcodec/mpeg12enc.c
index 9d0a8e41704f4..96957235e9e60 100644
--- a/libavcodec/mpeg12enc.c
+++ b/libavcodec/mpeg12enc.c
@@ -49,6 +49,7 @@
 #include "mpegvideo.h"
 #include "mpegvideoenc.h"
 #include "profiles.h"
+#include "put_bits.h"
 #include "rl.h"
 
 #if CONFIG_MPEG1VIDEO_ENCODER || CONFIG_MPEG2VIDEO_ENCODER
@@ -155,6 +156,8 @@ static void mpeg1_encode_sequence_header(MPEG12EncContext *mpeg12)
     AVRational aspect_ratio = s->c.avctx->sample_aspect_ratio;
     int aspect_ratio_info;
 
+    put_bits_assume_flushed(&s->pb);
+
     if (!(s->c.cur_pic.ptr->f->flags & AV_FRAME_FLAG_KEY))
         return;
 
@@ -339,6 +342,8 @@ static int mpeg1_encode_picture_header(MPVMainEncContext *const m)
     MPVEncContext *const s = &m->s;
     const AVFrameSideData *side_data;
 
+    put_bits_assume_flushed(&s->pb);
+
     mpeg1_encode_sequence_header(mpeg12);
 
     /* MPEG-1 picture header */
@@ -454,8 +459,7 @@ static int mpeg1_encode_picture_header(MPVMainEncContext *const m)
 
             put_bits(&s->pb, 1, 1);     // reserved_bit
             put_bits(&s->pb, 7, fpa_type); // S3D_video_format_type
-            put_bits(&s->pb, 8, 0x04);  // reserved_data[0]
-            put_bits(&s->pb, 8, 0xFF);  // reserved_data[1]
+            put_bits(&s->pb, 16, 0x04FF);  // reserved_data
         }
     }
 
@@ -1121,7 +1125,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     } else {
         s->min_qcoeff = -2047;
         s->max_qcoeff = 2047;
-        s->c.mpeg_quant = 1;
+        s->mpeg_quant = 1;
     }
     if (s->c.intra_vlc_format) {
         s->intra_ac_vlc_length      =
diff --git a/libavcodec/mpeg4video.c b/libavcodec/mpeg4video.c
index 2c0c1044f2014..3980a3930586c 100644
--- a/libavcodec/mpeg4video.c
+++ b/libavcodec/mpeg4video.c
@@ -20,25 +20,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/thread.h"
-
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "mpeg4video.h"
 #include "mpeg4data.h"
 
-static av_cold void mpeg4_init_rl_intra(void)
-{
-    static uint8_t mpeg4_rl_intra_table[2][2 * MAX_RUN + MAX_LEVEL + 3];
-    ff_rl_init(&ff_mpeg4_rl_intra, mpeg4_rl_intra_table);
-}
-
-av_cold void ff_mpeg4_init_rl_intra(void)
-{
-    static AVOnce init_static_once = AV_ONCE_INIT;
-    ff_thread_once(&init_static_once, mpeg4_init_rl_intra);
-}
-
 int ff_mpeg4_get_video_packet_prefix_length(enum AVPictureType pict_type,
                                             int f_code, int b_code)
 {
diff --git a/libavcodec/mpeg4videodata.h b/libavcodec/mpeg4videodata.h
index 8aac8a225587c..baca8a0b9a499 100644
--- a/libavcodec/mpeg4videodata.h
+++ b/libavcodec/mpeg4videodata.h
@@ -35,7 +35,6 @@ extern const int8_t ff_mpeg4_intra_level[102];
 extern const int8_t ff_mpeg4_intra_run[102];
 
 extern RLTable ff_mpeg4_rl_intra;
-void ff_mpeg4_init_rl_intra(void);
 
 /* Note this is identical to the intra rvlc except that it is reordered. */
 extern RLTable ff_rvlc_rl_inter;
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index d5979096ae2d7..313d73157f73b 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -24,6 +24,7 @@
 
 #include "config_components.h"
 
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/thread.h"
@@ -47,7 +48,6 @@
 #include "profiles.h"
 #include "qpeldsp.h"
 #include "threadprogress.h"
-#include "xvididct.h"
 #include "unary.h"
 
 #if 0 //3IV1 is quite rare and it slows things down a tiny bit
@@ -605,7 +605,8 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
         ctx->sprite_shift[1]  = alpha + beta + rho - min_ab + 2;
         break;
     default:
-        av_assert0(0);
+        av_unreachable("num_sprite_warping_points outside of 0..3 results in an error"
+                       "in which num_sprite_warping_points is reset to zero");
     }
     /* try to simplify the situation */
     if (sprite_delta[0][0] == a << ctx->sprite_shift[0] &&
@@ -1398,7 +1399,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
 
         scan_table = s->intra_scantable.permutated;
 
-        if (s->mpeg_quant) {
+        if (ctx->mpeg_quant) {
             qmul = 1;
             qadd = 0;
             if (rvlc)
@@ -2154,7 +2155,7 @@ static int mpeg4_decode_studio_block(MpegEncContext *s, int32_t block[64], int n
 
     s->last_dc[cc] += dct_diff;
 
-    if (s->mpeg_quant)
+    if (ctx->mpeg_quant)
         block[0] = s->last_dc[cc] * (8 >> s->intra_dc_precision);
     else
         block[0] = s->last_dc[cc] * (8 >> s->intra_dc_precision) * (8 >> s->dct_precision);
@@ -2584,7 +2585,7 @@ static int decode_studio_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     skip_bits(gb, 15); /* latter_half_vbv_occupancy */
     check_marker(s->avctx, gb, "after latter_half_vbv_occupancy");
     s->low_delay  = get_bits1(gb);
-    s->mpeg_quant = get_bits1(gb); /* mpeg2_stream */
+    ctx->mpeg_quant = get_bits1(gb); /* mpeg2_stream */
 
     next_start_code_studio(gb);
     extension_and_user_data(s, gb, 2);
@@ -2766,7 +2767,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 
         // FIXME a bunch of grayscale shape things
 
-        if ((s->mpeg_quant = get_bits1(gb))) { /* vol_quant_type */
+        if ((ctx->mpeg_quant = get_bits1(gb))) { /* vol_quant_type */
             int i, v;
 
             mpeg4_load_default_matrices(s);
@@ -3414,8 +3415,10 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb,
         }
     }
 
-    s->dct_unquantize_intra = s->mpeg_quant ? ctx->dct_unquantize_mpeg2_intra
-                                            : ctx->dct_unquantize_h263_intra;
+    s->dct_unquantize_intra = ctx->mpeg_quant ? ctx->dct_unquantize_mpeg2_intra
+                                              : ctx->dct_unquantize_h263_intra;
+    // The following tells ff_mpv_reconstruct_mb() to unquantize iff mpeg_quant
+    s->dct_unquantize_inter = ctx->mpeg_quant ? ctx->dct_unquantize_mpeg2_inter : NULL;
 
 end:
     /* detect buggy encoders which don't set the low_delay flag
@@ -3854,6 +3857,7 @@ static int mpeg4_update_thread_context(AVCodecContext *dst,
     s->sprite_warping_accuracy   = s1->sprite_warping_accuracy;
     s->num_sprite_warping_points = s1->num_sprite_warping_points;
     s->m.data_partitioning       = s1->m.data_partitioning;
+    s->mpeg_quant                = s1->mpeg_quant;
     s->rvlc                      = s1->rvlc;
     s->resync_marker             = s1->resync_marker;
     s->t_frame                   = s1->t_frame;
@@ -3878,9 +3882,6 @@ static int mpeg4_update_thread_context(AVCodecContext *dst,
     memcpy(s->sprite_shift, s1->sprite_shift, sizeof(s1->sprite_shift));
     memcpy(s->sprite_traj,  s1->sprite_traj,  sizeof(s1->sprite_traj));
 
-    if (!init && s1->xvid_build >= 0)
-        ff_xvid_idct_init(&s->m.idsp, dst);
-
     return av_buffer_replace(&s->bitstream_buffer, s1->bitstream_buffer);
 }
 
@@ -3899,7 +3900,6 @@ static int mpeg4_update_thread_context_for_user(AVCodecContext *dst,
 
 static av_cold void mpeg4_init_static(void)
 {
-    static uint8_t mpeg4_rvlc_rl_tables[2][2][2 * MAX_RUN + MAX_LEVEL + 3];
     static VLCElem vlc_buf[6498];
     VLCInitState state = VLC_INIT_STATE(vlc_buf);
 
@@ -3921,9 +3921,9 @@ static av_cold void mpeg4_init_static(void)
                                             0, 0);
     }
 
-    ff_mpeg4_init_rl_intra();
-    ff_rl_init(&ff_rvlc_rl_inter, mpeg4_rvlc_rl_tables[0]);
-    ff_rl_init(&ff_rvlc_rl_intra, mpeg4_rvlc_rl_tables[1]);
+    static uint8_t mpeg4_rl_intra_table[2][2 * MAX_RUN + MAX_LEVEL + 3];
+    ff_rl_init(&ff_mpeg4_rl_intra, mpeg4_rl_intra_table);
+
     INIT_FIRST_VLC_RL(ff_mpeg4_rl_intra, 554);
     VLC_INIT_RL(ff_rvlc_rl_inter, 1072);
     INIT_FIRST_VLC_RL(ff_rvlc_rl_intra, 1072);
@@ -3964,8 +3964,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ctx->dct_unquantize_h263_intra  = unquant_dsp_ctx.dct_unquantize_h263_intra;
     ctx->dct_unquantize_mpeg2_intra = unquant_dsp_ctx.dct_unquantize_mpeg2_intra;
     // dct_unquantize_inter is only used with MPEG-2 quantizers,
-    // so we can already set dct_unquantize_inter here once and for all.
-    s->dct_unquantize_inter = unquant_dsp_ctx.dct_unquantize_mpeg2_inter;
+    // so that is all we keep.
+    ctx->dct_unquantize_mpeg2_inter = unquant_dsp_ctx.dct_unquantize_mpeg2_inter;
 
     s->y_dc_scale_table = ff_mpeg4_y_dc_scale_table;
     s->c_dc_scale_table = ff_mpeg4_c_dc_scale_table;
diff --git a/libavcodec/mpeg4videodec.h b/libavcodec/mpeg4videodec.h
index 57a2f81816a18..ae8428fd2bdbc 100644
--- a/libavcodec/mpeg4videodec.h
+++ b/libavcodec/mpeg4videodec.h
@@ -52,6 +52,7 @@ typedef struct Mpeg4DecContext {
     /// sprite shift [isChroma]
     int sprite_shift[2];
 
+    int mpeg_quant;
     // reversible vlc
     int rvlc;
     /// could this stream contain resync markers
@@ -91,15 +92,19 @@ typedef struct Mpeg4DecContext {
 
     Mpeg4VideoDSPContext mdsp;
 
+    void (*dct_unquantize_mpeg2_inter)(MpegEncContext *s,
+                                       int16_t *block, int n, int qscale);
     void (*dct_unquantize_mpeg2_intra)(MpegEncContext *s,
                                        int16_t *block, int n, int qscale);
     void (*dct_unquantize_h263_intra)(MpegEncContext *s,
                                       int16_t *block, int n, int qscale);
 
-    DECLARE_ALIGNED(8, int32_t, block32)[12][64];
+    union {
+        DECLARE_ALIGNED(8, int32_t, block32)[12][64];
+        int16_t dpcm_macroblock[3][256];
+    };
     // 0 = DCT, 1 = DPCM top to bottom scan, -1 = DPCM bottom to top scan
     int dpcm_direction;
-    int16_t dpcm_macroblock[3][256];
 } Mpeg4DecContext;
 
 int ff_mpeg4_decode_picture_header(MpegEncContext *s);
diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c
index 01d5076547f21..0fa8159f181ce 100644
--- a/libavcodec/mpeg4videoenc.c
+++ b/libavcodec/mpeg4videoenc.c
@@ -35,6 +35,7 @@
 #include "mpeg4videoenc.h"
 #include "mpegvideoenc.h"
 #include "profiles.h"
+#include "put_bits.h"
 #include "version.h"
 
 /**
@@ -237,11 +238,10 @@ static inline int decide_ac_pred(MPVEncContext *const s, int16_t block[6][64],
  */
 void ff_clean_mpeg4_qscales(MPVEncContext *const s)
 {
-    int8_t *const qscale_table = s->c.cur_pic.qscale_table;
-
     ff_clean_h263_qscales(s);
 
     if (s->c.pict_type == AV_PICTURE_TYPE_B) {
+        int8_t *const qscale_table = s->c.cur_pic.qscale_table;
         int odd = 0;
         /* ok, come on, this isn't funny anymore, there's more code for
          * handling this MPEG-4 mess than for the actual adaptive quantization */
@@ -291,46 +291,19 @@ static inline void mpeg4_encode_dc(PutBitContext *s, int level, int n)
     }
 }
 
-static inline int mpeg4_get_dc_length(int level, int n)
-{
-    if (n < 4)
-        return uni_DCtab_lum_len[level + 256];
-    else
-        return uni_DCtab_chrom_len[level + 256];
-}
-
 /**
- * Encode an 8x8 block.
- * @param n block index (0-3 are luma, 4-5 are chroma)
+ * Encode the AC coefficients of an 8x8 block.
  */
-static inline void mpeg4_encode_block(const MPVEncContext *const s,
-                                      const int16_t *block, int n, int intra_dc,
-                                      const uint8_t *scan_table, PutBitContext *dc_pb,
-                                      PutBitContext *ac_pb)
+static inline void mpeg4_encode_ac_coeffs(const int16_t block[64],
+                                          const int last_index, int i,
+                                          const uint8_t *const scan_table,
+                                          PutBitContext *const ac_pb,
+                                          const uint32_t *const bits_tab,
+                                          const uint8_t *const len_tab)
 {
-    int i, last_non_zero;
-    const uint32_t *bits_tab;
-    const uint8_t *len_tab;
-    const int last_index = s->c.block_last_index[n];
-
-    if (s->c.mb_intra) {  // Note gcc (3.2.1 at least) will optimize this away
-        /* MPEG-4 based DC predictor */
-        mpeg4_encode_dc(dc_pb, intra_dc, n);
-        if (last_index < 1)
-            return;
-        i = 1;
-        bits_tab = uni_mpeg4_intra_rl_bits;
-        len_tab  = uni_mpeg4_intra_rl_len;
-    } else {
-        if (last_index < 0)
-            return;
-        i = 0;
-        bits_tab = uni_mpeg4_inter_rl_bits;
-        len_tab  = uni_mpeg4_inter_rl_len;
-    }
+    int last_non_zero = i - 1;
 
     /* AC coefs */
-    last_non_zero = i - 1;
     for (; i < last_index; i++) {
         int level = block[scan_table[i]];
         if (level) {
@@ -364,93 +337,40 @@ static inline void mpeg4_encode_block(const MPVEncContext *const s,
     }
 }
 
-static int mpeg4_get_block_length(MPVEncContext *const s,
-                                  const int16_t *block, int n,
-                                  int intra_dc, const uint8_t *scan_table)
+static void mpeg4_encode_blocks_inter(MPVEncContext *const s,
+                                      const int16_t block[6][64],
+                                      PutBitContext *ac_pb)
 {
-    int i, last_non_zero;
-    const uint8_t *len_tab;
-    const int last_index = s->c.block_last_index[n];
-    int len = 0;
-
-    if (s->c.mb_intra) {  // Note gcc (3.2.1 at least) will optimize this away
-        /* MPEG-4 based DC predictor */
-        len += mpeg4_get_dc_length(intra_dc, n);
-        if (last_index < 1)
-            return len;
-        i = 1;
-        len_tab = uni_mpeg4_intra_rl_len;
-    } else {
+    /* encode each block */
+    for (int n = 0; n < 6; ++n) {
+        const int last_index = s->c.block_last_index[n];
         if (last_index < 0)
-            return 0;
-        i = 0;
-        len_tab = uni_mpeg4_inter_rl_len;
-    }
+            continue;
 
-    /* AC coefs */
-    last_non_zero = i - 1;
-    for (; i < last_index; i++) {
-        int level = block[scan_table[i]];
-        if (level) {
-            int run = i - last_non_zero - 1;
-            level += 64;
-            if ((level & (~127)) == 0) {
-                const int index = UNI_MPEG4_ENC_INDEX(0, run, level);
-                len += len_tab[index];
-            } else {  // ESC3
-                len += 7 + 2 + 1 + 6 + 1 + 12 + 1;
-            }
-            last_non_zero = i;
-        }
-    }
-    /* if (i <= last_index) */ {
-        int level = block[scan_table[i]];
-        int run   = i - last_non_zero - 1;
-        level += 64;
-        if ((level & (~127)) == 0) {
-            const int index = UNI_MPEG4_ENC_INDEX(1, run, level);
-            len += len_tab[index];
-        } else {  // ESC3
-            len += 7 + 2 + 1 + 6 + 1 + 12 + 1;
-        }
+        mpeg4_encode_ac_coeffs(block[n], last_index, 0,
+                               s->c.intra_scantable.permutated, ac_pb,
+                               uni_mpeg4_inter_rl_bits, uni_mpeg4_inter_rl_len);
     }
-
-    return len;
 }
 
-static inline void mpeg4_encode_blocks(MPVEncContext *const s,
-                                       const int16_t block[6][64],
-                                       const int intra_dc[6],
-                                       const uint8_t * const *scan_table,
-                                       PutBitContext *dc_pb,
-                                       PutBitContext *ac_pb)
+static void mpeg4_encode_blocks_intra(MPVEncContext *const s,
+                                      const int16_t block[6][64],
+                                      const int intra_dc[6],
+                                      const uint8_t * const *scan_table,
+                                      PutBitContext *dc_pb,
+                                      PutBitContext *ac_pb)
 {
-    int i;
+    /* encode each block */
+    for (int n = 0; n < 6; ++n) {
+        mpeg4_encode_dc(dc_pb, intra_dc[n], n);
 
-    if (scan_table) {
-        if (s->c.avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT) {
-            for (i = 0; i < 6; i++)
-                skip_put_bits(&s->pb,
-                              mpeg4_get_block_length(s, block[i], i,
-                                                     intra_dc[i], scan_table[i]));
-        } else {
-            /* encode each block */
-            for (i = 0; i < 6; i++)
-                mpeg4_encode_block(s, block[i], i,
-                                   intra_dc[i], scan_table[i], dc_pb, ac_pb);
-        }
-    } else {
-        if (s->c.avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT) {
-            for (i = 0; i < 6; i++)
-                skip_put_bits(&s->pb,
-                              mpeg4_get_block_length(s, block[i], i, 0,
-                                                     s->c.intra_scantable.permutated));
-        } else {
-            /* encode each block */
-            for (i = 0; i < 6; i++)
-                mpeg4_encode_block(s, block[i], i, 0,
-                                   s->c.intra_scantable.permutated, dc_pb, ac_pb);
-        }
+        const int last_index = s->c.block_last_index[n];
+        if (last_index <= 0)
+            continue;
+
+        mpeg4_encode_ac_coeffs(block[n], last_index, 1,
+                               scan_table[n], ac_pb,
+                               uni_mpeg4_intra_rl_bits, uni_mpeg4_intra_rl_len);
     }
 }
 
@@ -640,7 +560,7 @@ static void mpeg4_encode_mb(MPVEncContext *const s, int16_t block[][64],
             if (interleaved_stats)
                 s->mv_bits += get_bits_diff(s);
 
-            mpeg4_encode_blocks(s, block, NULL, NULL, NULL, &s->pb);
+            mpeg4_encode_blocks_inter(s, block, &s->pb);
 
             if (interleaved_stats)
                 s->p_tex_bits += get_bits_diff(s);
@@ -803,7 +723,7 @@ static void mpeg4_encode_mb(MPVEncContext *const s, int16_t block[][64],
             if (interleaved_stats)
                 s->mv_bits += get_bits_diff(s);
 
-            mpeg4_encode_blocks(s, block, NULL, NULL, NULL, tex_pb);
+            mpeg4_encode_blocks_inter(s, block, tex_pb);
 
             if (interleaved_stats)
                 s->p_tex_bits += get_bits_diff(s);
@@ -865,7 +785,7 @@ static void mpeg4_encode_mb(MPVEncContext *const s, int16_t block[][64],
         if (interleaved_stats)
             s->misc_bits += get_bits_diff(s);
 
-        mpeg4_encode_blocks(s, block, dc_diff, scan_table, dc_pb, tex_pb);
+        mpeg4_encode_blocks_intra(s, block, dc_diff, scan_table, dc_pb, tex_pb);
 
         if (interleaved_stats)
             s->i_tex_bits += get_bits_diff(s);
@@ -1032,9 +952,9 @@ static void mpeg4_encode_vol_header(Mpeg4EncContext *const m4,
         put_bits(&s->pb, 2, 0);       /* sprite enable */
 
     put_bits(&s->pb, 1, 0);             /* not 8 bit == false */
-    put_bits(&s->pb, 1, s->c.mpeg_quant); /* quant type = (0 = H.263 style) */
+    put_bits(&s->pb, 1, s->mpeg_quant); /* quant type = (0 = H.263 style) */
 
-    if (s->c.mpeg_quant) {
+    if (s->mpeg_quant) {
         ff_write_quant_matrix(&s->pb, s->c.avctx->intra_matrix);
         ff_write_quant_matrix(&s->pb, s->c.avctx->inter_matrix);
     }
@@ -1070,6 +990,8 @@ static int mpeg4_encode_picture_header(MPVMainEncContext *const m)
     uint64_t time_incr;
     int64_t time_div, time_mod;
 
+    put_bits_assume_flushed(&s->pb);
+
     if (s->c.pict_type == AV_PICTURE_TYPE_I) {
         if (!(s->c.avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) {
             if (s->c.avctx->strict_std_compliance < FF_COMPLIANCE_VERY_STRICT)  // HACK, the reference sw is buggy
@@ -1181,95 +1103,78 @@ static av_cold void init_uni_dc_tab(void)
 static av_cold void init_uni_mpeg4_rl_tab(RLTable *rl, uint32_t *bits_tab,
                                           uint8_t *len_tab)
 {
-    int slevel, run, last;
-
-    av_assert0(MAX_LEVEL >= 64);
-    av_assert0(MAX_RUN >= 63);
+    // Type 3 escape method. The escape code is the same for both VLCs
+    // (0x3, seven bits), so it is hardcoded.
+    memset(len_tab, 30, 2 * 2 * 64 * 64);
+    len_tab  += 64;
+    bits_tab += 64;
+    for (int run = 0; run < 64; ++run) {
+        for (int level = 1;; ++level) {
+                       //  Escape code   type 3     not last    run (6 bits)   marker   marker
+            unsigned code = (3 << 23) | (3 << 21) | (0 << 20) | (run << 14) | (1 << 13) | 1;
+            // first the negative levels
+            bits_tab[UNI_MPEG4_ENC_INDEX(0, run, -level)] = code | (-level & 0xfff) << 1;
+            bits_tab[UNI_MPEG4_ENC_INDEX(1, run, -level)] =
+                bits_tab[UNI_MPEG4_ENC_INDEX(0, run, -level)] | (1 << 20) /* last */;
+
+            if (level == 64) // positive levels have a range of 1..63
+                break;
+            bits_tab[UNI_MPEG4_ENC_INDEX(0, run, level)] = code | level << 1;
+            bits_tab[UNI_MPEG4_ENC_INDEX(1, run, level)] =
+                bits_tab[UNI_MPEG4_ENC_INDEX(0, run, level)] | (1 << 20) /* last */;
+        }
+        // Is this needed at all?
+        len_tab[UNI_MPEG4_ENC_INDEX(0, run, 0)] =
+        len_tab[UNI_MPEG4_ENC_INDEX(1, run, 0)] = 0;
+    }
 
-    for (slevel = -64; slevel < 64; slevel++) {
-        if (slevel == 0)
-            continue;
-        for (run = 0; run < 64; run++) {
-            for (last = 0; last <= 1; last++) {
-                const int index = UNI_MPEG4_ENC_INDEX(last, run, slevel + 64);
-                int level       = slevel < 0 ? -slevel : slevel;
-                int sign        = slevel < 0 ? 1 : 0;
-                int bits, len, code;
-                int level1, run1;
-
-                len_tab[index] = 100;
-
-                /* ESC0 */
-                code = get_rl_index(rl, last, run, level);
-                bits = rl->table_vlc[code][0];
-                len  = rl->table_vlc[code][1];
-                bits = bits * 2 + sign;
-                len++;
-
-                if (code != rl->n && len < len_tab[index]) {
-                    bits_tab[index] = bits;
-                    len_tab[index]  = len;
-                }
-                /* ESC1 */
-                bits = rl->table_vlc[rl->n][0];
-                len  = rl->table_vlc[rl->n][1];
-                bits = bits * 2;
-                len++;                 // esc1
-                level1 = level - rl->max_level[last][run];
-                if (level1 > 0) {
-                    code   = get_rl_index(rl, last, run, level1);
-                    bits <<= rl->table_vlc[code][1];
-                    len   += rl->table_vlc[code][1];
-                    bits  += rl->table_vlc[code][0];
-                    bits   = bits * 2 + sign;
-                    len++;
-
-                    if (code != rl->n && len < len_tab[index]) {
-                        bits_tab[index] = bits;
-                        len_tab[index]  = len;
-                    }
-                }
-                /* ESC2 */
-                bits = rl->table_vlc[rl->n][0];
-                len  = rl->table_vlc[rl->n][1];
-                bits = bits * 4 + 2;
-                len += 2;                 // esc2
-                run1 = run - rl->max_run[last][level] - 1;
-                if (run1 >= 0) {
-                    code   = get_rl_index(rl, last, run1, level);
-                    bits <<= rl->table_vlc[code][1];
-                    len   += rl->table_vlc[code][1];
-                    bits  += rl->table_vlc[code][0];
-                    bits   = bits * 2 + sign;
-                    len++;
-
-                    if (code != rl->n && len < len_tab[index]) {
-                        bits_tab[index] = bits;
-                        len_tab[index]  = len;
-                    }
-                }
-                /* ESC3 */
-                bits = rl->table_vlc[rl->n][0];
-                len  = rl->table_vlc[rl->n][1];
-                bits = bits * 4 + 3;
-                len += 2;                 // esc3
-                bits = bits * 2 + last;
-                len++;
-                bits = bits * 64 + run;
-                len += 6;
-                bits = bits * 2 + 1;
-                len++;                    // marker
-                bits = bits * 4096 + (slevel & 0xfff);
-                len += 12;
-                bits = bits * 2 + 1;
-                len++;                    // marker
-
-                if (len < len_tab[index]) {
-                    bits_tab[index] = bits;
-                    len_tab[index]  = len;
-                }
-            }
+    uint8_t max_run[2][32] = { 0 };
+
+#define VLC_NUM_CODES 102 // excluding the escape
+    av_assert2(rl->n == VLC_NUM_CODES);
+    for (int i = VLC_NUM_CODES - 1, max_level, cur_run = 0; i >= 0; --i) {
+        int run = rl->table_run[i], level = rl->table_level[i];
+        int last = i >= rl->last;
+        unsigned code = rl->table_vlc[i][0] << 1;
+        int len = rl->table_vlc[i][1] + 1;
+
+        bits_tab[UNI_MPEG4_ENC_INDEX(last, run,  level)] = code;
+        len_tab [UNI_MPEG4_ENC_INDEX(last, run,  level)] = len;
+        bits_tab[UNI_MPEG4_ENC_INDEX(last, run, -level)] = code | 1;
+        len_tab [UNI_MPEG4_ENC_INDEX(last, run, -level)] = len;
+
+        if (!max_run[last][level])
+            max_run[last][level] = run + 1;
+        av_assert2(run + 1 <= max_run[last][level]);
+
+        int run3 = run + max_run[last][level];
+        int len3 = len + 7 + 2;
+
+        if (run3 < 64 && len3 < len_tab[UNI_MPEG4_ENC_INDEX(last, run3, level)]) {
+            unsigned code3 = code | (0x3 << 2 | 0x2) << len;
+            bits_tab[UNI_MPEG4_ENC_INDEX(last, run3,  level)] = code3;
+            len_tab [UNI_MPEG4_ENC_INDEX(last, run3,  level)] = len3;
+            bits_tab[UNI_MPEG4_ENC_INDEX(last, run3, -level)] = code3 | 1;
+            len_tab [UNI_MPEG4_ENC_INDEX(last, run3, -level)] = len3;
         }
+        // table_run and table_level are ordered so that all the entries
+        // with the same last and run are consecutive and level is ascending
+        // among these entries. By traversing downwards we therefore automatically
+        // encounter max_level of a given run first, needed for escape method 1.
+        if (run != cur_run) {
+            max_level = level;
+            cur_run   = run;
+        } else
+            av_assert2(max_level > level);
+
+        code  |= 0x3 << (len + 1);
+        len   += 7 + 1;
+        level += max_level;
+        av_assert2(len_tab [UNI_MPEG4_ENC_INDEX(last, run,  level)] >= len);
+        bits_tab[UNI_MPEG4_ENC_INDEX(last, run,  level)] = code;
+        len_tab [UNI_MPEG4_ENC_INDEX(last, run,  level)] = len;
+        bits_tab[UNI_MPEG4_ENC_INDEX(last, run, -level)] = code | 1;
+        len_tab [UNI_MPEG4_ENC_INDEX(last, run, -level)] = len;
     }
 }
 
@@ -1277,8 +1182,6 @@ static av_cold void mpeg4_encode_init_static(void)
 {
     init_uni_dc_tab();
 
-    ff_mpeg4_init_rl_intra();
-
     init_uni_mpeg4_rl_tab(&ff_mpeg4_rl_intra, uni_mpeg4_intra_rl_bits, uni_mpeg4_intra_rl_len);
     init_uni_mpeg4_rl_tab(&ff_h263_rl_inter,  uni_mpeg4_inter_rl_bits, uni_mpeg4_inter_rl_len);
 
@@ -1401,11 +1304,11 @@ void ff_mpeg4_encode_video_packet_header(MPVEncContext *const s)
     put_bits(&s->pb, 1, 0); /* no HEC */
 }
 
-#define OFFSET(x) offsetof(MPVEncContext, c.x)
+#define OFFSET(x) offsetof(MPVEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "data_partitioning", "Use data partitioning.",      OFFSET(data_partitioning), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
-    { "alternate_scan",    "Enable alternate scantable.", OFFSET(alternate_scan),    AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "data_partitioning", "Use data partitioning.",      OFFSET(c.data_partitioning), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "alternate_scan",    "Enable alternate scantable.", OFFSET(c.alternate_scan),    AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "mpeg_quant",        "Use MPEG quantizers instead of H.263",
       OFFSET(mpeg_quant), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, VE },
     FF_MPV_COMMON_BFRAME_OPTS
diff --git a/libavcodec/mpeg4videoenc.h b/libavcodec/mpeg4videoenc.h
index 815f16f07379c..4e20b8aaa0d89 100644
--- a/libavcodec/mpeg4videoenc.h
+++ b/libavcodec/mpeg4videoenc.h
@@ -27,6 +27,18 @@
 
 #include "put_bits.h"
 
+enum {
+    MAX_PB2_INTRA_SIZE = 1 /* ac_pred */ + 5 /* max cbpy len */ +
+                         2 /* dquant */ + 1 /* interlaced dct */
+                         + 4 * (8 /* longest luma dct_dc_size */ +
+                                9 /* longest dc diff */ + 1 /* marker */)
+                         + 2 * (9 + 9 + 1),
+    MAX_PB2_INTER_SIZE = 5 /* max cbpy len */ +
+                         2 /* dquant */ + 1 /* interlaced_dct */ + 1,
+    MAX_PB2_MB_SIZE    = (FFMAX(MAX_PB2_INTER_SIZE, MAX_PB2_INTRA_SIZE) + 7) / 8,
+    MAX_AC_TEX_MB_SIZE = 64 * 6 * 30 /* longest escape code */ / 8,
+};
+
 typedef struct MPVEncContext MPVEncContext;
 
 void ff_set_mpeg4_time(MPVEncContext *s);
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 55f7178bed2f3..f3e4d4c386a0d 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -335,9 +335,11 @@ av_cold int ff_mpv_init_context_frame(MpegEncContext *s)
         s->coded_block = s->coded_block_base + s->b8_stride + 1;
     }
 
-    if (s->h263_pred || s->h263_plus || !s->encoding) {
+    if (s->h263_pred || s->h263_aic || !s->encoding) {
         /* dc values */
         // MN: we need these for error resilience of intra-frames
+        // Allocating them unconditionally for decoders also means
+        // that we don't need to reinitialize when e.g. h263_aic changes.
         if (!FF_ALLOCZ_TYPED_ARRAY(s->dc_val_base, yc_size))
             return AVERROR(ENOMEM);
         s->dc_val[0] = s->dc_val_base + s->b8_stride + 1;
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 20a5759958d94..68d70cc0e36e6 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -90,7 +90,6 @@ typedef struct MpegEncContext {
     int pb_frame;     ///< PB-frame mode (0 = none, 1 = base, 2 = improved)
 
 /* the following codec id fields are deprecated in favor of codec_id */
-    int h263_plus;    ///< H.263+ headers
     int h263_flv;     ///< use flv H.263 header
 
     enum AVCodecID codec_id;     /* see AV_CODEC_ID_xxx */
@@ -254,7 +253,6 @@ typedef struct MpegEncContext {
     int data_partitioning;           ///< data partitioning flag from header
     int partitioned_frame;           ///< is current frame partitioned
     int low_delay;                   ///< no reordering needed / has no B-frames
-    int mpeg_quant;
     int padding_bug_score;             ///< used to detect the VERY common padding bug in MPEG-4
 
     /* divx specific, used to workaround (many) bugs in divx5 */
diff --git a/libavcodec/mpegvideo_dec.c b/libavcodec/mpegvideo_dec.c
index b8b84ffd8da18..f8551b93c8890 100644
--- a/libavcodec/mpegvideo_dec.c
+++ b/libavcodec/mpegvideo_dec.c
@@ -424,13 +424,6 @@ av_cold void ff_mpeg_flush(AVCodecContext *avctx)
     s->pp_time = 0;
 }
 
-void ff_mpv_report_decode_progress(MpegEncContext *s)
-{
-    if (s->pict_type != AV_PICTURE_TYPE_B && !s->partitioned_frame && !s->er.error_occurred)
-        ff_thread_progress_report(&s->cur_pic.ptr->progress, s->mb_y);
-}
-
-
 static inline int hpel_motion_lowres(MpegEncContext *s,
                                      uint8_t *dest, const uint8_t *src,
                                      int field_based, int field_select,
@@ -817,7 +810,7 @@ static inline void MPV_motion_lowres(MpegEncContext *s,
         }
         break;
     default:
-        av_assert2(0);
+        av_unreachable("No other mpegvideo MV types exist");
     }
 }
 
@@ -967,8 +960,8 @@ void mpv_reconstruct_mb_internal(MpegEncContext *s, int16_t block[12][64],
         }
 
         /* add dct residue */
-        if (!(IS_MPEG12_H261(s) || s->msmpeg4_version != MSMP4_UNUSED ||
-              (s->codec_id == AV_CODEC_ID_MPEG4 && !s->mpeg_quant))) {
+        if (is_mpeg12 != DEFINITELY_MPEG12_H261 && s->dct_unquantize_inter) {
+            // H.263, H.263+, H.263I, FLV, RV10, RV20 and MPEG-4 with MPEG-2 quantization
             add_dequant_dct(s, block[0], 0, dest_y                          , dct_linesize, s->qscale);
             add_dequant_dct(s, block[1], 1, dest_y              + block_size, dct_linesize, s->qscale);
             add_dequant_dct(s, block[2], 2, dest_y + dct_offset             , dct_linesize, s->qscale);
@@ -980,6 +973,10 @@ void mpv_reconstruct_mb_internal(MpegEncContext *s, int16_t block[12][64],
                 add_dequant_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
             }
         } else if (is_mpeg12 == DEFINITELY_MPEG12_H261 || lowres_flag || (s->codec_id != AV_CODEC_ID_WMV2)) {
+            // H.261, MPEG-1, MPEG-2, MPEG-4 with H.263 quantization,
+            // MSMP4V1-3 and WMV1.
+            // Also RV30, RV40 and the VC-1 family when performing error resilience,
+            // but all blocks are skipped in this case.
             add_dct(s, block[0], 0, dest_y                          , dct_linesize);
             add_dct(s, block[1], 1, dest_y              + block_size, dct_linesize);
             add_dct(s, block[2], 2, dest_y + dct_offset             , dct_linesize);
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 6e9533ebc92b4..46901fc506254 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -313,14 +313,15 @@ av_cold void ff_dct_encode_init(MPVEncContext *const s)
         s->dct_quantize  = dct_quantize_trellis_c;
 }
 
-static av_cold void init_unquantize(MpegEncContext *const s, AVCodecContext *avctx)
+static av_cold void init_unquantize(MPVEncContext *const s2, AVCodecContext *avctx)
 {
+    MpegEncContext *const s = &s2->c;
     MPVUnquantDSPContext unquant_dsp_ctx;
 
     ff_mpv_unquantize_init(&unquant_dsp_ctx,
                            avctx->flags & AV_CODEC_FLAG_BITEXACT, s->q_scale_type);
 
-    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
+    if (s2->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
         s->dct_unquantize_intra = unquant_dsp_ctx.dct_unquantize_mpeg2_intra;
         s->dct_unquantize_inter = unquant_dsp_ctx.dct_unquantize_mpeg2_inter;
     } else if (s->out_format == FMT_H263 || s->out_format == FMT_H261) {
@@ -403,7 +404,7 @@ static av_cold int init_matrices(MPVMainEncContext *const m, AVCodecContext *avc
     }
 
     if (CONFIG_MPEG4_ENCODER && s->c.codec_id == AV_CODEC_ID_MPEG4 &&
-        s->c.mpeg_quant) {
+        s->mpeg_quant) {
         intra_matrix = ff_mpeg4_default_intra_matrix;
         inter_matrix = ff_mpeg4_default_non_intra_matrix;
     } else if (s->c.out_format == FMT_H263 || s->c.out_format == FMT_H261) {
@@ -559,9 +560,10 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     case AV_PIX_FMT_YUV422P:
         s->c.chroma_format = CHROMA_422;
         break;
+    default:
+        av_unreachable("Already checked via CODEC_PIXFMTS");
     case AV_PIX_FMT_YUVJ420P:
     case AV_PIX_FMT_YUV420P:
-    default:
         s->c.chroma_format = CHROMA_420;
         break;
     }
@@ -838,7 +840,7 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         //return -1;
     }
 
-    if (s->c.mpeg_quant || s->c.codec_id == AV_CODEC_ID_MPEG1VIDEO || s->c.codec_id == AV_CODEC_ID_MPEG2VIDEO || s->c.codec_id == AV_CODEC_ID_MJPEG || s->c.codec_id == AV_CODEC_ID_AMV || s->c.codec_id == AV_CODEC_ID_SPEEDHQ) {
+    if (s->mpeg_quant || s->c.codec_id == AV_CODEC_ID_MPEG1VIDEO || s->c.codec_id == AV_CODEC_ID_MPEG2VIDEO || s->c.codec_id == AV_CODEC_ID_MJPEG || s->c.codec_id == AV_CODEC_ID_AMV || s->c.codec_id == AV_CODEC_ID_SPEEDHQ) {
         // (a + x * 3 / 8) / x
         s->intra_quant_bias = 3 << (QUANT_BIAS_SHIFT - 3);
         s->inter_quant_bias = 0;
@@ -906,7 +908,6 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         break;
     case AV_CODEC_ID_H263P:
         s->c.out_format = FMT_H263;
-        s->c.h263_plus  = 1;
         /* Fx */
         s->c.h263_aic        = (avctx->flags & AV_CODEC_FLAG_AC_PRED) ? 1 : 0;
         s->c.modified_quant  = s->c.h263_aic;
@@ -942,8 +943,9 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         avctx->delay       = 0;
         s->c.low_delay       = 1;
         s->c.modified_quant  = 1;
+        // Set here to force allocation of dc_val;
+        // will be set later on a per-frame basis.
         s->c.h263_aic        = 1;
-        s->c.h263_plus       = 1;
         s->c.loop_filter     = 1;
         s->c.unrestricted_mv = 0;
         break;
@@ -992,7 +994,7 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         s->c.low_delay         = 1;
         break;
     default:
-        return AVERROR(EINVAL);
+        av_unreachable("List contains all codecs using ff_mpv_encode_init()");
     }
 
     avctx->has_b_frames = !s->c.low_delay;
@@ -1026,10 +1028,10 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
      * before calling ff_mpv_common_init(). */
     s->parent = m;
     ff_mpv_idct_init(&s->c);
-    init_unquantize(&s->c, avctx);
+    init_unquantize(s, avctx);
     ff_fdctdsp_init(&s->fdsp, avctx);
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
-    ff_pixblockdsp_init(&s->pdsp, avctx);
+    ff_pixblockdsp_init(&s->pdsp, 8);
     ret = me_cmp_init(m, avctx);
     if (ret < 0)
         return ret;
@@ -2978,14 +2980,15 @@ static int encode_thread(AVCodecContext *c, void *arg){
     int i;
     MBBackup best_s = { 0 }, backup_s;
     uint8_t bit_buf[2][MAX_MB_BYTES];
-    uint8_t bit_buf2[2][MAX_MB_BYTES];
-    uint8_t bit_buf_tex[2][MAX_MB_BYTES];
+    // + 2 because ff_copy_bits() overreads
+    uint8_t bit_buf2[2][MAX_PB2_MB_SIZE + 2];
+    uint8_t bit_buf_tex[2][MAX_AC_TEX_MB_SIZE + 2];
     PutBitContext pb[2], pb2[2], tex_pb[2];
 
     for(i=0; i<2; i++){
         init_put_bits(&pb    [i], bit_buf    [i], MAX_MB_BYTES);
-        init_put_bits(&pb2   [i], bit_buf2   [i], MAX_MB_BYTES);
-        init_put_bits(&tex_pb[i], bit_buf_tex[i], MAX_MB_BYTES);
+        init_put_bits(&pb2   [i], bit_buf2   [i], MAX_PB2_MB_SIZE);
+        init_put_bits(&tex_pb[i], bit_buf_tex[i], MAX_AC_TEX_MB_SIZE);
     }
 
     s->last_bits= put_bits_count(&s->pb);
@@ -3006,25 +3009,17 @@ static int encode_thread(AVCodecContext *c, void *arg){
         s->c.last_dc[0] = 128 * 8 / 13;
         s->c.last_dc[1] = 128 * 8 / 14;
         s->c.last_dc[2] = 128 * 8 / 14;
+#if CONFIG_MPEG4_ENCODER
+    } else if (s->c.partitioned_frame) {
+        av_assert1(s->c.codec_id == AV_CODEC_ID_MPEG4);
+        ff_mpeg4_init_partitions(s);
+#endif
     }
     s->c.mb_skip_run = 0;
     memset(s->c.last_mv, 0, sizeof(s->c.last_mv));
 
     s->last_mv_dir = 0;
 
-    switch (s->c.codec_id) {
-    case AV_CODEC_ID_H263:
-    case AV_CODEC_ID_H263P:
-    case AV_CODEC_ID_FLV1:
-        if (CONFIG_H263_ENCODER)
-            s->c.gob_index = H263_GOB_HEIGHT(s->c.height);
-        break;
-    case AV_CODEC_ID_MPEG4:
-        if (CONFIG_MPEG4_ENCODER && s->c.partitioned_frame)
-            ff_mpeg4_init_partitions(s);
-        break;
-    }
-
     s->c.resync_mb_x = 0;
     s->c.resync_mb_y = 0;
     s->c.first_slice_line = 1;
@@ -3541,7 +3536,10 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     }
                     break;
                 default:
-                    av_log(s->c.avctx, AV_LOG_ERROR, "illegal MB type\n");
+                    av_unreachable("There is a case for every CANDIDATE_MB_TYPE_* "
+                                   "except CANDIDATE_MB_TYPE_SKIPPED which is never "
+                                   "the only candidate (always coupled with INTER) "
+                                   "so that it never reaches this switch");
                 }
 
                 encode_mb(s, motion_x, motion_y);
@@ -4019,7 +4017,7 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
         last_non_zero = 0;
         qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
         matrix = n < 4 ? s->c.intra_matrix : s->c.chroma_intra_matrix;
-        if (s->c.mpeg_quant || s->c.out_format == FMT_MPEG1 || s->c.out_format == FMT_MJPEG)
+        if (s->mpeg_quant || s->c.out_format == FMT_MPEG1 || s->c.out_format == FMT_MJPEG)
             bias= 1<<(QMAT_SHIFT-1);
 
         if (n > 3 && s->intra_chroma_ac_vlc_length) {
@@ -4334,7 +4332,7 @@ static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise?
         dc= block[0]*q;
 //        block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
-//        if (s->c.mpeg_quant || s->c.out_format == FMT_MPEG1)
+//        if (s->mpeg_quant || s->c.out_format == FMT_MPEG1)
 //            bias= 1<<(QMAT_SHIFT-1);
         if (n > 3 && s->intra_chroma_ac_vlc_length) {
             length     = s->intra_chroma_ac_vlc_length;
diff --git a/libavcodec/mpegvideo_motion.c b/libavcodec/mpegvideo_motion.c
index edc49310929c6..a48b898dac1b1 100644
--- a/libavcodec/mpegvideo_motion.c
+++ b/libavcodec/mpegvideo_motion.c
@@ -813,7 +813,8 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
             }
             break;
         }
-    default: av_assert2(0);
+    default:
+        av_unreachable("No other mpegvideo MV types exist");
     }
 }
 
diff --git a/libavcodec/mpegvideodec.h b/libavcodec/mpegvideodec.h
index bc4bc905908bb..8bc70b02c0490 100644
--- a/libavcodec/mpegvideodec.h
+++ b/libavcodec/mpegvideodec.h
@@ -57,7 +57,6 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx);
  */
 int ff_mpv_alloc_dummy_frames(MpegEncContext *s);
 void ff_mpv_reconstruct_mb(MpegEncContext *s, int16_t block[12][64]);
-void ff_mpv_report_decode_progress(MpegEncContext *s);
 void ff_mpv_frame_end(MpegEncContext *s);
 
 int ff_mpv_export_qp_table(const MpegEncContext *s, AVFrame *f,
diff --git a/libavcodec/mpegvideoenc.h b/libavcodec/mpegvideoenc.h
index ec0304c4a0fba..5510b43f86ce5 100644
--- a/libavcodec/mpegvideoenc.h
+++ b/libavcodec/mpegvideoenc.h
@@ -147,6 +147,7 @@ typedef struct MPVEncContext {
     int last_mv_dir;               ///< last mv_dir, used for B-frame encoding
 
     /* MPEG-4 specific */
+    int mpeg_quant;
     PutBitContext tex_pb;          ///< used for data partitioned VOPs
     PutBitContext pb2;             ///< used for data partitioned VOPs
 
diff --git a/libavcodec/msmpeg4dec.c b/libavcodec/msmpeg4dec.c
index df67d435421b6..df5ab5186eba3 100644
--- a/libavcodec/msmpeg4dec.c
+++ b/libavcodec/msmpeg4dec.c
@@ -366,6 +366,9 @@ av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
     if (ff_h263_decode_init(avctx) < 0)
         return -1;
 
+    // We unquantize inter blocks as we parse them.
+    s->dct_unquantize_inter = NULL;
+
     ff_msmpeg4_common_init(s);
 
     switch (s->msmpeg4_version) {
@@ -379,6 +382,8 @@ av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
         break;
     case MSMP4_WMV2:
         break;
+    default:
+        av_unreachable("List contains all cases using ff_msmpeg4_decode_init()");
     }
 
     s->slice_height= s->mb_height; //to avoid 1/0 if the first frame is not a keyframe
@@ -472,6 +477,8 @@ int ff_msmpeg4_decode_picture_header(MpegEncContext * s)
             ms->dc_table_index = get_bits1(&s->gb);
             s->inter_intra_pred= 0;
             break;
+        default:
+            av_unreachable("ff_msmpeg4_decode_picture_header() only used by MSMP4V1-3, WMV1");
         }
         s->no_rounding = 1;
         if(s->avctx->debug&FF_DEBUG_PICT_INFO)
@@ -523,6 +530,8 @@ int ff_msmpeg4_decode_picture_header(MpegEncContext * s)
             s->inter_intra_pred = s->width*s->height < 320*240 &&
                                   ms->bit_rate <= II_BITRATE;
             break;
+        default:
+            av_unreachable("ff_msmpeg4_decode_picture_header() only used by MSMP4V1-3, WMV1");
         }
 
         if(s->avctx->debug&FF_DEBUG_PICT_INFO)
diff --git a/libavcodec/msmpeg4enc.c b/libavcodec/msmpeg4enc.c
index 795db6e4de9d4..3449328b3c768 100644
--- a/libavcodec/msmpeg4enc.c
+++ b/libavcodec/msmpeg4enc.c
@@ -221,7 +221,8 @@ static int msmpeg4_encode_picture_header(MPVMainEncContext *const m)
 
     find_best_tables(ms);
 
-    align_put_bits(&s->pb);
+    put_bits_assume_flushed(&s->pb);
+
     put_bits(&s->pb, 2, s->c.pict_type - 1);
 
     put_bits(&s->pb, 5, s->c.qscale);
diff --git a/libavcodec/nvdec_mpeg4.c b/libavcodec/nvdec_mpeg4.c
index 7d158321aec24..827243903104e 100644
--- a/libavcodec/nvdec_mpeg4.c
+++ b/libavcodec/nvdec_mpeg4.c
@@ -70,7 +70,7 @@ static int nvdec_mpeg4_start_frame(AVCodecContext *avctx,
             .vop_time_increment_bitcount  = m->time_increment_bits,
             .top_field_first              = s->top_field_first,
             .resync_marker_disable        = !m->resync_marker,
-            .quant_type                   = s->mpeg_quant,
+            .quant_type                   = m->mpeg_quant,
             .quarter_sample               = s->quarter_sample,
             .short_video_header           = avctx->codec->id == AV_CODEC_ID_H263,
             .divx_flags                   = s->divx_packed ? 5 : 0,
diff --git a/libavcodec/pcm-dvdenc.c b/libavcodec/pcm-dvdenc.c
index b1f01ee323be3..a740f0e381b07 100644
--- a/libavcodec/pcm-dvdenc.c
+++ b/libavcodec/pcm-dvdenc.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "avcodec.h"
 #include "bytestream.h"
@@ -45,7 +46,7 @@ static av_cold int pcm_dvd_encode_init(AVCodecContext *avctx)
         freq = 1;
         break;
     default:
-        av_assert1(0);
+        av_unreachable("Already checked via CODEC_SAMPLERATES");
     }
 
     switch (avctx->sample_fmt) {
@@ -58,7 +59,7 @@ static av_cold int pcm_dvd_encode_init(AVCodecContext *avctx)
         quant = 2;
         break;
     default:
-        av_assert1(0);
+        av_unreachable("Already checked via CODEC_SAMPLEFMTS");
     }
 
     avctx->bits_per_coded_sample = 16 + quant * 4;
diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index bff61f2195a0b..68b19451942cc 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -327,6 +327,8 @@ static av_cold av_unused int pcm_lut_decode_init(AVCodecContext *avctx)
     PCMLUTDecode *s = avctx->priv_data;
 
     switch (avctx->codec_id) {
+    default:
+        av_unreachable("pcm_lut_decode_init() only used with alaw, mulaw and vidc");
     case AV_CODEC_ID_PCM_ALAW:
         for (int i = 0; i < 256; i++)
             s->table[i] = alaw2linear(i);
diff --git a/libavcodec/pixblockdsp.c b/libavcodec/pixblockdsp.c
index 1fff244511ad8..110a374260d35 100644
--- a/libavcodec/pixblockdsp.c
+++ b/libavcodec/pixblockdsp.c
@@ -21,7 +21,6 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/intreadwrite.h"
-#include "avcodec.h"
 #include "pixblockdsp.h"
 
 static void get_pixels_16_c(int16_t *restrict block, const uint8_t *pixels,
@@ -85,40 +84,33 @@ static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
     }
 }
 
-av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
+av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, int bits_per_raw_sample)
 {
-    av_unused const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
+    const unsigned high_bit_depth = bits_per_raw_sample > 8 &&
+                                    bits_per_raw_sample <= 16;
 
     c->diff_pixels_unaligned =
     c->diff_pixels = diff_pixels_c;
 
-    switch (avctx->bits_per_raw_sample) {
-    case 9:
-    case 10:
-    case 12:
-    case 14:
+    if (high_bit_depth) {
         c->get_pixels_unaligned = get_pixels_unaligned_16_c;
-        c->get_pixels = get_pixels_16_c;
-        break;
-    default:
-        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
-            c->get_pixels_unaligned =
-            c->get_pixels = get_pixels_8_c;
-        }
-        break;
+        c->get_pixels           = get_pixels_16_c;
+    } else {
+        c->get_pixels_unaligned =
+        c->get_pixels           = get_pixels_8_c;
     }
 
 #if ARCH_AARCH64
-    ff_pixblockdsp_init_aarch64(c, avctx, high_bit_depth);
+    ff_pixblockdsp_init_aarch64(c, high_bit_depth);
 #elif ARCH_ARM
-    ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
+    ff_pixblockdsp_init_arm(c, high_bit_depth);
 #elif ARCH_PPC
-    ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
+    ff_pixblockdsp_init_ppc(c, high_bit_depth);
 #elif ARCH_RISCV
-    ff_pixblockdsp_init_riscv(c, avctx, high_bit_depth);
+    ff_pixblockdsp_init_riscv(c, high_bit_depth);
 #elif ARCH_X86
-    ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
+    ff_pixblockdsp_init_x86(c, high_bit_depth);
 #elif ARCH_MIPS
-    ff_pixblockdsp_init_mips(c, avctx, high_bit_depth);
+    ff_pixblockdsp_init_mips(c, high_bit_depth);
 #endif
 }
diff --git a/libavcodec/pixblockdsp.h b/libavcodec/pixblockdsp.h
index 215b0905d7f07..d493d0e22b74c 100644
--- a/libavcodec/pixblockdsp.h
+++ b/libavcodec/pixblockdsp.h
@@ -19,13 +19,17 @@
 #ifndef AVCODEC_PIXBLOCKDSP_H
 #define AVCODEC_PIXBLOCKDSP_H
 
+#include <stddef.h>
 #include <stdint.h>
 
-#include "avcodec.h"
+#define PIXBLOCKDSP_8BPP_GET_PIXELS_SUPPORTS_UNALIGNED \
+    !(ARCH_ARM || ARCH_MIPS || ARCH_PPC || ARCH_RISCV)
 
 typedef struct PixblockDSPContext {
     void (*get_pixels)(int16_t *restrict block /* align 16 */,
-                       const uint8_t *pixels /* align 8 */,
+                       /* align 16 for > 8 bits; align 8 for <= 8 bits
+                        * (or 1 if PIXBLOCKDSP_8BPP_GET_PIXELS_SUPPORTS_UNALIGNED is set) */
+                       const uint8_t *pixels,
                        ptrdiff_t stride);
     void (*get_pixels_unaligned)(int16_t *restrict block /* align 16 */,
                        const uint8_t *pixels,
@@ -41,18 +45,18 @@ typedef struct PixblockDSPContext {
 
 } PixblockDSPContext;
 
-void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx);
-void ff_pixblockdsp_init_aarch64(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init(PixblockDSPContext *c, int bits_per_raw_sample);
+void ff_pixblockdsp_init_aarch64(PixblockDSPContext *c,
                                  unsigned high_bit_depth);
-void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init_arm(PixblockDSPContext *c,
                              unsigned high_bit_depth);
-void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
                              unsigned high_bit_depth);
-void ff_pixblockdsp_init_riscv(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init_riscv(PixblockDSPContext *c,
                                unsigned high_bit_depth);
-void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                              unsigned high_bit_depth);
-void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c,
                               unsigned high_bit_depth);
 
 #endif /* AVCODEC_PIXBLOCKDSP_H */
diff --git a/libavcodec/ppc/pixblockdsp.c b/libavcodec/ppc/pixblockdsp.c
index 01d14b4124170..75287b1e85373 100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -27,7 +27,6 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/util_altivec.h"
 
-#include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
 #if HAVE_ALTIVEC
@@ -263,7 +262,6 @@ static void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1,
 #endif /* HAVE_VSX */
 
 av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
-                                     AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
 #if HAVE_ALTIVEC
diff --git a/libavcodec/proresenc_anatoliy.c b/libavcodec/proresenc_anatoliy.c
index fc69b94780d4c..4fc40abaac8a8 100644
--- a/libavcodec/proresenc_anatoliy.c
+++ b/libavcodec/proresenc_anatoliy.c
@@ -27,6 +27,7 @@
  * Known FOURCCs: 'ap4h' (444), 'apch' (HQ), 'apcn' (422), 'apcs' (LT), 'acpo' (Proxy)
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
@@ -845,20 +846,25 @@ static av_cold int prores_encode_init(AVCodecContext *avctx)
     }
 
     if (avctx->profile == AV_PROFILE_UNKNOWN) {
-        if (avctx->pix_fmt == AV_PIX_FMT_YUV422P10) {
+        switch (avctx->pix_fmt) {
+        case AV_PIX_FMT_YUV422P10:
             avctx->profile = AV_PROFILE_PRORES_STANDARD;
             av_log(avctx, AV_LOG_INFO,
                 "encoding with ProRes standard (apcn) profile\n");
-        } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P10) {
+            break;
+        case AV_PIX_FMT_YUV444P10:
             avctx->profile = AV_PROFILE_PRORES_4444;
             av_log(avctx, AV_LOG_INFO,
                    "encoding with ProRes 4444 (ap4h) profile\n");
-        } else if (avctx->pix_fmt == AV_PIX_FMT_YUVA444P10) {
+            break;
+        case AV_PIX_FMT_YUVA444P10:
             avctx->profile = AV_PROFILE_PRORES_4444;
             av_log(avctx, AV_LOG_INFO,
                    "encoding with ProRes 4444+ (ap4h) profile\n");
-        } else
-            av_assert0(0);
+            break;
+        default:
+            av_unreachable("Already checked via CODEC_PIXFMTS");
+        }
     } else if (avctx->profile < AV_PROFILE_PRORES_PROXY
             || avctx->profile > AV_PROFILE_PRORES_XQ) {
         av_log(
diff --git a/libavcodec/put_bits.h b/libavcodec/put_bits.h
index 56c3f4cc6d17b..c3eee622d41ee 100644
--- a/libavcodec/put_bits.h
+++ b/libavcodec/put_bits.h
@@ -74,6 +74,16 @@ static inline void init_put_bits(PutBitContext *s, uint8_t *buffer,
     s->bit_buf      = 0;
 }
 
+/**
+ * Inform the compiler that a PutBitContext is flushed (i.e. if it has just
+ * been initialized or flushed). Undefined behaviour occurs if this is used
+ * with a PutBitContext for which this is not true.
+ */
+static inline void put_bits_assume_flushed(const PutBitContext *s)
+{
+    av_assume(s->bit_left == BUF_BITS);
+}
+
 /**
  * @return the total number of bits written to the bitstream.
  */
diff --git a/libavcodec/ratecontrol.c b/libavcodec/ratecontrol.c
index 06d998efcb61d..3dc44683d01cc 100644
--- a/libavcodec/ratecontrol.c
+++ b/libavcodec/ratecontrol.c
@@ -699,7 +699,7 @@ av_cold int ff_rate_control_init(MPVMainEncContext *const m)
     if (s->adaptive_quant) {
         unsigned mb_array_size = s->c.mb_stride * s->c.mb_height;
 
-        rcc->cplx_tab = av_malloc_array(mb_array_size, 2 * sizeof(rcc->cplx_tab));
+        rcc->cplx_tab = av_malloc_array(mb_array_size, 2 * sizeof(*rcc->cplx_tab));
         if (!rcc->cplx_tab)
             return AVERROR(ENOMEM);
         rcc->bits_tab = rcc->cplx_tab + mb_array_size;
diff --git a/libavcodec/riscv/pixblockdsp_init.c b/libavcodec/riscv/pixblockdsp_init.c
index 28caa99dfff0c..e59fba63cc4e3 100644
--- a/libavcodec/riscv/pixblockdsp_init.c
+++ b/libavcodec/riscv/pixblockdsp_init.c
@@ -24,7 +24,6 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/riscv/cpu.h"
-#include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
 void ff_get_pixels_8_rvi(int16_t *block, const uint8_t *pixels,
@@ -42,7 +41,6 @@ void ff_diff_pixels_unaligned_rvv(int16_t *block, const uint8_t *s1,
                                   const uint8_t *s2, ptrdiff_t stride);
 
 av_cold void ff_pixblockdsp_init_riscv(PixblockDSPContext *c,
-                                       AVCodecContext *avctx,
                                        unsigned high_bit_depth)
 {
 #if HAVE_RV
diff --git a/libavcodec/rv10enc.c b/libavcodec/rv10enc.c
index 984fe3379dd81..534b93fd81e7d 100644
--- a/libavcodec/rv10enc.c
+++ b/libavcodec/rv10enc.c
@@ -36,7 +36,7 @@ int ff_rv10_encode_picture_header(MPVMainEncContext *const m)
     MPVEncContext *const s = &m->s;
     int full_frame= 0;
 
-    align_put_bits(&s->pb);
+    put_bits_assume_flushed(&s->pb);
 
     put_bits(&s->pb, 1, 1);     /* marker */
 
diff --git a/libavcodec/rv20enc.c b/libavcodec/rv20enc.c
index ce0d435dcb209..5c3850c12fa24 100644
--- a/libavcodec/rv20enc.c
+++ b/libavcodec/rv20enc.c
@@ -38,6 +38,8 @@ int ff_rv20_encode_picture_header(MPVMainEncContext *const m)
 {
     MPVEncContext *const s = &m->s;
 
+    put_bits_assume_flushed(&s->pb);
+
     put_bits(&s->pb, 2, s->c.pict_type); //I 0 vs. 1 ?
     put_bits(&s->pb, 1, 0);     /* unknown bit */
     put_bits(&s->pb, 5, s->c.qscale);
@@ -48,12 +50,12 @@ int ff_rv20_encode_picture_header(MPVMainEncContext *const m)
 
     put_bits(&s->pb, 1, s->c.no_rounding);
 
-    av_assert0(s->f_code == 1);
-    av_assert0(!s->c.unrestricted_mv);
-    av_assert0(!s->c.alt_inter_vlc);
-    av_assert0(!s->c.umvplus);
-    av_assert0(s->c.modified_quant==1);
-    av_assert0(s->c.loop_filter==1);
+    av_assert1(s->f_code == 1);
+    av_assert1(!s->c.unrestricted_mv);
+    av_assert1(!s->c.alt_inter_vlc);
+    av_assert1(!s->c.umvplus);
+    av_assert1(s->c.modified_quant == 1);
+    av_assert1(s->c.loop_filter == 1);
 
     s->c.h263_aic = s->c.pict_type == AV_PICTURE_TYPE_I;
     if (s->c.h263_aic) {
diff --git a/libavcodec/rv60dec.c b/libavcodec/rv60dec.c
index d704ae512c2a2..2bbcb1d62093d 100644
--- a/libavcodec/rv60dec.c
+++ b/libavcodec/rv60dec.c
@@ -82,7 +82,7 @@ enum {
 };
 
 static const VLCElem * cbp8_vlc[7][4];
-static const VLCElem * cbp16_vlc[7][3][4];
+static const VLCElem * cbp16_vlc[7][4][4];
 
 typedef struct {
     const VLCElem * l0[2];
@@ -137,12 +137,12 @@ static av_cold void rv60_init_static_data(void)
 
     for (int i = 0; i < 7; i++)
         for (int j = 0; j < 4; j++)
-            cbp8_vlc[i][j] = gen_vlc(rv60_cbp8_lens[i][j], 64, &state);
+            cbp16_vlc[i][0][j] = cbp8_vlc[i][j] = gen_vlc(rv60_cbp8_lens[i][j], 64, &state);
 
     for (int i = 0; i < 7; i++)
         for (int j = 0; j < 3; j++)
             for (int k = 0; k < 4; k++)
-                cbp16_vlc[i][j][k] = gen_vlc(rv60_cbp16_lens[i][j][k], 64, &state);
+                cbp16_vlc[i][j + 1][k] = gen_vlc(rv60_cbp16_lens[i][j][k], 64, &state);
 
     build_coeff_vlc(rv60_intra_lens, intra_coeff_vlc, 5, &state);
     build_coeff_vlc(rv60_inter_lens, inter_coeff_vlc, 7, &state);
@@ -1650,10 +1650,7 @@ static int decode_super_cbp(GetBitContext * gb, const VLCElem * vlc[4])
 static int decode_cbp16(GetBitContext * gb, int subset, int qp)
 {
     int cb_set = rv60_qp_to_idx[qp];
-    if (!subset)
-        return decode_super_cbp(gb, cbp8_vlc[cb_set]);
-    else
-        return decode_super_cbp(gb, cbp16_vlc[cb_set][subset - 1]);
+    return decode_super_cbp(gb, cbp16_vlc[cb_set][subset]);
 }
 
 static int decode_cu_r(RV60Context * s, AVFrame * frame, ThreadContext * thread, GetBitContext * gb, int xpos, int ypos, int log_size, int qp, int sel_qp)
diff --git a/libavcodec/speedhqenc.c b/libavcodec/speedhqenc.c
index ecba2cd840886..23ab86e8e2a92 100644
--- a/libavcodec/speedhqenc.c
+++ b/libavcodec/speedhqenc.c
@@ -27,6 +27,7 @@
  * SpeedHQ encoder.
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/thread.h"
 
 #include "avcodec.h"
@@ -36,6 +37,7 @@
 #include "mpegvideo.h"
 #include "mpegvideodata.h"
 #include "mpegvideoenc.h"
+#include "put_bits.h"
 #include "rl.h"
 #include "speedhq.h"
 #include "speedhqenc.h"
@@ -100,6 +102,8 @@ static int speedhq_encode_picture_header(MPVMainEncContext *const m)
     SpeedHQEncContext *const ctx = (SpeedHQEncContext*)m;
     MPVEncContext *const s = &m->s;
 
+    put_bits_assume_flushed(&s->pb);
+
     put_bits_le(&s->pb, 8, 100 - s->c.qscale * 2);  /* FIXME why doubled */
     put_bits_le(&s->pb, 24, 4);  /* no second field */
 
@@ -259,7 +263,7 @@ static av_cold int speedhq_encode_init(AVCodecContext *avctx)
         avctx->codec_tag = MKTAG('S','H','Q','4');
         break;
     default:
-        av_assert0(0);
+        av_unreachable("Already checked via CODEC_PIXFMTS");
     }
 
     m->encode_picture_header = speedhq_encode_picture_header;
diff --git a/libavcodec/speexdec.c b/libavcodec/speexdec.c
index 60daab3b01563..94dce5420cc46 100644
--- a/libavcodec/speexdec.c
+++ b/libavcodec/speexdec.c
@@ -169,7 +169,7 @@ typedef struct SpeexSubmode {
 
 typedef struct SpeexMode {
     int modeID;                 /**< ID of the mode */
-    int (*decode)(AVCodecContext *avctx, void *dec, GetBitContext *gb, float *out);
+    int (*decode)(AVCodecContext *avctx, void *dec, GetBitContext *gb, float *out, int packets_left);
     int frame_size;             /**< Size of frames used for decoding */
     int subframe_size;          /**< Size of sub-frames used for decoding */
     int lpc_size;               /**< Order of LPC filter */
@@ -521,8 +521,8 @@ static const SpeexSubmode wb_submode4 = {
     split_cb_shape_sign_unquant, &split_cb_high, -1.f
 };
 
-static int nb_decode(AVCodecContext *, void *, GetBitContext *, float *);
-static int sb_decode(AVCodecContext *, void *, GetBitContext *, float *);
+static int nb_decode(AVCodecContext *, void *, GetBitContext *, float *, int packets_left);
+static int sb_decode(AVCodecContext *, void *, GetBitContext *, float *, int packets_left);
 
 static const SpeexMode speex_modes[SPEEX_NB_MODES] = {
     {
@@ -867,7 +867,7 @@ static void lsp_to_lpc(const float *freq, float *ak, int lpcrdr)
 }
 
 static int nb_decode(AVCodecContext *avctx, void *ptr_st,
-                     GetBitContext *gb, float *out)
+                     GetBitContext *gb, float *out, int packets_left)
 {
     DecoderState *st = ptr_st;
     float ol_gain = 0, ol_pitch_coef = 0, best_pitch_gain = 0, pitch_average = 0;
@@ -1218,7 +1218,7 @@ static void qmf_synth(const float *x1, const float *x2, const float *a, float *y
 }
 
 static int sb_decode(AVCodecContext *avctx, void *ptr_st,
-                     GetBitContext *gb, float *out)
+                     GetBitContext *gb, float *out, int packets_left)
 {
     SpeexContext *s = avctx->priv_data;
     DecoderState *st = ptr_st;
@@ -1234,9 +1234,11 @@ static int sb_decode(AVCodecContext *avctx, void *ptr_st,
     mode = st->mode;
 
     if (st->modeID > 0) {
+        if (packets_left <= 1)
+            return AVERROR_INVALIDDATA;
         low_innov_alias = out + st->frame_size;
         s->st[st->modeID - 1].innov_save = low_innov_alias;
-        ret = speex_modes[st->modeID - 1].decode(avctx, &s->st[st->modeID - 1], gb, out);
+        ret = speex_modes[st->modeID - 1].decode(avctx, &s->st[st->modeID - 1], gb, out, packets_left);
         if (ret < 0)
             return ret;
     }
@@ -1559,7 +1561,7 @@ static int speex_decode_frame(AVCodecContext *avctx, AVFrame *frame,
 
     dst = (float *)frame->extended_data[0];
     for (int i = 0; i < frames_per_packet; i++) {
-        ret = speex_modes[s->mode].decode(avctx, &s->st[s->mode], &s->gb, dst + i * s->frame_size);
+        ret = speex_modes[s->mode].decode(avctx, &s->st[s->mode], &s->gb, dst + i * s->frame_size, frames_per_packet - i);
         if (ret < 0)
             return ret;
         if (avctx->ch_layout.nb_channels == 2)
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 6319e9b0216e9..4c4f3018c518c 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -71,13 +71,14 @@
  * svq3 decoder.
  */
 
+#define NUM_PICS 3
+
 typedef struct SVQ3Frame {
     AVFrame *f;
 
-    int16_t (*motion_val_buf[2])[2];
     int16_t (*motion_val[2])[2];
 
-    uint32_t *mb_type_buf, *mb_type;
+    uint32_t *mb_type;
 } SVQ3Frame;
 
 typedef struct SVQ3Context {
@@ -103,7 +104,6 @@ typedef struct SVQ3Context {
     int adaptive_quant;
     int h_edge_pos;
     int v_edge_pos;
-    int last_frame_output;
     int slice_num;
     int qscale;
     int cbp;
@@ -142,7 +142,10 @@ typedef struct SVQ3Context {
     DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[15 * 8];
     uint32_t dequant4_coeff[QP_MAX_NUM + 1][16];
     int block_offset[2 * (16 * 3)];
-    SVQ3Frame frames[3];
+    SVQ3Frame frames[NUM_PICS];
+
+    uint32_t *mb_type_buf;
+    int16_t (*motion_val_buf)[2];
 } SVQ3Context;
 
 #define FULLPEL_MODE  1
@@ -1114,14 +1117,139 @@ static void init_dequant4_coeff_table(SVQ3Context *s)
     }
 }
 
+static av_cold int svq3_decode_extradata(AVCodecContext *avctx, SVQ3Context *s,
+                                         int seqh_offset)
+{
+    const uint8_t *extradata = avctx->extradata + seqh_offset;
+    unsigned int size = AV_RB32(extradata + 4);
+    GetBitContext gb;
+    int ret;
+
+    if (size > avctx->extradata_size - seqh_offset - 8)
+        return AVERROR_INVALIDDATA;
+    extradata += 8;
+    init_get_bits(&gb, extradata, size * 8);
+
+    /* 'frame size code' and optional 'width, height' */
+    int frame_size_code = get_bits(&gb, 3);
+    int w, h;
+    switch (frame_size_code) {
+    case 0:
+        w = 160;
+        h = 120;
+        break;
+    case 1:
+        w = 128;
+        h =  96;
+        break;
+    case 2:
+        w = 176;
+        h = 144;
+        break;
+    case 3:
+        w = 352;
+        h = 288;
+        break;
+    case 4:
+        w = 704;
+        h = 576;
+        break;
+    case 5:
+        w = 240;
+        h = 180;
+        break;
+    case 6:
+        w = 320;
+        h = 240;
+        break;
+    case 7:
+        w = get_bits(&gb, 12);
+        h = get_bits(&gb, 12);
+        break;
+    }
+    ret = ff_set_dimensions(avctx, w, h);
+    if (ret < 0)
+        return ret;
+
+    s->halfpel_flag  = get_bits1(&gb);
+    s->thirdpel_flag = get_bits1(&gb);
+
+    /* unknown fields */
+    int unk0 = get_bits1(&gb);
+    int unk1 = get_bits1(&gb);
+    int unk2 = get_bits1(&gb);
+    int unk3 = get_bits1(&gb);
+
+    s->low_delay = get_bits1(&gb);
+    avctx->has_b_frames = !s->low_delay;
+
+    /* unknown field */
+    int unk4 = get_bits1(&gb);
+
+    av_log(avctx, AV_LOG_DEBUG, "Unknown fields %d %d %d %d %d\n",
+           unk0, unk1, unk2, unk3, unk4);
+
+    if (skip_1stop_8data_bits(&gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    s->has_watermark = get_bits1(&gb);
+
+    if (!s->has_watermark)
+        return 0;
+
+#if CONFIG_ZLIB
+    unsigned watermark_width  = get_interleaved_ue_golomb(&gb);
+    unsigned watermark_height = get_interleaved_ue_golomb(&gb);
+    int u1                    = get_interleaved_ue_golomb(&gb);
+    int u2                    = get_bits(&gb, 8);
+    int u3                    = get_bits(&gb, 2);
+    int u4                    = get_interleaved_ue_golomb(&gb);
+    unsigned long buf_len     = watermark_width *
+                                watermark_height * 4;
+    int offset                = get_bits_count(&gb) + 7 >> 3;
+
+    if (watermark_height <= 0 ||
+        get_bits_left(&gb) <= 0 ||
+        (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height)
+        return AVERROR_INVALIDDATA;
+
+    av_log(avctx, AV_LOG_DEBUG, "watermark size: %ux%u\n",
+           watermark_width, watermark_height);
+    av_log(avctx, AV_LOG_DEBUG,
+           "u1: %x u2: %x u3: %x compressed data size: %d offset: %d\n",
+           u1, u2, u3, u4, offset);
+
+    uint8_t *buf = av_malloc(buf_len);
+    if (!buf)
+        return AVERROR(ENOMEM);
+
+    if (uncompress(buf, &buf_len, extradata + offset,
+                   size - offset) != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR,
+               "could not uncompress watermark logo\n");
+        av_free(buf);
+        return AVERROR_EXTERNAL;
+    }
+    s->watermark_key = av_bswap16(av_crc(av_crc_get_table(AV_CRC_16_CCITT), 0, buf, buf_len));
+
+    s->watermark_key = s->watermark_key << 16 | s->watermark_key;
+    av_log(avctx, AV_LOG_DEBUG,
+           "watermark key %#"PRIx32"\n", s->watermark_key);
+    av_free(buf);
+
+    return 0;
+#else
+    av_log(avctx, AV_LOG_ERROR,
+           "this svq3 file contains watermark which need zlib support compiled in\n");
+    return AVERROR(ENOSYS);
+#endif
+}
+
 static av_cold int svq3_decode_init(AVCodecContext *avctx)
 {
     SVQ3Context *s = avctx->priv_data;
     int m, x, y;
     unsigned char *extradata;
-    unsigned char *extradata_end;
-    unsigned int size;
-    int marker_found = 0;
     int ret;
 
     s->cur_pic  = &s->frames[0];
@@ -1154,147 +1282,55 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 
     /* prowl for the "SEQH" marker in the extradata */
     extradata     = (unsigned char *)avctx->extradata;
-    extradata_end = avctx->extradata + avctx->extradata_size;
     if (extradata) {
         for (m = 0; m + 8 < avctx->extradata_size; m++) {
             if (!memcmp(extradata, "SEQH", 4)) {
-                marker_found = 1;
+                /* if a match was found, parse the extra data */
+                ret = svq3_decode_extradata(avctx, s, m);
+                if (ret < 0)
+                    return ret;
                 break;
             }
             extradata++;
         }
     }
 
-    /* if a match was found, parse the extra data */
-    if (marker_found) {
-        GetBitContext gb;
-        int frame_size_code;
-        int unk0, unk1, unk2, unk3, unk4;
-        int w,h;
-
-        size = AV_RB32(&extradata[4]);
-        if (size > extradata_end - extradata - 8)
-            return AVERROR_INVALIDDATA;
-        init_get_bits(&gb, extradata + 8, size * 8);
-
-        /* 'frame size code' and optional 'width, height' */
-        frame_size_code = get_bits(&gb, 3);
-        switch (frame_size_code) {
-        case 0:
-            w = 160;
-            h = 120;
-            break;
-        case 1:
-            w = 128;
-            h =  96;
-            break;
-        case 2:
-            w = 176;
-            h = 144;
-            break;
-        case 3:
-            w = 352;
-            h = 288;
-            break;
-        case 4:
-            w = 704;
-            h = 576;
-            break;
-        case 5:
-            w = 240;
-            h = 180;
-            break;
-        case 6:
-            w = 320;
-            h = 240;
-            break;
-        case 7:
-            w = get_bits(&gb, 12);
-            h = get_bits(&gb, 12);
-            break;
-        }
-        ret = ff_set_dimensions(avctx, w, h);
-        if (ret < 0)
-            return ret;
-
-        s->halfpel_flag  = get_bits1(&gb);
-        s->thirdpel_flag = get_bits1(&gb);
-
-        /* unknown fields */
-        unk0 = get_bits1(&gb);
-        unk1 = get_bits1(&gb);
-        unk2 = get_bits1(&gb);
-        unk3 = get_bits1(&gb);
+    s->mb_width   = (avctx->width + 15) / 16;
+    s->mb_height  = (avctx->height + 15) / 16;
+    s->mb_stride  = s->mb_width + 1;
+    s->mb_num     = s->mb_width * s->mb_height;
+    s->b_stride   = 4 * s->mb_width;
+    s->h_edge_pos = s->mb_width * 16;
+    s->v_edge_pos = s->mb_height * 16;
 
-        s->low_delay = get_bits1(&gb);
+    const unsigned big_mb_num = s->mb_stride * (s->mb_height + 2) + 1;
 
-        /* unknown field */
-        unk4 = get_bits1(&gb);
+    s->mb_type_buf = av_calloc(big_mb_num, NUM_PICS * sizeof(*s->mb_type_buf));
+    if (!s->mb_type_buf)
+        return AVERROR(ENOMEM);
+    uint32_t *mb_type_buf = s->mb_type_buf + 2 * s->mb_stride + 1;
 
-        av_log(avctx, AV_LOG_DEBUG, "Unknown fields %d %d %d %d %d\n",
-               unk0, unk1, unk2, unk3, unk4);
+    const unsigned b4_stride     = s->mb_width * 4 + 1;
+    const unsigned b4_array_size = b4_stride * s->mb_height * 4;
+    const unsigned motion_val_buf_size = b4_array_size + 4;
 
-        if (skip_1stop_8data_bits(&gb) < 0)
-            return AVERROR_INVALIDDATA;
+    s->motion_val_buf = av_calloc(motion_val_buf_size,
+                                  NUM_PICS * 2 * sizeof(*s->motion_val_buf));
+    if (!s->motion_val_buf)
+        return AVERROR(ENOMEM);
+    int16_t (*motion_val_buf)[2] = s->motion_val_buf + 4;
 
-        s->has_watermark  = get_bits1(&gb);
-        avctx->has_b_frames = !s->low_delay;
-        if (s->has_watermark) {
-#if CONFIG_ZLIB
-            unsigned watermark_width  = get_interleaved_ue_golomb(&gb);
-            unsigned watermark_height = get_interleaved_ue_golomb(&gb);
-            int u1                    = get_interleaved_ue_golomb(&gb);
-            int u2                    = get_bits(&gb, 8);
-            int u3                    = get_bits(&gb, 2);
-            int u4                    = get_interleaved_ue_golomb(&gb);
-            unsigned long buf_len     = watermark_width *
-                                        watermark_height * 4;
-            int offset                = get_bits_count(&gb) + 7 >> 3;
-            uint8_t *buf;
-
-            if (watermark_height <= 0 ||
-                get_bits_left(&gb) <= 0 ||
-                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height)
-                return AVERROR_INVALIDDATA;
-
-            buf = av_malloc(buf_len);
-            if (!buf)
-                return AVERROR(ENOMEM);
-
-            av_log(avctx, AV_LOG_DEBUG, "watermark size: %ux%u\n",
-                   watermark_width, watermark_height);
-            av_log(avctx, AV_LOG_DEBUG,
-                   "u1: %x u2: %x u3: %x compressed data size: %d offset: %d\n",
-                   u1, u2, u3, u4, offset);
-            if (uncompress(buf, &buf_len, extradata + 8 + offset,
-                           size - offset) != Z_OK) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "could not uncompress watermark logo\n");
-                av_free(buf);
-                return -1;
-            }
-            s->watermark_key = av_bswap16(av_crc(av_crc_get_table(AV_CRC_16_CCITT), 0, buf, buf_len));
+    for (size_t i = 0; i < NUM_PICS; ++i) {
+        SVQ3Frame *const pic = &s->frames[i];
 
-            s->watermark_key = s->watermark_key << 16 | s->watermark_key;
-            av_log(avctx, AV_LOG_DEBUG,
-                   "watermark key %#"PRIx32"\n", s->watermark_key);
-            av_free(buf);
-#else
-            av_log(avctx, AV_LOG_ERROR,
-                   "this svq3 file contains watermark which need zlib support compiled in\n");
-            return AVERROR(ENOSYS);
-#endif
+        pic->mb_type = mb_type_buf;
+        mb_type_buf += big_mb_num;
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(pic->motion_val); ++j) {
+            pic->motion_val[j] = motion_val_buf;
+            motion_val_buf    += motion_val_buf_size;
         }
     }
 
-    s->mb_width   = (avctx->width + 15) / 16;
-    s->mb_height  = (avctx->height + 15) / 16;
-    s->mb_stride  = s->mb_width + 1;
-    s->mb_num     = s->mb_width * s->mb_height;
-    s->b_stride   = 4 * s->mb_width;
-    s->h_edge_pos = s->mb_width * 16;
-    s->v_edge_pos = s->mb_height * 16;
-
     s->intra4x4_pred_mode = av_mallocz(s->mb_stride * 2 * 8);
     if (!s->intra4x4_pred_mode)
         return AVERROR(ENOMEM);
@@ -1316,49 +1352,14 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void free_picture(SVQ3Frame *pic)
-{
-    int i;
-    for (i = 0; i < 2; i++) {
-        av_freep(&pic->motion_val_buf[i]);
-    }
-    av_freep(&pic->mb_type_buf);
-
-    av_frame_unref(pic->f);
-}
-
 static int get_buffer(AVCodecContext *avctx, SVQ3Frame *pic)
 {
     SVQ3Context *s = avctx->priv_data;
-    const int big_mb_num    = s->mb_stride * (s->mb_height + 1) + 1;
-    const int b4_stride     = s->mb_width * 4 + 1;
-    const int b4_array_size = b4_stride * s->mb_height * 4;
-    int ret;
-
-    if (!pic->motion_val_buf[0]) {
-        int i;
-
-        pic->mb_type_buf = av_calloc(big_mb_num + s->mb_stride, sizeof(uint32_t));
-        if (!pic->mb_type_buf)
-            return AVERROR(ENOMEM);
-        pic->mb_type = pic->mb_type_buf + 2 * s->mb_stride + 1;
-
-        for (i = 0; i < 2; i++) {
-            pic->motion_val_buf[i] = av_calloc(b4_array_size + 4, 2 * sizeof(int16_t));
-            if (!pic->motion_val_buf[i]) {
-                ret = AVERROR(ENOMEM);
-                goto fail;
-            }
-
-            pic->motion_val[i] = pic->motion_val_buf[i] + 4;
-        }
-    }
-
-    ret = ff_get_buffer(avctx, pic->f,
-                        (s->pict_type != AV_PICTURE_TYPE_B) ?
-                         AV_GET_BUFFER_FLAG_REF : 0);
+    int ret = ff_get_buffer(avctx, pic->f,
+                            (s->pict_type != AV_PICTURE_TYPE_B) ?
+                            AV_GET_BUFFER_FLAG_REF : 0);
     if (ret < 0)
-        goto fail;
+        return ret;
 
     if (!s->edge_emu_buffer) {
         s->edge_emu_buffer = av_calloc(pic->f->linesize[0], 17);
@@ -1367,9 +1368,23 @@ static int get_buffer(AVCodecContext *avctx, SVQ3Frame *pic)
     }
 
     return 0;
-fail:
-    free_picture(pic);
-    return ret;
+}
+
+static av_cold int alloc_dummy_frame(AVCodecContext *avctx, SVQ3Frame *pic)
+{
+    av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+    av_frame_unref(pic->f);
+    int ret = get_buffer(avctx, pic);
+    if (ret < 0)
+        return ret;
+
+    memset(pic->f->data[0], 0, avctx->height * pic->f->linesize[0]);
+    memset(pic->f->data[1], 0x80, (avctx->height / 2) *
+            pic->f->linesize[1]);
+    memset(pic->f->data[2], 0x80, (avctx->height / 2) *
+            pic->f->linesize[2]);
+
+    return 0;
 }
 
 static int svq3_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
@@ -1382,11 +1397,8 @@ static int svq3_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
 
     /* special case for last picture */
     if (buf_size == 0) {
-        if (s->next_pic->f->data[0] && !s->low_delay && !s->last_frame_output) {
-            ret = av_frame_ref(rframe, s->next_pic->f);
-            if (ret < 0)
-                return ret;
-            s->last_frame_output = 1;
+        if (s->next_pic->f->data[0] && !s->low_delay) {
+            av_frame_move_ref(rframe, s->next_pic->f);
             *got_frame          = 1;
         }
         return 0;
@@ -1398,8 +1410,9 @@ static int svq3_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
     if (ret < 0)
         return ret;
 
-    if (svq3_decode_slice_header(avctx))
-        return -1;
+    ret = svq3_decode_slice_header(avctx);
+    if (ret < 0)
+        return ret;
 
     if (avpkt->size < s->mb_width * s->mb_height / 8)
         return AVERROR_INVALIDDATA;
@@ -1435,29 +1448,15 @@ static int svq3_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
 
     if (s->pict_type != AV_PICTURE_TYPE_I) {
         if (!s->last_pic->f->data[0]) {
-            av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
-            av_frame_unref(s->last_pic->f);
-            ret = get_buffer(avctx, s->last_pic);
+            ret = alloc_dummy_frame(avctx, s->last_pic);
             if (ret < 0)
                 return ret;
-            memset(s->last_pic->f->data[0], 0, avctx->height * s->last_pic->f->linesize[0]);
-            memset(s->last_pic->f->data[1], 0x80, (avctx->height / 2) *
-                   s->last_pic->f->linesize[1]);
-            memset(s->last_pic->f->data[2], 0x80, (avctx->height / 2) *
-                   s->last_pic->f->linesize[2]);
         }
 
         if (s->pict_type == AV_PICTURE_TYPE_B && !s->next_pic->f->data[0]) {
-            av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
-            av_frame_unref(s->next_pic->f);
-            ret = get_buffer(avctx, s->next_pic);
+            ret = alloc_dummy_frame(avctx, s->next_pic);
             if (ret < 0)
                 return ret;
-            memset(s->next_pic->f->data[0], 0, avctx->height * s->next_pic->f->linesize[0]);
-            memset(s->next_pic->f->data[1], 0x80, (avctx->height / 2) *
-                   s->next_pic->f->linesize[1]);
-            memset(s->next_pic->f->data[2], 0x80, (avctx->height / 2) *
-                   s->next_pic->f->linesize[2]);
         }
     }
 
@@ -1512,8 +1511,9 @@ static int svq3_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
                 if (((get_bits_count(&s->gb_slice) & 7) == 0 ||
                     show_bits(&s->gb_slice, get_bits_left(&s->gb_slice) & 7) == 0)) {
 
-                    if (svq3_decode_slice_header(avctx))
-                        return -1;
+                    ret = svq3_decode_slice_header(avctx);
+                    if (ret < 0)
+                        return ret;
                 }
                 if (s->slice_type != s->pict_type) {
                     avpriv_request_sample(avctx, "non constant slice type");
@@ -1583,10 +1583,10 @@ static av_cold int svq3_decode_end(AVCodecContext *avctx)
 {
     SVQ3Context *s = avctx->priv_data;
 
-    for (int i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
-        free_picture(&s->frames[i]);
+    for (int i = 0; i < NUM_PICS; i++)
         av_frame_free(&s->frames[i].f);
-    }
+    av_freep(&s->motion_val_buf);
+    av_freep(&s->mb_type_buf);
     av_freep(&s->slice_buf);
     av_freep(&s->intra4x4_pred_mode);
     av_freep(&s->edge_emu_buffer);
diff --git a/libavcodec/tests/.gitignore b/libavcodec/tests/.gitignore
index 0df4ae10a0285..2c5bbec7f9d77 100644
--- a/libavcodec/tests/.gitignore
+++ b/libavcodec/tests/.gitignore
@@ -1,3 +1,4 @@
+/apv
 /av1_levels
 /avcodec
 /avpacket
diff --git a/libavcodec/tests/avcodec.c b/libavcodec/tests/avcodec.c
index 83a5d0531636e..dde8226384082 100644
--- a/libavcodec/tests/avcodec.c
+++ b/libavcodec/tests/avcodec.c
@@ -167,6 +167,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 !(codec->capabilities & AV_CODEC_CAP_DELAY))
                 ERR("EOF_FLUSH encoder %s is not marked as having delay\n");
         } else {
+            if ((codec2->update_thread_context || codec2->update_thread_context_for_user) &&
+                !(codec->capabilities & AV_CODEC_CAP_FRAME_THREADS))
+                ERR("Non-frame-threaded decoder %s has update_thread_context set");
             if ((codec->type == AVMEDIA_TYPE_SUBTITLE) != (codec2->cb_type == FF_CODEC_CB_TYPE_DECODE_SUB))
                 ERR("Subtitle decoder %s does not implement decode_sub callback\n");
             if (codec->type == AVMEDIA_TYPE_SUBTITLE && codec2->bsfs)
diff --git a/libavcodec/tests/hashtable.c b/libavcodec/tests/hashtable.c
new file mode 100644
index 0000000000000..02c0ac8afa025
--- /dev/null
+++ b/libavcodec/tests/hashtable.c
@@ -0,0 +1,110 @@
+/*
+ * Generic hashtable tests
+ * Copyright (C) 2024 Emma Worley <emma@emma.gg>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/avassert.h"
+#include "libavcodec/hashtable.h"
+
+int main(void)
+{
+    struct FFHashtableContext *ctx;
+    uint8_t k;
+    uint64_t v;
+
+    // impossibly large allocation should fail gracefully
+    av_assert0(ff_hashtable_alloc(&ctx, -1, -1, -1) < 0);
+
+    // hashtable can store up to 3 uint8_t->uint64_t entries
+    av_assert0(!ff_hashtable_alloc(&ctx, sizeof(k), sizeof(v), 3));
+
+    // unsuccessful deletes return 0
+    k = 1;
+    av_assert0(!ff_hashtable_delete(ctx, &k));
+
+    // unsuccessful gets return 0
+    k = 1;
+    av_assert0(!ff_hashtable_get(ctx, &k, &v));
+
+    // successful sets returns 1
+    k = 1;
+    v = 1;
+    av_assert0(ff_hashtable_set(ctx, &k, &v));
+
+    // get should now contain 1
+    k = 1;
+    v = 0;
+    av_assert0(ff_hashtable_get(ctx, &k, &v));
+    av_assert0(v == 1);
+
+    // updating sets should return 1
+    k = 1;
+    v = 2;
+    av_assert0(ff_hashtable_set(ctx, &k, &v));
+
+    // get should now contain 2
+    k = 1;
+    v = 0;
+    av_assert0(ff_hashtable_get(ctx, &k, &v));
+    av_assert0(v == 2);
+
+    // fill the table
+    k = 2;
+    v = 2;
+    av_assert0(ff_hashtable_set(ctx, &k, &v));
+    k = 3;
+    v = 3;
+    av_assert0(ff_hashtable_set(ctx, &k, &v));
+
+    // inserting sets on a full table should return 0
+    k = 4;
+    v = 4;
+    av_assert0(!ff_hashtable_set(ctx, &k, &v));
+
+    // updating sets on a full table should return 1
+    k = 1;
+    v = 4;
+    av_assert0(ff_hashtable_set(ctx, &k, &v));
+    v = 0;
+    av_assert0(ff_hashtable_get(ctx, &k, &v));
+    av_assert0(v == 4);
+
+    // successful deletes should return 1
+    k = 1;
+    av_assert0(ff_hashtable_delete(ctx, &k));
+
+    // get should now return 0
+    av_assert0(!ff_hashtable_get(ctx, &k, &v));
+
+    // sanity check remaining keys
+    k = 2;
+    v = 0;
+    av_assert0(ff_hashtable_get(ctx, &k, &v));
+    av_assert0(v == 2);
+    k = 3;
+    v = 0;
+    av_assert0(ff_hashtable_get(ctx, &k, &v));
+    av_assert0(v == 3);
+
+    ff_hashtable_freep(&ctx);
+
+    return 0;
+}
diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c
index be503d78c6a3c..7cefca79bc6fd 100644
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -24,6 +24,7 @@
  * Ut Video encoder
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
@@ -143,9 +144,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
         original_format  = UTVIDEO_444;
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown pixel format: %d\n",
-               avctx->pix_fmt);
-        return AVERROR_INVALIDDATA;
+        av_unreachable("Already checked via CODEC_PIXFMTS");
     }
 
     ff_bswapdsp_init(&c->bdsp);
@@ -153,7 +152,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
 
     if (c->frame_pred == PRED_GRADIENT) {
         av_log(avctx, AV_LOG_ERROR, "Gradient prediction is not supported.\n");
-        return AVERROR_OPTION_NOT_FOUND;
+        return AVERROR_PATCHWELCOME;
     }
 
     /*
@@ -646,7 +645,6 @@ static const AVOption options[] = {
 { "pred", "Prediction method", OFFSET(frame_pred), AV_OPT_TYPE_INT, { .i64 = PRED_LEFT }, PRED_NONE, PRED_MEDIAN, VE, .unit = "pred" },
     { "none",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRED_NONE }, INT_MIN, INT_MAX, VE, .unit = "pred" },
     { "left",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRED_LEFT }, INT_MIN, INT_MAX, VE, .unit = "pred" },
-    { "gradient", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRED_GRADIENT }, INT_MIN, INT_MAX, VE, .unit = "pred" },
     { "median",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRED_MEDIAN }, INT_MIN, INT_MAX, VE, .unit = "pred" },
 
     { NULL},
diff --git a/libavcodec/vaapi_mpeg4.c b/libavcodec/vaapi_mpeg4.c
index 8338c0732d333..533e6750a1c55 100644
--- a/libavcodec/vaapi_mpeg4.c
+++ b/libavcodec/vaapi_mpeg4.c
@@ -70,7 +70,7 @@ static int vaapi_mpeg4_start_frame(AVCodecContext *avctx,
             .obmc_disable                 = 1,
             .sprite_enable                = ctx->vol_sprite_usage,
             .sprite_warping_accuracy      = ctx->sprite_warping_accuracy,
-            .quant_type                   = s->mpeg_quant,
+            .quant_type                   = ctx->mpeg_quant,
             .quarter_sample               = s->quarter_sample,
             .data_partitioned             = s->data_partitioning,
             .reversible_vlc               = ctx->rvlc,
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index e3b90d2b62601..b018537af3387 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -30,7 +30,9 @@
 #include "hpeldsp.h"
 #include "h264chroma.h"
 
-typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h);
+typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/,
+                                  const uint8_t *pixels/*align 1*/,
+                                  ptrdiff_t line_size, int round);
 
 typedef struct VC1DSPContext {
     /* vc1 functions */
diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
index 99ca95c40a099..b0588f6b58426 100644
--- a/libavcodec/vc2enc.c
+++ b/libavcodec/vc2enc.c
@@ -193,6 +193,8 @@ static uint16_t interleaved_ue_golomb_tab[256];
 static uint16_t top_interleaved_ue_golomb_tab[256];
 /// 1 x_{k-1} ... x_0 -> 2 * k
 static uint8_t golomb_len_tab[256];
+/// quant -> av_log2(ff_dirac_qscale_tab[quant]) + 32
+static uint8_t qscale_len_tab[FF_ARRAY_ELEMS(ff_dirac_qscale_tab)];
 
 static av_cold void vc2_init_static_data(void)
 {
@@ -202,6 +204,8 @@ static av_cold void vc2_init_static_data(void)
         interleaved_ue_golomb_tab[i] = (interleaved_ue_golomb_tab[i >> 1] << 2) | (i & 1);
         top_interleaved_ue_golomb_tab[i] = interleaved_ue_golomb_tab[i] ^ (1 << golomb_len_tab[i]);
     }
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(qscale_len_tab); ++i)
+        qscale_len_tab[i] = av_log2(ff_dirac_qscale_tab[i]) + 32;
 }
 
 static av_always_inline void put_vc2_ue_uint_inline(PutBitContext *pb, uint32_t val)
@@ -545,7 +549,7 @@ static void encode_subband(const VC2EncContext *s, PutBitContext *pb,
     dwtcoef *coeff = b->buf + top * b->stride;
     const uint64_t q_m = ((uint64_t)(s->qmagic_lut[quant][0])) << 2;
     const uint64_t q_a = s->qmagic_lut[quant][1];
-    const int q_s = av_log2(ff_dirac_qscale_tab[quant]) + 32;
+    const int q_s = qscale_len_tab[quant];
 
     for (y = top; y < bottom; y++) {
         for (x = left; x < right; x++) {
@@ -586,7 +590,7 @@ static int count_hq_slice(SliceArgs *slice, int quant_idx)
                 const int q_idx = quants[level][orientation];
                 const uint64_t q_m = ((uint64_t)s->qmagic_lut[q_idx][0]) << 2;
                 const uint64_t q_a = s->qmagic_lut[q_idx][1];
-                const int q_s = av_log2(ff_dirac_qscale_tab[q_idx]) + 32;
+                const int q_s = qscale_len_tab[q_idx];
 
                 const int left   = b->width  * slice->x    / s->num_x;
                 const int right  = b->width  *(slice->x+1) / s->num_x;
diff --git a/libavcodec/vdpau_mpeg4.c b/libavcodec/vdpau_mpeg4.c
index 91981935f5ba9..7ec7a74ad1d40 100644
--- a/libavcodec/vdpau_mpeg4.c
+++ b/libavcodec/vdpau_mpeg4.c
@@ -68,7 +68,7 @@ static int vdpau_mpeg4_start_frame(AVCodecContext *avctx,
     info->vop_fcode_backward                = ctx->b_code;
     info->resync_marker_disable             = !ctx->resync_marker;
     info->interlaced                        = !s->progressive_sequence;
-    info->quant_type                        = s->mpeg_quant;
+    info->quant_type                        = ctx->mpeg_quant;
     info->quarter_sample                    = s->quarter_sample;
     info->short_video_header                = avctx->codec->id == AV_CODEC_ID_H263;
     info->rounding_control                  = s->no_rounding;
diff --git a/libavcodec/vlc.c b/libavcodec/vlc.c
index c49c801181979..3aa198a2778e0 100644
--- a/libavcodec/vlc.c
+++ b/libavcodec/vlc.c
@@ -42,6 +42,8 @@
 {                                                           \
     const uint8_t *ptr = (const uint8_t *)table + i * wrap; \
     switch(size) {                                          \
+    default:                                                \
+        av_unreachable("Only uint8/16/32_t are used");      \
     case 1:                                                 \
         v = *(const uint8_t *)ptr;                          \
         break;                                              \
@@ -49,8 +51,6 @@
         v = *(const uint16_t *)ptr;                         \
         break;                                              \
     case 4:                                                 \
-    default:                                                \
-        av_assert1(size == 4);                              \
         v = *(const uint32_t *)ptr;                         \
         break;                                              \
     }                                                       \
@@ -260,7 +260,7 @@ int ff_vlc_init_sparse(VLC *vlc, int nb_bits, int nb_codes,
     if (ret < 0)
         return ret;
 
-    av_assert0(symbols_size <= 2 || !symbols);
+    av_assert0(symbols_size <= 2U);
     j = 0;
 #define COPY(condition)\
     for (int i = 0; i < nb_codes; i++) {                                    \
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index fd416eed3a5ab..141f0941b402f 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -1140,7 +1140,8 @@ static void decode_sb(VP9TileData *td, int row, int col, VP9Filter *lflvl,
                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 break;
             default:
-                av_assert0(0);
+                av_unreachable("ff_vp9_partition_tree only has "
+                               "the four PARTITION_* terminal codes");
             }
         } else if (vpx_rac_get_prob_branchy(td->c, p[1])) {
             bp = PARTITION_SPLIT;
diff --git a/libavcodec/vp9dec.h b/libavcodec/vp9dec.h
index 851ee9f6dde1b..e41f47a82a524 100644
--- a/libavcodec/vp9dec.h
+++ b/libavcodec/vp9dec.h
@@ -220,8 +220,8 @@ struct VP9TileData {
     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
     // block reconstruction intermediates
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
+    DECLARE_ALIGNED(64, uint8_t, tmp_y)[64 * 64 * 2];
+    DECLARE_ALIGNED(64, uint8_t, tmp_uv)[2][64 * 64 * 2];
     struct { int x, y; } min_mv, max_mv;
     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index feb5d2ea5136c..729cb4f15c5da 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -6,10 +6,8 @@ clean::
 OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
-					vulkan/ffv1_enc_common.o \
 					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \
-					vulkan/ffv1_enc_vlc.o vulkan/ffv1_enc_ac.o \
-					vulkan/ffv1_enc.o vulkan/ffv1_enc_rgb.o
+					vulkan/ffv1_rct_search.o vulkan/ffv1_enc.o
 
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
diff --git a/libavcodec/vulkan/ffv1_common.comp b/libavcodec/vulkan/ffv1_common.comp
index 64c1c2ce801be..3d40592739b5c 100644
--- a/libavcodec/vulkan/ffv1_common.comp
+++ b/libavcodec/vulkan/ffv1_common.comp
@@ -92,3 +92,90 @@ uint slice_coord(uint width, uint sx, uint num_h_slices, uint chroma_shift)
 
     return sx;
 }
+
+#ifdef RGB
+#define RGB_LBUF (RGB_LINECACHE - 1)
+#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF)))
+
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
+               int comp, int sw, uint8_t quant_table_idx, bool extend_lookup)
+{
+    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? off + ivec2(1, -1) : off;
+
+    /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
+    VTYPE3 top  = VTYPE3(TYPE(imageLoad(pred, sp + LADDR(yoff_border1 + ivec2(-1, -1)))[comp]),
+                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -1)))[comp]),
+                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[comp]));
+
+    /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
+     * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
+     * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
+    TYPE cur = TYPE(imageLoad(pred, sp + LADDR(yoff_border1 + ivec2(-1,  0)))[comp]);
+
+    int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
+
+    if (expectEXT(extend_lookup, false)) {
+        TYPE cur2 = TYPE(0);
+        if (expectEXT(off.x > 0, true)) {
+            const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
+            cur2 = TYPE(imageLoad(pred, sp + LADDR(off + yoff_border2))[comp]);
+        }
+        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
+
+        /* top-2 became current upon swap */
+        TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off))[comp]);
+        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
+    }
+
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
+}
+
+#else /* RGB */
+
+#define LADDR(p) (p)
+
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
+               int comp, int sw, uint8_t quant_table_idx, bool extend_lookup)
+{
+    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
+    sp += off;
+
+    VTYPE3 top  = VTYPE3(TYPE(0),
+                         TYPE(0),
+                         TYPE(0));
+    if (off.y > 0 && off != ivec2(0, 1))
+        top[0] = TYPE(imageLoad(pred, sp + ivec2(-1, -1) + yoff_border1)[comp]);
+    if (off.y > 0) {
+        top[1] = TYPE(imageLoad(pred, sp + ivec2(0, -1))[comp]);
+        top[2] = TYPE(imageLoad(pred, sp + ivec2(min(1, sw - off.x - 1), -1))[comp]);
+    }
+
+    TYPE cur = TYPE(0);
+    if (off != ivec2(0, 0))
+        cur = TYPE(imageLoad(pred, sp + ivec2(-1,  0) + yoff_border1)[comp]);
+
+    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
+
+    if (expectEXT(extend_lookup, false)) {
+        TYPE cur2 = TYPE(0);
+        if (off.x > 0 && off != ivec2(1, 0)) {
+            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
+            cur2 = TYPE(imageLoad(pred, sp + ivec2(-2,  0) + yoff_border2)[comp]);
+        }
+        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
+
+        TYPE top2 = TYPE(0);
+        if (off.y > 1)
+            top2 = TYPE(imageLoad(pred, sp + ivec2(0, -2))[comp]);
+        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
+    }
+
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
+}
+#endif
diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index fc0175c715723..eb795dcba4526 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -20,93 +20,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef RGB
-#define LADDR(p) (p)
-#else
-#define RGB_LINECACHE 2
-#define RGB_LBUF (RGB_LINECACHE - 1)
-#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF)))
-#endif
-
-#ifdef RGB
-ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
-{
-    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);
-
-    /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
-    VTYPE3 top  = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[0]),
-                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(0, -1)))[0]),
-                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[0]));
-
-    /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
-     * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
-     * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
-    TYPE cur = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[0]);
-
-    int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
-
-    if (expectEXT(extend_lookup[quant_table_idx] > 0, false)) {
-        TYPE cur2 = TYPE(0);
-        if (expectEXT(off.x > 0, true)) {
-            const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
-            cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + yoff_border2))[0]);
-        }
-        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
-
-        /* top-2 became current upon swap */
-        TYPE top2 = TYPE(imageLoad(dec[p], sp + LADDR(off))[0]);
-        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
-    }
-
-    /* context, prediction */
-    return ivec2(base, predict(cur, VTYPE2(top)));
-}
-#else
-ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
-{
-    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
-    sp += off;
-
-    VTYPE3 top  = VTYPE3(TYPE(0),
-                         TYPE(0),
-                         TYPE(0));
-    if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(dec[p], sp + ivec2(-1, -1) + yoff_border1)[0]);
-    if (off.y > 0) {
-        top[1] = TYPE(imageLoad(dec[p], sp + ivec2(0, -1))[0]);
-        top[2] = TYPE(imageLoad(dec[p], sp + ivec2(min(1, sw - off.x - 1), -1))[0]);
-    }
-
-    TYPE cur = TYPE(0);
-    if (off != ivec2(0, 0))
-        cur = TYPE(imageLoad(dec[p], sp + ivec2(-1,  0) + yoff_border1)[0]);
-
-    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
-
-    if ((quant_table[quant_table_idx][3][127] != 0) ||
-        (quant_table[quant_table_idx][4][127] != 0)) {
-        TYPE cur2 = TYPE(0);
-        if (off.x > 0 && off != ivec2(1, 0)) {
-            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-            cur2 = TYPE(imageLoad(dec[p], sp + ivec2(-2,  0) + yoff_border2)[0]);
-        }
-        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
-
-        TYPE top2 = TYPE(0);
-        if (off.y > 1)
-            top2 = TYPE(imageLoad(dec[p], sp + ivec2(0, -2))[0]);
-        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
-    }
-
-    /* context, prediction */
-    return ivec2(base, predict(cur, VTYPE2(top)));
-}
-#endif
-
 #ifndef GOLOMB
 #ifdef CACHED_SYMBOL_READER
 shared uint8_t state[CONTEXT_SIZE];
@@ -143,6 +56,11 @@ int get_isymbol(inout RangeCoder c, uint state_off)
 
 void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits)
 {
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -171,8 +89,8 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
 #endif
 
     for (int x = 0; x < w; x++) {
-        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
-                            quant_table_idx);
+        ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
+                            quant_table_idx, extend_lookup[quant_table_idx] > 0);
 
         uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
 #ifdef CACHED_SYMBOL_READER
@@ -192,6 +110,8 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
 
 #ifdef CACHED_SYMBOL_READER
         }
+
+        barrier();
         sb.v = state[gl_LocalInvocationID.x];
 #endif
     }
@@ -216,10 +136,11 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
     for (int x = 0; x < w; x++) {
         ivec2 pos = sp + ivec2(x, y);
         int diff;
-        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
-                            quant_table_idx);
+        ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
+                            quant_table_idx, extend_lookup[quant_table_idx] > 0);
 
-        VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0]));
+        uint context_off = state_off + VLC_STATE_SIZE*abs(pr[0]);
+        VlcState sb = VlcState(uint64_t(slice_state) + context_off);
 
         if (pr[0] == 0 && run_mode == 0)
             run_mode = 1;
@@ -305,7 +226,6 @@ void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
 
 void decode_slice(inout SliceContext sc, const uint slice_idx)
 {
-    int run_index = 0;
     int w = sc.slice_dim.x;
     ivec2 sp = sc.slice_pos;
 
@@ -322,8 +242,6 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
     /* PCM coding */
 #ifndef GOLOMB
     if (sc.slice_coding_mode == 1) {
-        if (gl_LocalInvocationID.x > 0)
-            return;
 #ifndef RGB
         for (int p = 0; p < planes; p++) {
             int h = sc.slice_dim.y;
@@ -355,11 +273,13 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
             if (p > 0 && p < 3)
                 h >>= chroma_shift.y;
 
+            int run_index = 0;
             for (int y = 0; y < h; y++)
                 decode_line(sc, sp, w, y, p, bits,
                             slice_state_off[p], quant_table_idx[p], run_index);
         }
 #else
+        int run_index = 0;
         for (int y = 0; y < sc.slice_dim.y; y++) {
             for (int p = 0; p < color_planes; p++)
                 decode_line(sc, sp, w, y, p, bits,
@@ -375,4 +295,8 @@ void main(void)
 {
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
     decode_slice(slice_ctx[slice_idx], slice_idx);
+
+    uint32_t status = corrupt ? uint32_t(corrupt) : overread;
+    if (status != 0)
+        slice_status[2*slice_idx + 1] = status;
 }
diff --git a/libavcodec/vulkan/ffv1_dec_setup.comp b/libavcodec/vulkan/ffv1_dec_setup.comp
index a27a878927a72..671f28e7e75d5 100644
--- a/libavcodec/vulkan/ffv1_dec_setup.comp
+++ b/libavcodec/vulkan/ffv1_dec_setup.comp
@@ -133,6 +133,8 @@ void main(void)
         for (int i = 0; i < slice_size; i++)
             crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
 
-        slice_crc_mismatch[slice_idx] = crc;
+        slice_status[2*slice_idx + 0] = crc;
     }
+
+    slice_status[2*slice_idx + 1] = corrupt ? uint32_t(corrupt) : overread;
 }
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index 4b851fd7116bf..78372f5b3a76d 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -20,12 +20,226 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#ifndef GOLOMB
+#ifdef CACHED_SYMBOL_READER
+shared uint8_t state[CONTEXT_SIZE];
+#define WRITE(c, off, val) put_rac_direct(c, state[off], val)
+#else
+#define WRITE(c, off, val) put_rac(c, uint64_t(slice_state) + (state_off + off), val)
+#endif
+
+/* Note - only handles signed values */
+void put_symbol(inout RangeCoder c, uint state_off, int v)
+{
+    bool is_nil = (v == 0);
+    WRITE(c, 0, is_nil);
+    if (is_nil)
+        return;
+
+    const int a = abs(v);
+    const int e = findMSB(a);
+
+    for (int i = 0; i < e; i++)
+        WRITE(c, 1 + min(i, 9), true);
+    WRITE(c, 1 + min(e, 9), false);
+
+    for (int i = e - 1; i >= 0; i--)
+        WRITE(c, 22 + min(i, 9), bool(bitfieldExtract(a, i, 1)));
+
+    WRITE(c, 22 - 11 + min(e, 10), v < 0);
+}
+
+void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
+                     ivec2 sp, int y, int p, int comp, int bits)
+{
+    int w = sc.slice_dim.x;
+
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
+#ifndef RGB
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+#endif
+
+    for (int x = 0; x < w; x++) {
+        uint v = imageLoad(img, sp + LADDR(ivec2(x, y)))[comp];
+        for (int i = (bits - 1); i >= 0; i--)
+            put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1)));
+    }
+}
+
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
+                 ivec2 sp, int y, int p, int comp, int bits,
+                 uint8_t quant_table_idx, const int run_index)
+{
+    int w = sc.slice_dim.x;
+
+#ifndef RGB
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+#endif
+
+    for (int x = 0; x < w; x++) {
+        ivec2 d = get_pred(img, sp, ivec2(x, y), comp, w,
+                           quant_table_idx, extend_lookup[quant_table_idx] > 0);
+        d[1] = int(imageLoad(img, sp + LADDR(ivec2(x, y)))[comp]) - d[1];
+
+        if (d[0] < 0)
+            d = -d;
+
+        d[1] = fold(d[1], bits);
+
+        uint context_off = state_off + CONTEXT_SIZE*d[0];
+#ifdef CACHED_SYMBOL_READER
+        u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
+        state[gl_LocalInvocationID.x] = sb.v;
+        barrier();
+        if (gl_LocalInvocationID.x == 0)
+#endif
+
+            put_symbol(sc.c, context_off, d[1]);
+
+#ifdef CACHED_SYMBOL_READER
+        barrier();
+        sb.v = state[gl_LocalInvocationID.x];
+#endif
+    }
+}
+
+#else /* GOLOMB */
+
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
+                 ivec2 sp, int y, int p, int comp, int bits,
+                 uint8_t quant_table_idx, inout int run_index)
+{
+    int w = sc.slice_dim.x;
+
+#ifndef RGB
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+#endif
+
+    int run_count = 0;
+    bool run_mode = false;
+
+    for (int x = 0; x < w; x++) {
+        ivec2 d = get_pred(img, sp, ivec2(x, y), comp, w,
+                           quant_table_idx, extend_lookup[quant_table_idx] > 0);
+        d[1] = int(imageLoad(img, sp + LADDR(ivec2(x, y)))[comp]) - d[1];
+
+        if (d[0] < 0)
+            d = -d;
+
+        d[1] = fold(d[1], bits);
+
+        if (d[0] == 0)
+            run_mode = true;
+
+        if (run_mode) {
+            if (d[1] != 0) {
+                /* A very unlikely loop */
+                while (run_count >= 1 << log2_run[run_index]) {
+                    run_count -= 1 << log2_run[run_index];
+                    run_index++;
+                    put_bits(sc.pb, 1, 1);
+                }
+
+                put_bits(sc.pb, 1 + log2_run[run_index], run_count);
+                if (run_index != 0)
+                    run_index--;
+                run_count = 0;
+                run_mode  = false;
+                if (d[1] > 0)
+                    d[1]--;
+            } else {
+                run_count++;
+            }
+        }
+
+        if (!run_mode) {
+            VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*d[0]);
+            Symbol sym = get_vlc_symbol(sb, d[1], bits);
+            put_bits(sc.pb, sym.bits, sym.val);
+        }
+    }
+
+    if (run_mode) {
+        while (run_count >= (1 << log2_run[run_index])) {
+            run_count -= 1 << log2_run[run_index];
+            run_index++;
+            put_bits(sc.pb, 1, 1);
+        }
+
+        if (run_count > 0)
+            put_bits(sc.pb, 1, 1);
+    }
+}
+#endif
+
+#ifdef RGB
+ivec4 load_components(ivec2 pos)
+{
+    ivec4 pix = ivec4(imageLoad(src[0], pos));
+    if (planar_rgb != 0) {
+        for (int i = 1; i < (3 + transparency); i++)
+            pix[i] = int(imageLoad(src[i], pos)[0]);
+    }
+
+    return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
+                 pix[fmt_lut[2]], pix[fmt_lut[3]]);
+}
+
+void transform_sample(inout ivec4 pix, ivec2 rct_coef)
+{
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += rct_offset;
+    pix.r += rct_offset;
+}
+
+void preload_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
+{
+    for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) {
+        ivec2 lpos = sp + LADDR(ivec2(x, y));
+        ivec2 pos = sc.slice_pos + ivec2(x, y);
+
+        ivec4 pix = load_components(pos);
+
+        if (expectEXT(apply_rct, true))
+            transform_sample(pix, sc.slice_rct_coef);
+
+        imageStore(tmp, lpos, pix);
+    }
+}
+#endif
+
 void encode_slice(inout SliceContext sc, const uint slice_idx)
 {
+    ivec2 sp = sc.slice_pos;
+
+#ifndef RGB
     int bits = bits_per_raw_sample;
+#else
+    int bits = 9;
+    if (bits != 8 || sc.slice_coding_mode != 0)
+        bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
+
+    sp.y = int(gl_WorkGroupID.y)*RGB_LINECACHE;
+#endif
 
 #ifndef GOLOMB
     if (sc.slice_coding_mode == 1) {
+#ifndef RGB
         for (int c = 0; c < components; c++) {
 
             int h = sc.slice_dim.y;
@@ -37,14 +251,26 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line_pcm(sc, y, p, comp, bits);
+                encode_line_pcm(sc, src[p], sp, y, p, comp, bits);
+        }
+#else
+        for (int y = 0; y < sc.slice_dim.y; y++) {
+            preload_rgb(sc, sp, sc.slice_dim.x, y, false);
+
+            encode_line_pcm(sc, tmp, sp, y, 0, 1, bits);
+            encode_line_pcm(sc, tmp, sp, y, 0, 2, bits);
+            encode_line_pcm(sc, tmp, sp, y, 0, 0, bits);
+            if (transparency == 1)
+                encode_line_pcm(sc, tmp, sp, y, 0, 3, bits);
         }
+#endif
     } else
 #endif
     {
-        uint64_t slice_state_off = uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes;
+        u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
+        u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;
 
+#ifndef RGB
         for (int c = 0; c < components; c++) {
             int run_index = 0;
 
@@ -56,19 +282,77 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line(sc, slice_state_off, y, p, comp, bits, run_index);
+                encode_line(sc, src[p], slice_state_off[c], sp, y, p,
+                            comp, bits, quant_table_idx[c], run_index);
+        }
+#else
+        int run_index = 0;
+        for (int y = 0; y < sc.slice_dim.y; y++) {
+            preload_rgb(sc, sp, sc.slice_dim.x, y, true);
 
-            /* For the second chroma plane, reuse the first plane's state */
-            if (c != 1)
-                slice_state_off += plane_state_size;
+            encode_line(sc, tmp, slice_state_off[0],
+                        sp, y, 0, 1, bits, quant_table_idx[0], run_index);
+            encode_line(sc, tmp, slice_state_off[1],
+                        sp, y, 0, 2, bits, quant_table_idx[1], run_index);
+            encode_line(sc, tmp, slice_state_off[2],
+                        sp, y, 0, 0, bits, quant_table_idx[2], run_index);
+            if (transparency == 1)
+                encode_line(sc, tmp, slice_state_off[3],
+                            sp, y, 0, 3, bits, quant_table_idx[3], run_index);
         }
+#endif
+    }
+}
+
+void finalize_slice(inout SliceContext sc, const uint slice_idx)
+{
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
+#ifdef GOLOMB
+    uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
+#else
+    uint32_t enc_len = rac_terminate(sc.c);
+#endif
+
+    u8buf bs = u8buf(sc.c.bytestream_start);
+
+    /* Append slice length */
+    u8vec4 enc_len_p = unpack8(enc_len);
+    bs[enc_len + 0].v = enc_len_p.z;
+    bs[enc_len + 1].v = enc_len_p.y;
+    bs[enc_len + 2].v = enc_len_p.x;
+    enc_len += 3;
+
+    /* Calculate and write CRC */
+    if (ec != 0) {
+        bs[enc_len].v = uint8_t(0);
+        enc_len++;
+
+        uint32_t crc = crcref;
+        for (int i = 0; i < enc_len; i++)
+            crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
+
+        if (crcref != 0x00000000)
+            crc ^= 0x8CD88196;
+
+        u8vec4 crc_p = unpack8(crc);
+        bs[enc_len + 0].v = crc_p.x;
+        bs[enc_len + 1].v = crc_p.y;
+        bs[enc_len + 2].v = crc_p.z;
+        bs[enc_len + 3].v = crc_p.w;
+        enc_len += 4;
     }
 
-    finalize_slice(sc, slice_idx);
+    slice_results[slice_idx*2 + 0] = enc_len;
+    slice_results[slice_idx*2 + 1] = uint64_t(bs) - uint64_t(out_data);
 }
 
 void main(void)
 {
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
     encode_slice(slice_ctx[slice_idx], slice_idx);
+    finalize_slice(slice_ctx[slice_idx], slice_idx);
 }
diff --git a/libavcodec/vulkan/ffv1_enc_ac.comp b/libavcodec/vulkan/ffv1_enc_ac.comp
deleted file mode 100644
index 0bbf58c5ddd04..0000000000000
--- a/libavcodec/vulkan/ffv1_enc_ac.comp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-void put_rac(inout RangeCoder c, uint64_t state, bool bit)
-{
-    put_rac_norenorm(c, state, bit);
-    if (c.range < 0x100)
-        renorm_encoder(c);
-}
-
-/* Note - only handles signed values */
-void put_symbol(inout RangeCoder c, uint64_t state, int v)
-{
-    bool is_nil = (v == 0);
-    put_rac(c, state, is_nil);
-    if (is_nil)
-        return;
-
-    const int a = abs(v);
-    const int e = findMSB(a);
-
-    state += 1;
-    for (int i = 0; i < e; i++)
-        put_rac(c, state + min(i, 9), true);
-    put_rac(c, state + min(e, 9), false);
-
-    state += 21;
-    for (int i = e - 1; i >= 0; i--)
-        put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1)));
-
-    put_rac(c, state - 11 + min(e, 10), v < 0);
-}
-
-void encode_line_pcm(inout SliceContext sc, int y, int p, int comp,
-                     int bits)
-{
-    ivec2 sp = sc.slice_pos;
-    int w = sc.slice_dim.x;
-    if (p > 0 && p < 3) {
-        w >>= chroma_shift.x;
-        sp >>= chroma_shift;
-    }
-
-    for (int x = 0; x < w; x++) {
-        uint v = imageLoad(src[p], (sp + ivec2(x, y)))[comp];
-        for (int i = (bits - 1); i >= 0; i--)
-            put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1)));
-    }
-}
-
-void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits, const int run_index)
-{
-    ivec2 sp = sc.slice_pos;
-
-    int w = sc.slice_dim.x;
-    if (p > 0 && p < 3) {
-        w >>= chroma_shift.x;
-        sp >>= chroma_shift;
-    }
-
-    for (int x = 0; x < w; x++) {
-        const ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
-        put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]);
-    }
-}
diff --git a/libavcodec/vulkan/ffv1_enc_common.comp b/libavcodec/vulkan/ffv1_enc_common.comp
deleted file mode 100644
index 62c0624b0e1f0..0000000000000
--- a/libavcodec/vulkan/ffv1_enc_common.comp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-ivec2 get_diff(ivec2 pos, ivec2 off, int p, int comp, int sw, int bits)
-{
-    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
-    const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-
-    TYPE top2 = TYPE(0);
-    if (off.y > 1)
-        top2 = TYPE(imageLoad(src[p], pos + ivec2(0, -2))[comp]);
-
-    VTYPE3 top  = VTYPE3(TYPE(0),
-                         TYPE(0),
-                         TYPE(0));
-    if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(src[p], pos + ivec2(-1, -1) + yoff_border1)[comp]);
-    if (off.y > 0) {
-        top[1] = TYPE(imageLoad(src[p], pos + ivec2(0, -1))[comp]);
-        top[2] = TYPE(imageLoad(src[p], pos + ivec2(min(1, sw - off.x - 1), -1))[comp]);
-    }
-
-    VTYPE3 cur = VTYPE3(TYPE(0),
-                        TYPE(0),
-                        imageLoad(src[p], pos)[comp]);
-    if (off.x > 0 && off != ivec2(1, 0))
-        cur[0] = TYPE(imageLoad(src[p], pos + ivec2(-2,  0) + yoff_border2)[comp]);
-    if (off != ivec2(0, 0))
-        cur[1] = TYPE(imageLoad(src[p], pos + ivec2(-1,  0) + yoff_border1)[comp]);
-
-    /* context, diff */
-    ivec2 d = ivec2(get_context(VTYPE2(cur), top, top2, context_model),
-                    cur[2] - predict(cur[1], VTYPE2(top)));
-
-    if (d[0] < 0)
-        d = -d;
-
-    d[1] = fold(d[1], bits);
-
-    return d;
-}
-
-void finalize_slice(inout SliceContext sc, const uint slice_idx)
-{
-#ifdef GOLOMB
-    uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
-#else
-    uint32_t enc_len = rac_terminate(sc.c);
-#endif
-
-    u8buf bs = u8buf(sc.c.bytestream_start);
-
-    /* Append slice length */
-    u8vec4 enc_len_p = unpack8(enc_len);
-    bs[enc_len + 0].v = enc_len_p.z;
-    bs[enc_len + 1].v = enc_len_p.y;
-    bs[enc_len + 2].v = enc_len_p.x;
-    enc_len += 3;
-
-    /* Calculate and write CRC */
-    if (ec != 0) {
-        bs[enc_len].v = uint8_t(0);
-        enc_len++;
-
-        uint32_t crc = crcref;
-        for (int i = 0; i < enc_len; i++)
-            crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
-
-        if (crcref != 0x00000000)
-            crc ^= 0x8CD88196;
-
-        u8vec4 crc_p = unpack8(crc);
-        bs[enc_len + 0].v = crc_p.x;
-        bs[enc_len + 1].v = crc_p.y;
-        bs[enc_len + 2].v = crc_p.z;
-        bs[enc_len + 3].v = crc_p.w;
-        enc_len += 4;
-    }
-
-    slice_results[slice_idx*2 + 0] = enc_len;
-    slice_results[slice_idx*2 + 1] = uint64_t(bs) - uint64_t(out_data);
-}
diff --git a/libavcodec/vulkan/ffv1_enc_rgb.comp b/libavcodec/vulkan/ffv1_enc_rgb.comp
deleted file mode 100644
index c176d94e8b246..0000000000000
--- a/libavcodec/vulkan/ffv1_enc_rgb.comp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-void encode_slice_rgb(inout SliceContext sc, const uint slice_idx)
-{
-    int bits = 9;
-    if (bits != 8 || sc.slice_coding_mode != 0)
-        bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
-
-    int run_index = 0;
-
-#ifndef GOLOMB
-    if (sc.slice_coding_mode == 1) {
-        if (transparency == 1) {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line_pcm(sc, y, 0, 1, bits);
-                encode_line_pcm(sc, y, 0, 2, bits);
-                encode_line_pcm(sc, y, 0, 0, bits);
-                encode_line_pcm(sc, y, 0, 3, bits);
-            }
-        } else {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line_pcm(sc, y, 0, 1, bits);
-                encode_line_pcm(sc, y, 0, 2, bits);
-                encode_line_pcm(sc, y, 0, 0, bits);
-            }
-        }
-    } else
-#endif
-    {
-        uint64_t slice_state_off = uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes;
-
-        if (transparency == 1) {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line(sc, slice_state_off + plane_state_size*0,
-                            y, 0, 1, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 2, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 0, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*2,
-                            y, 0, 3, bits, run_index);
-            }
-        } else {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line(sc, slice_state_off + plane_state_size*0,
-                            y, 0, 1, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 2, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 0, bits, run_index);
-            }
-        }
-    }
-
-    finalize_slice(sc, slice_idx);
-}
-
-void main(void)
-{
-    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
-    encode_slice_rgb(slice_ctx[slice_idx], slice_idx);
-}
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp
index 44c13404d8545..5f8e6704b0aaa 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp
@@ -20,7 +20,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-void init_slice(out SliceContext sc, const uint slice_idx)
+uint8_t state[CONTEXT_SIZE];
+
+void init_slice(inout SliceContext sc, const uint slice_idx)
 {
     /* Set coordinates */
     uvec2 img_size = imageSize(src[0]);
@@ -35,77 +37,66 @@ void init_slice(out SliceContext sc, const uint slice_idx)
 
     sc.slice_pos = ivec2(sxs, sys);
     sc.slice_dim = ivec2(sxe - sxs, sye - sys);
-    sc.slice_rct_coef = ivec2(1, 1);
     sc.slice_coding_mode = int(force_pcm == 1);
     sc.slice_reset_contexts = sc.slice_coding_mode == 1;
     sc.quant_table_idx = u8vec3(context_model);
 
+    if ((rct_search == 0) || (sc.slice_coding_mode == 1))
+        sc.slice_rct_coef = ivec2(1, 1);
+
     rac_init(sc.c,
              OFFBUF(u8buf, out_data, slice_idx * slice_size_max),
              slice_size_max);
 }
 
-void put_rac_full(inout RangeCoder c, uint64_t state, bool bit)
-{
-    put_rac_norenorm(c, state, bit);
-    if (c.range < 0x100)
-        renorm_encoder_full(c);
-}
-
-void put_symbol_unsigned(inout RangeCoder c, uint64_t state, uint v)
+void put_usymbol(inout RangeCoder c, uint v)
 {
     bool is_nil = (v == 0);
-    put_rac_full(c, state, is_nil);
+    put_rac_direct(c, state[0], is_nil);
     if (is_nil)
         return;
 
     const int e = findMSB(v);
 
-    state += 1;
     for (int i = 0; i < e; i++)
-        put_rac_full(c, state + min(i, 9), true);
-    put_rac_full(c, state + min(e, 9), false);
+        put_rac_direct(c, state[1 + min(i, 9)], true);
+    put_rac_direct(c, state[1 + min(e, 9)], false);
 
-    state += 21;
     for (int i = e - 1; i >= 0; i--)
-        put_rac_full(c, state + min(i, 9), bool(bitfieldExtract(v, i, 1)));
+        put_rac_direct(c, state[22 + min(i, 9)], bool(bitfieldExtract(v, i, 1)));
 }
 
-void write_slice_header(inout SliceContext sc, uint64_t state)
+void write_slice_header(inout SliceContext sc)
 {
-    u8buf sb = u8buf(state);
-
     [[unroll]]
     for (int i = 0; i < CONTEXT_SIZE; i++)
-        sb[i].v = uint8_t(128);
+        state[i] = uint8_t(128);
 
-    put_symbol_unsigned(sc.c, state, gl_WorkGroupID.x);
-    put_symbol_unsigned(sc.c, state, gl_WorkGroupID.y);
-    put_symbol_unsigned(sc.c, state, 0);
-    put_symbol_unsigned(sc.c, state, 0);
+    put_usymbol(sc.c, gl_WorkGroupID.x);
+    put_usymbol(sc.c, gl_WorkGroupID.y);
+    put_usymbol(sc.c, 0);
+    put_usymbol(sc.c, 0);
 
     for (int i = 0; i < codec_planes; i++)
-        put_symbol_unsigned(sc.c, state, sc.quant_table_idx[i]);
+        put_usymbol(sc.c, sc.quant_table_idx[i]);
 
-    put_symbol_unsigned(sc.c, state, pic_mode);
-    put_symbol_unsigned(sc.c, state, sar.x);
-    put_symbol_unsigned(sc.c, state, sar.y);
+    put_usymbol(sc.c, pic_mode);
+    put_usymbol(sc.c, sar.x);
+    put_usymbol(sc.c, sar.y);
 
     if (version >= 4) {
-        put_rac_full(sc.c, state, sc.slice_reset_contexts);
-        put_symbol_unsigned(sc.c, state, sc.slice_coding_mode);
+        put_rac_direct(sc.c, state[0], sc.slice_reset_contexts);
+        put_usymbol(sc.c, sc.slice_coding_mode);
         if (sc.slice_coding_mode != 1 && colorspace == 1) {
-            put_symbol_unsigned(sc.c, state, sc.slice_rct_coef.y);
-            put_symbol_unsigned(sc.c, state, sc.slice_rct_coef.x);
+            put_usymbol(sc.c, sc.slice_rct_coef.y);
+            put_usymbol(sc.c, sc.slice_rct_coef.x);
         }
     }
 }
 
-void write_frame_header(inout SliceContext sc, uint64_t state)
+void write_frame_header(inout SliceContext sc)
 {
-    u8buf sb = u8buf(state);
-    sb.v = uint8_t(128);
-    put_rac_full(sc.c, state, bool(key_frame));
+    put_rac_equi(sc.c, bool(key_frame));
 }
 
 #ifdef GOLOMB
@@ -122,16 +113,12 @@ void main(void)
 {
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
 
-    /* Write slice data */
-    uint64_t scratch_state = uint64_t(scratch_data) + slice_idx*CONTEXT_SIZE;
-    u8buf sb = u8buf(scratch_state);
-
     init_slice(slice_ctx[slice_idx], slice_idx);
 
     if (slice_idx == 0)
-        write_frame_header(slice_ctx[slice_idx], scratch_state);
+        write_frame_header(slice_ctx[slice_idx]);
 
-    write_slice_header(slice_ctx[slice_idx], scratch_state);
+    write_slice_header(slice_ctx[slice_idx]);
 
 #ifdef GOLOMB
     init_golomb(slice_ctx[slice_idx]);
diff --git a/libavcodec/vulkan/ffv1_enc_vlc.comp b/libavcodec/vulkan/ffv1_enc_vlc.comp
deleted file mode 100644
index 7a4d39e307f28..0000000000000
--- a/libavcodec/vulkan/ffv1_enc_vlc.comp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-struct RLEState {
-    int count;
-    int diff;
-    int index;
-    bool mode;
-};
-
-void calc_new_state(inout RLEState state, int context)
-{
-    if (context == 0)
-        state.mode = false;
-
-    if (!state.mode)
-        return;
-
-    if (state.diff > 0) {
-        while (state.count >= (1 << log2_run[state.index])) {
-            state.count -= 1 << log2_run[state.index];
-            state.index++;
-        }
-        if (state.index > 0)
-            state.index--;
-        state.count = 0;
-        state.mode = false;
-        if (state.diff > 0)
-            state.diff--;
-    } else {
-        state.count++;
-    }
-}
-
-void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits, inout int run_index)
-{
-    ivec2 sp = sc.slice_pos;
-
-    int w = sc.slice_dim.x;
-    if (p > 0 && p < 3) {
-        w >>= chroma_shift.x;
-        sp >>= chroma_shift;
-    }
-
-    int run_count = 0;
-    bool run_mode = false;
-
-    for (int x = 0; x < w; x++) {
-        ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
-
-        if (d[0] == 0)
-            run_mode = true;
-
-        if (run_mode) {
-            if (d[1] != 0) {
-                /* A very unlikely loop */
-                while (run_count >= 1 << log2_run[run_index]) {
-                    run_count -= 1 << log2_run[run_index];
-                    run_index++;
-                    put_bits(sc.pb, 1, 1);
-                }
-
-                put_bits(sc.pb, 1 + log2_run[run_index], run_count);
-                if (run_index != 0)
-                    run_index--;
-                run_count = 0;
-                run_mode  = false;
-                if (d[1] > 0)
-                    d[1]--;
-            } else {
-                run_count++;
-            }
-        }
-
-        if (!run_mode) {
-            VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]);
-            Symbol sym = get_vlc_symbol(sb, d[1], bits);
-            put_bits(sc.pb, sym.bits, sym.val);
-        }
-    }
-
-    if (run_mode) {
-        while (run_count >= (1 << log2_run[run_index])) {
-            run_count -= 1 << log2_run[run_index];
-            run_index++;
-            put_bits(sc.pb, 1, 1);
-        }
-
-        if (run_count > 0)
-            put_bits(sc.pb, 1, 1);
-    }
-}
diff --git a/libavcodec/vulkan/ffv1_rct_search.comp b/libavcodec/vulkan/ffv1_rct_search.comp
new file mode 100644
index 0000000000000..055bde46c40a5
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_rct_search.comp
@@ -0,0 +1,139 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ivec3 load_components(ivec2 pos)
+{
+    ivec3 pix = ivec3(imageLoad(src[0], pos));
+    if (planar_rgb != 0) {
+        for (int i = 1; i < 3; i++)
+            pix[i] = int(imageLoad(src[i], pos)[0]);
+    }
+
+    return ivec3(pix[fmt_lut[0]], pix[fmt_lut[1]], pix[fmt_lut[2]]);
+}
+
+#define NUM_CHECKS 15
+const ivec2 rct_y_coeff[NUM_CHECKS] = {
+    ivec2(0, 0), //      4G
+
+    ivec2(0, 1), //      3G +  B
+    ivec2(1, 0), //  R + 3G
+    ivec2(1, 1), //  R + 2G + B
+
+    ivec2(0, 2), //      2G + 2B
+    ivec2(2, 0), // 2R + 2G
+    ivec2(2, 2), // 2R      + 2B
+
+    ivec2(0, 3), //      1G + 3B
+    ivec2(3, 0), // 3R + 1G
+
+    ivec2(0, 4), //           4B
+    ivec2(4, 0), // 4R
+
+    ivec2(1, 2), //  R +  G + 2B
+    ivec2(2, 1), // 2R +  G +  B
+
+    ivec2(3, 1), // 3R      +  B
+    ivec2(1, 3), //  R      + 3B
+};
+
+shared ivec3 pix_buf[gl_WorkGroupSize.x + 1][gl_WorkGroupSize.y + 1] = { };
+
+ivec3 transform_sample(ivec3 pix, ivec2 rct_coef)
+{
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += rct_offset;
+    pix.r += rct_offset;
+    return pix;
+}
+
+uint get_dist(ivec3 cur)
+{
+    ivec3 LL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 1];
+    ivec3 TL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 0];
+    ivec3 TT = pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 0];
+
+    ivec3 pred = ivec3(predict(LL.r, ivec2(TL.r, TT.r)),
+                       predict(LL.g, ivec2(TL.g, TT.g)),
+                       predict(LL.b, ivec2(TL.b, TT.b)));
+
+    uvec3 c = abs(pred - cur);
+    return mid_pred(c.r, c.g, c.b);
+}
+
+shared uint score_cols[gl_WorkGroupSize.y] = { };
+shared uint score_mode[16] = { };
+
+void process(ivec2 pos)
+{
+    ivec3 pix = load_components(pos);
+
+    for (int i = 0; i < NUM_CHECKS; i++) {
+        ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]);
+        pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = tx_pix;
+        memoryBarrierShared();
+
+        uint dist = get_dist(tx_pix);
+        atomicAdd(score_mode[i], dist);
+    }
+}
+
+void coeff_search(inout SliceContext sc)
+{
+    uvec2 img_size = imageSize(src[0]);
+    uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
+                           gl_NumWorkGroups.x, 0);
+    uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1,
+                           gl_NumWorkGroups.x, 0);
+    uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0,
+                           gl_NumWorkGroups.y, 0);
+    uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
+                           gl_NumWorkGroups.y, 0);
+
+    for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += gl_WorkGroupSize.y) {
+        for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += gl_WorkGroupSize.x) {
+            process(ivec2(x, y));
+        }
+    }
+
+    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
+        uint min_score = 0xFFFFFFFF;
+        uint min_idx = 3;
+        for (int i = 0; i < NUM_CHECKS; i++) {
+            if (score_mode[i] < min_score) {
+                min_score = score_mode[i];
+                min_idx = i;
+            }
+        }
+        sc.slice_rct_coef = rct_y_coeff[min_idx];
+    }
+}
+
+void main(void)
+{
+    if (force_pcm == 1)
+        return;
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    coeff_search(slice_ctx[slice_idx]);
+}
diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp
index 256b5f0e79742..8687b8bc3cf60 100644
--- a/libavcodec/vulkan/rangecoder.comp
+++ b/libavcodec/vulkan/rangecoder.comp
@@ -31,8 +31,9 @@ struct RangeCoder {
     uint8_t outstanding_byte;
 };
 
+#ifdef FULL_RENORM
 /* Full renorm version that can handle outstanding_byte == 0xFF */
-void renorm_encoder_full(inout RangeCoder c)
+void renorm_encoder(inout RangeCoder c)
 {
     int bs_cnt = 0;
     u8buf bytestream = u8buf(c.bytestream);
@@ -62,6 +63,8 @@ void renorm_encoder_full(inout RangeCoder c)
     c.low = bitfieldInsert(0, c.low, 8, 8);
 }
 
+#else
+
 /* Cannot deal with outstanding_byte == -1 in the name of speed */
 void renorm_encoder(inout RangeCoder c)
 {
@@ -90,59 +93,40 @@ void renorm_encoder(inout RangeCoder c)
     for (int i = 1; i < oc; i++)
         bs[i].v = fill;
 }
+#endif
 
-void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
+void put_rac_internal(inout RangeCoder c, const int range1, bool bit)
 {
-    u8buf sb = u8buf(state);
-    uint val = uint(sb.v);
-    int range1 = uint16_t((c.range * val) >> 8);
-
 #ifdef DEBUG
-    if (val == 0)
-        debugPrintfEXT("Error: state is zero (addr: 0x%lx)", uint64_t(sb));
     if (range1 >= c.range)
         debugPrintfEXT("Error: range1 >= c.range");
     if (range1 <= 0)
         debugPrintfEXT("Error: range1 <= 0");
 #endif
 
-    int diff = c.range - range1;
-    if (bit) {
-        c.low   += diff;
-        c.range  = range1;
-    } else {
-        c.range  = diff;
-    }
+    int ranged = c.range - range1;
+    c.low += bit ? ranged : 0;
+    c.range = bit ? range1 : ranged;
 
-    sb.v = zero_one_state[(uint(bit) << 8) + val];
+    if (expectEXT(c.range < 0x100, false))
+        renorm_encoder(c);
+}
 
-#ifdef DEBUG
-    if (sb.v == 0)
-        debugPrintfEXT("Error: inserted zero state from tab %i idx %i", bit, val);
-#endif
+void put_rac_direct(inout RangeCoder c, inout uint8_t state, bool bit)
+{
+    put_rac_internal(c, (c.range * state) >> 8, bit);
+    state = zero_one_state[(uint(bit) << 8) + state];
+}
+
+void put_rac(inout RangeCoder c, uint64_t state, bool bit)
+{
+    put_rac_direct(c, u8buf(state).v, bit);
 }
 
 /* Equiprobable bit */
 void put_rac_equi(inout RangeCoder c, bool bit)
 {
-    int range1 = c.range >> 1;
-
-#ifdef DEBUG
-    if (range1 >= c.range)
-        debugPrintfEXT("Error: range1 >= c.range");
-    if (range1 <= 0)
-        debugPrintfEXT("Error: range1 <= 0");
-#endif
-
-    if (bit) {
-        c.low   += c.range - range1;
-        c.range  = range1;
-    } else {
-        c.range -= range1;
-    }
-
-    if (expectEXT(c.range < 0x100, false))
-        renorm_encoder(c);
+    put_rac_internal(c, c.range >> 1, bit);
 }
 
 void put_rac_terminate(inout RangeCoder c)
@@ -226,11 +210,9 @@ void refill(inout RangeCoder c)
     }
 }
 
-bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
+bool get_rac_internal(inout RangeCoder c, const int range1)
 {
-    int range1 = c.range * state >> 8;
     int ranged = c.range - range1;
-
     bool bit = c.low >= ranged;
     c.low -= bit ? ranged : 0;
     c.range = (bit ? 0 : ranged) + (bit ? range1 : 0);
@@ -238,6 +220,12 @@ bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
     if (expectEXT(c.range < 0x100, false))
         refill(c);
 
+    return bit;
+}
+
+bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
+{
+    bool bit = get_rac_internal(c, c.range * state >> 8);
     state = zero_one_state[state + (bit ? 256 : 0)];
     return bit;
 }
@@ -249,18 +237,5 @@ bool get_rac(inout RangeCoder c, uint64_t state)
 
 bool get_rac_equi(inout RangeCoder c)
 {
-    int range1 = c.range >> 1;
-
-    c.range -= range1;
-
-    bool bit = c.low >= c.range;
-    if (bit) {
-        c.low -= c.range;
-        c.range = range1;
-    }
-
-    if (expectEXT(c.range < 0x100, false))
-        refill(c);
-
-    return bit;
+    return get_rac_internal(c, c.range >> 1);
 }
diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c
index f1313c840950c..7310ba1547960 100644
--- a/libavcodec/vulkan_decode.c
+++ b/libavcodec/vulkan_decode.c
@@ -142,6 +142,7 @@ static void init_frame(FFVulkanDecodeContext *dec, FFVulkanDecodePicture *vkpic)
 
     vkpic->destroy_image_view = vk->DestroyImageView;
     vkpic->wait_semaphores = vk->WaitSemaphores;
+    vkpic->invalidate_memory_ranges = vk->InvalidateMappedMemoryRanges;
 }
 
 int ff_vk_decode_prepare_frame(FFVulkanDecodeContext *dec, AVFrame *pic,
diff --git a/libavcodec/vulkan_decode.h b/libavcodec/vulkan_decode.h
index cbd22b3591128..bf6506f280e6b 100644
--- a/libavcodec/vulkan_decode.h
+++ b/libavcodec/vulkan_decode.h
@@ -114,6 +114,7 @@ typedef struct FFVulkanDecodePicture {
     /* Vulkan functions needed for destruction, as no other context is guaranteed to exist */
     PFN_vkWaitSemaphores            wait_semaphores;
     PFN_vkDestroyImageView          destroy_image_view;
+    PFN_vkInvalidateMappedMemoryRanges invalidate_memory_ranges;
 } FFVulkanDecodePicture;
 
 /**
diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index cbde2f319a375..c839f4c3879e1 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -26,6 +26,8 @@
 #include "libavutil/vulkan_spirv.h"
 #include "libavutil/mem.h"
 
+#define RGB_LINECACHE 2
+
 extern const char *ff_source_common_comp;
 extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
@@ -219,7 +221,7 @@ static int vk_ffv1_start_frame(AVCodecContext          *avctx,
                                   &fp->slice_status_buf,
                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                  NULL, f->slice_count*sizeof(uint32_t),
+                                  NULL, 2*f->slice_count*sizeof(uint32_t),
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
                                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
     if (err < 0)
@@ -406,7 +408,7 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
     ff_vk_shader_update_desc_buffer(&ctx->s, exec, &fv->setup,
                                     1, 2, 0,
                                     slice_status,
-                                    0, f->slice_count*sizeof(uint32_t),
+                                    0, 2*f->slice_count*sizeof(uint32_t),
                                     VK_FORMAT_UNDEFINED);
 
     ff_vk_exec_bind_shader(&ctx->s, exec, &fv->setup);
@@ -536,10 +538,15 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, decode_shader,
+                                    1, 2, 0,
+                                    slice_status,
+                                    0, 2*f->slice_count*sizeof(uint32_t),
+                                    VK_FORMAT_UNDEFINED);
     if (is_rgb)
         ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
                                       f->picture.f, vp->view.out,
-                                      1, 2,
+                                      1, 3,
                                       VK_IMAGE_LAYOUT_GENERAL,
                                       VK_NULL_HANDLE);
 
@@ -610,6 +617,7 @@ static void define_shared_code(FFVulkanShader *shd, int use32bit)
 
     GLSLC(0, #define DECODE                                              );
 
+    av_bprintf(&shd->src, "#define RGB_LINECACHE %i\n"                   ,RGB_LINECACHE);
     av_bprintf(&shd->src, "#define CONTEXT_SIZE %i\n"                    ,CONTEXT_SIZE);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_MASK 0x%x\n"          ,MAX_QUANT_TABLE_MASK);
 
@@ -697,8 +705,8 @@ static int init_setup_shader(FFV1Context *f, FFVulkanContext *s,
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
             .mem_quali   = "writeonly",
-            .buf_content = "uint32_t slice_crc_mismatch",
-            .buf_elems   = f->max_slice_count,
+            .buf_content = "uint32_t slice_status",
+            .buf_elems   = 2*f->max_slice_count,
         },
     };
     RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0));
@@ -892,6 +900,14 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
             .elems      = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
+        {
+            .name        = "slice_status_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_quali   = "writeonly",
+            .buf_content = "uint32_t slice_status",
+            .buf_elems   = 2*f->max_slice_count,
+        },
         {
             .name       = "dst",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@@ -903,7 +919,7 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2 + rgb, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3 + rgb, 0, 0));
 
     GLSLD(ff_source_ffv1_dec_comp);
 
@@ -936,7 +952,7 @@ static int init_indirect(AVCodecContext *avctx, FFVulkanContext *s,
     frames_ctx->format    = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = sw_format;
     frames_ctx->width     = s->frames->width;
-    frames_ctx->height    = f->num_v_slices*2;
+    frames_ctx->height    = f->num_v_slices*RGB_LINECACHE;
 
     vk_frames = frames_ctx->hwctx;
     vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
@@ -1111,22 +1127,35 @@ static int vk_decode_ffv1_init(AVCodecContext *avctx)
 
 static void vk_ffv1_free_frame_priv(AVRefStructOpaque _hwctx, void *data)
 {
-    AVHWDeviceContext *hwctx = _hwctx.nc;
+    AVHWDeviceContext *dev_ctx = _hwctx.nc;
+    AVVulkanDeviceContext *hwctx = dev_ctx->hwctx;
 
     FFv1VulkanDecodePicture *fp = data;
     FFVulkanDecodePicture *vp = &fp->vp;
+    FFVkBuffer *slice_status = (FFVkBuffer *)fp->slice_status_buf->data;
 
-    ff_vk_decode_free_frame(hwctx, vp);
+    ff_vk_decode_free_frame(dev_ctx, vp);
+
+    /* Invalidate slice/output data if needed */
+    if (!(slice_status->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+        VkMappedMemoryRange invalidate_data = {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = slice_status->mem,
+            .offset = 0,
+            .size = 2*fp->slice_num*sizeof(uint32_t),
+        };
+        vp->invalidate_memory_ranges(hwctx->act_dev,
+                                     1, &invalidate_data);
+    }
 
-    if (fp->crc_checked) {
-        FFVkBuffer *slice_status = (FFVkBuffer *)fp->slice_status_buf->data;
-        for (int i = 0; i < fp->slice_num; i++) {
-            uint32_t crc_res;
-            crc_res = AV_RN32(slice_status->mapped_mem + i*sizeof(uint32_t));
-            if (crc_res != 0)
-                av_log(hwctx, AV_LOG_ERROR, "CRC mismatch in slice %i, res: 0x%x\n",
-                       i, crc_res);
-        }
+    for (int i = 0; i < fp->slice_num; i++) {
+        uint32_t crc_res = 0;
+        if (fp->crc_checked)
+            crc_res = AV_RN32(slice_status->mapped_mem + 2*i*sizeof(uint32_t) + 0);
+        uint32_t status = AV_RN32(slice_status->mapped_mem + 2*i*sizeof(uint32_t) + 4);
+        if (status || crc_res)
+            av_log(dev_ctx, AV_LOG_ERROR, "Slice %i status: 0x%x, CRC 0x%x\n",
+                   i, status, crc_res);
     }
 
     av_buffer_unref(&vp->slices_buf);
diff --git a/libavcodec/vvc.h b/libavcodec/vvc.h
index 92639779c1db4..5490ddb4c81d1 100644
--- a/libavcodec/vvc.h
+++ b/libavcodec/vvc.h
@@ -154,6 +154,9 @@ enum {
 
     // {sps, ph}_num_{ver, hor}_virtual_boundaries should in [0, 3]
     VVC_MAX_VBS = 3,
+
+    // 8.4.5.3 Decoding process for palette mode - maxNumPalettePredictorSize
+    VVC_MAX_NUM_PALETTE_PREDICTOR_SIZE = 63
 };
 
 #endif /* AVCODEC_VVC_H */
diff --git a/libavcodec/vvc/Makefile b/libavcodec/vvc/Makefile
index 6a28d32bc25cc..10125ffc2d3bc 100644
--- a/libavcodec/vvc/Makefile
+++ b/libavcodec/vvc/Makefile
@@ -14,4 +14,5 @@ OBJS-$(CONFIG_VVC_DECODER)          +=  vvc/dec.o           \
                                         vvc/mvs.o           \
                                         vvc/ps.o            \
                                         vvc/refs.o          \
+                                        vvc/sei.o           \
                                         vvc/thread.o        \
diff --git a/libavcodec/vvc/cabac.c b/libavcodec/vvc/cabac.c
index 55101448931b2..6847ce59aff10 100644
--- a/libavcodec/vvc/cabac.c
+++ b/libavcodec/vvc/cabac.c
@@ -928,6 +928,27 @@ static int truncated_binary_decode(VVCLocalContext *lc, const int c_max)
     return v;
 }
 
+// 9.3.3.5 k-th order Exp - Golomb binarization process
+static int kth_order_egk_decode(CABACContext *c, int k)
+{
+    int bit    = 1;
+    int value  = 0;
+    int symbol = 0;
+
+    while (bit) {
+        bit = get_cabac_bypass(c);
+        value += bit << k++;
+    }
+
+    if (--k) {
+        for (int i = 0; i < k; i++)
+            symbol = (symbol << 1) | get_cabac_bypass(c);
+        value += symbol;
+    }
+
+    return value;
+}
+
 // 9.3.3.6 Limited k-th order Exp-Golomb binarization process
 static int limited_kth_order_egk_decode(CABACContext *c, const int k, const int max_pre_ext_len, const int trunc_suffix_len)
 {
@@ -947,6 +968,17 @@ static int limited_kth_order_egk_decode(CABACContext *c, const int k, const int
     return val;
 }
 
+// 9.3.3.7 Fixed-length binarization process
+static int fixed_length_decode(CABACContext* c, const int len)
+{
+    int value = 0;
+
+    for (int i = 0; i < len; i++)
+        value = (value << 1) | get_cabac_bypass(c);
+
+    return value;
+}
+
 static av_always_inline
 void get_left_top(const VVCLocalContext *lc, uint8_t *left, uint8_t *top,
     const int x0, const int y0, const uint8_t *left_ctx, const uint8_t *top_ctx)
@@ -990,11 +1022,7 @@ int ff_vvc_sao_type_idx_decode(VVCLocalContext *lc)
 
 int ff_vvc_sao_band_position_decode(VVCLocalContext *lc)
 {
-    int value = get_cabac_bypass(&lc->ep->cc);
-
-    for (int i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&lc->ep->cc);
-    return value;
+    return fixed_length_decode(&lc->ep->cc, 5);
 }
 
 int ff_vvc_sao_offset_abs_decode(VVCLocalContext *lc)
@@ -1014,9 +1042,7 @@ int ff_vvc_sao_offset_sign_decode(VVCLocalContext *lc)
 
 int ff_vvc_sao_eo_class_decode(VVCLocalContext *lc)
 {
-    int ret = get_cabac_bypass(&lc->ep->cc) << 1;
-    ret    |= get_cabac_bypass(&lc->ep->cc);
-    return ret;
+    return (get_cabac_bypass(&lc->ep->cc) << 1) | get_cabac_bypass(&lc->ep->cc);
 }
 
 int ff_vvc_alf_ctb_flag(VVCLocalContext *lc, const int rx, const int ry, const int c_idx)
@@ -1351,6 +1377,58 @@ int ff_vvc_intra_chroma_pred_mode(VVCLocalContext *lc)
     return (get_cabac_bypass(&lc->ep->cc) << 1) | get_cabac_bypass(&lc->ep->cc);
 }
 
+int ff_vvc_palette_predictor_run(VVCLocalContext *lc)
+{
+    return kth_order_egk_decode(&lc->ep->cc, 0);
+}
+
+int ff_vvc_num_signalled_palette_entries(VVCLocalContext *lc)
+{
+    return kth_order_egk_decode(&lc->ep->cc, 0);
+}
+
+int ff_vvc_new_palette_entries(VVCLocalContext *lc, const int bit_depth)
+{
+    return fixed_length_decode(&lc->ep->cc, bit_depth);
+}
+
+bool ff_vvc_palette_escape_val_present_flag(VVCLocalContext *lc)
+{
+    return get_cabac_bypass(&lc->ep->cc);
+}
+
+bool ff_vvc_palette_transpose_flag(VVCLocalContext *lc)
+{
+    return GET_CABAC(PALETTE_TRANSPOSE_FLAG);
+}
+
+bool ff_vvc_run_copy_flag(VVCLocalContext *lc, const int prev_run_type, const int prev_run_position, const int cur_pos)
+{
+    uint8_t run_left_lut[] = { 0, 1, 2, 3, 4 };
+    uint8_t run_top_lut[] = { 5, 6, 6, 7, 7 };
+
+    int bin_dist = cur_pos - prev_run_position - 1;
+    uint8_t *run_lut = prev_run_type == 1 ? run_top_lut : run_left_lut;
+    uint8_t ctx_inc = bin_dist <= 4 ? run_lut[bin_dist] : run_lut[4];
+
+    return GET_CABAC(RUN_COPY_FLAG + ctx_inc);
+}
+
+bool ff_vvc_copy_above_palette_indices_flag(VVCLocalContext *lc)
+{
+    return GET_CABAC(COPY_ABOVE_PALETTE_INDICES_FLAG);
+}
+
+int ff_vvc_palette_idx_idc(VVCLocalContext *lc, const int max_palette_index, const bool adjust)
+{
+    return truncated_binary_decode(lc, max_palette_index - adjust);
+}
+
+int ff_vvc_palette_escape_val(VVCLocalContext *lc)
+{
+    return kth_order_egk_decode(&lc->ep->cc, 5);
+}
+
 int ff_vvc_general_merge_flag(VVCLocalContext *lc)
 {
     return GET_CABAC(GENERAL_MERGE_FLAG);
@@ -1458,12 +1536,7 @@ int ff_vvc_merge_idx(VVCLocalContext *lc)
 
 int ff_vvc_merge_gpm_partition_idx(VVCLocalContext *lc)
 {
-    int i = 0;
-
-    for (int j = 0; j < 6; j++)
-        i = (i << 1) | get_cabac_bypass(&lc->ep->cc);
-
-    return i;
+    return fixed_length_decode(&lc->ep->cc, 6);
 }
 
 int ff_vvc_merge_gpm_idx(VVCLocalContext *lc, const int idx)
@@ -1630,6 +1703,11 @@ int ff_vvc_tu_y_coded_flag(VVCLocalContext *lc)
     return lc->parse.prev_tu_cbf_y;
 }
 
+int ff_vvc_cu_act_enabled_flag(VVCLocalContext *lc)
+{
+    return GET_CABAC(CU_ACT_ENABLED_FLAG);
+}
+
 int ff_vvc_cu_qp_delta_abs(VVCLocalContext *lc)
 {
     int v, i, k;
diff --git a/libavcodec/vvc/cabac.h b/libavcodec/vvc/cabac.h
index e9bc98e23a510..972890317edfb 100644
--- a/libavcodec/vvc/cabac.h
+++ b/libavcodec/vvc/cabac.h
@@ -81,6 +81,15 @@ int ff_vvc_intra_luma_mpm_remainder(VVCLocalContext *lc);
 int ff_vvc_cclm_mode_flag(VVCLocalContext *lc);
 int ff_vvc_cclm_mode_idx(VVCLocalContext *lc);
 int ff_vvc_intra_chroma_pred_mode(VVCLocalContext *lc);
+int ff_vvc_palette_predictor_run(VVCLocalContext *lc);
+int ff_vvc_num_signalled_palette_entries(VVCLocalContext *lc);
+int ff_vvc_new_palette_entries(VVCLocalContext *lc, int bit_dpeth);
+bool ff_vvc_palette_escape_val_present_flag(VVCLocalContext *lc);
+bool ff_vvc_palette_transpose_flag(VVCLocalContext *lc);
+bool ff_vvc_run_copy_flag(VVCLocalContext *lc, int prev_run_type, int prev_run_position, int cur_pos);
+bool ff_vvc_copy_above_palette_indices_flag(VVCLocalContext *lc);
+int ff_vvc_palette_idx_idc(VVCLocalContext *lc, int max_palette_index, bool adjust);
+int ff_vvc_palette_escape_val(VVCLocalContext *lc);
 
 //inter
 int ff_vvc_general_merge_flag(VVCLocalContext *lc);
@@ -111,6 +120,7 @@ int ff_vvc_bcw_idx(VVCLocalContext *lc, int no_backward_pred_flag);
 int ff_vvc_tu_cb_coded_flag(VVCLocalContext *lc);
 int ff_vvc_tu_cr_coded_flag(VVCLocalContext *lc, int tu_cb_coded_flag);
 int ff_vvc_tu_y_coded_flag(VVCLocalContext *lc);
+int ff_vvc_cu_act_enabled_flag(VVCLocalContext *lc);
 int ff_vvc_cu_chroma_qp_offset_flag(VVCLocalContext *lc);
 int ff_vvc_cu_chroma_qp_offset_idx(VVCLocalContext *lc);
 int ff_vvc_tu_joint_cbcr_residual_flag(VVCLocalContext *lc, int tu_cb_coded_flag, int tu_cr_coded_flag);
diff --git a/libavcodec/vvc/ctu.c b/libavcodec/vvc/ctu.c
index 080b740cc68d7..ba4c89b1d1b14 100644
--- a/libavcodec/vvc/ctu.c
+++ b/libavcodec/vvc/ctu.c
@@ -25,6 +25,7 @@
 #include "cabac.h"
 #include "ctu.h"
 #include "inter.h"
+#include "intra.h"
 #include "mvs.h"
 
 #define PROF_TEMP_SIZE (PROF_BLOCK_SIZE) * sizeof(int16_t)
@@ -391,6 +392,8 @@ static int hls_transform_unit(VVCLocalContext *lc, int x0, int y0,int tu_width,
             if (ret < 0)
                 return ret;
             set_tb_tab(fc->tab.tu_coded_flag[tb->c_idx], tu->coded_flag[tb->c_idx], fc, tb);
+        } else if (cu->act_enabled_flag) {
+            memset(tb->coeffs, 0, tb->tb_width * tb->tb_height * sizeof(*tb->coeffs));
         }
         if (tb->c_idx != CR)
             set_tb_size(fc, tb);
@@ -501,13 +504,12 @@ static int skipped_transform_tree(VVCLocalContext *lc, int x0, int y0,int tu_wid
             SKIPPED_TRANSFORM_TREE(x0, y0 + trafo_height);
     } else {
         TransformUnit *tu    = add_tu(fc, lc->cu, x0, y0, tu_width, tu_height);
-        const int has_chroma = sps->r->sps_chroma_format_idc && cu->tree_type != DUAL_TREE_LUMA;
-        const int c_start    = cu->tree_type == DUAL_TREE_CHROMA ? CB : LUMA;
-        const int c_end      = has_chroma ? VVC_MAX_SAMPLE_ARRAYS : CB;
+        int start, end;
 
         if (!tu)
             return AVERROR_INVALIDDATA;
-        for (int i = c_start; i < c_end; i++) {
+        ff_vvc_channel_range(&start, &end, cu->tree_type, sps->r->sps_chroma_format_idc);
+        for (int i = start; i < end; i++) {
             TransformBlock *tb = add_tb(tu, lc, x0, y0, tu_width >> sps->hshift[i], tu_height >> sps->vshift[i], i);
             if (i != CR)
                 set_tb_size(fc, tb);
@@ -895,7 +897,7 @@ static void derive_chroma_intra_pred_mode(VVCLocalContext *lc,
     enum IntraPredMode luma_intra_pred_mode = SAMPLE_CTB(fc->tab.ipm, x_cb, y_cb);
 
     if (cu->tree_type == SINGLE_TREE && sps->r->sps_chroma_format_idc == CHROMA_FORMAT_444 &&
-        intra_chroma_pred_mode == 4 && intra_mip_flag) {
+        (intra_chroma_pred_mode == 4 || cu->act_enabled_flag) && intra_mip_flag) {
         cu->mip_chroma_direct_flag = 1;
         cu->intra_pred_mode_c = luma_intra_pred_mode;
         return;
@@ -1007,34 +1009,38 @@ static void intra_luma_pred_modes(VVCLocalContext *lc)
 
 static void intra_chroma_pred_modes(VVCLocalContext *lc)
 {
-    const VVCSPS *sps   = lc->fc->ps.sps;
-    CodingUnit *cu      = lc->cu;
-    const int hs        = sps->hshift[CHROMA];
-    const int vs        = sps->vshift[CHROMA];
+    const VVCSPS *sps          = lc->fc->ps.sps;
+    CodingUnit *cu             = lc->cu;
+    const int hs               = sps->hshift[CHROMA];
+    const int vs               = sps->vshift[CHROMA];
+    int cclm_mode_flag         = 0;
+    int cclm_mode_idx          = 0;
+    int intra_chroma_pred_mode = 0;
+
+    if (!cu->act_enabled_flag) {
+        cu->mip_chroma_direct_flag = 0;
+        if (sps->r->sps_bdpcm_enabled_flag &&
+            (cu->cb_width  >> hs) <= sps->max_ts_size &&
+            (cu->cb_height >> vs) <= sps->max_ts_size) {
+            cu->bdpcm_flag[CB] = cu->bdpcm_flag[CR] = ff_vvc_intra_bdpcm_chroma_flag(lc);
+        }
+        if (cu->bdpcm_flag[CHROMA]) {
+            cu->intra_pred_mode_c = ff_vvc_intra_bdpcm_chroma_dir_flag(lc) ? INTRA_VERT : INTRA_HORZ;
+        } else {
+            const int cclm_enabled = get_cclm_enabled(lc, cu->x0, cu->y0);
 
-    cu->mip_chroma_direct_flag = 0;
-    if (sps->r->sps_bdpcm_enabled_flag &&
-        (cu->cb_width  >> hs) <= sps->max_ts_size &&
-        (cu->cb_height >> vs) <= sps->max_ts_size) {
-        cu->bdpcm_flag[CB] = cu->bdpcm_flag[CR] = ff_vvc_intra_bdpcm_chroma_flag(lc);
-    }
-    if (cu->bdpcm_flag[CHROMA]) {
-        cu->intra_pred_mode_c = ff_vvc_intra_bdpcm_chroma_dir_flag(lc) ? INTRA_VERT : INTRA_HORZ;
-    } else {
-        const int cclm_enabled = get_cclm_enabled(lc, cu->x0, cu->y0);
-        int cclm_mode_flag = 0;
-        int cclm_mode_idx = 0;
-        int intra_chroma_pred_mode = 0;
+            if (cclm_enabled)
+                cclm_mode_flag = ff_vvc_cclm_mode_flag(lc);
 
-        if (cclm_enabled)
-            cclm_mode_flag = ff_vvc_cclm_mode_flag(lc);
+            if (cclm_mode_flag)
+                cclm_mode_idx = ff_vvc_cclm_mode_idx(lc);
+            else
+                intra_chroma_pred_mode = ff_vvc_intra_chroma_pred_mode(lc);
+        }
+    }
 
-        if (cclm_mode_flag)
-            cclm_mode_idx = ff_vvc_cclm_mode_idx(lc);
-        else
-            intra_chroma_pred_mode = ff_vvc_intra_chroma_pred_mode(lc);
+    if (!cu->bdpcm_flag[CHROMA])
         derive_chroma_intra_pred_mode(lc, cclm_mode_flag, cclm_mode_idx, intra_chroma_pred_mode);
-    }
 }
 
 static PredMode pred_mode_decode(VVCLocalContext *lc,
@@ -1047,13 +1053,15 @@ static PredMode pred_mode_decode(VVCLocalContext *lc,
     const H266RawSliceHeader *rsh   = lc->sc->sh.r;
     const int ch_type               = tree_type == DUAL_TREE_CHROMA ? 1 : 0;
     const int is_4x4                = cu->cb_width == 4 && cu->cb_height == 4;
+    const int is_128                = cu->cb_width == 128 || cu->cb_height == 128;
+    const int hs                    = sps->hshift[CHROMA];
+    const int vs                    = sps->vshift[CHROMA];
     int pred_mode_flag;
     int pred_mode_ibc_flag;
     PredMode pred_mode;
 
     cu->skip_flag = 0;
     if (!IS_I(rsh) || sps->r->sps_ibc_enabled_flag) {
-        const int is_128 = cu->cb_width == 128 || cu->cb_height == 128;
         if (tree_type != DUAL_TREE_CHROMA &&
             ((!is_4x4 && mode_type != MODE_TYPE_INTRA) ||
             (sps->r->sps_ibc_enabled_flag && !is_128))) {
@@ -1088,6 +1096,14 @@ static PredMode pred_mode_decode(VVCLocalContext *lc,
         pred_mode = MODE_INTRA;
     }
 
+    if (pred_mode == MODE_INTRA && sps->r->sps_palette_enabled_flag && !is_128 && !cu->skip_flag &&
+        mode_type != MODE_TYPE_INTER && ((cu->cb_width * cu->cb_height) >
+            (tree_type != DUAL_TREE_CHROMA ? 16 : (16 << hs << vs))) &&
+        (mode_type != MODE_TYPE_INTRA || tree_type != DUAL_TREE_CHROMA)) {
+        if (ff_vvc_pred_mode_plt_flag(lc))
+            pred_mode = MODE_PLT;
+    }
+
     set_cb_tab(lc, fc->tab.cpm[cu->ch_type], pred_mode);
     if (tree_type == SINGLE_TREE)
         set_cb_tab(lc, fc->tab.cpm[CHROMA], pred_mode);
@@ -1756,8 +1772,8 @@ static void fill_dmvr_info(const VVCLocalContext *lc)
     const VVCFrameContext *fc = lc->fc;
     const CodingUnit *cu      = lc->cu;
 
-    if (cu->pred_mode == MODE_IBC) {
-        ff_vvc_set_intra_mvf(lc, 1);
+    if (cu->pred_mode == MODE_IBC || cu->pred_mode == MODE_PLT) {
+        ff_vvc_set_intra_mvf(lc, true, cu->pred_mode == MODE_IBC ? PF_IBC : PF_PLT, false);
     } else {
         const VVCPPS *pps = fc->ps.pps;
         const int w       = cu->cb_width >> MIN_PU_LOG2;
@@ -1806,17 +1822,345 @@ static int inter_data(VVCLocalContext *lc)
     return ret;
 }
 
+static TransformUnit* palette_add_tu(VVCLocalContext *lc, const int start, const int end, const VVCTreeType tree_type)
+{
+    CodingUnit   *cu  = lc->cu;
+    const VVCSPS *sps = lc->fc->ps.sps;
+    TransformUnit *tu = add_tu(lc->fc, cu, cu->x0, cu->y0, cu->cb_width, cu->cb_height);
+
+    if (!tu)
+        return NULL;
+
+    for (int c = start; c < end; c++) {
+        const int w = tu->width >> sps->hshift[c];
+        const int h = tu->height >> sps->vshift[c];
+        TransformBlock *tb = add_tb(tu, lc, tu->x0, tu->y0, w, h, c);
+        if (c != CR)
+            set_tb_size(lc->fc, tb);
+    }
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(cu->plt); i++)
+        cu->plt[i].size = 0;
+
+    return tu;
+}
+
+static int palette_predicted(VVCLocalContext *lc, const bool local_dual_tree, int start, int end,
+    bool *predictor_reused, const int predictor_size, const int max_entries)
+{
+    CodingUnit  *cu  = lc->cu;
+    int nb_predicted = 0;
+
+    if (local_dual_tree) {
+        start = LUMA;
+        end = VVC_MAX_SAMPLE_ARRAYS;
+    }
+
+    for (int i = 0; i < predictor_size && nb_predicted < max_entries; i++) {
+        const int run = ff_vvc_palette_predictor_run(lc);
+        if (run == 1)
+            break;
+
+        if (run > 1)
+            i += run - 1;
+
+        if (i >= predictor_size)
+            return AVERROR_INVALIDDATA;
+
+        predictor_reused[i] = true;
+        for (int c = start; c < end; c++)
+            cu->plt[c].entries[nb_predicted] = lc->ep->pp[c].entries[i];
+        nb_predicted++;
+    }
+
+    for (int c = start; c < end; c++)
+        cu->plt[c].size = nb_predicted;
+
+    return 0;
+}
+
+static int palette_signaled(VVCLocalContext *lc, const bool local_dual_tree,
+    const int start, const int end, const int max_entries)
+{
+    const VVCSPS *sps         = lc->fc->ps.sps;
+    CodingUnit  *cu           = lc->cu;
+    const int nb_predicted    = cu->plt[start].size;
+    const int nb_signaled     = nb_predicted < max_entries ? ff_vvc_num_signalled_palette_entries(lc) : 0;
+    const int size            = nb_predicted + nb_signaled;
+    const bool dual_tree_luma = local_dual_tree && cu->tree_type == DUAL_TREE_LUMA;
+
+    if (size > max_entries)
+        return AVERROR_INVALIDDATA;
+
+    for (int c = start; c < end; c++) {
+        Palette *plt = cu->plt + c;
+        for (int i = nb_predicted; i < size; i++) {
+            plt->entries[i] = ff_vvc_new_palette_entries(lc, sps->bit_depth);
+            if (dual_tree_luma) {
+                plt[CB].entries[i] = 1 << (sps->bit_depth - 1);
+                plt[CR].entries[i] = 1 << (sps->bit_depth - 1);
+            }
+        }
+        plt->size = size;
+    }
+
+    return 0;
+}
+
+static void palette_update_predictor(VVCLocalContext *lc, const bool local_dual_tree, int start, int end,
+    bool *predictor_reused, const int predictor_size)
+{
+    CodingUnit  *cu         = lc->cu;
+    const int max_predictor = VVC_MAX_NUM_PALETTE_PREDICTOR_SIZE >> (cu->tree_type != SINGLE_TREE && !local_dual_tree);
+
+    if (local_dual_tree) {
+        start = LUMA;
+        end = VVC_MAX_SAMPLE_ARRAYS;
+    }
+
+    for (int c = start; c < end; c++) {
+        Palette *pp  = lc->ep->pp + c;
+        Palette *plt = cu->plt + c;
+        int i = cu->plt[start].size;;
+
+        // copy unused predictors to the end of plt
+        for (int j = 0; j < predictor_size && i < max_predictor; j++) {
+            if (!predictor_reused[j]) {
+                plt->entries[i] = pp->entries[j];
+                i++;
+            }
+        }
+
+        memcpy(pp->entries, plt->entries, i * sizeof(pp->entries[0]));
+        pp->size = i;
+    }
+}
+
+static void palette_qp(VVCLocalContext *lc, VVCTreeType tree_type, const bool escape_present)
+{
+    const VVCFrameContext *fc     = lc->fc;
+    const VVCPPS *pps             = fc->ps.pps;
+    const H266RawSliceHeader *rsh = lc->sc->sh.r;
+    const CodingUnit *cu          = lc->cu;
+
+    if (tree_type != DUAL_TREE_CHROMA) {
+        const bool has_qp_delta = escape_present &&
+            pps->r->pps_cu_qp_delta_enabled_flag && !lc->parse.is_cu_qp_delta_coded;
+        set_qp_y(lc, cu->x0, cu->y0, has_qp_delta);
+    }
+
+    if (tree_type != DUAL_TREE_LUMA) {
+        if (rsh->sh_cu_chroma_qp_offset_enabled_flag && !lc->parse.is_cu_chroma_qp_offset_coded)
+            chroma_qp_offset_decode(lc, 0, 1);
+        set_qp_c(lc);
+    }
+}
+
+#define PALETTE_SET_PIXEL(xc, yc, pix)                              \
+    do {                                                            \
+        const int off = ((xc) >> hs) + ((yc) >> vs) * tb->tb_width; \
+        if (sps->bit_depth == 8)                                    \
+            u8[off] = pix;                                          \
+        else                                                        \
+            u16[off] = pix;                                         \
+    } while (0)
+
+#define PALETTE_INDEX(x, y) index[(y) * cu->cb_width + (x)]
+
+// 6.5.3 Horizontal and vertical traverse scan order array initialization process
+// The hTravScan and vTravScan tables require approximately 576 KB of memory.
+// To save space, we use a macro to achieve the same functionality.
+#define TRAV_COL(p, wlog, mask) ((p & mask) ^ (-((p >> wlog) & 1) & mask))
+#define TRAV_ROW(p, hlog) (p >> hlog)
+#define TRAV(trans, p, wlog, hlog, mask)  (trans ? TRAV_ROW((p), hlog) : TRAV_COL((p), wlog, mask))
+#define TRAV_X(pos) TRAV(transpose, pos, wlog2, hlog2, wmask)
+#define TRAV_Y(pos) TRAV(!transpose, pos, hlog2, wlog2, hmask)
+
+static int palette_subblock_data(VVCLocalContext *lc,
+    const int max_index, const int subset_id, const bool transpose,
+    uint8_t *run_type, uint8_t *index, int *prev_run_pos, bool *adjust)
+{
+    const CodingUnit *cu = lc->cu;
+    TransformUnit *tu    = cu->tus.head;
+    const VVCSPS *sps    = lc->fc->ps.sps;
+    const int min_pos    = subset_id << 4;
+    const int max_pos    = FFMIN(min_pos + 16, cu->cb_width * cu->cb_height);
+    const int wmask      = cu->cb_width  - 1;
+    const int hmask      = cu->cb_height - 1;
+    const int wlog2      = av_log2(cu->cb_width);
+    const int hlog2      = av_log2(cu->cb_height);
+    const uint8_t esc    = cu->plt[tu->tbs[0].c_idx].size;
+    uint8_t run_copy[16] = { 0 };
+
+    for (int i = min_pos; i < max_pos; i++) {
+        const int xc = TRAV_X(i);
+        const int yc = TRAV_Y(i);
+
+        if (i > 0 && max_index > 0)
+            run_copy[i - min_pos] = ff_vvc_run_copy_flag(lc, run_type[i - 1], *prev_run_pos, i);
+
+        run_type[i] = 0;
+        if (max_index > 0 && !run_copy[i - min_pos]) {
+            if (((!transpose && yc > 0) || (transpose && xc > 0))
+                && i > 0 && !run_type[i - 1]) {
+                run_type[i] = ff_vvc_copy_above_palette_indices_flag(lc);
+            }
+            *prev_run_pos = i;
+        } else if (i > 0) {
+            run_type[i] = run_type[i - 1];
+        }
+    }
+
+    for (int i = min_pos; i < max_pos; i++) {
+        const int xc = TRAV_X(i);
+        const int yc = TRAV_Y(i);
+        const int prev_xc = i > 0 ? TRAV_X(i - 1) : 0;
+        const int prev_yc = i > 0 ? TRAV_Y(i - 1) : 0;
+
+        int idx = 0;
+        if (max_index > 0 && !run_copy[i - min_pos] && !run_type[i]) {
+            if (max_index - *adjust > 0)
+                idx = ff_vvc_palette_idx_idc(lc, max_index, *adjust);
+            if (i > 0) {
+                const int ref_idx = !run_type[i - 1] ?
+                    PALETTE_INDEX(prev_xc, prev_yc) : PALETTE_INDEX(xc - transpose, yc - !transpose);
+                idx += (idx >= ref_idx);
+            }
+            *adjust = true;
+        } else {
+            idx = PALETTE_INDEX(prev_xc, prev_yc);
+        }
+
+        if (!run_type[i])
+            PALETTE_INDEX(xc, yc) = idx;
+        else
+            PALETTE_INDEX(xc, yc) = PALETTE_INDEX(xc - transpose, yc - !transpose);
+    }
+
+    for (int c = 0; c < tu->nb_tbs; c++) {
+        TransformBlock *tb = &tu->tbs[c];
+        const Palette *plt = cu->plt + tb->c_idx;
+        const int scale    = ff_vvc_palette_derive_scale(lc, tu, tb);
+        const int hs       = sps->hshift[c];
+        const int vs       = sps->vshift[c];
+        uint8_t *u8        = (uint8_t *)tb->coeffs;
+        uint16_t *u16      = (uint16_t *)tb->coeffs;
+
+        for (int i = min_pos; i < max_pos; i++) {
+            const int xc = TRAV_X(i);
+            const int yc = TRAV_Y(i);
+            if (!(xc & hs) && !(yc & vs)) {
+                const int v = PALETTE_INDEX(xc, yc);
+                if (v == esc) {
+                    const int coeff = ff_vvc_palette_escape_val(lc);
+                    const int pixel = av_clip_intp2(RSHIFT(coeff * scale, 6), sps->bit_depth);
+                    PALETTE_SET_PIXEL(xc, yc, pixel);
+                } else {
+                    PALETTE_SET_PIXEL(xc, yc, plt->entries[v]);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int hls_palette_coding(VVCLocalContext *lc, const VVCTreeType tree_type)
+{
+    const VVCFrameContext *fc     = lc->fc;
+    const VVCSPS *sps             = fc->ps.sps;
+    const H266RawSliceHeader *rsh = lc->sc->sh.r;
+    CodingUnit *cu                = lc->cu;
+    Palette *pp                   = lc->ep->pp;
+    const int max_entries         = tree_type == SINGLE_TREE ? 31 : 15;
+    const bool local_dual_tree    = tree_type != SINGLE_TREE &&
+                                        (!IS_I(rsh) || (IS_I(rsh) && !sps->r->sps_qtbtt_dual_tree_intra_flag));
+    bool escape_present           = false;
+    bool transpose                = false;
+    bool adjust                   = false;
+    int max_index                 = 0;
+    int prev_run_pos              = 0;
+
+    int predictor_size, start, end, ret;
+    bool reused[VVC_MAX_NUM_PALETTE_PREDICTOR_SIZE];
+    uint8_t run_type[MAX_PALETTE_CU_SIZE * MAX_PALETTE_CU_SIZE];
+    uint8_t index[MAX_PALETTE_CU_SIZE * MAX_PALETTE_CU_SIZE];
+
+    ff_vvc_channel_range(&start, &end, tree_type, sps->r->sps_chroma_format_idc);
+
+    if (!palette_add_tu(lc, start, end, tree_type))
+        return AVERROR(ENOMEM);
+
+    predictor_size = pp[start].size;
+    memset(reused, 0, sizeof(reused[0]) * predictor_size);
+
+    ret = palette_predicted(lc, local_dual_tree, start, end, reused, predictor_size, max_entries);
+    if (ret < 0)
+        return ret;
+
+    ret = palette_signaled(lc, local_dual_tree, start, end, max_entries);
+    if (ret < 0)
+        return ret;
+
+    palette_update_predictor(lc, local_dual_tree, start, end, reused, predictor_size);
+
+    if (cu->plt[start].size > 0)
+        escape_present = ff_vvc_palette_escape_val_present_flag(lc);
+
+    max_index = cu->plt[start].size - 1 + escape_present;
+    if (max_index > 0) {
+        adjust = false;
+        transpose = ff_vvc_palette_transpose_flag(lc);
+    }
+
+    palette_qp(lc, tree_type, escape_present);
+
+    index[0] = 0;
+    for (int i = 0; i <= (cu->cb_width * cu->cb_height - 1) >> 4; i++)
+        palette_subblock_data(lc, max_index, i, transpose,
+            run_type, index, &prev_run_pos, &adjust);
+
+    return 0;
+}
+
+static int intra_data(VVCLocalContext *lc)
+{
+    const VVCSPS *sps              = lc->fc->ps.sps;
+    const CodingUnit *cu           = lc->cu;
+    const VVCTreeType tree_type    = cu->tree_type;
+    const bool  pred_mode_plt_flag = cu->pred_mode == MODE_PLT;
+    int ret                        = 0;
+
+    if (tree_type == SINGLE_TREE || tree_type == DUAL_TREE_LUMA) {
+        if (pred_mode_plt_flag) {
+            if ((ret = hls_palette_coding(lc, tree_type)) < 0)
+                return ret;
+            ff_vvc_set_intra_mvf(lc, false, PF_PLT, false);
+        } else {
+            intra_luma_pred_modes(lc);
+            ff_vvc_set_intra_mvf(lc, false, PF_INTRA, cu->ciip_flag);
+        }
+    }
+    if ((tree_type == SINGLE_TREE || tree_type == DUAL_TREE_CHROMA) && sps->r->sps_chroma_format_idc) {
+        if (pred_mode_plt_flag && tree_type == DUAL_TREE_CHROMA) {
+            if ((ret = hls_palette_coding(lc, tree_type)) < 0)
+                return ret;
+        } else if (!pred_mode_plt_flag) {
+            intra_chroma_pred_modes(lc);
+        }
+    }
+
+    return ret;
+}
+
 static int hls_coding_unit(VVCLocalContext *lc, int x0, int y0, int cb_width, int cb_height,
     int cqt_depth, const VVCTreeType tree_type, VVCModeType mode_type)
 {
-    const VVCFrameContext *fc       = lc->fc;
-    const VVCSPS *sps               = fc->ps.sps;
-    const H266RawSliceHeader *rsh   = lc->sc->sh.r;
-    const int hs                    = sps->hshift[CHROMA];
-    const int vs                    = sps->vshift[CHROMA];
-    const int is_128                = cb_width > 64 || cb_height > 64;
-    int pred_mode_plt_flag          = 0;
-    int ret;
+    const VVCFrameContext *fc     = lc->fc;
+    const VVCSPS *sps             = fc->ps.sps;
+    const H266RawSliceHeader *rsh = lc->sc->sh.r;
+    const int is_128              = cb_width > 64 || cb_height > 64;
+    int ret                       = 0;
 
     CodingUnit *cu = add_cu(lc, x0, y0, cb_width, cb_height, cqt_depth, tree_type);
 
@@ -1829,54 +2173,26 @@ static int hls_coding_unit(VVCLocalContext *lc, int x0, int y0, int cb_width, in
         mode_type = MODE_TYPE_INTRA;
     cu->pred_mode = pred_mode_decode(lc, tree_type, mode_type);
 
-    if (cu->pred_mode == MODE_INTRA && sps->r->sps_palette_enabled_flag && !is_128 && !cu->skip_flag &&
-        mode_type != MODE_TYPE_INTER && ((cb_width * cb_height) >
-        (tree_type != DUAL_TREE_CHROMA ? 16 : (16 << hs << vs))) &&
-        (mode_type != MODE_TYPE_INTRA || tree_type != DUAL_TREE_CHROMA)) {
-        pred_mode_plt_flag = ff_vvc_pred_mode_plt_flag(lc);
-        if (pred_mode_plt_flag) {
-            avpriv_report_missing_feature(fc->log_ctx, "Palette");
-            return AVERROR_PATCHWELCOME;
-        }
-    }
-    if (cu->pred_mode == MODE_INTRA && sps->r->sps_act_enabled_flag && tree_type == SINGLE_TREE) {
-        avpriv_report_missing_feature(fc->log_ctx, "Adaptive Color Transform");
-        return AVERROR_PATCHWELCOME;
-    }
-    if (cu->pred_mode == MODE_INTRA || cu->pred_mode == MODE_PLT) {
-        if (tree_type == SINGLE_TREE || tree_type == DUAL_TREE_LUMA) {
-            if (pred_mode_plt_flag) {
-                avpriv_report_missing_feature(fc->log_ctx, "Palette");
-                return AVERROR_PATCHWELCOME;
-            } else {
-                intra_luma_pred_modes(lc);
-            }
-            ff_vvc_set_intra_mvf(lc, 0);
-        }
-        if ((tree_type == SINGLE_TREE || tree_type == DUAL_TREE_CHROMA) && sps->r->sps_chroma_format_idc) {
-            if (pred_mode_plt_flag && tree_type == DUAL_TREE_CHROMA) {
-                avpriv_report_missing_feature(fc->log_ctx, "Palette");
-                return AVERROR_PATCHWELCOME;
-            } else if (!pred_mode_plt_flag) {
-                if (!cu->act_enabled_flag)
-                    intra_chroma_pred_modes(lc);
-            }
-        }
-    } else if (tree_type != DUAL_TREE_CHROMA) { /* MODE_INTER or MODE_IBC */
-        if ((ret = inter_data(lc)) < 0)
-            return ret;
-    }
-    if (cu->pred_mode != MODE_INTRA && !pred_mode_plt_flag && !lc->cu->pu.general_merge_flag)
+    if (cu->pred_mode == MODE_INTRA && sps->r->sps_act_enabled_flag && tree_type == SINGLE_TREE)
+        cu->act_enabled_flag = ff_vvc_cu_act_enabled_flag(lc);
+
+    if (cu->pred_mode == MODE_INTRA || cu->pred_mode == MODE_PLT)
+        ret = intra_data(lc);
+    else if (tree_type != DUAL_TREE_CHROMA) /* MODE_INTER or MODE_IBC */
+        ret = inter_data(lc);
+
+    if (ret < 0)
+        return ret;
+
+    if (cu->pred_mode != MODE_INTRA && cu->pred_mode != MODE_PLT && !lc->cu->pu.general_merge_flag)
         cu->coded_flag = ff_vvc_cu_coded_flag(lc);
     else
-        cu->coded_flag = !(cu->skip_flag || pred_mode_plt_flag);
+        cu->coded_flag = !(cu->skip_flag || cu->pred_mode == MODE_PLT);
 
     if (cu->coded_flag) {
         sbt_info(lc, sps);
-        if (sps->r->sps_act_enabled_flag && cu->pred_mode != MODE_INTRA && tree_type == SINGLE_TREE) {
-            avpriv_report_missing_feature(fc->log_ctx, "Adaptive Color Transform");
-            return AVERROR_PATCHWELCOME;
-        }
+        if (sps->r->sps_act_enabled_flag && cu->pred_mode != MODE_INTRA && tree_type == SINGLE_TREE)
+            cu->act_enabled_flag = ff_vvc_cu_act_enabled_flag(lc);
         lc->parse.lfnst_dc_only = 1;
         lc->parse.lfnst_zero_out_sig_coeff_flag = 1;
         lc->parse.mts_dc_only = 1;
@@ -1887,7 +2203,7 @@ static int hls_coding_unit(VVCLocalContext *lc, int x0, int y0, int cb_width, in
         cu->lfnst_idx = lfnst_idx_decode(lc);
         cu->mts_idx = mts_idx_decode(lc);
         set_qp_c(lc);
-    } else {
+    } else if (cu->pred_mode != MODE_PLT) {
         ret = skipped_transform_tree_unit(lc);
         if (ret < 0)
             return ret;
@@ -2580,3 +2896,12 @@ void ff_vvc_ep_init_stat_coeff(EntryPoint *ep,
             persistent_rice_adaptation_enabled_flag ? 2 * (av_log2(bit_depth - 10)) : 0;
     }
 }
+
+void ff_vvc_channel_range(int *start, int *end, const VVCTreeType tree_type, const uint8_t chroma_format_idc)
+{
+    const bool has_chroma = chroma_format_idc && tree_type != DUAL_TREE_LUMA;
+    const bool has_luma   = tree_type != DUAL_TREE_CHROMA;
+
+    *start = has_luma   ? LUMA : CB;
+    *end   = has_chroma ? VVC_MAX_SAMPLE_ARRAYS : CB;
+}
diff --git a/libavcodec/vvc/ctu.h b/libavcodec/vvc/ctu.h
index c5533c1ad086f..e37bacf9ddb02 100644
--- a/libavcodec/vvc/ctu.h
+++ b/libavcodec/vvc/ctu.h
@@ -36,6 +36,7 @@
 #define MIN_CU_SIZE             4
 #define MIN_CU_LOG2             2
 #define MAX_CU_DEPTH            7
+#define MAX_PALETTE_CU_SIZE     64
 
 #define MAX_PARTS_IN_CTU        ((MAX_CTU_SIZE >> MIN_CU_LOG2) * (MAX_CTU_SIZE >> MIN_CU_LOG2))
 
@@ -224,6 +225,7 @@ typedef enum PredFlag {
     PF_L1    = 0x2,
     PF_BI    = 0x3,
     PF_IBC   = PF_L0 | 0x4,
+    PF_PLT   = 0x8,
 } PredFlag;
 
 typedef enum IntraPredMode {
@@ -277,6 +279,11 @@ typedef struct PredictionUnit {
     int cb_prof_flag[2];
 } PredictionUnit;
 
+typedef struct Palette {
+    uint8_t size;
+    uint16_t entries[VVC_MAX_NUM_PALETTE_PREDICTOR_SIZE];
+} Palette;
+
 typedef struct CodingUnit {
     VVCTreeType tree_type;
     int x0;
@@ -326,6 +333,8 @@ typedef struct CodingUnit {
 
     int8_t qp[4];                                   ///< QpY, Qp′Cb, Qp′Cr, Qp′CbCr
 
+    Palette plt[VVC_MAX_SAMPLE_ARRAYS];
+
     PredictionUnit pu;
 
     struct CodingUnit *next;                        ///< RefStruct reference
@@ -356,6 +365,8 @@ typedef struct EntryPoint {
 
     int stat_coeff[VVC_MAX_SAMPLE_ARRAYS];          ///< StatCoeff
 
+    Palette pp[VVC_MAX_SAMPLE_ARRAYS];              // PalettePredictor
+
     VVCCabacState cabac_state[VVC_CONTEXTS];
     CABACContext cc;
 
@@ -489,5 +500,6 @@ void ff_vvc_decode_neighbour(VVCLocalContext *lc, int x_ctb, int y_ctb, int rx,
 void ff_vvc_ctu_free_cus(CodingUnit **cus);
 int ff_vvc_get_qPy(const VVCFrameContext *fc, int xc, int yc);
 void ff_vvc_ep_init_stat_coeff(EntryPoint *ep, int bit_depth, int persistent_rice_adaptation_enabled_flag);
+void ff_vvc_channel_range(int *start, int *end, VVCTreeType tree_type, uint8_t chroma_format_idc);
 
 #endif // AVCODEC_VVC_CTU_H
diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c
index 0b6443a112f9c..381b42c421b3a 100644
--- a/libavcodec/vvc/dec.c
+++ b/libavcodec/vvc/dec.c
@@ -26,9 +26,12 @@
 #include "libavcodec/hwconfig.h"
 #include "libavcodec/profiles.h"
 #include "libavutil/refstruct.h"
+#include "libavcodec/aom_film_grain.h"
+#include "libavcodec/thread.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/thread.h"
+#include "libavutil/film_grain_params.h"
 
 #include "dec.h"
 #include "ctu.h"
@@ -506,23 +509,18 @@ static int slices_realloc(VVCFrameContext *fc)
     return 0;
 }
 
-static int ep_init_cabac_decoder(SliceContext *sc, const int index,
-    const H2645NAL *nal, GetBitContext *gb, const CodedBitstreamUnit *unit)
+static int get_ep_size(const H266RawSliceHeader *rsh, GetBitContext *gb, const H2645NAL *nal, const int header_size, const int ep_index)
 {
-    const H266RawSlice *slice     = unit->content_ref;
-    const H266RawSliceHeader *rsh = sc->sh.r;
-    EntryPoint *ep                = sc->eps + index;
     int size;
-    int ret;
 
-    if (index < rsh->num_entry_points) {
+    if (ep_index < rsh->num_entry_points) {
         int skipped = 0;
         int64_t start =  (gb->index >> 3);
-        int64_t end = start + rsh->sh_entry_point_offset_minus1[index] + 1;
-        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= start + slice->header_size) {
+        int64_t end = start + rsh->sh_entry_point_offset_minus1[ep_index] + 1;
+        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= start + header_size) {
             skipped++;
         }
-        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= end + slice->header_size) {
+        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= end + header_size) {
             end--;
             skipped++;
         }
@@ -531,6 +529,13 @@ static int ep_init_cabac_decoder(SliceContext *sc, const int index,
     } else {
         size = get_bits_left(gb) / 8;
     }
+    return size;
+}
+
+static int ep_init_cabac_decoder(EntryPoint *ep, GetBitContext *gb, const int size)
+{
+    int ret;
+
     av_assert0(gb->buffer + get_bits_count(gb) / 8 + size <= gb->buffer_end);
     ret = ff_init_cabac_decoder (&ep->cc, gb->buffer + get_bits_count(gb) / 8, size);
     if (ret < 0)
@@ -539,6 +544,22 @@ static int ep_init_cabac_decoder(SliceContext *sc, const int index,
     return 0;
 }
 
+static int ep_init(EntryPoint *ep, const int ctu_addr, const int ctu_end, GetBitContext *gb, const int size)
+{
+    const int ret = ep_init_cabac_decoder(ep, gb, size);
+
+    if (ret < 0)
+        return ret;
+
+    ep->ctu_start = ctu_addr;
+    ep->ctu_end   = ctu_end;
+
+    for (int c_idx = LUMA; c_idx <= CR; c_idx++)
+        ep->pp[c_idx].size = 0;
+
+    return 0;
+}
+
 static int slice_init_entry_points(SliceContext *sc,
     VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
 {
@@ -562,20 +583,19 @@ static int slice_init_entry_points(SliceContext *sc,
         return ret;
     for (int i = 0; i < sc->nb_eps; i++)
     {
-        EntryPoint *ep = sc->eps + i;
+        const int size    = get_ep_size(sc->sh.r, &gb, nal, slice->header_size, i);
+        const int ctu_end = (i + 1 == sc->nb_eps ? sh->num_ctus_in_curr_slice : sh->entry_point_start_ctu[i]);
+        EntryPoint *ep    = sc->eps + i;
 
-        ep->ctu_start = ctu_addr;
-        ep->ctu_end   = (i + 1 == sc->nb_eps ? sh->num_ctus_in_curr_slice : sh->entry_point_start_ctu[i]);
+        ret = ep_init(ep, ctu_addr, ctu_end, &gb, size);
+        if (ret < 0)
+            return ret;
 
         for (int j = ep->ctu_start; j < ep->ctu_end; j++) {
             const int rs = sc->sh.ctb_addr_in_curr_slice[j];
             fc->tab.slice_idx[rs] = sc->slice_idx;
         }
 
-        ret = ep_init_cabac_decoder(sc, i, nal, &gb, unit);
-        if (ret < 0)
-            return ret;
-
         if (i + 1 < sc->nb_eps)
             ctu_addr = sh->entry_point_start_ctu[i];
     }
@@ -601,6 +621,14 @@ static int ref_frame(VVCFrame *dst, const VVCFrame *src)
     av_refstruct_replace(&dst->sps, src->sps);
     av_refstruct_replace(&dst->pps, src->pps);
 
+    if (src->needs_fg) {
+        ret = av_frame_ref(dst->frame_grain, src->frame_grain);
+        if (ret < 0)
+            return ret;
+
+        dst->needs_fg = src->needs_fg;
+    }
+
     av_refstruct_replace(&dst->progress, src->progress);
 
     av_refstruct_replace(&dst->tab_dmvr_mvf, src->tab_dmvr_mvf);
@@ -634,12 +662,14 @@ static av_cold void frame_context_free(VVCFrameContext *fc)
     for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
         ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
         av_frame_free(&fc->DPB[i].frame);
+        av_frame_free(&fc->DPB[i].frame_grain);
     }
 
     ff_vvc_frame_thread_free(fc);
     pic_arrays_free(fc);
     av_frame_free(&fc->output_frame);
     ff_vvc_frame_ps_free(&fc->ps);
+    ff_vvc_sei_reset(&fc->sei);
 }
 
 static av_cold int frame_context_init(VVCFrameContext *fc, AVCodecContext *avctx)
@@ -655,6 +685,10 @@ static av_cold int frame_context_init(VVCFrameContext *fc, AVCodecContext *avctx
         fc->DPB[j].frame = av_frame_alloc();
         if (!fc->DPB[j].frame)
             return AVERROR(ENOMEM);
+
+        fc->DPB[j].frame_grain = av_frame_alloc();
+        if (!fc->DPB[j].frame_grain)
+            return AVERROR(ENOMEM);
     }
     fc->cu_pool = av_refstruct_pool_alloc(sizeof(CodingUnit), 0);
     if (!fc->cu_pool)
@@ -682,6 +716,10 @@ static int frame_context_setup(VVCFrameContext *fc, VVCContext *s)
                     return ret;
             }
         }
+
+        ret = ff_vvc_sei_replace(&fc->sei, &prev->sei);
+        if (ret < 0)
+            return ret;
     }
 
     if (IS_IDR(s)) {
@@ -697,6 +735,65 @@ static int frame_context_setup(VVCFrameContext *fc, VVCContext *s)
     return 0;
 }
 
+/* SEI does not affect decoding, so we ignore the return value */
+static void decode_prefix_sei(VVCFrameContext *fc, VVCContext *s)
+{
+    CodedBitstreamFragment *frame = &s->current_frame;
+
+    for (int i = 0; i < frame->nb_units; i++) {
+        const CodedBitstreamUnit *unit = frame->units + i;
+
+        if (unit->type == VVC_PREFIX_SEI_NUT) {
+            int ret = ff_vvc_sei_decode(&fc->sei, unit->content_ref, fc);
+            if (ret < 0)
+                return;
+        }
+    }
+}
+
+static int set_side_data(VVCContext *s, VVCFrameContext *fc)
+{
+    AVFrame *out = fc->ref->frame;
+
+    return ff_h2645_sei_to_frame(out, &fc->sei.common, AV_CODEC_ID_VVC, s->avctx,
+        NULL, fc->ps.sps->bit_depth, fc->ps.sps->bit_depth, fc->ref->poc);
+}
+
+static int check_film_grain(VVCContext *s, VVCFrameContext *fc)
+{
+    int ret;
+
+    fc->ref->needs_fg = (fc->sei.common.film_grain_characteristics &&
+        fc->sei.common.film_grain_characteristics->present ||
+        fc->sei.common.aom_film_grain.enable) &&
+        !(s->avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) &&
+        !s->avctx->hwaccel;
+
+    if (fc->ref->needs_fg &&
+        (fc->sei.common.film_grain_characteristics->present &&
+            !ff_h274_film_grain_params_supported(fc->sei.common.film_grain_characteristics->model_id,
+                fc->ref->frame->format) ||
+            !av_film_grain_params_select(fc->ref->frame))) {
+        av_log_once(s->avctx, AV_LOG_WARNING, AV_LOG_DEBUG, &s->film_grain_warning_shown,
+            "Unsupported film grain parameters. Ignoring film grain.\n");
+        fc->ref->needs_fg = 0;
+    }
+
+    if (fc->ref->needs_fg) {
+        fc->ref->frame_grain->format = fc->ref->frame->format;
+        fc->ref->frame_grain->width  = fc->ref->frame->width;
+        fc->ref->frame_grain->height = fc->ref->frame->height;
+
+        ret = ff_thread_get_buffer(s->avctx, fc->ref->frame_grain, 0);
+        if (ret < 0)
+            return ret;
+
+        return av_frame_copy_props(fc->ref->frame_grain, fc->ref->frame);
+    }
+
+    return 0;
+}
+
 static int frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext *sc)
 {
     const VVCPH *ph                 = &fc->ps.ph;
@@ -710,6 +807,16 @@ static int frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext *sc)
     if ((ret = ff_vvc_set_new_ref(s, fc, &fc->frame)) < 0)
         goto fail;
 
+    decode_prefix_sei(fc, s);
+
+    ret = set_side_data(s, fc);
+    if (ret < 0)
+        goto fail;
+
+    ret = check_film_grain(s, fc);
+    if (ret < 0)
+        goto fail;
+
     if (!IS_IDR(s))
         ff_vvc_bump_frame(s, fc);
 
@@ -914,6 +1021,15 @@ static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, AVBufferRef *buf_
         if (ret < 0)
             return ret;
         break;
+    case VVC_PREFIX_SEI_NUT:
+        /* handle by decode_prefix_sei() */
+        break;
+
+    case VVC_SUFFIX_SEI_NUT:
+        /* SEI does not affect decoding, so we ignore the return value*/
+        if (fc)
+            ff_vvc_sei_decode(&fc->sei, unit->content_ref, fc);
+        break;
     }
 
     return 0;
@@ -958,14 +1074,60 @@ static int decode_nal_units(VVCContext *s, VVCFrameContext *fc, AVPacket *avpkt)
     return ret;
 }
 
+static int frame_end(VVCContext *s, VVCFrameContext *fc)
+{
+    const AVFilmGrainParams *fgp;
+    int ret = 0;
+
+    if (fc->ref->needs_fg) {
+        av_assert0(fc->ref->frame_grain->buf[0]);
+        fgp = av_film_grain_params_select(fc->ref->frame);
+        switch (fgp->type) {
+        case AV_FILM_GRAIN_PARAMS_NONE:
+            av_assert0(0);
+            return AVERROR_BUG;
+        case AV_FILM_GRAIN_PARAMS_H274:
+            ret = ff_h274_apply_film_grain(fc->ref->frame_grain, fc->ref->frame,
+                &s->h274db, fgp);
+            break;
+        case AV_FILM_GRAIN_PARAMS_AV1:
+            ret = ff_aom_apply_film_grain(fc->ref->frame_grain, fc->ref->frame, fgp);
+            break;
+        }
+    }
+
+    if (!s->avctx->hwaccel && s->avctx->err_recognition & AV_EF_CRCCHECK) {
+        VVCSEI *sei = &fc->sei;
+        if (sei->picture_hash.present) {
+            int ret = ff_h274_hash_init(&s->hash_ctx, sei->picture_hash.hash_type);
+            if (ret < 0)
+                return ret;
+
+            ret = ff_h274_hash_verify(s->hash_ctx, &sei->picture_hash, fc->ref->frame, fc->ps.pps->width, fc->ps.pps->height);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                    "Verifying checksum for frame with decoder_order %d: failed\n",
+                    (int)fc->decode_order);
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return ret;
+            }
+        }
+    }
+
+    return 0;
+}
+
 static int wait_delayed_frame(VVCContext *s, AVFrame *output, int *got_output)
 {
     VVCFrameContext *delayed = get_frame_context(s, s->fcs, s->nb_frames - s->nb_delayed);
     int ret                  = ff_vvc_frame_wait(s, delayed);
 
-    if (!ret && delayed->output_frame->buf[0] && output) {
-        av_frame_move_ref(output, delayed->output_frame);
-        *got_output = 1;
+    if (!ret) {
+        ret = frame_end(s, delayed);
+        if (ret >= 0 && delayed->output_frame->buf[0] && output) {
+            av_frame_move_ref(output, delayed->output_frame);
+            *got_output = 1;
+        }
     }
     s->nb_delayed--;
 
@@ -1080,6 +1242,7 @@ static av_cold int vvc_decode_free(AVCodecContext *avctx)
             frame_context_free(s->fcs + i);
         av_free(s->fcs);
     }
+    ff_h274_hash_freep(&s->hash_ctx);
     ff_vvc_ps_uninit(&s->ps);
     ff_cbs_close(&s->cbc);
 
diff --git a/libavcodec/vvc/dec.h b/libavcodec/vvc/dec.h
index 6aa31215505cf..5f8065b38bad8 100644
--- a/libavcodec/vvc/dec.h
+++ b/libavcodec/vvc/dec.h
@@ -26,9 +26,11 @@
 
 #include "libavcodec/videodsp.h"
 #include "libavcodec/vvc.h"
+#include "libavcodec/h274.h"
 
 #include "ps.h"
 #include "dsp.h"
+#include "sei.h"
 
 #define LUMA                    0
 #define CHROMA                  1
@@ -70,12 +72,15 @@ typedef struct VVCWindow {
 
 typedef struct VVCFrame {
     struct AVFrame *frame;
-
+    struct AVFrame *frame_grain;
     const VVCSPS *sps;                          ///< RefStruct reference
     const VVCPPS *pps;                          ///< RefStruct reference
     struct MvField *tab_dmvr_mvf;               ///< RefStruct reference
     RefPicListTab **rpl_tab;                    ///< RefStruct reference
     RefPicListTab  *rpl;                        ///< RefStruct reference
+
+    int needs_fg;                               ///< 1 if grain needs to be applied by the decoder
+
     int nb_rpl_elems;
 
     int ctb_count;
@@ -124,6 +129,7 @@ typedef struct VVCFrameContext {
     struct AVFrame *output_frame;
 
     VVCFrameParamSets ps;
+    VVCSEI sei;
 
     SliceContext  **slices;
     int nb_slices;
@@ -216,6 +222,7 @@ typedef struct VVCContext {
     CodedBitstreamFragment current_frame;
 
     VVCParamSets ps;
+    H274FilmGrainDatabase h274db;
 
     int temporal_id;        ///< temporal_id_plus1 - 1
     int poc_tid0;
@@ -226,6 +233,7 @@ typedef struct VVCContext {
     enum VVCNALUnitType vcl_unit_type;
     int no_output_before_recovery_flag; ///< NoOutputBeforeRecoveryFlag
     int gdr_recovery_point_poc;         ///< recoveryPointPocVal
+    int film_grain_warning_shown;
 
     /**
      * Sequence counters for decoded and output frames, so that old
@@ -241,6 +249,8 @@ typedef struct VVCContext {
 
     uint64_t nb_frames;     ///< processed frames
     int nb_delayed;         ///< delayed frames
+
+    H274HashContext *hash_ctx;
 }  VVCContext ;
 
 #endif /* AVCODEC_VVC_DEC_H */
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index fc4c3a679909c..ae22900931799 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -106,7 +106,7 @@ struct VVCLocalContext;
 
 typedef struct VVCIntraDSPContext {
     void (*intra_cclm_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h);
-    void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int *dst, const int *coeff, int w, int h, int x0_cu, int y0_cu);
+    void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int *coeff, int w, int h, int x0_cu, int y0_cu);
     void (*intra_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h, int c_idx);
     void (*pred_planar)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride);
     void (*pred_mip)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride,
@@ -122,11 +122,12 @@ typedef struct VVCIntraDSPContext {
 
 typedef struct VVCItxDSPContext {
     void (*add_residual)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride);
-    void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift);
-    void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift);
+    void (*pred_residual_joint)(int *dst, const int *src, int width, int height, int c_sign, int shift);
 
     void (*itx[VVC_N_TX_TYPE][VVC_N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz);
     void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range);
+
+    void (*adaptive_color_transform)(int *y, int *u, int *v, int width, int height);
 } VVCItxDSPContext;
 
 typedef struct VVCLMCSDSPContext {
diff --git a/libavcodec/vvc/dsp_template.c b/libavcodec/vvc/dsp_template.c
index 1aa1e027bdd17..13bd8cd4a161b 100644
--- a/libavcodec/vvc/dsp_template.c
+++ b/libavcodec/vvc/dsp_template.c
@@ -45,32 +45,12 @@ static void FUNC(add_residual)(uint8_t *_dst, const int *res,
     }
 }
 
-static void FUNC(add_residual_joint)(uint8_t *_dst, const int *res,
-    const int w, const int h, const ptrdiff_t _stride, const int c_sign, const int shift)
-{
-    pixel *dst = (pixel *)_dst;
-
-    const int stride = _stride / sizeof(pixel);
-
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++) {
-            const int r = ((*res) * c_sign) >> shift;
-            dst[x] = av_clip_pixel(dst[x] + r);
-            res++;
-        }
-        dst += stride;
-    }
-}
-
-static void FUNC(pred_residual_joint)(int *buf, const int w, const int h,
+static void FUNC(pred_residual_joint)(int *dst, const int *src, const int w, const int h,
     const int c_sign, const int shift)
 {
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++) {
-            *buf = ((*buf) * c_sign) >> shift;
-            buf++;
-        }
-    }
+    const int size = w * h;
+    for (int i = 0; i < size; i++)
+        dst[i] = (src[i] * c_sign) >> shift;
 }
 
 static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height,
@@ -94,6 +74,24 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height
     }
 }
 
+// 8.7.4.6 Residual modification process for blocks using colour space conversion
+static void FUNC(adaptive_color_transform)(int *y, int *u, int *v, const int width, const int height)
+{
+    const int size = width * height;
+    const int bits = BIT_DEPTH + 1;
+
+    for (int i = 0; i < size; i++) {
+        const int y0 = av_clip_intp2(y[i], bits);
+        const int cg = av_clip_intp2(u[i], bits);
+        const int co = av_clip_intp2(v[i], bits);
+        const int t  = y0 - (cg >> 1);
+
+        y[i] = cg + t;
+        u[i] = t - (co >> 1);
+        v[i] = co + u[i];
+    }
+}
+
 static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx)
 {
 #define VVC_ITX(TYPE, type, s)                                                  \
@@ -106,7 +104,6 @@ static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx)
         VVC_ITX(TYPE, type, 32);
 
     itx->add_residual                = FUNC(add_residual);
-    itx->add_residual_joint          = FUNC(add_residual_joint);
     itx->pred_residual_joint         = FUNC(pred_residual_joint);
     itx->transform_bdpcm             = FUNC(transform_bdpcm);
     VVC_ITX(DCT2, dct2, 2)
@@ -115,6 +112,8 @@ static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx)
     VVC_ITX_COMMON(DCT8, dct8)
     VVC_ITX_COMMON(DST7, dst7)
 
+    itx->adaptive_color_transform = FUNC(adaptive_color_transform);
+
 #undef VVC_ITX
 #undef VVC_ITX_COMMON
 }
diff --git a/libavcodec/vvc/filter.c b/libavcodec/vvc/filter.c
index a7f102bc643a3..3815668bcf5f0 100644
--- a/libavcodec/vvc/filter.c
+++ b/libavcodec/vvc/filter.c
@@ -385,6 +385,9 @@ static int boundary_strength(const VVCLocalContext *lc, const MvField *curr, con
 {
     RefPicList *rpl = lc->sc->rpl;
 
+    if (curr->pred_flag == PF_PLT)
+        return 0;
+
     if (curr->pred_flag == PF_IBC)
         return FFABS(neigh->mv[0].x - curr->mv[0].x) >= 8 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 8;
 
@@ -772,17 +775,15 @@ static int get_qp(const VVCFrameContext *fc, const uint8_t *src, const int x, co
 
 static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs, const int vertical)
 {
-    VVCFrameContext *fc    = lc->fc;
-    const VVCSPS *sps      = fc->ps.sps;
-    const int c_end        = sps->r->sps_chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
-    const int ctb_size     = fc->ps.sps->ctb_size_y;
-    const DBParams *params = fc->tab.deblock + rs;
-    int x_end              = FFMIN(x0 + ctb_size, fc->ps.pps->width);
-    int y_end              = FFMIN(y0 + ctb_size, fc->ps.pps->height);
-
-    //not use this yet, may needed by plt.
-    const uint8_t no_p[4]  = { 0 };
-    const uint8_t no_q[4]  = { 0 } ;
+    VVCFrameContext *fc        = lc->fc;
+    const VVCSPS *sps          = fc->ps.sps;
+    const int c_end            = sps->r->sps_chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
+    const int ctb_size         = fc->ps.sps->ctb_size_y;
+    const DBParams *params     = fc->tab.deblock + rs;
+    int x_end                  = FFMIN(x0 + ctb_size, fc->ps.pps->width);
+    int y_end                  = FFMIN(y0 + ctb_size, fc->ps.pps->height);
+    const int log2_min_cb_size = fc->ps.sps->min_cb_log2_size_y;
+    const int min_cb_width     = fc->ps.pps->min_cb_width;
 
     if (!vertical) {
         FFSWAP(int, x_end, y_end);
@@ -802,6 +803,8 @@ static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs,
                 const uint8_t horizontal_ctu_edge = !vertical && !(x % ctb_size);
                 int32_t bs[4], beta[4], tc[4] = { 0 }, all_zero_bs = 1;
                 uint8_t max_len_p[4], max_len_q[4];
+                uint8_t no_p[4] = { 0 };
+                uint8_t no_q[4] = { 0 };
 
                 for (int i = 0; i < DEBLOCK_STEP >> (2 - vs); i++) {
                     int tx         = x;
@@ -818,6 +821,13 @@ static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs,
                         tc[i] = TC_CALC(qp, bs[i]) ;
                         max_filter_length(fc, tx, ty, c_idx, vertical, horizontal_ctu_edge, bs[i], &max_len_p[i], &max_len_q[i]);
                         all_zero_bs = 0;
+
+                        if (sps->r->sps_palette_enabled_flag) {
+                            const int cu_q = (ty             >> log2_min_cb_size) * min_cb_width + (tx            >> log2_min_cb_size);
+                            const int cu_p = (ty - !vertical >> log2_min_cb_size) * min_cb_width + (tx - vertical >> log2_min_cb_size);
+                            no_q[i] = fc->tab.cpm[!!c_idx][cu_q] == MODE_PLT;
+                            no_p[i] = cu_p >= 0 && fc->tab.cpm[!!c_idx][cu_p] == MODE_PLT;
+                        }
                     }
                 }
 
diff --git a/libavcodec/vvc/intra.c b/libavcodec/vvc/intra.c
index 41ed89c94623b..f56b43be66eeb 100644
--- a/libavcodec/vvc/intra.c
+++ b/libavcodec/vvc/intra.c
@@ -27,6 +27,10 @@
 #include "intra.h"
 #include "itx_1d.h"
 
+#define POS(c_idx, x, y)    \
+    &fc->frame->data[c_idx][((y) >> fc->ps.sps->vshift[c_idx]) * fc->frame->linesize[c_idx] +   \
+        (((x) >> fc->ps.sps->hshift[c_idx]) << fc->ps.sps->pixel_shift)]
+
 static int is_cclm(enum IntraPredMode mode)
 {
     return mode == INTRA_LT_CCLM || mode == INTRA_L_CCLM || mode == INTRA_T_CCLM;
@@ -164,28 +168,6 @@ static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalConte
     *trv = mts_to_trv[cu->mts_idx];
 }
 
-static void add_residual_for_joint_coding_chroma(VVCLocalContext *lc,
-    const TransformUnit *tu, TransformBlock *tb, const int chroma_scale)
-{
-    const VVCFrameContext *fc  = lc->fc;
-    const CodingUnit *cu = lc->cu;
-    const int c_sign = 1 - 2 * fc->ps.ph.r->ph_joint_cbcr_sign_flag;
-    const int shift  = tu->coded_flag[1] ^ tu->coded_flag[2];
-    const int c_idx  = 1 + tu->coded_flag[1];
-    const ptrdiff_t stride = fc->frame->linesize[c_idx];
-    const int hs = fc->ps.sps->hshift[c_idx];
-    const int vs = fc->ps.sps->vshift[c_idx];
-    uint8_t *dst = &fc->frame->data[c_idx][(tb->y0 >> vs) * stride +
-                                          ((tb->x0 >> hs) << fc->ps.sps->pixel_shift)];
-    if (chroma_scale) {
-        fc->vvcdsp.itx.pred_residual_joint(tb->coeffs, tb->tb_width, tb->tb_height, c_sign, shift);
-        fc->vvcdsp.intra.lmcs_scale_chroma(lc, tb->coeffs, tb->coeffs, tb->tb_width, tb->tb_height, cu->x0, cu->y0);
-        fc->vvcdsp.itx.add_residual(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride);
-    } else {
-        fc->vvcdsp.itx.add_residual_joint(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride, c_sign, shift);
-    }
-}
-
 static int add_reconstructed_area(VVCLocalContext *lc, const int ch_type, const int x0, const int y0, const int w, const int h)
 {
     const VVCSPS *sps       = lc->fc->ps.sps;
@@ -303,21 +285,15 @@ static void scale(int *out, const int *in, const int w, const int h, const int s
 // part of 8.7.3 Scaling process for transform coefficients
 static void derive_qp(const VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb)
 {
-    const VVCSPS *sps               = lc->fc->ps.sps;
-    const H266RawSliceHeader *rsh   = lc->sc->sh.r;
-    const CodingUnit *cu            = lc->cu;
-    int qp, qp_act_offset;
+    const VVCSPS *sps             = lc->fc->ps.sps;
+    const H266RawSliceHeader *rsh = lc->sc->sh.r;
+    const CodingUnit *cu          = lc->cu;
+    const bool is_jcbcr           = tb->c_idx && tu->joint_cbcr_residual_flag && tu->coded_flag[CB] && tu->coded_flag[CR];
+    const int idx                 = is_jcbcr ? JCBCR : tb->c_idx;
+    const int qp                  = cu->qp[idx] + (idx ? 0 : sps->qp_bd_offset);
+    const int act_offset[]        = { -5, 1, 3, 1 };
+    const int qp_act_offset       = cu->act_enabled_flag ? act_offset[idx] : 0;
 
-    if (tb->c_idx == 0) {
-        //fix me
-        qp = cu->qp[LUMA] + sps->qp_bd_offset;
-        qp_act_offset = cu->act_enabled_flag ? -5 : 0;
-    } else {
-        const int is_jcbcr = tu->joint_cbcr_residual_flag && tu->coded_flag[CB] && tu->coded_flag[CR];
-        const int idx = is_jcbcr ? JCBCR : tb->c_idx;
-        qp = cu->qp[idx];
-        qp_act_offset = cu->act_enabled_flag ? 1 : 0;
-    }
     if (tb->ts) {
         const int qp_prime_ts_min = 4 + 6 * sps->r->sps_min_qp_prime_ts;
 
@@ -336,29 +312,30 @@ static void derive_qp(const VVCLocalContext *lc, const TransformUnit *tu, Transf
     tb->bd_offset = (1 << tb->bd_shift) >> 1;
 }
 
+static const uint8_t rem6[63 + 8 * 6 + 1] = {
+    0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
+    0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
+    0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
+    0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
+    0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
+};
+
+static const uint8_t div6[63 + 8 * 6 + 1] = {
+    0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,
+    4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+    8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+   12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+   16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18,
+};
+
+const static int level_scale[2][6] = {
+   { 40, 45, 51, 57, 64, 72 },
+   { 57, 64, 72, 80, 90, 102 }
+};
+
 //8.7.3 Scaling process for transform coefficients
 static av_always_inline int derive_scale(const TransformBlock *tb, const int sh_dep_quant_used_flag)
 {
-    static const uint8_t rem6[63 + 8 * 6 + 1] = {
-         0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
-         0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
-         0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
-         0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,
-         0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
-    };
-
-    static const uint8_t div6[63 + 8 * 6 + 1] = {
-         0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,
-         4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
-         8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
-        12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
-        16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18,
-    };
-
-    const static int level_scale[2][6] = {
-        { 40, 45, 51, 57, 64, 72 },
-        { 57, 64, 72, 80, 90, 102 }
-    };
     const int addin = sh_dep_quant_used_flag && !tb->ts;
     const int qp    = tb->qp + addin;
 
@@ -515,29 +492,67 @@ static void transform_bdpcm(TransformBlock *tb, const VVCLocalContext *lc, const
         tb->max_scan_x = tb->tb_width - 1;
 }
 
-static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, const int target_ch_type)
+static void lmcs_scale_chroma(VVCLocalContext *lc, TransformUnit *tu, TransformBlock *tb, const int target_ch_type)
 {
-    const VVCFrameContext *fc   = lc->fc;
-    const VVCSPS *sps           = fc->ps.sps;
-    const VVCSH *sh             = &lc->sc->sh;
-    const CodingUnit *cu        = lc->cu;
-    const int ps                = fc->ps.sps->pixel_shift;
-    DECLARE_ALIGNED(32, int, temp)[MAX_TB_SIZE * MAX_TB_SIZE];
+    const VVCFrameContext *fc = lc->fc;
+    const VVCSH *sh           = &lc->sc->sh;
+    const CodingUnit *cu      = lc->cu;
+    const int c_idx           = tb->c_idx;
+    const int ch_type         = c_idx > 0;
+    const int w               = tb->tb_width;
+    const int h               = tb->tb_height;
+    const int chroma_scale    = ch_type && sh->r->sh_lmcs_used_flag && fc->ps.ph.r->ph_chroma_residual_scale_flag && (w * h > 4);
+    const int has_jcbcr       = tu->joint_cbcr_residual_flag && c_idx;
+
+    for (int j = 0; j < 1 + has_jcbcr; j++) {
+        const bool is_jcbcr   = j > 0;
+        const int jcbcr_idx   = CB + tu->coded_flag[CB];
+        TransformBlock *jcbcr = &tu->tbs[jcbcr_idx - tu->tbs[0].c_idx];
+        int *coeffs           = is_jcbcr ? jcbcr->coeffs : tb->coeffs;
+
+        if (!j && has_jcbcr) {
+            const int c_sign = 1 - 2 * fc->ps.ph.r->ph_joint_cbcr_sign_flag;
+            const int shift  = tu->coded_flag[CB] ^ tu->coded_flag[CR];
+            fc->vvcdsp.itx.pred_residual_joint(jcbcr->coeffs, tb->coeffs, w, h, c_sign, shift);
+        }
+        if (chroma_scale)
+            fc->vvcdsp.intra.lmcs_scale_chroma(lc, coeffs, w, h, cu->x0, cu->y0);
+    }
+}
+
+static void add_residual(const VVCLocalContext *lc, TransformUnit *tu, const int target_ch_type)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const CodingUnit *cu      = lc->cu;
 
     for (int i = 0; i < tu->nb_tbs; i++) {
-        TransformBlock *tb  = &tu->tbs[i];
-        const int c_idx     = tb->c_idx;
-        const int ch_type   = c_idx > 0;
-
-        if (ch_type == target_ch_type && tb->has_coeffs) {
-            const int w             = tb->tb_width;
-            const int h             = tb->tb_height;
-            const int chroma_scale  = ch_type && sh->r->sh_lmcs_used_flag && fc->ps.ph.r->ph_chroma_residual_scale_flag && (w * h > 4);
-            const ptrdiff_t stride  = fc->frame->linesize[c_idx];
-            const int hs            = sps->hshift[c_idx];
-            const int vs            = sps->vshift[c_idx];
-            uint8_t *dst            = &fc->frame->data[c_idx][(tb->y0 >> vs) * stride + ((tb->x0 >> hs) << ps)];
+        TransformBlock *tb      = tu->tbs + i;
+        const int c_idx         = tb->c_idx;
+        const int ch_type       = c_idx > 0;
+        const ptrdiff_t stride  = fc->frame->linesize[c_idx];
+        const bool has_residual = tb->has_coeffs || cu->act_enabled_flag ||
+                                  (c_idx && tu->joint_cbcr_residual_flag);
+        uint8_t *dst            = POS(c_idx, tb->x0, tb->y0);
+
+        if (ch_type == target_ch_type && has_residual)
+             fc->vvcdsp.itx.add_residual(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride);
+    }
+}
 
+static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int target_ch_type)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const CodingUnit *cu      = lc->cu;
+    TransformBlock *tbs       = tu->tbs;
+    const bool is_act_luma    = cu->act_enabled_flag && target_ch_type == LUMA;
+
+    for (int i = 0; i < tu->nb_tbs; i++) {
+        TransformBlock *tb = tbs + i;
+        const int c_idx    = tb->c_idx;
+        const int ch_type  = c_idx > 0;
+        const bool do_itx  = is_act_luma || !cu->act_enabled_flag && ch_type == target_ch_type;
+
+        if (tb->has_coeffs && do_itx) {
             if (cu->bdpcm_flag[tb->c_idx])
                 transform_bdpcm(tb, lc, cu);
             dequant(lc, tu, tb);
@@ -547,22 +562,22 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx,
                 if (cu->apply_lfnst_flag[c_idx])
                     ilfnst_transform(lc, tb);
                 derive_transform_type(fc, lc, tb, &trh, &trv);
-                if (w > 1 && h > 1)
+                if (tb->tb_width > 1 && tb->tb_height > 1)
                     itx_2d(fc, tb, trh, trv);
                 else
                     itx_1d(fc, tb, trh, trv);
             }
-
-            if (chroma_scale)
-                fc->vvcdsp.intra.lmcs_scale_chroma(lc, temp, tb->coeffs, w, h, cu->x0, cu->y0);
-            // TODO: Address performance issue here by combining transform, lmcs_scale_chroma, and add_residual into one function.
-            // Complete this task before implementing ASM code.
-            fc->vvcdsp.itx.add_residual(dst, chroma_scale ? temp : tb->coeffs, w, h, stride);
-
-            if (tu->joint_cbcr_residual_flag && tb->c_idx)
-                add_residual_for_joint_coding_chroma(lc, tu, tb, chroma_scale);
+            lmcs_scale_chroma(lc, tu, tb, target_ch_type);
         }
     }
+
+    if (is_act_luma) {
+        fc->vvcdsp.itx.adaptive_color_transform(
+            tbs[LUMA].coeffs, tbs[CB].coeffs, tbs[CR].coeffs,
+            tbs[LUMA].tb_width, tbs[LUMA].tb_height);
+    }
+
+    add_residual(lc, tu, target_ch_type);
 }
 
 static int reconstruct(VVCLocalContext *lc)
@@ -576,17 +591,13 @@ static int reconstruct(VVCLocalContext *lc)
         TransformUnit *tu = cu->tus.head;
         for (int i = 0; tu; i++) {
             predict_intra(lc, tu, i, ch_type);
-            itransform(lc, tu, i, ch_type);
+            itransform(lc, tu, ch_type);
             tu = tu->next;
         }
     }
     return 0;
 }
 
-#define POS(c_idx, x, y)    \
-    &fc->frame->data[c_idx][((y) >> fc->ps.sps->vshift[c_idx]) * fc->frame->linesize[c_idx] +   \
-        (((x) >> fc->ps.sps->hshift[c_idx]) << fc->ps.sps->pixel_shift)]
-
 #define IBC_POS(c_idx, x, y) \
     (fc->tab.ibc_vir_buf[c_idx] + \
         (x << ps) + (y + ((cu->y0 & ~(sps->ctb_size_y - 1)) >> vs)) * ibc_stride)
@@ -639,11 +650,11 @@ static void ibc_fill_vir_buf(const VVCLocalContext *lc, const CodingUnit *cu)
 {
     const VVCFrameContext *fc = lc->fc;
     const VVCSPS *sps         = fc->ps.sps;
-    const int has_chroma      = sps->r->sps_chroma_format_idc && cu->tree_type != DUAL_TREE_LUMA;
-    const int start           = cu->tree_type == DUAL_TREE_CHROMA;
-    const int end             = has_chroma ? CR : LUMA;
+    int start, end;
 
-    for (int c_idx = start; c_idx <= end; c_idx++) {
+    ff_vvc_channel_range(&start, &end, cu->tree_type, sps->r->sps_chroma_format_idc);
+
+    for (int c_idx = start; c_idx < end; c_idx++) {
         const int hs = sps->hshift[c_idx];
         const int vs = sps->vshift[c_idx];
         const int ps = sps->pixel_shift;
@@ -658,6 +669,38 @@ static void ibc_fill_vir_buf(const VVCLocalContext *lc, const CodingUnit *cu)
     }
 }
 
+int ff_vvc_palette_derive_scale(VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb)
+{
+    const VVCSPS *sps = lc->fc->ps.sps;
+    const int qp_prime_ts_min  = 4 + 6 * sps->r->sps_min_qp_prime_ts;
+    int qp;
+
+    derive_qp(lc, tu, tb);
+    qp = FFMAX(qp_prime_ts_min, tb->qp);
+    return level_scale[0][rem6[qp]] << div6[qp];
+}
+
+// 8.4.5.3 Decoding process for palette mode
+static void vvc_predict_palette(VVCLocalContext *lc)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const CodingUnit *cu      = lc->cu;
+    TransformUnit *tu         = cu->tus.head;
+    const VVCSPS *sps         = fc->ps.sps;
+    const int ps              = sps->pixel_shift;
+
+    for (int i = 0; i < tu->nb_tbs; i++) {
+        TransformBlock *tb     = &tu->tbs[i];
+        const int c_idx        = tb->c_idx;
+        const int w            = tb->tb_width;
+        const int h            = tb->tb_height;
+        const ptrdiff_t stride = fc->frame->linesize[c_idx];
+        uint8_t *dst           = POS(c_idx, cu->x0, cu->y0);
+
+        av_image_copy_plane(dst, stride, (uint8_t*)tb->coeffs, w << ps, w << ps, h);
+    }
+}
+
 int ff_vvc_reconstruct(VVCLocalContext *lc, const int rs, const int rx, const int ry)
 {
     const VVCFrameContext *fc   = lc->fc;
@@ -678,6 +721,8 @@ int ff_vvc_reconstruct(VVCLocalContext *lc, const int rs, const int rx, const in
             ff_vvc_predict_ciip(lc);
         else if (cu->pred_mode == MODE_IBC)
             vvc_predict_ibc(lc);
+        else if (cu->pred_mode == MODE_PLT)
+            vvc_predict_palette(lc);
         if (cu->coded_flag) {
             ret = reconstruct(lc);
         } else {
diff --git a/libavcodec/vvc/intra.h b/libavcodec/vvc/intra.h
index 8a02699135486..1201c70836fbd 100644
--- a/libavcodec/vvc/intra.h
+++ b/libavcodec/vvc/intra.h
@@ -45,5 +45,6 @@ int ff_vvc_intra_pred_angle_derive(int pred_mode);
 int ff_vvc_intra_inv_angle_derive(int pred_mode);
 int ff_vvc_wide_angle_mode_mapping(const CodingUnit *cu,
     int tb_width, int tb_height, int c_idx, int pred_mode_intra);
+int ff_vvc_palette_derive_scale(VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb);
 
 #endif // AVCODEC_VVC_INTRA_H
diff --git a/libavcodec/vvc/intra_template.c b/libavcodec/vvc/intra_template.c
index 440ac5b6cccc6..3ec6c72213e7f 100644
--- a/libavcodec/vvc/intra_template.c
+++ b/libavcodec/vvc/intra_template.c
@@ -428,7 +428,7 @@ static int FUNC(lmcs_derive_chroma_scale)(VVCLocalContext *lc, const int x0, con
 }
 
 // 8.7.5.3 Picture reconstruction with luma dependent chroma residual scaling process for chroma samples
-static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int *dst, const int *coeff,
+static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int *coeff,
     const int width, const int height, const int x0_cu, const int y0_cu)
 {
     const int chroma_scale = FUNC(lmcs_derive_chroma_scale)(lc, x0_cu, y0_cu);
@@ -438,11 +438,10 @@ static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int *dst, const int *co
             const int c = av_clip_intp2(*coeff, BIT_DEPTH);
 
             if (c > 0)
-                *dst = (c * chroma_scale + (1 << 10)) >> 11;
+                *coeff = (c * chroma_scale + (1 << 10)) >> 11;
             else
-                *dst = -((-c * chroma_scale + (1 << 10)) >> 11);
+                *coeff = -((-c * chroma_scale + (1 << 10)) >> 11);
             coeff++;
-            dst++;
         }
     }
 }
diff --git a/libavcodec/vvc/mvs.c b/libavcodec/vvc/mvs.c
index 566df158a8aac..2cf67def7bed0 100644
--- a/libavcodec/vvc/mvs.c
+++ b/libavcodec/vvc/mvs.c
@@ -144,7 +144,9 @@ static int derive_temporal_colocated_mvs(const VVCLocalContext *lc, MvField temp
     const SliceContext *sc      = lc->sc;
     RefPicList* refPicList      = sc->rpl;
 
-    if (temp_col.pred_flag == PF_INTRA)
+    if (temp_col.pred_flag == PF_INTRA ||
+        temp_col.pred_flag == PF_IBC   ||
+        temp_col.pred_flag == PF_PLT)
         return 0;
 
     if (sb_flag){
@@ -266,7 +268,7 @@ void ff_vvc_set_mvf(const VVCLocalContext *lc, const int x0, const int y0, const
     }
 }
 
-void ff_vvc_set_intra_mvf(const VVCLocalContext *lc, const int dmvr)
+void ff_vvc_set_intra_mvf(const VVCLocalContext *lc, const bool dmvr, const PredFlag pf, const bool ciip_flag)
 {
     const VVCFrameContext *fc   = lc->fc;
     const CodingUnit *cu        = lc->cu;
@@ -277,7 +279,10 @@ void ff_vvc_set_intra_mvf(const VVCLocalContext *lc, const int dmvr)
         for (int dx = 0; dx < cu->cb_width; dx += min_pu_size) {
             const int x = cu->x0 + dx;
             const int y = cu->y0 + dy;
-            TAB_MVF(x, y).pred_flag = PF_INTRA;
+            MvField *mv = &TAB_MVF(x, y);
+
+            mv->pred_flag = pf;
+            mv->ciip_flag = ciip_flag;
         }
     }
 }
@@ -599,7 +604,19 @@ static void init_neighbour_context(NeighbourContext *ctx, const VVCLocalContext
 
 static av_always_inline PredMode pred_flag_to_mode(PredFlag pred)
 {
-    return pred == PF_IBC ? MODE_IBC : (pred == PF_INTRA ? MODE_INTRA : MODE_INTER);
+    static const PredMode lut[] = {
+        MODE_INTRA, // PF_INTRA
+        MODE_INTER, // PF_L0
+        MODE_INTER, // PF_L1
+        MODE_INTER, // PF_BI
+        0,          // invalid
+        MODE_IBC,   // PF_IBC
+        0,          // invalid
+        0,          // invalid
+        MODE_PLT,   // PF_PLT
+    };
+
+    return lut[pred];
 }
 
 static int check_available(Neighbour *n, const VVCLocalContext *lc, const int check_mer)
diff --git a/libavcodec/vvc/mvs.h b/libavcodec/vvc/mvs.h
index b2242b2a4d9ea..7150c0b8cf26c 100644
--- a/libavcodec/vvc/mvs.h
+++ b/libavcodec/vvc/mvs.h
@@ -43,6 +43,6 @@ void ff_vvc_update_hmvp(VVCLocalContext *lc, const MotionInfo *mi);
 int ff_vvc_no_backward_pred_flag(const VVCLocalContext *lc);
 MvField* ff_vvc_get_mvf(const VVCFrameContext *fc, const int x0, const int y0);
 void ff_vvc_set_mvf(const VVCLocalContext *lc, const int x0, const int y0, const int w, const int h, const MvField *mvf);
-void ff_vvc_set_intra_mvf(const VVCLocalContext *lc, int dmvr);
+void ff_vvc_set_intra_mvf(const VVCLocalContext *lc, bool dmvr, PredFlag pf, bool ciip_flag);
 
 #endif //AVCODEC_VVC_MVS_H
diff --git a/libavcodec/vvc/ps.c b/libavcodec/vvc/ps.c
index e8c312d8ac05d..d9f46b219af90 100644
--- a/libavcodec/vvc/ps.c
+++ b/libavcodec/vvc/ps.c
@@ -408,6 +408,8 @@ static int pps_add_ctus(VVCPPS *pps, int *off, const int rx, const int ry,
     int start = *off;
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
+            if (*off >= pps->ctb_count)
+                return AVERROR_INVALIDDATA;
             pps->ctb_addr_in_slice[*off] = ctu_rs(rx + x, ry + y, pps);
             (*off)++;
         }
@@ -415,16 +417,21 @@ static int pps_add_ctus(VVCPPS *pps, int *off, const int rx, const int ry,
     return *off - start;
 }
 
-static void pps_single_slice_picture(VVCPPS *pps, int *off)
+static int pps_single_slice_picture(VVCPPS *pps, int *off)
 {
     pps->num_ctus_in_slice[0] = 0;
     for (int j = 0; j < pps->r->num_tile_rows; j++) {
         for (int i = 0; i < pps->r->num_tile_columns; i++) {
-            pps->num_ctus_in_slice[0] += pps_add_ctus(pps, off,
+            const int ret = pps_add_ctus(pps, off,
                 pps->col_bd[i], pps->row_bd[j],
                 pps->r->col_width_val[i], pps->r->row_height_val[j]);
+            if (ret < 0)
+                return ret;
+            pps->num_ctus_in_slice[0] += ret;
         }
     }
+
+    return 0;
 }
 
 static void subpic_tiles(int *tile_x, int *tile_y, int *tile_x_end, int *tile_y_end,
@@ -451,50 +458,36 @@ static void subpic_tiles(int *tile_x, int *tile_y, int *tile_x_end, int *tile_y_
         (*tile_y_end)++;
 }
 
-static bool mark_tile_as_used(bool *tile_in_subpic, const int tx, const int ty, const int tile_columns)
+static int pps_subpic_less_than_one_tile_slice(VVCPPS *pps, const VVCSPS *sps, const int i, const int tx, const int ty, int *off)
 {
-    const size_t tile_idx = ty * tile_columns + tx;
-    if (tile_in_subpic[tile_idx]) {
-        /* the tile is covered by other subpictures */
-        return false;
-    }
-    tile_in_subpic[tile_idx] = true;
-    return true;
-}
-
-static int pps_subpic_less_than_one_tile_slice(VVCPPS *pps, const VVCSPS *sps, const int i, const int tx, const int ty, int *off, bool *tile_in_subpic)
-{
-    const int subpic_bottom = sps->r->sps_subpic_ctu_top_left_y[i] + sps->r->sps_subpic_height_minus1[i];
-    const int tile_bottom = pps->row_bd[ty] + pps->r->row_height_val[ty] - 1;
-    const bool is_final_subpic_in_tile = subpic_bottom == tile_bottom;
-
-    if (is_final_subpic_in_tile && !mark_tile_as_used(tile_in_subpic, tx, ty, pps->r->num_tile_columns))
-        return AVERROR_INVALIDDATA;
-
-    pps->num_ctus_in_slice[i] = pps_add_ctus(pps, off,
+    const int ret = pps_add_ctus(pps, off,
         sps->r->sps_subpic_ctu_top_left_x[i], sps->r->sps_subpic_ctu_top_left_y[i],
         sps->r->sps_subpic_width_minus1[i] + 1, sps->r->sps_subpic_height_minus1[i] + 1);
+    if (ret < 0)
+        return ret;
 
+    pps->num_ctus_in_slice[i] = ret;
     return 0;
 }
 
 static int pps_subpic_one_or_more_tiles_slice(VVCPPS *pps, const int tile_x, const int tile_y, const int x_end, const int y_end,
-    const int i, int *off, bool *tile_in_subpic)
+    const int i, int *off)
 {
     for (int ty = tile_y; ty < y_end; ty++) {
         for (int tx = tile_x; tx < x_end; tx++) {
-            if (!mark_tile_as_used(tile_in_subpic, tx, ty, pps->r->num_tile_columns))
-                return AVERROR_INVALIDDATA;
-
-            pps->num_ctus_in_slice[i] += pps_add_ctus(pps, off,
+            const int ret = pps_add_ctus(pps, off,
                 pps->col_bd[tx], pps->row_bd[ty],
                 pps->r->col_width_val[tx], pps->r->row_height_val[ty]);
+            if (ret < 0)
+                return ret;
+
+            pps->num_ctus_in_slice[i] += ret;
         }
     }
     return 0;
 }
 
-static int pps_subpic_slice(VVCPPS *pps, const VVCSPS *sps, const int i, int *off, bool *tile_in_subpic)
+static int pps_subpic_slice(VVCPPS *pps, const VVCSPS *sps, const int i, int *off)
 {
     int tx, ty, x_end, y_end;
 
@@ -503,28 +496,25 @@ static int pps_subpic_slice(VVCPPS *pps, const VVCSPS *sps, const int i, int *of
 
     subpic_tiles(&tx, &ty, &x_end, &y_end, sps, pps, i);
     if (ty + 1 == y_end && sps->r->sps_subpic_height_minus1[i] + 1 < pps->r->row_height_val[ty])
-        return pps_subpic_less_than_one_tile_slice(pps, sps, i, tx, ty, off, tile_in_subpic);
+        return pps_subpic_less_than_one_tile_slice(pps, sps, i, tx, ty, off);
     else
-        return pps_subpic_one_or_more_tiles_slice(pps, tx, ty, x_end, y_end, i, off, tile_in_subpic);
+        return pps_subpic_one_or_more_tiles_slice(pps, tx, ty, x_end, y_end, i, off);
 }
 
 static int pps_single_slice_per_subpic(VVCPPS *pps, const VVCSPS *sps, int *off)
 {
+    int ret;
+
     if (!sps->r->sps_subpic_info_present_flag) {
-        pps_single_slice_picture(pps, off);
+        ret = pps_single_slice_picture(pps, off);
+        if (ret < 0)
+            return ret;
     } else {
-        bool tile_in_subpic[VVC_MAX_TILES_PER_AU] = {0};
         for (int i = 0; i < pps->r->pps_num_slices_in_pic_minus1 + 1; i++) {
-            const int ret = pps_subpic_slice(pps, sps, i, off, tile_in_subpic);
+            const int ret = pps_subpic_slice(pps, sps, i, off);
             if (ret < 0)
                 return ret;
         }
-
-        // We only use tile_in_subpic to check that the subpictures don't overlap
-        // here; we don't use tile_in_subpic to check that the subpictures cover
-        // every tile.  It is possible to avoid doing this work here because the
-        // covering property of subpictures is already guaranteed by the mechanisms
-        // which check every CTU belongs to a slice.
     }
     return 0;
 }
@@ -538,9 +528,13 @@ static int pps_one_tile_slices(VVCPPS *pps, const int tile_idx, int i, int *off)
     ctu_xy(&rx, &ry, tile_x, tile_y, pps);
     ctu_y_end = ry + r->row_height_val[tile_y];
     while (ry < ctu_y_end) {
+        int ret;
         pps->slice_start_offset[i] = *off;
-        pps->num_ctus_in_slice[i] = pps_add_ctus(pps, off, rx, ry,
+        ret = pps_add_ctus(pps, off, rx, ry,
             r->col_width_val[tile_x], r->slice_height_in_ctus[i]);
+        if (ret < 0)
+            return ret;
+        pps->num_ctus_in_slice[i] = ret;
         ry += r->slice_height_in_ctus[i++];
     }
     i--;
@@ -557,13 +551,17 @@ static int pps_multi_tiles_slice(VVCPPS *pps, const int tile_idx, const int i, i
     pps->num_ctus_in_slice[i] = 0;
     for (int ty = tile_y; ty <= tile_y + r->pps_slice_height_in_tiles_minus1[i]; ty++) {
         for (int tx = tile_x; tx <= tile_x + r->pps_slice_width_in_tiles_minus1[i]; tx++) {
+            int ret;
             const int idx = ty * r->num_tile_columns + tx;
             if (tile_in_slice[idx])
                 return AVERROR_INVALIDDATA;
             tile_in_slice[idx] = true;
             ctu_xy(&rx, &ry, tx, ty, pps);
-            pps->num_ctus_in_slice[i] += pps_add_ctus(pps, off, rx, ry,
+            ret = pps_add_ctus(pps, off, rx, ry,
                 r->col_width_val[tx], r->row_height_val[ty]);
+            if (ret < 0)
+                return ret;
+            pps->num_ctus_in_slice[i] += ret;
         }
     }
 
@@ -574,7 +572,7 @@ static int pps_rect_slice(VVCPPS *pps, const VVCSPS *sps)
 {
     const H266RawPPS *r = pps->r;
     bool tile_in_slice[VVC_MAX_TILES_PER_AU] = {false};
-    int tile_idx = 0, off = 0;
+    int tile_idx = 0, off = 0, ret;
 
     if (r->pps_single_slice_per_subpic_flag) {
         return pps_single_slice_per_subpic(pps, sps, &off);
@@ -586,9 +584,12 @@ static int pps_rect_slice(VVCPPS *pps, const VVCSPS *sps)
             if (tile_in_slice[tile_idx])
                 return AVERROR_INVALIDDATA;
             tile_in_slice[tile_idx] = true;
-            i = pps_one_tile_slices(pps, tile_idx, i, &off);
+            ret = pps_one_tile_slices(pps, tile_idx, i, &off);
+            if (ret < 0)
+                return ret;
+            i = ret;
         } else {
-            const int ret = pps_multi_tiles_slice(pps, tile_idx, i, &off, tile_in_slice);
+            ret = pps_multi_tiles_slice(pps, tile_idx, i, &off, tile_in_slice);
             if (ret < 0)
                 return ret;
         }
@@ -603,21 +604,28 @@ static int pps_rect_slice(VVCPPS *pps, const VVCSPS *sps)
     return 0;
 }
 
-static void pps_no_rect_slice(VVCPPS* pps)
+static int pps_no_rect_slice(VVCPPS* pps)
 {
     const H266RawPPS* r = pps->r;
     int rx, ry, off = 0;
 
     for (int tile_y = 0; tile_y < r->num_tile_rows; tile_y++) {
         for (int tile_x = 0; tile_x < r->num_tile_columns; tile_x++) {
+            int ret;
             ctu_xy(&rx, &ry, tile_x, tile_y, pps);
-            pps_add_ctus(pps, &off, rx, ry, r->col_width_val[tile_x], r->row_height_val[tile_y]);
+            ret = pps_add_ctus(pps, &off, rx, ry, r->col_width_val[tile_x], r->row_height_val[tile_y]);
+            if (ret < 0)
+                return ret;
         }
     }
+
+    return 0;
 }
 
 static int pps_slice_map(VVCPPS *pps, const VVCSPS *sps)
 {
+    int ret;
+
     pps->ctb_addr_in_slice = av_calloc(pps->ctb_count, sizeof(*pps->ctb_addr_in_slice));
     if (!pps->ctb_addr_in_slice)
         return AVERROR(ENOMEM);
@@ -625,7 +633,9 @@ static int pps_slice_map(VVCPPS *pps, const VVCSPS *sps)
     if (pps->r->pps_rect_slice_flag)
         return pps_rect_slice(pps, sps);
 
-    pps_no_rect_slice(pps);
+    ret = pps_no_rect_slice(pps);
+    if (ret < 0)
+        return ret;
 
     return 0;
 }
@@ -839,7 +849,7 @@ static int lmcs_derive_lut(VVCLMCS *lmcs, const H266RawAPS *rlmcs, const H266Raw
     uint16_t input_pivot[LMCS_MAX_BIN_SIZE];
     uint16_t scale_coeff[LMCS_MAX_BIN_SIZE];
     uint16_t inv_scale_coeff[LMCS_MAX_BIN_SIZE];
-    int i, delta_crs;
+    int i, delta_crs, sum_cw = 0;
     if (bit_depth > LMCS_MAX_BIT_DEPTH)
         return AVERROR_PATCHWELCOME;
 
@@ -850,8 +860,12 @@ static int lmcs_derive_lut(VVCLMCS *lmcs, const H266RawAPS *rlmcs, const H266Raw
     lmcs->max_bin_idx = LMCS_MAX_BIN_SIZE - 1 - rlmcs->lmcs_delta_max_bin_idx;
 
     memset(cw, 0, sizeof(cw));
-    for (int i = lmcs->min_bin_idx; i <= lmcs->max_bin_idx; i++)
+    for (int i = lmcs->min_bin_idx; i <= lmcs->max_bin_idx; i++) {
         cw[i] = org_cw + (1 - 2 * rlmcs->lmcs_delta_sign_cw_flag[i]) * rlmcs->lmcs_delta_abs_cw[i];
+        sum_cw += cw[i];
+    }
+    if (sum_cw > (1 << bit_depth) - 1)
+        return AVERROR_INVALIDDATA;
 
     delta_crs = (1 - 2 * rlmcs->lmcs_delta_sign_crs_flag) * rlmcs->lmcs_delta_abs_crs;
 
@@ -859,13 +873,20 @@ static int lmcs_derive_lut(VVCLMCS *lmcs, const H266RawAPS *rlmcs, const H266Raw
     for (i = 0; i < LMCS_MAX_BIN_SIZE; i++) {
         input_pivot[i]        = i * org_cw;
         lmcs->pivot[i + 1] = lmcs->pivot[i] + cw[i];
+        if (i >= lmcs->min_bin_idx && i <= lmcs->max_bin_idx &&
+            lmcs->pivot[i] % (1 << (bit_depth - 5)) != 0 &&
+            lmcs->pivot[i] >> (bit_depth - 5) == lmcs->pivot[i + 1] >> (bit_depth - 5))
+            return AVERROR_INVALIDDATA;
         scale_coeff[i]        = (cw[i] * (1 << 11) +  off) >> shift;
         if (cw[i] == 0) {
             inv_scale_coeff[i] = 0;
             lmcs->chroma_scale_coeff[i] = (1 << 11);
         } else {
+            const int cw_plus_d = cw[i] + delta_crs;
+            if (cw_plus_d < (org_cw >> 3) || cw_plus_d > ((org_cw << 3) - 1))
+                return AVERROR_INVALIDDATA;
             inv_scale_coeff[i] = org_cw * (1 << 11) / cw[i];
-            lmcs->chroma_scale_coeff[i] = org_cw * (1 << 11) / (cw[i] + delta_crs);
+            lmcs->chroma_scale_coeff[i] = org_cw * (1 << 11) / cw_plus_d;
         }
     }
 
diff --git a/libavcodec/vvc/refs.c b/libavcodec/vvc/refs.c
index 1cfca4820477d..79967b77d3fa5 100644
--- a/libavcodec/vvc/refs.c
+++ b/libavcodec/vvc/refs.c
@@ -52,6 +52,12 @@ void ff_vvc_unref_frame(VVCFrameContext *fc, VVCFrame *frame, int flags)
         frame->flags = 0;
     if (!frame->flags) {
         av_frame_unref(frame->frame);
+
+        if (frame->needs_fg) {
+            av_frame_unref(frame->frame_grain);
+            frame->needs_fg = 0;
+        }
+
         av_refstruct_unref(&frame->sps);
         av_refstruct_unref(&frame->pps);
         av_refstruct_unref(&frame->progress);
@@ -154,6 +160,14 @@ static VVCFrame *alloc_frame(VVCContext *s, VVCFrameContext *fc)
         frame->ref_width   = pps->r->pps_pic_width_in_luma_samples  - win->left_offset   - win->right_offset;
         frame->ref_height  = pps->r->pps_pic_height_in_luma_samples - win->bottom_offset - win->top_offset;
 
+        if (fc->sei.frame_field_info.present) {
+            if (fc->sei.frame_field_info.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD)
+                frame->frame->flags |= AV_FRAME_FLAG_TOP_FIELD_FIRST;
+            if (fc->sei.frame_field_info.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD ||
+                fc->sei.frame_field_info.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD)
+                frame->frame->flags |= AV_FRAME_FLAG_INTERLACED;
+        }
+
         frame->progress = alloc_progress();
         if (!frame->progress)
             goto fail;
@@ -285,7 +299,13 @@ int ff_vvc_output_frame(VVCContext *s, VVCFrameContext *fc, AVFrame *out, const
             if (frame->flags & VVC_FRAME_FLAG_CORRUPT)
                 frame->frame->flags |= AV_FRAME_FLAG_CORRUPT;
 
-            ret = av_frame_ref(out, frame->frame);
+            ret = av_frame_ref(out, frame->needs_fg ? frame->frame_grain : frame->frame);
+            if (ret < 0)
+                return ret;
+
+            if (!(s->avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN))
+                av_frame_remove_side_data(out, AV_FRAME_DATA_FILM_GRAIN_PARAMS);
+
             if (frame->flags & VVC_FRAME_FLAG_BUMPING)
                 ff_vvc_unref_frame(fc, frame, VVC_FRAME_FLAG_OUTPUT | VVC_FRAME_FLAG_BUMPING);
             else
diff --git a/libavcodec/vvc/sei.c b/libavcodec/vvc/sei.c
new file mode 100644
index 0000000000000..d8ab2bf245a18
--- /dev/null
+++ b/libavcodec/vvc/sei.c
@@ -0,0 +1,239 @@
+/*
+ * VVC Supplementary Enhancement Information messages
+ *
+ * copyright (c) 2024 Wu Jianhua <toqsxw@outlook.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "sei.h"
+#include "dec.h"
+#include "libavutil/refstruct.h"
+
+static int decode_film_grain_characteristics(H2645SEIFilmGrainCharacteristics *h, const SEIRawFilmGrainCharacteristics *s, const VVCFrameContext *fc)
+{
+    const VVCSPS *sps = fc->ps.sps;
+
+    h->present = !s->fg_characteristics_cancel_flag;
+    if (h->present) {
+        h->model_id                                 = s->fg_model_id;
+        h->separate_colour_description_present_flag = s->fg_separate_colour_description_present_flag;
+        if (h->separate_colour_description_present_flag) {
+            h->bit_depth_luma           =  s->fg_bit_depth_luma_minus8 + 8;
+            h->bit_depth_chroma         =  s->fg_bit_depth_chroma_minus8 + 8;
+            h->full_range               =  s->fg_full_range_flag;
+            h->color_primaries          =  s->fg_colour_primaries;
+            h->transfer_characteristics =  s->fg_transfer_characteristics;
+            h->matrix_coeffs            =  s->fg_matrix_coeffs;
+        }  else {
+            if (!sps) {
+                av_log(fc->log_ctx, AV_LOG_ERROR,
+                    "No active SPS for film_grain_characteristics.\n");
+                return AVERROR_INVALIDDATA;
+            }
+            h->bit_depth_luma           = sps->bit_depth;
+            h->bit_depth_chroma         = sps->bit_depth;
+            h->full_range               = sps->r->vui.vui_full_range_flag;
+            h->color_primaries          = sps->r->vui.vui_colour_primaries;
+            h->transfer_characteristics = sps->r->vui.vui_transfer_characteristics;
+            h->matrix_coeffs            = sps->r->vui.vui_matrix_coeffs ;
+        }
+
+        h->blending_mode_id  =  s->fg_blending_mode_id;
+        h->log2_scale_factor =  s->fg_log2_scale_factor;
+
+        for (int c = 0; c < 3; c++) {
+            h->comp_model_present_flag[c] = s->fg_comp_model_present_flag[c];
+            if (h->comp_model_present_flag[c]) {
+                h->num_intensity_intervals[c] = s->fg_num_intensity_intervals_minus1[c] + 1;
+                h->num_model_values[c]        = s->fg_num_model_values_minus1[c] + 1;
+
+                if (h->num_model_values[c] > 6)
+                    return AVERROR_INVALIDDATA;
+
+                for (int i = 0; i < h->num_intensity_intervals[c]; i++) {
+                    h->intensity_interval_lower_bound[c][i] = s->fg_intensity_interval_lower_bound[c][i];
+                    h->intensity_interval_upper_bound[c][i] = s->fg_intensity_interval_upper_bound[c][i];
+                    for (int j = 0; j < h->num_model_values[c]; j++)
+                        h->comp_model_value[c][i][j] = s->fg_comp_model_value[c][i][j];
+                }
+            }
+        }
+
+        h->persistence_flag = s->fg_characteristics_persistence_flag;
+    }
+
+    return 0;
+}
+
+static int decode_decoded_picture_hash(H274SEIPictureHash *h, const SEIRawDecodedPictureHash *s)
+{
+    h->present   = 1;
+    h->hash_type = s->dph_sei_hash_type;
+    if (h->hash_type == 0)
+        memcpy(h->md5, s->dph_sei_picture_md5, sizeof(h->md5));
+    else if (h->hash_type == 1)
+        memcpy(h->crc, s->dph_sei_picture_crc, sizeof(h->crc));
+    else if (h->hash_type == 2)
+        memcpy(h->checksum, s->dph_sei_picture_checksum, sizeof(h->checksum));
+
+    return 0;
+}
+
+static int decode_display_orientation(H2645SEIDisplayOrientation *h, const SEIRawDisplayOrientation *s)
+{
+    int degrees[] = { 0, 0x8000, 0x4000, 0xC000 };
+
+    h->present = !s->display_orientation_cancel_flag;
+    if (h->present) {
+        if (s->display_orientation_transform_type > 7)
+            return AVERROR_INVALIDDATA;
+
+        h->vflip = 0;
+        if (s->display_orientation_transform_type == 1 ||
+            s->display_orientation_transform_type == 3 ||
+            s->display_orientation_transform_type == 4 ||
+            s->display_orientation_transform_type == 6) {
+            h->hflip = 1;
+        } else {
+            h->hflip = 0;
+        }
+        h->anticlockwise_rotation = degrees[s->display_orientation_transform_type >> 1];
+    }
+
+    return 0;
+}
+
+static int decode_content_light_level_info(H2645SEIContentLight *h, const SEIRawContentLightLevelInfo *s)
+{
+    h->present                     = 1;
+    h->max_content_light_level     = s->max_content_light_level;
+    h->max_pic_average_light_level = s->max_pic_average_light_level;
+
+    return 0;
+}
+
+static int decode_frame_field_info(H274SEIFrameFieldInfo *h, const SEIRawFrameFieldInformation *s)
+{
+    if (s->ffi_source_scan_type > 3)
+        return AVERROR_INVALIDDATA;
+
+    h->present = 1;
+    if (s->ffi_field_pic_flag) {
+        if (s->ffi_bottom_field_flag)
+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
+        else
+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
+    } else {
+        h->display_elemental_periods = s->ffi_display_elemental_periods_minus1 + 1;
+    }
+
+    h->source_scan_type = s->ffi_source_scan_type;
+    h->duplicate_flag   = s->ffi_duplicate_flag;
+
+    return 0;
+}
+
+static int decode_ambient_viewing_environment(H2645SEIAmbientViewingEnvironment *h, const SEIRawAmbientViewingEnvironment *s)
+{
+    h->present             = 1;
+    h->ambient_illuminance = s->ambient_illuminance;
+    h->ambient_light_x     = s->ambient_light_x;
+    h->ambient_light_y     = s->ambient_light_y;
+
+    return 0;
+}
+
+static int decode_mastering_display_colour_volume(H2645SEIMasteringDisplay *h, const SEIRawMasteringDisplayColourVolume *s)
+{
+    h->present = 1;
+
+    for (int c = 0; c < 3; c++) {
+        h->display_primaries[c][0] = s->display_primaries_x[c];
+        h->display_primaries[c][1] = s->display_primaries_y[c];
+    }
+
+    h->white_point[0] = s->white_point_x;
+    h->white_point[1] = s->white_point_y;
+
+    h->max_luminance  = s->max_display_mastering_luminance;
+    h->min_luminance  = s->min_display_mastering_luminance;
+
+    return 0;
+}
+
+int ff_vvc_sei_decode(VVCSEI *s, const H266RawSEI *sei, const struct VVCFrameContext *fc)
+{
+    H2645SEI *c  = &s->common;
+
+    if (!sei)
+        return AVERROR_INVALIDDATA;
+
+    for (int i = 0; i < sei->message_list.nb_messages; i++) {
+        SEIRawMessage *message = &sei->message_list.messages[i];
+        void *payload          = message->payload;
+
+        switch (message->payload_type) {
+        case SEI_TYPE_FILM_GRAIN_CHARACTERISTICS:
+            av_refstruct_unref(&c->film_grain_characteristics);
+            c->film_grain_characteristics = av_refstruct_allocz(sizeof(*c->film_grain_characteristics));
+            if (!c->film_grain_characteristics)
+                return AVERROR(ENOMEM);
+            return decode_film_grain_characteristics(c->film_grain_characteristics, payload, fc);
+
+        case SEI_TYPE_DECODED_PICTURE_HASH:
+            return decode_decoded_picture_hash(&s->picture_hash, payload);
+
+        case SEI_TYPE_DISPLAY_ORIENTATION:
+            return decode_display_orientation(&s->common.display_orientation, payload);
+
+        case SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
+            return decode_content_light_level_info(&s->common.content_light, payload);
+
+        case SEI_TYPE_FRAME_FIELD_INFO:
+            return decode_frame_field_info(&s->frame_field_info, payload);
+
+        case SEI_TYPE_AMBIENT_VIEWING_ENVIRONMENT:
+            return decode_ambient_viewing_environment(&s->common.ambient_viewing_environment, payload);
+
+        case SEI_TYPE_MASTERING_DISPLAY_COLOUR_VOLUME:
+            return decode_mastering_display_colour_volume(&s->common.mastering_display, payload);
+
+        default:
+            av_log(fc->log_ctx, AV_LOG_DEBUG, "Skipped %s SEI %d\n",
+                sei->nal_unit_header.nal_unit_type == VVC_PREFIX_SEI_NUT ?
+                    "PREFIX" : "SUFFIX", message->payload_type);
+            return FF_H2645_SEI_MESSAGE_UNHANDLED;
+        }
+    }
+
+    return 0;
+}
+
+int ff_vvc_sei_replace(VVCSEI *dst, const VVCSEI *src)
+{
+    dst->picture_hash.present = 0;        // drop hash
+    dst->frame_field_info.present = 0;    // drop field info
+    return ff_h2645_sei_ctx_replace(&dst->common, &src->common);
+}
+
+void ff_vvc_sei_reset(VVCSEI *s)
+{
+    ff_h2645_sei_reset(&s->common);
+    s->picture_hash.present = 0;
+    s->frame_field_info.present = 0;
+}
diff --git a/libavcodec/vvc/sei.h b/libavcodec/vvc/sei.h
new file mode 100644
index 0000000000000..578b48a0e4382
--- /dev/null
+++ b/libavcodec/vvc/sei.h
@@ -0,0 +1,48 @@
+/*
+ * VVC Supplementary Enhancement Information messages
+ *
+ * copyright (c) 2024 Wu Jianhua <toqsxw@outlook.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VVC_SEI_H
+#define AVCODEC_VVC_SEI_H
+
+#include <stdint.h>
+
+#include "libavcodec/get_bits.h"
+#include "libavcodec/cbs.h"
+#include "libavcodec/cbs_h266.h"
+#include "libavcodec/h2645_sei.h"
+#include "libavcodec/sei.h"
+#include "libavcodec/vvc.h"
+#include "libavcodec/h274.h"
+
+typedef struct VVCSEI {
+    H2645SEI common;
+    H274SEIPictureHash picture_hash;
+    H274SEIFrameFieldInfo frame_field_info;
+} VVCSEI;
+
+struct VVCFrameContext;
+
+int ff_vvc_sei_decode(VVCSEI *s, const H266RawSEI *sei, const struct VVCFrameContext *fc);
+int ff_vvc_sei_replace(VVCSEI *dst, const VVCSEI *src);
+void ff_vvc_sei_reset(VVCSEI *s);
+
+#endif /* AVCODEC_VVC_SEI_H */
diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
index 6194416e14a95..2138341b0fe83 100644
--- a/libavcodec/vvc/thread.c
+++ b/libavcodec/vvc/thread.c
@@ -283,6 +283,13 @@ static void add_progress_listener(VVCFrame *ref, ProgressListener *l,
     ff_vvc_add_progress_listener(ref, (VVCProgressListener*)l);
 }
 
+static void ep_init_wpp(EntryPoint *next, const EntryPoint *ep, const VVCSPS *sps)
+{
+    memcpy(next->cabac_state, ep->cabac_state, sizeof(next->cabac_state));
+    memcpy(next->pp, ep->pp, sizeof(next->pp));
+    ff_vvc_ep_init_stat_coeff(next, sps->bit_depth, sps->r->sps_persistent_rice_adaptation_enabled_flag);
+}
+
 static void schedule_next_parse(VVCContext *s, VVCFrameContext *fc, const SliceContext *sc, const VVCTask *t)
 {
     VVCFrameThread *ft = fc->ft;
@@ -292,10 +299,8 @@ static void schedule_next_parse(VVCContext *s, VVCFrameContext *fc, const SliceC
     if (sps->r->sps_entropy_coding_sync_enabled_flag) {
         if (t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]) {
             EntryPoint *next = ep + 1;
-            if (next < sc->eps + sc->nb_eps && !is_first_row(fc, t->rx, t->ry + 1)) {
-                memcpy(next->cabac_state, ep->cabac_state, sizeof(next->cabac_state));
-                ff_vvc_ep_init_stat_coeff(next, sps->bit_depth, sps->r->sps_persistent_rice_adaptation_enabled_flag);
-            }
+            if (next < sc->eps + sc->nb_eps && !is_first_row(fc, t->rx, t->ry + 1))
+                ep_init_wpp(next, ep, sps);
         }
         if (t->ry + 1 < ft->ctu_height && !is_first_row(fc, t->rx, t->ry + 1))
             frame_thread_add_score(s, ft, t->rx, t->ry + 1, VVC_TASK_STAGE_PARSE);
diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index 889306aebd6fd..51487b72b5a0d 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -79,7 +79,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
         AV_WL32(extradata, flags1);
         AV_WL16(extradata + 4, flags2);
     } else {
-        av_assert0(0);
+        av_unreachable("This function is only used with WMAV1/2 encoders");
     }
     avctx->extradata          = extradata;
     s->use_exp_vlc            = flags2 & 0x0001;
@@ -206,7 +206,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
 
     // FIXME remove duplication relative to decoder
     if (s->use_variable_block_len) {
-        av_assert0(0); // FIXME not implemented
+        av_unreachable("use_variable_block_len unimplemented, set to 0 during init");
     } else {
         /* fixed block len */
         s->next_block_len_bits = s->frame_len_bits;
@@ -306,7 +306,8 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                 if (s->use_exp_vlc) {
                     encode_exp_vlc(s, ch, fixed_exp);
                 } else {
-                    av_assert0(0); // FIXME not implemented
+                    av_unreachable("use_exp_vlc always set to 1 during init");
+                    // FIXME not implemented
 //                    encode_exp_lsp(s, ch);
                 }
             }
@@ -365,7 +366,7 @@ static int encode_frame(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     init_put_bits(&s->pb, buf, buf_size);
 
     if (s->use_bit_reservoir)
-        av_assert0(0); // FIXME not implemented
+        av_unreachable("use_bit_reseroir unimplemented, set to 0 during init");
     else if (encode_block(s, src_coefs, total_gain) < 0)
         return INT_MAX;
 
diff --git a/libavcodec/wmv2enc.c b/libavcodec/wmv2enc.c
index f9fd918dbf6f6..592d1060d3973 100644
--- a/libavcodec/wmv2enc.c
+++ b/libavcodec/wmv2enc.c
@@ -28,6 +28,7 @@
 #include "msmpeg4enc.h"
 #include "msmpeg4data.h"
 #include "msmpeg4_vc1_data.h"
+#include "put_bits.h"
 #include "wmv2.h"
 
 #define WMV2_EXTRADATA_SIZE 4
@@ -78,6 +79,8 @@ static int wmv2_encode_picture_header(MPVMainEncContext *const m)
     MSMPEG4EncContext *const ms = &w->msmpeg4;
     MPVEncContext *const s = &m->s;
 
+    put_bits_assume_flushed(&s->pb);
+
     put_bits(&s->pb, 1, s->c.pict_type - 1);
     if (s->c.pict_type == AV_PICTURE_TYPE_I)
         put_bits(&s->pb, 7, 0);
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 821c410a0f654..89ee8dc726ec9 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -184,7 +184,9 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER)      += x86/vp6dsp.o
 X86ASM-OBJS-$(CONFIG_VP9_DECODER)      += x86/vp9intrapred.o            \
                                           x86/vp9intrapred_16bpp.o      \
                                           x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_avx512.o         \
                                           x86/vp9itxfm_16bpp.o          \
+                                          x86/vp9itxfm_16bpp_avx512.o   \
                                           x86/vp9lpf.o                  \
                                           x86/vp9lpf_16bpp.o            \
                                           x86/vp9mc.o                   \
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 4a0513d06d5dd..6b2ad4494b918 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -22,6 +22,9 @@
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  */
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
@@ -74,19 +77,263 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
 /* MMX no rounding */
 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
 #define SET_RND  MOVQ_WONE
-#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
 #define STATIC static
 
 #include "rnd_template.c"
-#include "hpeldsp_rnd_template.c"
 
 #undef DEF
 #undef SET_RND
-#undef PAVGBP
-#undef PAVGB
 #undef STATIC
 
+// this routine is 'slightly' suboptimal but mostly unused
+static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                                       ptrdiff_t line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    MOVQ_WONE(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "add    %3, %1                  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
+                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
+
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
+                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"FF_REG_a"     \n\t"
+        "movq (%1), %%mm0               \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
+        PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
+        PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+        __asm__ volatile(
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
+            PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
+            PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
+            :"memory");
+}
+
+static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
+        PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm0, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm2, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
 #if HAVE_MMX
 CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
@@ -101,7 +348,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #define SET_RND  MOVQ_WTWO
 #define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
 #define STATIC
-#define NO_AVG
 
 #include "rnd_template.c"
 
@@ -122,7 +368,6 @@ CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
     CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
     CALL_2X_PIXELS(put_pixels16_y2        ## CPUEXT, ff_put_pixels8_y2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
@@ -170,7 +415,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
     c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
     c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
 
-    c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
     c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
deleted file mode 100644
index 2bff2d27660cf..0000000000000
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
- * and improved by Zdenek Kabelac <kabi@users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-// put_pixels
-av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"FF_REG_a"     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-        __asm__ volatile(
-            ".p2align 3                 \n\t"
-            "1:                         \n\t"
-            "movq  (%1), %%mm0          \n\t"
-            "movq  1(%1), %%mm1         \n\t"
-            "movq  (%2), %%mm3          \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, (%2)          \n\t"
-            "movq  8(%1), %%mm0         \n\t"
-            "movq  9(%1), %%mm1         \n\t"
-            "movq  8(%2), %%mm3         \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8(%2)         \n\t"
-            "add    %3, %1              \n\t"
-            "add    %3, %2              \n\t"
-            "subl   $1, %0              \n\t"
-            "jnz    1b                  \n\t"
-            :"+g"(h), "+S"(pixels), "+D"(block)
-            :"r"((x86_reg)line_size)
-            :"memory");
-}
-
-av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm0, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm2, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 85e9159f91077..dbb21871218ac 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -109,7 +109,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         qmat = s->q_inter_matrix16[qscale][0];
     }
 
-    if ((s->c.out_format == FMT_H263 || s->c.out_format == FMT_H261) && !s->c.mpeg_quant) {
+    if ((s->c.out_format == FMT_H263 || s->c.out_format == FMT_H261) && !s->mpeg_quant) {
         __asm__ volatile(
             "movd %%"FF_REG_a", %%xmm3          \n\t" // last_non_zero_p1
             SPREADW("%%xmm3")
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index 51f2a0033a45b..f105775c2b1dc 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -28,7 +28,6 @@ void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                          ptrdiff_t stride);
 
 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
-                                     AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index b825eeba6e032..4590aeddf014b 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -96,82 +96,3 @@ av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixel
         :"D"(block), "r"((x86_reg)line_size)
         :FF_REG_a, "memory");
 }
-
-#ifndef NO_AVG
-// avg_pixels
-// this routine is 'slightly' suboptimal but mostly unused
-av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
-                                  ptrdiff_t line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
-        "add    %3, %1                  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
-                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"        \n\t"
-
-        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
-                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-#endif
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 8d11dbc348022..4373fa3f04e64 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -114,7 +114,9 @@ itxfm_func(idct, idct, 32, ssse3);
 itxfm_func(idct, idct, 32, avx);
 itxfm_func(iwht, iwht, 4, mmx);
 itxfm_funcs(16, avx2);
+itxfm_funcs(16, avx512icl);
 itxfm_func(idct, idct, 32, avx2);
+itxfm_func(idct, idct, 32, avx512icl);
 
 #undef itxfm_func
 #undef itxfm_funcs
@@ -406,6 +408,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
         init_ipred(32, avx2, tm, TM_VP8);
     }
 
+#if ARCH_X86_64
+    if (EXTERNAL_AVX512ICL(cpu_flags)) {
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT]  =
+        dsp->itxfm_add[TX_32X32][DCT_ADST]  =
+        dsp->itxfm_add[TX_32X32][DCT_DCT]   = ff_vp9_idct_idct_32x32_add_avx512icl;
+    }
+#endif
+
 #undef init_fpel
 #undef init_subpel1
 #undef init_subpel2
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index f93ea2468ea59..db775f7c1a403 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -127,6 +127,8 @@ decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
 #if BPC == 10
 decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
 decl_itxfm_funcs(4, BPC, ssse3);
+decl_itxfm_funcs(16, BPC, avx512icl);
+decl_itxfm_func(idct,  idct, 32, BPC, avx512icl);
 #else
 decl_itxfm_func(idct,  idct,  4, BPC, sse2);
 #endif
@@ -233,6 +235,12 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
 #endif
     }
 
+#if ARCH_X86_64 && BPC == 10
+    if (EXTERNAL_AVX512ICL(cpu_flags)) {
+        init_itx_funcs(TX_16X16, 16, BPC, avx512icl);
+        init_itx_func_one(TX_32X32, idct, idct, 32, BPC, avx512icl);
+    }
+#endif
 #endif /* HAVE_X86ASM */
 
     ff_vp9dsp_init_16bpp_x86(dsp);
diff --git a/libavcodec/x86/vp9itxfm_16bpp_avx512.asm b/libavcodec/x86/vp9itxfm_16bpp_avx512.asm
new file mode 100644
index 0000000000000..11d1e453a70a9
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_16bpp_avx512.asm
@@ -0,0 +1,1165 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2025 Two Orioles, LLC
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL
+
+SECTION_RODATA 64
+
+; Thw following set of constants are ordered to form the
+; qword shuffle mask { 0,  2,  4,  6,  1,  3,  5,  7 }
+%define deintq_perm pd_5520
+pd_5520:     dd 5520
+pd_9760:     dd 9760
+pd_10394:    dd 10394
+pd_15426:    dd 15426
+pd_804:      dd 804
+pd_2404:     dd 2404
+pd_6270:     dd 6270
+pd_9102:     dd 9102
+pd_11585:    dd 11585
+pd_12665:    dd 12665
+pd_7723:     dd 7723
+pd_14811:    dd 14811
+pd_7005:     dd 7005
+pd_14053:    dd 14053
+pd_8423:     dd 8423
+pd_13623:    dd 13623
+
+pixel_clip:  times 2 dw 0x7c00
+pixel_clip6: dd 2031648 ; 32 + (pixel_clip << 6)
+pd_532480:   dd 532480  ; 8192 + (32 << 14)
+pd_8192:     dd 8192
+
+pd_1606:     dd 1606
+pd_3196:     dd 3196
+pd_3981:     dd 3981
+pd_4756:     dd 4756
+pd_11003:    dd 11003
+pd_12140:    dd 12140
+pd_13160:    dd 13160
+pd_14449:    dd 14449
+pd_15137:    dd 15137
+pd_15679:    dd 15679
+pd_15893:    dd 15893
+pd_16069:    dd 16069
+pd_16207:    dd 16207
+pd_16305:    dd 16305
+pd_16364:    dd 16364
+
+SECTION .text
+
+%define o_base (deintq_perm+128)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], inv_dst2
+%if %8 < 32
+    pmulld              m%4, m%1, m%8
+    pmulld              m%3, m%2, m%8
+%else
+    vpbroadcastd        m%3, [o(pd_%8)]
+    pmulld              m%4, m%1, m%3
+    pmulld              m%3, m%2
+%endif
+%if %7 < 32
+    pmulld              m%1, m%7
+    pmulld              m%2, m%7
+%else
+    vpbroadcastd        m%5, [o(pd_%7)]
+    pmulld              m%1, m%5
+    pmulld              m%2, m%5
+%endif
+%if %9
+    psubd               m%4, m%6, m%4
+    psubd               m%2, m%4, m%2
+%else
+%ifnum %6
+    paddd               m%4, m%6
+%endif
+    paddd               m%2, m%4
+%endif
+%ifnum %6
+    paddd               m%1, m%6
+%endif
+    psubd               m%1, m%3
+%ifnum %6
+    psrad               m%2, 14
+    psrad               m%1, 14
+%endif
+%endmacro
+
+%macro WRAP_YMM 1+
+    INIT_YMM cpuname
+    %1
+    INIT_ZMM cpuname
+%endmacro
+
+%macro TRANSPOSE_4D 5 ; in[1-4], tmp
+    punpckhdq           m%5, m%3, m%4 ; c2 d2 c3 d3
+    punpckldq           m%3, m%4      ; c0 d0 c1 d1
+    punpckhdq           m%4, m%1, m%2 ; a2 b2 a3 b3
+    punpckldq           m%1, m%2      ; a0 b0 a1 b1
+    punpckhqdq          m%2, m%1, m%3 ; a1 b1 c1 d1
+    punpcklqdq          m%1, m%3      ; a0 b0 c0 d0
+    punpcklqdq          m%3, m%4, m%5 ; a2 b2 c2 d2
+    punpckhqdq          m%4, m%5      ; a3 b3 c3 d3
+%endmacro
+
+%macro TRANSPOSE_4DQ 5 ; in[1-4], tmp
+    vshufi32x4          m%5, m%3, m%4, q3232 ; c2 c3 d2 d3
+    vinserti32x8        m%3, ym%4, 1         ; c0 c1 d0 d1
+    vshufi32x4          m%4, m%1, m%2, q3232 ; a2 a3 b2 b3
+    vinserti32x8        m%1, ym%2, 1         ; a0 a1 b0 b1
+    vshufi32x4          m%2, m%1, m%3, q3131 ; a1 b1 c1 d1
+    vshufi32x4          m%1, m%3, q2020      ; a0 b0 c0 d0
+    vshufi32x4          m%3, m%4, m%5, q2020 ; a2 b2 c2 d2
+    vshufi32x4          m%4, m%5, q3131      ; a3 b3 c3 d3
+%endmacro
+
+%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
+cglobal vp9_i%1_i%2_%3_add_10, 4, 5, 0, dst, stride, c, eob, tx2
+    %define %%p1 m(vp9_i%1_%3_internal_10)
+    lea                  r5, [o_base]
+    ; Jump to the 1st txfm function if we're not taking the fast path, which
+    ; in turn performs an indirect jump to the 2nd txfm function.
+    lea                tx2q, [m(vp9_i%2_%3_internal_10).pass2]
+%ifidn %1_%2, dct_dct
+    dec                eobd
+    jnz %%p1
+%else
+%if %4
+    add                eobd, %4
+%endif
+    ; jump to the 1st txfm function unless it's located directly after this
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+    INV_TXFM_FN          %1, %2, 16x16, %3
+%ifidn %1_%2, dct_dct
+    imul                r6d, [cq], 11585
+    vpbroadcastd        ym3, [o(pixel_clip)]
+    mov                [cq], r3d
+    add                 r6d, 8192
+    sar                 r6d, 14
+    imul                r6d, 11585
+    or                  r3d, 8
+    add                 r6d, 532480
+    sar                 r6d, 20
+    vpbroadcastw        ym2, r6d
+    paddsw              ym2, ym3
+.dconly_loop:
+    paddsw              ym0, ym2, [dstq+strideq*0]
+    paddsw              ym1, ym2, [dstq+strideq*1]
+    psubusw             ym0, ym3
+    psubusw             ym1, ym3
+    mova   [dstq+strideq*0], ym0
+    mova   [dstq+strideq*1], ym1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r3d
+    jg .dconly_loop
+    RET
+%endif
+%endmacro
+
+%macro IDCT16_PART1 0
+%if mmsize == 64
+.main_part1_fast:
+%endif
+    pmulld              m15, m1, [o(pd_16305)] {bcstd} ; t15a
+    pmulld               m1, [o(pd_1606)] {bcstd}      ; t8a
+    pmulld               m9, m7, [o(pd_10394)] {bcstd} ; t9a
+    pmulld               m7, [o(pd_12665)] {bcstd}     ; t14a
+    pmulld              m11, m5, [o(pd_14449)] {bcstd} ; t13a
+    pmulld               m5, [o(pd_7723)] {bcstd}      ; t10a
+    pmulld              m13, m3, [o(pd_4756)] {bcstd}  ; t11a
+    pmulld               m3, [o(pd_15679)] {bcstd}     ; t12a
+    pmulld              m10, m6, [o(pd_9102)] {bcstd}  ; t5a
+    pmulld               m6, [o(pd_13623)] {bcstd}     ; t6a
+    pmulld              m14, m2, [o(pd_16069)] {bcstd} ; t7a
+    pmulld               m2, [o(pd_3196)] {bcstd}      ; t4a
+    pmulld              m12, m4, [o(pd_15137)] {bcstd} ; t3
+    pmulld               m4, [o(pd_6270)] {bcstd}      ; t2
+    pmulld               m0, m21
+    REPX  {psubd x, m20, x}, m9, m13, m10
+    paddd                m0, m20
+    mova                m18, m0
+%if mmsize == 64 ; for the ymm variant we only ever use the fast path
+    jmp %%main_part1b
+.main_part1:
+    ITX_MULSUB_2D         1, 15, 16, 17, 18, _,  1606, 16305 ; t8a,  t15a
+    ITX_MULSUB_2D         9,  7, 16, 17, 18, _, 12665, 10394 ; t9a,  t14a
+    ITX_MULSUB_2D         5, 11, 16, 17, 18, _,  7723, 14449 ; t10a, t13a
+    ITX_MULSUB_2D        13,  3, 16, 17, 18, _, 15679,  4756 ; t11a, t12a
+    ITX_MULSUB_2D        10,  6, 16, 17, 18, _, 13623,  9102 ; t5a,  t6a
+    ITX_MULSUB_2D         2, 14, 16, 17, 18, _,  3196, 16069 ; t4a,  t7a
+    ITX_MULSUB_2D         4, 12, 16, 17, 18, _,  6270, 15137 ; t2,  t3
+    pmulld               m0, m21
+    pmulld               m8, m21
+    REPX     {paddd x, m20}, m0, m9, m13, m10
+    psubd               m18, m0, m8   ; t1
+    paddd                m0, m8       ; t0
+%%main_part1b:
+%endif
+    vpbroadcastd        m19, [o(pd_15137)]
+    vpbroadcastd        m16, [o(pd_6270)]
+    REPX     {paddd x, m20}, m15, m7, m1, m11, m3, m5
+    REPX     {psrad x, 14 }, m15, m7, m1, m9, m11, m3, m5, m13
+    paddd               m17, m15, m7  ; t15
+    psubd               m15, m7       ; t14
+    psubd                m7, m3, m11  ; t13
+    paddd                m3, m11      ; t12
+    psubd               m11, m13, m5  ; t10
+    paddd                m5, m13      ; t11
+    psubd               m13, m1, m9   ; t9
+    paddd                m1, m9       ; t8
+    ITX_MULSUB_2D        15, 13, 8, 9, _, 20, 16, 19         ; t9a,  t14a
+    ITX_MULSUB_2D         7, 11, 8, 9, _, 20, 16, 19, 2      ; t13a, t10a
+    paddd               m16, m1, m5   ; t8a
+    psubd                m1, m5       ; t11a
+    paddd                m8, m15, m11 ; t9
+    psubd               m15, m11      ; t10
+    psubd               m11, m17, m3  ; t12a
+    paddd               m17, m3       ; t15a
+    psubd                m9, m13, m7  ; t13
+    paddd               m13, m7       ; t14
+    REPX    {pmulld x, m21}, m11, m9, m1, m15
+    REPX     {paddd x, m20}, m2, m6, m14
+    REPX     {psrad x, 14 }, m10, m2, m6, m14
+    psubd                m3, m2, m10  ; t5a
+    paddd               m10, m2       ; t4
+    paddd               m11, m20
+    psubd                m5, m11, m1  ; t11
+    paddd               m11, m1       ; t12
+    psubd                m1, m14, m6  ; t6a
+    paddd               m14, m6       ; t7
+    pmulld               m1, m21
+    pmulld               m3, m21
+    paddd                m4, m20
+    paddd               m12, m20
+    REPX     {psrad x, 14 }, m4, m12, m0, m18
+    paddd                m9, m20
+    paddd                m2, m9, m15  ; t13a
+    psubd                m9, m15      ; t10a
+    paddd                m1, m20
+    psubd                m6, m1, m3   ; t5
+    paddd                m1, m3       ; t6
+    REPX      {psrad x, 14}, m6, m1, m11, m5, m2, m9
+%endmacro
+
+%macro IDCT16_PART2 0
+    psubd                m3, m0, m12 ; t3
+    paddd                m0, m12     ; t0
+    psubd               m12, m18, m4 ; t2
+    paddd               m18, m4      ; t1
+    psubd                m4, m3, m10 ; t4
+    paddd                m3, m10     ; t3
+    psubd               m10, m12, m6 ; t5
+    paddd               m12, m6      ; t2
+    psubd                m6, m18, m1 ; t6
+    paddd                m1, m18     ; t1
+    psubd                m7, m0, m14 ; t7
+    paddd                m0, m14     ; t0
+    psubd               m15, m0, m17 ; out15
+    paddd                m0, m17     ; out0
+    psubd               m14, m1, m13 ; out14
+    paddd                m1, m13     ; out1
+    psubd               m13, m12, m2 ; out13
+    paddd                m2, m12     ; out2
+    psubd               m12, m3, m11 ; out12
+    paddd                m3, m11     ; out3
+    psubd               m11, m4, m5  ; out11
+    paddd                m4, m5      ; out4
+    paddd                m5, m10, m9 ; out5
+    psubd               m10, m9      ; out10
+    psubd                m9, m6, m8  ; out9
+    paddd                m6, m8      ; out6
+    psubd                m8, m7, m16 ; out8
+    paddd                m7, m16     ; out7
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst, 39-23-1
+
+cglobal vp9_idct_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2
+    mova                 m0, [cq+64* 0]
+    mova                 m1, [cq+64* 1]
+    mova                 m2, [cq+64* 2]
+    mova                 m3, [cq+64* 3]
+    mova                 m4, [cq+64* 4]
+    mova                 m5, [cq+64* 5]
+    mova                 m6, [cq+64* 6]
+    mova                 m7, [cq+64* 7]
+    vpbroadcastd        m20, [o(pd_8192)]
+    vpbroadcastd        m21, [o(pd_11585)]
+    sub                eobd, 38
+    jl .pass1_fast
+    mova                 m8, [cq+64* 8]
+    mova                 m9, [cq+64* 9]
+    mova                m10, [cq+64*10]
+    mova                m11, [cq+64*11]
+    mova                m12, [cq+64*12]
+    mova                m13, [cq+64*13]
+    mova                m14, [cq+64*14]
+    mova                m15, [cq+64*15]
+    call .main_part1
+    call .main_part2
+.pass1_end:
+    TRANSPOSE_4DQ         0,  4,  8, 12, 16
+    TRANSPOSE_4DQ         1,  5,  9, 13, 16
+    TRANSPOSE_4DQ         2,  6, 10, 14, 16
+    TRANSPOSE_4DQ         3,  7, 11, 15, 16
+    TRANSPOSE_4D          8,  9, 10, 11, 16
+    TRANSPOSE_4D         12, 13, 14, 15, 16
+    mov                 r6d, 64*12
+    jmp .pass1_transpose_end
+.pass1_fast:
+    WRAP_YMM IDCT16_PART1
+    WRAP_YMM IDCT16_PART2
+.pass1_fast_end:
+    vinserti32x8         m0, ym4, 1
+    vinserti32x8         m8, ym12, 1
+    vinserti32x8         m1, ym5, 1
+    vinserti32x8         m9, ym13, 1
+    vinserti32x8         m2, ym6, 1
+    vinserti32x8        m10, ym14, 1
+    vinserti32x8         m3, ym7, 1
+    vinserti32x8        m11, ym15, 1
+    vshufi32x4           m4, m0, m8, q3131
+    vshufi32x4           m0, m8, q2020
+    vshufi32x4           m5, m1, m9, q3131
+    vshufi32x4           m1, m9, q2020
+    vshufi32x4           m6, m2, m10, q3131
+    vshufi32x4           m2, m10, q2020
+    vshufi32x4           m7, m3, m11, q3131
+    vshufi32x4           m3, m11, q2020
+    mov                 r6d, 64*4
+.pass1_transpose_end:
+    pxor                m16, m16
+.zero_loop:
+    mova       [cq+r6+64*0], m16
+    mova       [cq+r6+64*1], m16
+    mova       [cq+r6+64*2], m16
+    mova       [cq+r6+64*3], m16
+    sub                 r6d, 64*4
+    jge .zero_loop
+    TRANSPOSE_4D          0,  1,  2,  3, 16
+    TRANSPOSE_4D          4,  5,  6,  7, 16
+    jmp                tx2q
+.pass2:
+    test               eobd, eobd
+    jl .pass2_fast
+    call .main_part1
+    jmp .pass2_end
+.pass2_fast:
+    call .main_part1_fast
+.pass2_end:
+    vpbroadcastd         m3, [o(pixel_clip6)]
+    paddd                m0, m3
+    paddd               m18, m3
+    call .main_part2
+    REPX       {psrad x, 6}, m0, m1, m2, m3
+    packssdw             m0, m1
+    lea                  r6, [strideq*3]
+    packssdw             m1, m2, m3
+    mova                 m2, [o(deintq_perm)]
+    vpbroadcastd         m3, [o(pixel_clip)]
+    REPX       {psrad x, 6}, m4, m5, m6, m7
+    call .write_16x4
+    packssdw             m0, m4, m5
+    packssdw             m1, m6, m7
+    REPX       {psrad x, 6}, m8, m9, m10, m11
+    call .write_16x4
+    packssdw             m0, m8, m9
+    packssdw             m1, m10, m11
+.pass2_end2:
+    REPX       {psrad x, 6}, m12, m13, m14, m15
+    call .write_16x4
+    packssdw             m0, m12, m13
+    packssdw             m1, m14, m15
+    call .write_16x4
+    RET
+ALIGN function_align
+.write_16x4:
+    mova               ym16, [dstq+strideq*0]
+    vinserti32x8        m16, [dstq+strideq*1], 1
+    mova               ym17, [dstq+strideq*2]
+    vinserti32x8        m17, [dstq+r6       ], 1
+    vpermq               m0, m2, m0
+    vpermq               m1, m2, m1
+    paddsw              m16, m0
+    paddsw              m17, m1
+    psubusw             m16, m3
+    psubusw             m17, m3
+    mova          [dstq+strideq*0], ym16
+    vextracti32x8 [dstq+strideq*1], m16, 1
+    mova          [dstq+strideq*2], ym17
+    vextracti32x8 [dstq+r6       ], m17, 1
+    lea                dstq, [dstq+strideq*4]
+    ret
+ALIGN function_align
+    IDCT16_PART1
+    ret
+ALIGN function_align
+.main_part2:
+    IDCT16_PART2
+    ret
+
+%macro IADST16_PART1 0
+%if mmsize == 64
+.main_part1_fast:
+%endif
+    pmulld              m15, m0, [o(pd_16364)] {bcstd} ; t1
+    pmulld               m0, [o(pd_804)] {bcstd}       ; t0
+    pmulld              m13, m2, [o(pd_15893)] {bcstd} ; t3
+    pmulld               m2, [o(pd_3981)] {bcstd}      ; t2
+    pmulld              m11, m4, [o(pd_14811)] {bcstd} ; t5
+    pmulld               m4, [o(pd_7005)] {bcstd}      ; t4
+    pmulld               m9, m6, [o(pd_13160)] {bcstd} ; t7
+    pmulld               m6, [o(pd_9760)] {bcstd}      ; t6
+    pmulld               m8, m7, [o(pd_11003)] {bcstd} ; t8
+    pmulld               m7, [o(pd_12140)] {bcstd}     ; t9
+    pmulld              m10, m5, [o(pd_8423)] {bcstd}  ; t10
+    pmulld               m5, [o(pd_14053)] {bcstd}     ; t11
+    pmulld              m12, m3, [o(pd_5520)] {bcstd}  ; t12
+    pmulld               m3, [o(pd_15426)] {bcstd}     ; t13
+    pmulld              m14, m1, [o(pd_2404)] {bcstd}  ; t14
+    pmulld               m1, [o(pd_16207)] {bcstd}     ; t15
+    REPX  {psubd x, m20, x}, m15, m13, m11, m9
+%if mmsize == 64 ; for the ymm variant we only ever use the fast path
+    jmp %%main_part1b
+ALIGN function_align
+.main_part1:
+    ITX_MULSUB_2D        15,  0, 16, 17, 18, _,   804, 16364 ; t1,  t0
+    ITX_MULSUB_2D        13,  2, 16, 17, 18, _,  3981, 15893 ; t3,  t2
+    ITX_MULSUB_2D        11,  4, 16, 17, 18, _,  7005, 14811 ; t5,  t4
+    ITX_MULSUB_2D         9,  6, 16, 17, 18, _,  9760, 13160 ; t7,  t6
+    ITX_MULSUB_2D         7,  8, 16, 17, 18, _, 12140, 11003 ; t9,  t8
+    ITX_MULSUB_2D         5, 10, 16, 17, 18, _, 14053,  8423 ; t11, t10
+    ITX_MULSUB_2D         3, 12, 16, 17, 18, _, 15426,  5520 ; t13, t12
+    ITX_MULSUB_2D         1, 14, 16, 17, 18, _, 16207,  2404 ; t15, t14
+    REPX     {paddd x, m20}, m15, m13, m11, m9
+%%main_part1b:
+%endif
+    REPX     {paddd x, m20}, m0, m2, m4, m6
+    psubd               m16, m2, m10  ; t10a
+    paddd                m2, m10      ; t2a
+    psubd               m10, m9, m1   ; t15a
+    paddd                m9, m1       ; t7a
+    psubd                m1, m13, m5  ; t11a
+    paddd               m13, m5       ; t3a
+    psubd                m5, m6, m14  ; t14a
+    paddd                m6, m14      ; t6a
+    REPX      {psrad x, 14}, m16, m10, m1, m5
+    psubd               m14, m0, m8   ; t8a
+    paddd                m0, m8       ; t0a
+    psubd                m8, m15, m7  ; t9a
+    paddd               m15, m7       ; t1a
+    psubd                m7, m4, m12  ; t12a
+    paddd                m4, m12      ; t4a
+    paddd               m12, m11, m3  ; t5a
+    psubd               m11, m3       ; t13a
+    REPX      {psrad x, 14}, m14, m8, m7, m11
+    vpbroadcastd        m19, [o(pd_9102)]
+    vpbroadcastd        m18, [o(pd_13623)]
+    ITX_MULSUB_2D        16, 1, 3, 17, _, _, 18, 19 ; t11, t10
+    ITX_MULSUB_2D        10, 5, 3, 17, _, _, 19, 18 ; t14, t15
+    vpbroadcastd        m19, [o(pd_16069)]
+    vpbroadcastd        m18, [o(pd_3196)]
+    ITX_MULSUB_2D        14, 8, 3, 17, _, _, 18, 19 ; t9,  t8
+    ITX_MULSUB_2D        11, 7, 3, 17, _, _, 19, 18 ; t12, t13
+    vpbroadcastd        m19, [o(pd_6270)]
+    vpbroadcastd        m18, [o(pd_15137)]
+    REPX      {psrad x, 14}, m15, m12, m0, m4
+    psubd                m3, m15, m12 ; t5
+    paddd               m15, m12      ; t1
+    psubd               m12, m0, m4   ; t4
+    paddd                m0, m4       ; t0
+    REPX      {psrad x, 14}, m2, m6, m13, m9
+    psubd                m4, m2, m6   ; t6
+    paddd                m2, m6       ; t2
+    psubd                m6, m13, m9  ; t7
+    paddd                m9, m13      ; t3
+    REPX     {paddd x, m20}, m8, m14, m1, m16
+    psubd               m13, m8, m11  ; t12a
+    paddd                m8, m11      ; t8a
+    psubd               m11, m14, m7  ; t13a
+    paddd               m14, m7       ; t9a
+    psubd                m7, m1, m10  ; t14a
+    paddd                m1, m10      ; t10a
+    psubd               m10, m16, m5  ; t15a
+    paddd               m16, m5       ; t11a
+    REPX      {psrad x, 14}, m13, m11, m7, m10
+    ITX_MULSUB_2D        12,  3, 5, 17, _, _, 19, 18 ; t5a, t4a
+    ITX_MULSUB_2D         6,  4, 5, 17, _, _, 18, 19 ; t6a, t7a
+    ITX_MULSUB_2D        13, 11, 5, 17, _, _, 19, 18 ; t13, t12
+    ITX_MULSUB_2D        10,  7, 5, 17, _, _, 18, 19 ; t14, t15
+    REPX      {psrad x, 14}, m8, m1, m14, m16
+    psubd                m5, m8, m1   ;  t10
+    paddd                m1, m8       ; -out1
+    psubd                m8, m15, m9  ;  t3a
+    paddd               m15, m9       ; -out15
+    psubd                m9, m14, m16 ;  t11
+    paddd               m14, m16      ;  out14
+    psubd               m16, m0, m2   ;  t2a
+    paddd                m0, m2       ;  out0
+    REPX     {paddd x, m20}, m11, m13, m12, m3
+    paddd                m2, m11, m10 ;  out2
+    psubd               m11, m10      ;  t14a
+    psubd               m10, m13, m7  ;  t15a
+    paddd               m13, m7       ; -out13
+    psubd                m7, m12, m4  ;  t7
+    paddd               m12, m4       ;  out12
+    psubd                m4, m3, m6   ;  t6
+    paddd                m3, m6       ; -out3
+    REPX      {psrad x, 14}, m10, m7, m11, m4
+    REPX    {pmulld x, m21}, m9, m10, m7, m8, m5, m11, m4, m16
+    REPX      {psrad x, 14}, m2, m13, m12, m3
+%endmacro
+
+%macro IADST16_PART2 0
+    paddd                m9, m20
+    psubd               m10, m20, m10
+    paddd                m7, m20
+    psubd                m8, m20, m8
+    paddd                m6, m9, m5   ; out6
+    psubd                m9, m5       ; out9
+    psubd                m5, m10, m11 ; out5
+    paddd               m10, m11      ; out10
+    psubd               m11, m7, m4   ; out11
+    paddd                m4, m7       ; out4
+    psubd                m7, m8, m16  ; out7
+    paddd                m8, m16      ; out8
+%endmacro
+
+%macro IADST16_PASS1_END 0
+    pxor                m16, m16
+    psubd                m1, m16, m1
+    psubd                m3, m16, m3
+    psubd               m13, m16, m13
+    psubd               m15, m16, m15
+    REPX      {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct, 39-18
+INV_TXFM_16X16_FN adst, adst
+
+cglobal vp9_iadst_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2
+    mova                 m0, [cq+64* 0]
+    mova                 m1, [cq+64* 1]
+    mova                 m2, [cq+64* 2]
+    mova                 m3, [cq+64* 3]
+    mova                 m4, [cq+64* 4]
+    mova                 m5, [cq+64* 5]
+    mova                 m6, [cq+64* 6]
+    mova                 m7, [cq+64* 7]
+    vpbroadcastd        m20, [o(pd_8192)]
+    vpbroadcastd        m21, [o(pd_11585)]
+    sub                eobd, 39
+    jl .pass1_fast
+    mova                 m8, [cq+64* 8]
+    mova                 m9, [cq+64* 9]
+    mova                m10, [cq+64*10]
+    mova                m11, [cq+64*11]
+    mova                m12, [cq+64*12]
+    mova                m13, [cq+64*13]
+    mova                m14, [cq+64*14]
+    mova                m15, [cq+64*15]
+    call .main_part1
+    call .main_part2
+    IADST16_PASS1_END
+    jmp m(vp9_idct_16x16_internal_10).pass1_end
+.pass1_fast:
+    WRAP_YMM IADST16_PART1
+    WRAP_YMM IADST16_PART2
+    WRAP_YMM IADST16_PASS1_END
+    jmp m(vp9_idct_16x16_internal_10).pass1_fast_end
+.pass2:
+    test               eobd, eobd
+    jl .pass2_fast
+    call .main_part1
+    jmp .pass2_end
+.pass2_fast:
+    call .main_part1_fast
+.pass2_end:
+    vpbroadcastd        m20, [o(pd_532480)]
+    call .main_part2
+    vpbroadcastd        m16, [o(pixel_clip6)]
+    REPX     {paddd x, m16}, m0, m2, m12, m14
+    REPX  {psubd x, m16, x}, m1, m3, m13, m15
+    REPX       {psrad x, 6}, m0, m1, m2, m3
+    packssdw             m0, m1
+    lea                  r6, [strideq*3]
+    packssdw             m1, m2, m3
+    mova                 m2, [o(deintq_perm)]
+    vpbroadcastd         m3, [o(pixel_clip)]
+    REPX      {psrad x, 20}, m4, m5, m6, m7
+    call m(vp9_idct_16x16_internal_10).write_16x4
+    packssdw             m0, m4, m5
+    packssdw             m1, m6, m7
+    paddsw               m0, m3
+    paddsw               m1, m3
+    REPX      {psrad x, 20}, m8, m9, m10, m11
+    call m(vp9_idct_16x16_internal_10).write_16x4
+    packssdw             m0, m8, m9
+    packssdw             m1, m10, m11
+    paddsw               m0, m3
+    paddsw               m1, m3
+    jmp m(vp9_idct_16x16_internal_10).pass2_end2
+ALIGN function_align
+    IADST16_PART1
+    ret
+ALIGN function_align
+.main_part2:
+    IADST16_PART2
+    ret
+
+cglobal vp9_idct_idct_32x32_add_10, 4, 7, 23, 64*64, dst, stride, c, eob
+%undef cmp
+    lea                  r5, [o_base]
+    dec                eobd
+    jnz .pass1
+    imul                r6d, [cq], 11585
+    vpbroadcastd         m3, [o(pixel_clip)]
+    mov                [cq], r3d
+    add                 r6d, 8192
+    sar                 r6d, 14
+    imul                r6d, 11585
+    or                  r3d, 16
+    add                 r6d, 532480
+    sar                 r6d, 20
+    vpbroadcastw         m2, r6d
+    paddsw               m2, m3
+.dconly_loop:
+    paddsw               m0, m2, [dstq+strideq*0]
+    paddsw               m1, m2, [dstq+strideq*1]
+    psubusw              m0, m3
+    psubusw              m1, m3
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r3d
+    jg .dconly_loop
+    RET
+.pass1:
+    vpbroadcastd        m20, [o(pd_8192)]
+    vpbroadcastd        m21, [o(pd_11585)]
+    cmp                eobd, 135
+    jl .pass1_fast
+    add                  cq, 64
+    lea                  r4, [rsp+64*8]
+    cmp                eobd, 579
+    jl .pass1_right_fast
+    mov                 r6d, 128*28
+    call .pass1_main
+    jmp .pass1_right_end
+.pass1_right_fast: ; bottomright quadrant is zero
+    mova                 m0, [cq+128* 1]
+    mova                 m1, [cq+128* 3]
+    mova                 m2, [cq+128* 5]
+    mova                 m3, [cq+128* 7]
+    mova                 m4, [cq+128* 9]
+    mova                 m5, [cq+128*11]
+    mova                 m6, [cq+128*13]
+    mova                 m7, [cq+128*15]
+    call .main_fast
+    mova                 m0, [cq+128* 0]
+    mova                 m1, [cq+128* 2]
+    mova                 m2, [cq+128* 4]
+    mova                 m3, [cq+128* 6]
+    mova                 m4, [cq+128* 8]
+    mova                 m5, [cq+128*10]
+    mova                 m6, [cq+128*12]
+    mova                 m7, [cq+128*14]
+    call m(vp9_idct_16x16_internal_10).main_part1_fast
+    mov                 r6d, 128*12
+    call .pass1_main_end
+.pass1_right_end:
+    mova         [r4+64* 8], m0
+    mova         [r4+64* 9], m1
+    mova         [r4+64*10], m2
+    mova         [r4+64*11], m3
+    mova         [r4+64*12], m4
+    mova         [r4+64*13], m5
+    mova         [r4+64*14], m6
+    mova         [r4+64*15], m7
+    mova         [r4+64*16], m16
+    mova         [r4+64*17], m17
+    mova         [r4+64*18], m18
+    mova         [r4+64*19], m19
+    mova         [r4+64*20], m8
+    mova         [r4+64*21], m9
+    mova         [r4+64*22], m10
+    mova         [r4+64*23], m11
+    sub                  cq, 64
+    sub                  r4, 64*8
+    mov                 r6d, 128*28
+    call .pass1_main
+    mova                m12, [r4+64*20]
+    mova                m13, [r4+64*21]
+    mova                m14, [r4+64*22]
+    mova                m15, [r4+64*23]
+    mova         [r4+64*20], m8
+    mova         [r4+64*21], m9
+    mova         [r4+64*22], m10
+    mova         [r4+64*23], m11
+    mova                 m8, [r4+64*16]
+    mova                 m9, [r4+64*17]
+    mova                m10, [r4+64*18]
+    mova                m11, [r4+64*19]
+    mova         [r4+64*16], m16
+    mova         [r4+64*17], m17
+    mova         [r4+64*18], m18
+    mova         [r4+64*19], m19
+    call .main
+    mova                 m0, [r4+64*16]
+    mova                 m1, [r4+64*17]
+    mova                 m2, [r4+64*18]
+    mova                 m3, [r4+64*19]
+    mova                 m4, [r4+64*20]
+    mova                 m5, [r4+64*21]
+    mova                 m6, [r4+64*22]
+    mova                 m7, [r4+64*23]
+    mova                 m8, [r4+64*24]
+    mova                 m9, [r4+64*25]
+    mova                m10, [r4+64*26]
+    mova                m11, [r4+64*27]
+    mova                m12, [r4+64*28]
+    mova                m13, [r4+64*29]
+    mova                m14, [r4+64*30]
+    mova                m15, [r4+64*31]
+    call m(vp9_idct_16x16_internal_10).main_part1
+    call .pass2_main_left
+    mova                 m8, [r4+64* 8]
+    mova                 m9, [r4+64* 9]
+    mova                m10, [r4+64*10]
+    mova                m11, [r4+64*11]
+    mova                m12, [r4+64*12]
+    mova                m13, [r4+64*13]
+    mova                m14, [r4+64*14]
+    mova                m15, [r4+64*15]
+    TRANSPOSE_4DQ         8, 10, 12, 14, 16
+    TRANSPOSE_4DQ         9, 11, 13, 15, 16
+    call .main
+    call .pass2_main_right
+    mova                 m8, [r4+64*24]
+    mova                 m9, [r4+64*25]
+    mova                m10, [r4+64*26]
+    mova                m11, [r4+64*27]
+    mova                m12, [r4+64*28]
+    mova                m13, [r4+64*29]
+    mova                m14, [r4+64*30]
+    mova                m15, [r4+64*31]
+    TRANSPOSE_4DQ         8, 10, 12, 14, 16
+    TRANSPOSE_4DQ         9, 11, 13, 15, 16
+    call m(vp9_idct_16x16_internal_10).main_part1
+    jmp .pass2_end
+.pass1_fast:
+    mova                 m0, [cq+128* 1]
+    mova                 m1, [cq+128* 3]
+    mova                 m2, [cq+128* 5]
+    mova                 m3, [cq+128* 7]
+    mova                 m4, [cq+128* 9]
+    mova                 m5, [cq+128*11]
+    mova                 m6, [cq+128*13]
+    mova                 m7, [cq+128*15]
+    mov                  r4, rsp
+    call .main_fast
+    mova                 m0, [cq+128* 0]
+    mova                 m1, [cq+128* 2]
+    mova                 m2, [cq+128* 4]
+    mova                 m3, [cq+128* 6]
+    mova                 m4, [cq+128* 8]
+    mova                 m5, [cq+128*10]
+    mova                 m6, [cq+128*12]
+    mova                 m7, [cq+128*14]
+    call m(vp9_idct_16x16_internal_10).main_part1_fast
+    call m(vp9_idct_16x16_internal_10).main_part2
+    mov                 r6d, 128*12
+    call .pass1_main_end2
+    mova         [r4+64*16], m16
+    mova         [r4+64*17], m17
+    mova         [r4+64*18], m18
+    mova         [r4+64*19], m19
+    mova         [r4+64*20], m8
+    mova         [r4+64*21], m9
+    mova         [r4+64*22], m10
+    mova         [r4+64*23], m11
+    call .main_fast
+    mova                 m0, [r4+64*16]
+    mova                 m1, [r4+64*17]
+    mova                 m2, [r4+64*18]
+    mova                 m3, [r4+64*19]
+    mova                 m4, [r4+64*20]
+    mova                 m5, [r4+64*21]
+    mova                 m6, [r4+64*22]
+    mova                 m7, [r4+64*23]
+    call m(vp9_idct_16x16_internal_10).main_part1_fast
+    call .pass2_main_left
+    call .main_fast
+    call .pass2_main_right
+    call m(vp9_idct_16x16_internal_10).main_part1_fast
+.pass2_end:
+    paddd                m0, m22
+    paddd               m18, m22
+    call m(vp9_idct_16x16_internal_10).main_part2
+    mova                m20, [o(deintq_perm)]
+    rorx                 r2, strideq, 59 ; strideq*32
+    vpbroadcastd        m21, [o(pixel_clip)]
+    add                  r2, dstq
+%assign i 0
+%rep 16
+    mova                m16, [r4+64*(15-i)]
+    mova                m17, [r4+64*(i-16)]
+    mova                m18, [r4-64*(17+i)]
+    paddd               m19, m %+ i, m16
+    psubd                m0, m %+ i, m16
+    call .write_32x2
+    %assign i i+1
+%endrep
+    RET
+ALIGN function_align
+.write_32x2:
+    paddd               m16, m17, m18
+    psubd               m17, m18
+    REPX       {psrad x, 6}, m19, m16, m0, m17
+    packssdw            m16, m19
+    packssdw            m17, m0
+    sub                  r2, strideq
+    vpermq              m16, m20, m16
+    vpermq              m17, m20, m17
+    paddsw              m16, [dstq]
+    paddsw              m17, [r2  ]
+    psubusw             m16, m21
+    psubusw             m17, m21
+    mova             [dstq], m16
+    mova             [r2  ], m17
+    add                dstq, strideq
+    ret
+ALIGN function_align
+.pass1_main:
+    mova                 m0, [cq+128* 1]
+    mova                 m1, [cq+128* 3]
+    mova                 m2, [cq+128* 5]
+    mova                 m3, [cq+128* 7]
+    mova                 m4, [cq+128* 9]
+    mova                 m5, [cq+128*11]
+    mova                 m6, [cq+128*13]
+    mova                 m7, [cq+128*15]
+    mova                 m8, [cq+128*17]
+    mova                 m9, [cq+128*19]
+    mova                m10, [cq+128*21]
+    mova                m11, [cq+128*23]
+    mova                m12, [cq+128*25]
+    mova                m13, [cq+128*27]
+    mova                m14, [cq+128*29]
+    mova                m15, [cq+128*31]
+    call .main
+    mova                 m0, [cq+128* 0]
+    mova                 m1, [cq+128* 2]
+    mova                 m2, [cq+128* 4]
+    mova                 m3, [cq+128* 6]
+    mova                 m4, [cq+128* 8]
+    mova                 m5, [cq+128*10]
+    mova                 m6, [cq+128*12]
+    mova                 m7, [cq+128*14]
+    mova                 m8, [cq+128*16]
+    mova                 m9, [cq+128*18]
+    mova                m10, [cq+128*20]
+    mova                m11, [cq+128*22]
+    mova                m12, [cq+128*24]
+    mova                m13, [cq+128*26]
+    mova                m14, [cq+128*28]
+    mova                m15, [cq+128*30]
+    call m(vp9_idct_16x16_internal_10).main_part1
+.pass1_main_end:
+    call m(vp9_idct_16x16_internal_10).main_part2
+.pass1_main_end2:
+    pxor                m16, m16
+.pass1_zero_loop:
+    mova      [cq+r6+128*0], m16
+    mova      [cq+r6+128*1], m16
+    mova      [cq+r6+128*2], m16
+    mova      [cq+r6+128*3], m16
+    sub                 r6d, 128*4
+    jge .pass1_zero_loop
+    mova                m16, [r4+64*15]
+    mova                m19, [r4+64*14]
+    mova                m22, [r4+64*13]
+    mova                m17, [r4+64*12]
+    psubd               m18, m0, m16
+    paddd               m16, m0
+    paddd                m0, m19, m1
+    psubd               m19, m1, m19
+    paddd                m1, m17, m3
+    psubd                m3, m17
+    paddd               m17, m2, m22
+    psubd                m2, m22
+    TRANSPOSE_4D          3,  2, 19, 18, 22 ; 28 29 30 31
+    TRANSPOSE_4D         16,  0, 17,  1, 22 ;  0  1  2  3
+    mova         [r4+64*54], m3
+    mova         [r4+64*55], m19
+    mova         [r4+64*38], m2
+    mova         [r4+64*39], m18
+    mova                 m2, [r4+64*11]
+    mova                m19, [r4+64*10]
+    mova                 m3, [r4+64* 9]
+    mova                m22, [r4+64* 8]
+    paddd               m18, m4, m2
+    psubd                m4, m2
+    paddd                m2, m5, m19
+    psubd                m5, m19
+    paddd               m19, m6, m3
+    psubd                m6, m3
+    paddd                m3, m7, m22
+    psubd                m7, m22
+    TRANSPOSE_4D          7,  6,  5,  4, 22 ; 24 25 26 27
+    TRANSPOSE_4D         18,  2, 19,  3, 22 ;  4  5  6  7
+    mova         [r4+64*52], m7
+    mova         [r4+64*53], m5
+    mova         [r4+64*36], m6
+    mova         [r4+64*37], m4
+    mova                 m7, [r4+64* 7]
+    mova                 m4, [r4+64* 6]
+    mova                 m5, [r4+64* 5]
+    mova                m22, [r4+64* 4]
+    psubd                m6, m8, m7
+    paddd                m8, m7
+    psubd                m7, m9, m4
+    paddd                m4, m9
+    paddd                m9, m10, m5
+    psubd               m10, m5
+    paddd                m5, m11, m22
+    psubd               m11, m22
+    TRANSPOSE_4D         11, 10,  7,  6, 22 ; 20 21 22 23
+    TRANSPOSE_4D          8,  4,  9,  5, 22 ;  8  9 10 11
+    mova         [r4+64*50], m11
+    mova         [r4+64*51], m7
+    mova         [r4+64*34], m10
+    mova         [r4+64*35], m6
+    mova                 m6, [r4+64* 3]
+    mova                m11, [r4+64* 2]
+    mova                 m7, [r4+64* 1]
+    mova                m22, [r4+64* 0]
+    paddd               m10, m12, m6
+    psubd               m12, m6
+    paddd                m6, m13, m11
+    psubd               m13, m11
+    paddd               m11, m14, m7
+    psubd               m14, m7
+    paddd                m7, m15, m22
+    psubd               m15, m22
+    TRANSPOSE_4D         15, 14, 13, 12, 22 ; 16 17 18 19
+    TRANSPOSE_4D         10,  6, 11,  7, 22 ; 12 13 14 15
+    mova         [r4+64*48], m15
+    mova         [r4+64*49], m13
+    mova         [r4+64*32], m14
+    mova         [r4+64*33], m12
+    TRANSPOSE_4DQ         0,  2,  4,  6, 22
+    TRANSPOSE_4DQ         1,  3,  5,  7, 22
+    TRANSPOSE_4DQ        16, 18,  8, 10, 22
+    TRANSPOSE_4DQ        17, 19,  9, 11, 22
+    ret
+ALIGN function_align
+.pass2_main_left:
+    vpbroadcastd        m22, [o(pixel_clip6)]
+    paddd                m0, m22
+    paddd               m18, m22
+    call m(vp9_idct_16x16_internal_10).main_part2
+    mova         [r4+64*16], m0
+    mova         [r4+64*17], m1
+    mova         [r4+64*18], m2
+    mova         [r4+64*19], m3
+    mova         [r4+64*20], m4
+    mova         [r4+64*21], m5
+    mova         [r4+64*22], m6
+    mova         [r4+64*23], m7
+    mova         [r4+64*24], m8
+    mova         [r4+64*25], m9
+    mova         [r4+64*26], m10
+    mova         [r4+64*27], m11
+    mova         [r4+64*28], m12
+    mova         [r4+64*29], m13
+    mova         [r4+64*30], m14
+    mova         [r4+64*31], m15
+    add                  r4, 64*32
+    mova                 m0, [r4+64* 0]
+    mova                 m1, [r4+64* 1]
+    mova                 m2, [r4+64* 2]
+    mova                 m3, [r4+64* 3]
+    mova                 m4, [r4+64* 4]
+    mova                 m5, [r4+64* 5]
+    mova                 m6, [r4+64* 6]
+    mova                 m7, [r4+64* 7]
+    jmp .pass2_main_transpose
+ALIGN function_align
+.pass2_main_right:
+    mova                 m0, [r4+64*16]
+    mova                 m1, [r4+64*17]
+    mova                 m2, [r4+64*18]
+    mova                 m3, [r4+64*19]
+    mova                 m4, [r4+64*20]
+    mova                 m5, [r4+64*21]
+    mova                 m6, [r4+64*22]
+    mova                 m7, [r4+64*23]
+.pass2_main_transpose:
+    TRANSPOSE_4DQ         0, 2, 4, 6, 8
+    TRANSPOSE_4DQ         1, 3, 5, 7, 8
+    ret
+ALIGN function_align
+.main_fast:
+    pmulld              m15, m0, [o(pd_16364)] {1to16} ; t31a
+    pmulld               m0, [o(pd_804)] {1to16}       ; t16a
+    pmulld               m8, m7, [o(pd_11003)] {1to16} ; t17a
+    pmulld               m7, [o(pd_12140)] {1to16}     ; t30a
+    pmulld              m11, m4, [o(pd_14811)] {1to16} ; t29a
+    pmulld               m4, [o(pd_7005)] {1to16}      ; t18a
+    pmulld              m12, m3, [o(pd_5520)] {1to16}  ; t19a
+    pmulld               m3, [o(pd_15426)] {1to16}     ; t28a
+    pmulld              m13, m2, [o(pd_15893)] {1to16} ; t27a
+    pmulld               m2, [o(pd_3981)] {1to16}      ; t20a
+    pmulld              m10, m5, [o(pd_8423)] {1to16}  ; t21a
+    pmulld               m5, [o(pd_14053)] {1to16}     ; t26a
+    pmulld               m9, m6, [o(pd_13160)] {1to16} ; t25a
+    pmulld               m6, [o(pd_9760)] {1to16}      ; t22a
+    pmulld              m14, m1, [o(pd_2404)] {1to16}  ; t23a
+    pmulld               m1, [o(pd_16207)] {1to16}     ; t24a
+    REPX  {psubd x, m20, x}, m8, m12, m10, m14
+    jmp .main2
+ALIGN function_align
+.main:
+    ITX_MULSUB_2D         0, 15, 16, 17, 18, _,   804, 16364 ; t16a, t31a
+    ITX_MULSUB_2D         8,  7, 16, 17, 18, _, 12140, 11003 ; t17a, t30a
+    ITX_MULSUB_2D         4, 11, 16, 17, 18, _,  7005, 14811 ; t18a, t29a
+    ITX_MULSUB_2D        12,  3, 16, 17, 18, _, 15426,  5520 ; t19a, t28a
+    ITX_MULSUB_2D         2, 13, 16, 17, 18, _,  3981, 15893 ; t20a, t27a
+    ITX_MULSUB_2D        10,  5, 16, 17, 18, _, 14053,  8423 ; t21a, t26a
+    ITX_MULSUB_2D         6,  9, 16, 17, 18, _,  9760, 13160 ; t22a, t25a
+    ITX_MULSUB_2D        14,  1, 16, 17, 18, _, 16207,  2404 ; t23a, t24a
+    REPX     {paddd x, m20}, m8, m12, m10, m14
+.main2:
+    REPX     {paddd x, m20}, m0, m15, m7, m4, m3, m11
+    REPX     {psrad x, 14 }, m8, m0, m15, m7, m12, m4, m3, m11
+    psubd               m16, m0, m8   ; t17
+    paddd                m0, m8       ; t16
+    psubd                m8, m15, m7  ; t30
+    paddd               m15, m7       ; t31
+    paddd                m7, m12, m4  ; t19
+    psubd               m12, m4       ; t18
+    paddd                m4, m3, m11  ; t28
+    psubd                m3, m11      ; t29
+    REPX     {paddd x, m20}, m2, m13, m5, m6, m1, m9
+    REPX     {psrad x, 14 }, m10, m2, m13, m5, m14, m6, m1, m9
+    psubd               m11, m2, m10  ; t21
+    paddd                m2, m10      ; t20
+    psubd               m10, m13, m5  ; t26
+    paddd               m13, m5       ; t27
+    psubd                m5, m14, m6  ; t22
+    paddd                m6, m14      ; t23
+    psubd               m14, m1, m9   ; t25
+    paddd                m9, m1       ; t24
+    vpbroadcastd        m19, [o(pd_16069)]
+    vpbroadcastd        m18, [o(pd_3196)]
+    ITX_MULSUB_2D         8, 16,  1, 17, _, 20, 18, 19    ; t17a, t30a
+    ITX_MULSUB_2D         3, 12,  1, 17, _, 20, 18, 19, 1 ; t29a, t18a
+    vpbroadcastd        m19, [o(pd_9102)]
+    vpbroadcastd        m18, [o(pd_13623)]
+    ITX_MULSUB_2D        10, 11,  1, 17, _, 20, 18, 19    ; t21a, t26a
+    ITX_MULSUB_2D        14,  5,  1, 17, _, 20, 18, 19, 1 ; t25a, t22a
+    paddd                m1, m6, m2   ; t23a
+    psubd                m6, m2       ; t20a
+    psubd                m2, m9, m13  ; t27a
+    paddd                m9, m13      ; t24a
+    psubd               m13, m15, m4  ; t28a
+    paddd               m15, m4       ; t31a
+    psubd                m4, m8, m12  ; t18
+    paddd                m8, m12      ; t17
+    psubd               m12, m0, m7   ; t19a
+    paddd                m0, m7       ; t16a
+    psubd                m7, m16, m3  ; t29
+    paddd                m3, m16      ; t30
+    paddd               m16, m5, m10  ; t22
+    psubd                m5, m10      ; t21
+    psubd               m10, m14, m11 ; t26
+    paddd               m14, m11      ; t25
+    vpbroadcastd        m19, [o(pd_15137)]
+    vpbroadcastd        m18, [o(pd_6270)]
+    ITX_MULSUB_2D        13, 12, 11, 17, _, 20, 18, 19    ; t19,  t28
+    ITX_MULSUB_2D         2,  6, 11, 17, _, 20, 18, 19, 1 ; t27,  t20
+    ITX_MULSUB_2D         7,  4, 11, 17, _, 20, 18, 19    ; t18a, t29a
+    ITX_MULSUB_2D        10,  5, 11, 17, _, 20, 18, 19, 1 ; t26a, t21a
+    psubd               m11, m0, m1   ; t23
+    paddd                m0, m1       ; t16
+    paddd                m1, m16, m8  ; t17a
+    psubd               m16, m8, m16  ; t22a
+    psubd                m8, m15, m9  ; t24
+    paddd               m15, m9       ; t31
+    psubd                m9, m3, m14  ; t25a
+    paddd               m14, m3       ; t30a
+    paddd                m3, m6, m13  ; t19a
+    psubd                m6, m13, m6  ; t20a
+    paddd               m13, m10, m4  ; t29
+    psubd               m10, m4, m10  ; t26
+    psubd                m4, m12, m2  ; t27a
+    paddd               m12, m2       ; t28a
+    paddd                m2, m7, m5   ; t18
+    psubd                m7, m5       ; t21
+    REPX    {pmulld x, m21}, m10, m8, m4, m9, m7, m11, m6, m16
+    mova         [r4+64* 0], m0
+    mova         [r4+64* 1], m1
+    mova         [r4+64* 2], m2
+    mova         [r4+64* 3], m3
+    mova         [r4+64*12], m12
+    mova         [r4+64*13], m13
+    mova         [r4+64*14], m14
+    mova         [r4+64*15], m15
+    REPX    {paddd  x, m20}, m10, m8, m4, m9
+    psubd                m5, m10, m7  ; t21a
+    paddd               m10, m7       ; t26a
+    psubd                m7, m8, m11  ; t23a
+    paddd                m8, m11      ; t24a
+    REPX    {psrad  x, 14 }, m5, m10, m7, m8
+    paddd               m11, m4, m6   ; t27
+    psubd                m4, m6       ; t20
+    psubd                m6, m9, m16  ; t22
+    paddd                m9, m16      ; t25
+    REPX    {psrad  x, 14 }, m11, m4, m6, m9
+    mova         [r4+64* 4], m4
+    mova         [r4+64* 5], m5
+    mova         [r4+64* 6], m6
+    mova         [r4+64* 7], m7
+    mova         [r4+64* 8], m8
+    mova         [r4+64* 9], m9
+    mova         [r4+64*10], m10
+    mova         [r4+64*11], m11
+    ret
+
+%endif
diff --git a/libavcodec/x86/vp9itxfm_avx512.asm b/libavcodec/x86/vp9itxfm_avx512.asm
new file mode 100644
index 0000000000000..d51c50756d58f
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_avx512.asm
@@ -0,0 +1,1629 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2025 Two Orioles, LLC
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL
+
+SECTION_RODATA 64
+
+dup16_perm:  db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
+             db  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
+             db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
+             db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
+itx_perm:    dq 0x0000000820150440, 0x0000000231372604
+             dq 0x0000000ca8041551, 0x00000006b9263715
+             dq 0x00000001ec9d8c62, 0x0000000bfdbfae26
+             dq 0x00000005648c9d73, 0x0000000f75aebf37
+deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+int_shuf1:   db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
+int_shuf2:   db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
+pw_512:      times 4 dw  512
+pw_m512:     times 4 dw -512
+pw_15137_6270x2x4:   times 4 dw  15137*2
+                     times 4 dw   6270*2
+pw_11585_m11585x2x4: times 4 dw  11585*2
+pw_m11585_11585x2x4: times 4 dw -11585*2
+pw_11585_11585x2:    times 4 dw  11585*2
+int_mshift:  db 142, 150, 0, 0, 174, 182, 0, 0
+pd_8192:     dd 8192
+pw_804x2:    times 2 dw    804*2
+pw_1606x2:   times 2 dw   1606*2
+pw_3196x2:   times 2 dw   3196*2
+pw_3981x2:   times 2 dw   3981*2
+pw_6270x2:   times 2 dw   6270*2
+pw_7005x2:   times 2 dw   7005*2
+pw_7723x2:   times 2 dw   7723*2
+pw_9760x2:   times 2 dw   9760*2
+pw_12140x2:  times 2 dw  12140*2
+pw_12665x2:  times 2 dw  12665*2
+pw_13160x2:  times 2 dw  13160*2
+pw_13623x2:  times 2 dw  13623*2
+pw_14053x2:  times 2 dw  14053*2
+pw_14449x2:  times 2 dw  14449*2
+pw_14811x2:  times 2 dw  14811*2
+pw_15137x2:  times 2 dw  15137*2
+pw_15426x2:  times 2 dw  15426*2
+pw_15679x2:  times 2 dw  15679*2
+pw_15893x2:  times 2 dw  15893*2
+pw_16069x2:  times 2 dw  16069*2
+pw_16207x2:  times 2 dw  16207*2
+pw_16305x2:  times 2 dw  16305*2
+pw_16364x2:  times 2 dw  16364*2
+pw_m2404x2:  times 2 dw  -2404*2
+pw_m4756x2:  times 2 dw  -4756*2
+pw_m5520x2:  times 2 dw  -5520*2
+pw_m8423x2:  times 2 dw  -8423*2
+pw_m9102x2:  times 2 dw  -9102*2
+pw_m10394x2: times 2 dw -10394*2
+pw_m11003x2: times 2 dw -11003*2
+pw_804_16364x2:    dw    804*2, 16364*2
+pw_1606_16305x2:   dw   1606*2, 16305*2
+pw_3196_16069x2:   dw   3196*2, 16069*2
+pw_3981_15893x2:   dw   3981*2, 15893*2
+pw_7005_14811x2:   dw   7005*2, 14811*2
+pw_7723_14449x2:   dw   7723*2, 14449*2
+pw_9760_13160x2:   dw   9760*2, 13160*2
+pw_m2404_16207x2:  dw  -2404*2, 16207*2
+pw_m4756_15679x2:  dw  -4756*2, 15679*2
+pw_m5520_15426x2:  dw  -5520*2, 15426*2
+pw_m8423_14053x2:  dw  -8423*2, 14053*2
+pw_m9102_13623x2:  dw  -9102*2, 13623*2
+pw_m10394_12665x2: dw -10394*2, 12665*2
+pw_m11003_12140x2: dw -11003*2, 12140*2
+
+%macro COEF_PAIR 2-3 0
+%if %3 & 4
+pw_%1_m%2:  dw  %1, -%2
+%else
+pw_%1_%2:   dw  %1,  %2
+%if %3 & 2
+pw_m%1_%2:  dw -%1,  %2
+%else
+pw_m%2_%1:  dw -%2,  %1
+%endif
+%endif
+%if %3 & 1
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR   804, 16364
+COEF_PAIR  1606, 16305
+COEF_PAIR  3196, 16069, 1
+COEF_PAIR  3981, 15893
+COEF_PAIR  6270, 15137, 1
+COEF_PAIR  7005, 14811
+COEF_PAIR  7723, 14449
+COEF_PAIR  9102, 13623
+COEF_PAIR  9760, 13160
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 12140, 11003
+COEF_PAIR 12665, 10394
+COEF_PAIR 13623,  9102, 1
+COEF_PAIR 14053,  8423
+COEF_PAIR 15137,  6270
+COEF_PAIR 15426,  5520
+COEF_PAIR 15679,  4756
+COEF_PAIR 16069,  3196
+COEF_PAIR 16207,  2404
+
+; ADST16-only:
+COEF_PAIR  2404,  9760, 2
+COEF_PAIR  5520,  7005, 2
+COEF_PAIR  8423,  3981, 2
+COEF_PAIR 11003,   804, 2
+COEF_PAIR 12140, 16364, 5
+COEF_PAIR 14053, 15893, 5
+COEF_PAIR 15426, 14811, 5
+COEF_PAIR 16207, 13160, 5
+pw_11585_m11585:  dw 11585, -11585
+pw_16069_m3196:   dw 16069,  -3196
+pw_9102_m13623:   dw  9102, -13623
+pw_15137_m6270:   dw 15137,  -6270
+pw_6270_m15137:   dw  6270, -15137
+
+%define pw_11585x2  pw_11585_11585x2
+%define pw_m11585x2 pw_m11585_11585x2x4
+
+SECTION .text
+
+%define o_base pw_512 + 128
+%define o(x) (r6 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
+;        16 = special_mul1, 32 = special_mul2, 64 = dst_in_tmp1
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+    mova                m%2, m%4
+%if %7 & 16
+    vpdpwssd            m%2, m%1, [o(pw_%5)] {bcstd}
+    mova                m%3, m%4
+%if %7 & 32
+    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
+%else
+    vpdpwssd            m%3, m%1, m%6
+%endif
+%elif %7 & 32
+    vpdpwssd            m%2, m%1, m%5
+    mova                m%3, m%4
+    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
+%elif %6 < 32
+    vpdpwssd            m%2, m%1, m%5
+    mova                m%3, m%4
+    vpdpwssd            m%3, m%1, m%6
+%elif %7 & 1
+    vpdpwssd            m%2, m%1, [o(pw_%5_%6)] {bcstd}
+    mova                m%3, m%4
+    vpdpwssd            m%3, m%1, [o(pw_m%6_%5)] {bcstd}
+%else
+    vpdpwssd            m%2, m%1, [o(pw_m%6_%5)] {bcstd}
+    mova                m%3, m%4
+    vpdpwssd            m%3, m%1, [o(pw_%5_%6)] {bcstd}
+%endif
+%if %7 & 2
+    psrld               m%2, 14
+    pslld               m%3, 2
+    vpshrdd             m%1, m%3, m%2, 16
+%elif %7 & 4
+    ; compared to using shifts (as above) this has better throughput,
+    ; but worse latency and requires setting up the opmask/index
+    ; registers, so only use this method for the larger transforms
+%if %7 & 64
+    pslld               m%2, 2
+    vpmultishiftqb  m%2{k7}, m13, m%3
+%else
+    pslld               m%1, m%2, 2
+    vpmultishiftqb  m%1{k7}, m13, m%3
+%endif
+%else
+    psrad               m%2, 14
+    psrad               m%3, 14
+%if %7 & 8 == 0
+    packssdw            m%1, m%3, m%2
+%endif
+%endif
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+    punpcklwd           m%3, m%2, m%1
+    punpckhwd           m%2, m%1
+%if %7 < 32
+    mova                m%1, m%5
+    vpdpwssd            m%1, m%3, m%7
+    mova                m%4, m%5
+    vpdpwssd            m%4, m%2, m%7
+%else
+    mova                m%1, m%5
+    vpdpwssd            m%1, m%3, [o(pw_m%7_%6)] {bcstd}
+    mova                m%4, m%5
+    vpdpwssd            m%4, m%2, [o(pw_m%7_%6)] {bcstd}
+%endif
+    psrad               m%1, 14
+    psrad               m%4, 14
+    packssdw            m%1, m%4
+    mova                m%4, m%5
+%if %7 < 32
+    vpdpwssd            m%4, m%2, m%6
+    mova                m%2, m%5
+    vpdpwssd            m%2, m%3, m%6
+%else
+    vpdpwssd            m%4, m%2, [o(pw_%6_%7)] {bcstd}
+    mova                m%2, m%5
+    vpdpwssd            m%2, m%3, [o(pw_%6_%7)] {bcstd}
+%endif
+    psrad               m%4, 14
+    psrad               m%2, 14
+    packssdw            m%2, m%4
+%endmacro
+
+; flags: 1 = swap, 2 = invert2, 4 = invert1
+%macro ADST_MULSUB_4W 10-11 0 ; dst1/src1, src2, dst2, tmp[1-2], rnd, coef[1-4], flags
+    mova                m%3, m%6
+%if %11 & 1
+    vpdpwssd            m%3, m%1, [o(pw_m%8_%7)] {bcstd}
+%else
+    vpdpwssd            m%3, m%1, [o(pw_%7_%8)] {bcstd}
+%endif
+%if %11 & 4
+    vpbroadcastd        m%4, [o(pw_m%9_%10)]
+%elif %11 & 2
+    vpbroadcastd        m%4, [o(pw_%9_m%10)]
+%elif %11 & 1
+    vpbroadcastd        m%4, [o(pw_%10_%9)]
+%else
+    vpbroadcastd        m%4, [o(pw_%9_%10)]
+%endif
+    pmaddwd             m%4, m%2
+    mova                m%5, m%6
+%if %11 & 4
+    vpdpwssd            m%5, m%1, [o(pw_%8_m%7)] {bcstd}
+%elif %11 & 1
+    vpdpwssd            m%5, m%1, [o(pw_%7_%8)] {bcstd}
+%else
+    vpdpwssd            m%5, m%1, [o(pw_m%8_%7)] {bcstd}
+%endif
+%if %11 & 2
+    vpbroadcastd        m%1, [o(pw_%10_%9)]
+%elif %11 & 1
+    vpbroadcastd        m%1, [o(pw_%9_m%10)]
+%else
+    vpbroadcastd        m%1, [o(pw_m%10_%9)]
+%endif
+    pmaddwd             m%2, m%1
+    paddd               m%1, m%3, m%4
+    psubd               m%3, m%4
+    paddd               m%4, m%5, m%2
+    psubd               m%5, m%2
+    pslld               m%1, 2
+    pslld               m%3, 2
+    vpmultishiftqb  m%1{k7}, m13, m%4
+    vpmultishiftqb  m%3{k7}, m13, m%5
+%endmacro
+
+%macro WRAP_YMM 1+
+    INIT_YMM cpuname
+    %1
+    INIT_ZMM cpuname
+%endmacro
+
+%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
+cglobal vp9_i%1_i%2_%3_add, 4, 5, 0, dst, stride, c, eob, tx2
+    %undef cmp
+    %define %%p1 m(vp9_i%1_%3_internal)
+    lea                  r6, [o_base]
+    ; Jump to the 1st txfm function if we're not taking the fast path, which
+    ; in turn performs an indirect jump to the 2nd txfm function.
+    lea tx2q, [m(vp9_i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
+    cmp                eobd, 1
+    jne %%p1
+%else
+%if %4
+    add                eobd, %4
+%endif
+    ; jump to the 1st txfm function unless it's located directly after this
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+    INV_TXFM_FN          %1, %2, 16x16, %3
+%ifidn %1_%2, dct_dct
+    movd               xmm0, [o(pw_11585x2)]
+    pmulhrsw           xmm3, xmm0, [cq]
+    pxor                ym2, ym2
+    pmulhrsw           xmm3, xmm0
+    pmulhrsw           xmm3, [o(pw_512)]
+    mova               [cq], xm2
+    add                 r3d, 7
+    vpbroadcastw        ym3, xmm3
+.dconly_loop:
+    mova                xm1, [dstq+strideq*0]
+    vinserti32x4        ym1, [dstq+strideq*1], 1
+    punpcklbw           ym0, ym1, ym2
+    punpckhbw           ym1, ym2
+    paddw               ym0, ym3
+    paddw               ym1, ym3
+    packuswb            ym0, ym1
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r3d
+    jg .dconly_loop
+    RET
+%endif
+%endmacro
+
+%macro IDCT16_MAIN 0-1 0 ; idct32
+%if mmsize == 64 && %1 == 0
+.main_fast:
+%endif
+    vpbroadcastd         m2, [o(pw_1606_16305x2)]
+    vpbroadcastd         m4, [o(pw_m10394_12665x2)]
+    vpbroadcastd        m11, [o(pw_7723_14449x2)]
+    vpbroadcastd        m12, [o(pw_m4756_15679x2)]
+    pmulhrsw             m8, m2  ; t8a  t15a
+    vpbroadcastd         m2, [o(pw_3196_16069x2)]
+    pmulhrsw             m0, m4  ; t9a  t14a
+    vpbroadcastd         m4, [o(pw_m9102_13623x2)]
+    pmulhrsw             m5, m11 ; t10a t13a
+    vpbroadcastd        m11, [o(pw_11585_11585x2)]
+    pmulhrsw             m1, m12 ; t11a t12a
+    vbroadcasti32x4     m12, [o(pw_15137_6270x2x4)]
+    pmulhrsw             m7, m2  ; t4a  t7a
+    pmulhrsw             m3, m4  ; t5a  t6a
+    pmulhrsw             m9, m11 ; t0   t1
+    pmulhrsw             m6, m12 ; t3   t2
+%if mmsize == 64 && %1 == 0
+    jmp %%main2
+ALIGN function_align
+.main:
+    punpckhwd            m8, m7, m0 ; dct16 in15 in1
+    punpcklwd            m9, m4, m0 ; dct4  in2  in0
+    punpckhwd            m0, m3, m4 ; dct16 in7  in9
+    punpcklwd            m7, m1     ; dct8  in7  in1
+    punpckhwd            m1, m6     ; dct16 in3  in13
+    punpcklwd            m3, m5     ; dct8  in3  in5
+    punpckhwd            m5, m2     ; dct16 in11 in5
+    punpcklwd            m6, m2     ; dct4  in3  in1
+    ITX_MUL2X_PACK        8, 2, 4, 10,  1606, 16305, 5 ; t8a  t15a
+    ITX_MUL2X_PACK        0, 2, 4, 10, 12665, 10394, 5 ; t9a  t14a
+    ITX_MUL2X_PACK        5, 2, 4, 10,  7723, 14449, 5 ; t10a t13a
+    ITX_MUL2X_PACK        1, 2, 4, 10, 15679,  4756, 5 ; t11a t12a
+    ITX_MUL2X_PACK        7, 2, 4, 10,  3196, 16069, 5 ; t4a  t7a
+    ITX_MUL2X_PACK        3, 2, 4, 10, 13623,  9102, 5 ; t5a  t6a
+    ITX_MUL2X_PACK        9, 2, 4, 10, 11585, 11585    ; t0   t1
+    ITX_MUL2X_PACK        6, 2, 4, 10,  6270, 15137    ; t3   t2
+%%main2:
+%endif
+    psubw                m2, m8, m0 ; t9  t14
+    paddw                m8, m0     ; t8  t15
+    psubw                m4, m1, m5 ; t10 t13
+    paddw                m1, m5     ; t11 t12
+    ITX_MUL2X_PACK        2, 0, 5, 10,   6270, 15137, (1|%1*4) ; t9a  t14a
+    ITX_MUL2X_PACK        4, 0, 5, 10, m15137,  6270, (1|%1*4) ; t10a t13a
+    vbroadcasti32x4      m5, [o(deint_shuf)]
+    psubw                m0, m8, m1 ; t11a t12a
+    paddw                m8, m1     ; t8a  t15a
+    psubw                m1, m7, m3 ; t5a  t6a
+    paddw                m7, m3     ; t4   t7
+    pshufb               m8, m5
+    pshufb               m7, m5
+    paddw                m3, m2, m4 ; t9   t14
+    psubw                m2, m4     ; t10  t13
+%if %1
+    vpbroadcastd        m12, [o(pw_11585_11585)]
+    vpbroadcastd        m11, [o(pw_m11585_11585)]
+    pshufb               m3, m5
+    ITX_MUL2X_PACK        1, 4,  5, 10, 12, 11    ; t5   t6
+    ITX_MUL2X_PACK        0, 4,  5, 10, 11, 12, 8 ; t11  t12
+    ITX_MUL2X_PACK        2, 0, 11, 10, 11, 12, 8 ; t10a t13a
+    packssdw             m5, m11    ; t12  t13a
+    packssdw             m4, m0     ; t11  t10a
+%else
+    pshufb               m0, m5
+    ITX_MUL2X_PACK        1, 4, 5, 10, 11585_11585, m11585_11585, 48 ; t5   t6
+    vpbroadcastd        m11, [o(pw_11585x2)]
+    punpckhqdq           m5, m0, m2 ; t12a t13
+    punpcklqdq           m0, m2     ; t11a t10
+    psubw                m4, m5, m0
+    paddw                m5, m0
+    pmulhrsw             m4, m11    ; t11  t10a
+    pmulhrsw             m5, m11    ; t12  t13a
+%endif
+    punpckhqdq           m2, m7, m1 ; t7   t6
+    punpcklqdq           m7, m1     ; t4   t5
+    psubw                m1, m9, m6 ; t3   t2
+    paddw                m9, m6     ; t0   t1
+    punpckhqdq           m0, m8, m3 ; t15a t14
+    punpcklqdq           m8, m3     ; t8a  t9
+    psubw                m3, m9, m2 ; t7   t6
+    paddw                m9, m2     ; t0   t1
+    psubw                m2, m1, m7 ; t4   t5
+    paddw                m1, m7     ; t3   t2
+    psubw                m7, m9, m0 ; out15 out14
+    paddw                m0, m9     ; out0  out1
+    psubw                m6, m1, m5 ; out12 out13
+    paddw                m1, m5     ; out3  out2
+    psubw                m5, m2, m4 ; out11 out10
+    paddw                m2, m4     ; out4  out5
+    psubw                m4, m3, m8 ; out8  out9
+    paddw                m3, m8     ; out7  out6
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst, 39-23
+
+cglobal vp9_idct_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2
+    mova                m15, [o(itx_perm)]
+    vpbroadcastd        m10, [o(pd_8192)]
+    vpbroadcastq        m13, [o(int_mshift)]
+    vpcmpub              k7, m13, m10, 6
+    sub                eobd, 39
+    jl .pass1_fast
+    vpermq               m0, m15, [cq+64*0]
+    vpermq               m1, m15, [cq+64*1]
+    vpermq               m2, m15, [cq+64*2]
+    vpermq               m3, m15, [cq+64*3]
+    vpermq               m4, m15, [cq+64*4]
+    vpermq               m5, m15, [cq+64*5]
+    vpermq               m6, m15, [cq+64*6]
+    vpermq               m7, m15, [cq+64*7]
+    call .main
+    vbroadcasti32x4     m12, [o(int_shuf1)]
+    vbroadcasti32x4     m11, [o(int_shuf2)]
+    pshufb               m0, m12
+    pshufb               m8, m1, m11
+    pshufb               m2, m12
+    pshufb               m9, m3, m11
+    pshufb               m4, m12
+    pshufb              m14, m5, m11
+    pshufb               m6, m12
+    pshufb              m11, m7, m11
+    punpckhdq            m1, m0, m8
+    punpckldq            m0, m8
+    punpckhdq            m3, m2, m9
+    punpckldq            m2, m9
+    punpckhdq            m5, m4, m14
+    punpckldq            m4, m14
+    punpckhdq            m7, m6, m11
+    punpckldq            m6, m11
+.pass1_end:
+    vshufi32x4           m8, m4, m6, q3232
+    vinserti32x8         m4, ym6, 1
+    vshufi32x4           m6, m0, m2, q3232
+    vinserti32x8         m0, ym2, 1
+    vshufi32x4           m9, m5, m7, q3232
+    vinserti32x8         m5, ym7, 1
+    vshufi32x4           m7, m1, m3, q3232
+    vinserti32x8         m1, ym3, 1
+    vshufi32x4           m2, m0, m4, q3131 ;  4  5
+    vshufi32x4           m0, m4, q2020     ;  0  1
+    vshufi32x4           m4, m6, m8, q2020 ;  8  9
+    vshufi32x4           m6, m8, q3131     ; 12 13
+    vshufi32x4           m3, m1, m5, q3131 ;  6  7
+    vshufi32x4           m1, m5, q2020     ;  2  3
+    vshufi32x4           m5, m7, m9, q2020 ; 10 11
+    vshufi32x4           m7, m9, q3131     ; 14  1
+    jmp                tx2q
+.pass1_fast:
+    mova                ym3, [o(dup16_perm)]
+    vbroadcasti32x4     ym9, [cq+32*0]
+    vbroadcasti32x4     ym6, [cq+32*4]
+    vpermb              ym8, ym3, [cq+32*1]
+    vpermb              ym0, ym3, [cq+32*7]
+    vpermb              ym5, ym3, [cq+32*5]
+    vpermb              ym1, ym3, [cq+32*3]
+    vpermb              ym7, ym3, [cq+32*2]
+    vpermb              ym3, ym3, [cq+32*6]
+    shufpd              ym9, ym9, 0x0c
+    shufpd              ym6, ym6, 0x0c
+    WRAP_YMM IDCT16_MAIN
+    vbroadcasti32x4      m8, [o(int_shuf1)]
+    vbroadcasti32x4      m9, [o(int_shuf2)]
+    vinserti32x8         m0, ym2, 1 ;  0  1 |  4  5
+    vinserti32x8         m4, ym6, 1 ;  8  9 | 12 13
+    vinserti32x8         m1, ym3, 1 ;  3  2 |  7  6
+    vinserti32x8         m5, ym7, 1 ; 11 10 | 15 14
+    vshufi32x4           m2, m0, m4, q3131
+    vshufi32x4           m0, m4, q2020
+    vshufi32x4           m4, m1, m5, q2020
+    vshufi32x4           m1, m5, q3131
+    pshufb               m2, m8
+    pshufb               m0, m8
+    pshufb               m4, m9
+    pshufb               m1, m9
+    punpckhdq            m3, m2, m1 ; 6-7
+    punpckldq            m2, m1     ; 4-5
+    punpckhdq            m1, m0, m4 ; 2-3
+    punpckldq            m0, m4     ; 0-1
+    jmp                tx2q
+.pass2:
+    test               eobd, eobd
+    jl .pass2_fast
+    call .main
+    jmp .pass2_end
+.pass2_fast:
+    punpcklqdq           m9, m0, m0
+    punpckhwd            m8, m0, m0
+    punpcklwd            m7, m1, m1
+    punpckhwd            m1, m1
+    punpcklqdq           m6, m2, m2
+    punpckhwd            m5, m2, m2
+    punpckhwd            m0, m3, m3
+    punpcklwd            m3, m3
+    call .main_fast
+.pass2_end:
+    psrldq               m8, m15, 1
+    psrlq               m12, m15, 12
+    psrldq               m9, m15, 2
+    psrlq               m13, m15, 20
+    mova                m10, m8
+    vpermi2q             m8, m0, m2 ;  0  1  4  5
+    vpermt2q             m0, m12, m2
+    mova                m11, m9
+    vpermi2q             m9, m1, m3 ;  2  3  6  7
+    vpermt2q             m1, m13, m3
+    vpbroadcastd         m2, [o(pw_512)]
+    vpermi2q            m10, m4, m6 ;  8  9 12 13
+    vpermt2q             m4, m12, m6
+    vpermi2q            m11, m5, m7 ; 10 11 14 15
+    vpermt2q             m5, m13, m7
+    REPX   {pmulhrsw x, m2}, m0, m1, m4, m5, m8, m9, m10, m11
+.pass2_end2:
+    lea                  r3, [strideq*3]
+    lea                  r4, [dstq+strideq*4]
+    lea                  r5, [dstq+strideq*8]
+    lea                  r6, [r4  +strideq*8]
+    mova                xm3, [dstq+strideq*0]
+    mova                xm6, [dstq+strideq*2]
+    vinserti32x4        ym3, [dstq+strideq*1], 1
+    vinserti32x4        ym6, [dstq+r3       ], 1
+    vinserti32x4         m3, [r4+strideq*0], 2
+    vinserti32x4         m6, [r4+strideq*2], 2
+    vinserti32x4         m3, [r4+strideq*1], 3
+    vinserti32x4         m6, [r4+r3       ], 3
+    mova               xm12, [r5+strideq*0]
+    mova               xm13, [r5+strideq*2]
+    vinserti32x4       ym12, [r5+strideq*1], 1
+    vinserti32x4       ym13, [r5+r3       ], 1
+    vinserti32x4        m12, [r6+strideq*0], 2
+    vinserti32x4        m13, [r6+strideq*2], 2
+    vinserti32x4        m12, [r6+strideq*1], 3
+    vinserti32x4        m13, [r6+r3       ], 3
+    pxor                 m7, m7
+    REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+    punpcklbw            m2, m3, m7
+    punpckhbw            m3, m7
+    paddw                m0, m2
+    paddw                m8, m3
+    packuswb             m0, m8
+    punpcklbw            m2, m6, m7
+    punpckhbw            m6, m7
+    paddw                m1, m2
+    paddw                m9, m6
+    packuswb             m1, m9
+    punpcklbw            m2, m12, m7
+    punpckhbw           m12, m7
+    paddw                m2, m4
+    paddw               m10, m12
+    packuswb             m2, m10
+    punpcklbw            m3, m13, m7
+    punpckhbw           m13, m7
+    paddw                m3, m5
+    paddw               m11, m13
+    packuswb             m3, m11
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    mova          [dstq+strideq*2], xm1
+    vextracti32x4 [dstq+r3       ], ym1, 1
+    vextracti32x4 [r4+strideq*0], m0, 2
+    vextracti32x4 [r4+strideq*1], m0, 3
+    vextracti32x4 [r4+strideq*2], m1, 2
+    vextracti32x4 [r4+r3       ], m1, 3
+    mova          [r5+strideq*0], xm2
+    vextracti32x4 [r5+strideq*1], ym2, 1
+    mova          [r5+strideq*2], xm3
+    vextracti32x4 [r5+r3       ], ym3, 1
+    vextracti32x4 [r6+strideq*0], m2, 2
+    vextracti32x4 [r6+strideq*1], m2, 3
+    vextracti32x4 [r6+strideq*2], m3, 2
+    vextracti32x4 [r6+r3       ], m3, 3
+    RET
+ALIGN function_align
+    IDCT16_MAIN
+    ret
+
+%macro IADST16_MAIN 0
+%if mmsize == 64
+.main_fast:
+%endif
+    punpcklwd            m4, m3, m0 ; in7 in0
+    punpcklwd           m11, m1, m2 ; in3 in4
+    punpckhwd            m9, m2, m1 ; in5 in2
+    punpckhwd            m7, m0, m3 ; in1 in6
+    ITX_MUL2X_PACK        4, 0, 6, 10,  11003_804,  12140_m16364, 116 ; t1a  t0a
+    ITX_MUL2X_PACK        4, 5, 6, 10, m11003_804, m12140_m16364,  52 ; t9a  t8a
+    ITX_MUL2X_PACK       11, 2, 6, 10,  5520_7005,  15426_m14811, 116 ; t5a  t4a
+    ITX_MUL2X_PACK       11, 5, 6, 10, m5520_7005, m15426_m14811,  52 ; t13a t12a
+    ITX_MUL2X_PACK        9, 1, 6, 10,  8423_3981,  14053_m15893, 116 ; t3a  t2a
+    ITX_MUL2X_PACK        9, 5, 6, 10, m8423_3981, m14053_m15893,  52 ; t11a t10a
+    ITX_MUL2X_PACK        7, 3, 6, 10,  2404_9760,  16207_m13160, 116 ; t7a  t6a
+    ITX_MUL2X_PACK        7, 5, 6, 10, m2404_9760, m16207_m13160,  52 ; t15a t14a
+%if mmsize == 64 ; for the ymm variant we only ever use the fast path
+    jmp %%main2
+ALIGN function_align
+.main:
+    punpckhwd            m8, m7, m0 ; in14 in1
+    punpcklwd            m0, m7     ; in0  in15
+    punpcklwd            m7, m6, m1 ; in12 in3
+    punpckhwd            m1, m6     ; in2  in13
+    punpckhwd            m6, m5, m2 ; in10 in5
+    punpcklwd            m2, m5     ; in4  in11
+    punpcklwd            m5, m4, m3 ; in8  in7
+    punpckhwd            m3, m4     ; in6  in9
+    ADST_MULSUB_4W        0,  5,  4,  9, 11, 10,   804, 16364, 12140, 11003    ;  t1a    t0a,  t9a    t8a
+    ADST_MULSUB_4W        2,  7, 11,  5,  9, 10,  7005, 14811, 15426,  5520    ;  t5a    t4a,  t13a   t12a
+    ADST_MULSUB_4W        1,  6,  9,  5,  7, 10,  3981, 15893, 14053,  8423    ;  t3a    t2a,  t11a   t10a
+    ADST_MULSUB_4W        3,  8,  7,  5,  6, 10,  9760, 13160, 16207,  2404    ;  t7a    t6a,  t15a   t14a
+%%main2:
+%endif
+    psubw                m5, m1, m3        ;  t7     t6
+    paddw                m6, m1, m3        ;  t3     t2
+    psubw                m1, m0, m2        ;  t5     t4
+    paddw                m2, m0            ;  t1     t0
+    ADST_MULSUB_4W        4, 11,  8,  3,  0, 10,  3196, 16069, 16069,  3196, 1 ;  t8a    t9a,  t12a   t13a
+    ADST_MULSUB_4W        9,  7,  0,  3, 11, 10, 13623,  9102,  9102, 13623, 1 ;  t10a   t11a, t14a   t15a
+    ADST_MULSUB_4W        1,  5, 11,  3,  7, 10,  6270, 15137, 15137,  6270, 2 ;  out12 -out3, t7     t6
+    psubw                m3, m2, m6        ;  t3a    t2a
+    paddw                m2, m6            ; -out15  out0
+    ADST_MULSUB_4W        8,  0,  5,  6,  7, 10, 15137,  6270,  6270, 15137, 6 ; -out13  out2, t15a   t14
+    vbroadcasti32x4     m12, [o(deint_shuf)]
+    paddw                m0, m4, m9        ; -out1   out14
+    psubw                m4, m9            ;  t10    t11
+    pshufb               m2, m12
+    pshufb               m1, m12
+    pshufb               m8, m12
+    pshufb               m0, m12
+    punpcklqdq           m6, m1, m8        ;  out12 -out13
+    shufps               m7, m0, m2, q1032 ;  out14 -out15
+%endmacro
+
+%macro IADST16_PASS1_END 0
+    shufps               m0, m2, m0, q1032 ;  out0  -out1
+    punpckhqdq           m1, m8, m1        ;  out2  -out3
+    mova                 m2, m10
+    vpdpwssd             m2, m5, [o(pw_m11585_m11585)] {bcstd} ; out5
+    mova                 m8, m10
+    vpdpwssd             m8, m11, [o(pw_11585_11585)]  {bcstd} ; out4
+    mova                 m9, m10
+    vpdpwssd             m9, m5, [o(pw_m11585_11585)]  {bcstd} ; out10
+    mova                 m5, m10
+    vpdpwssd             m5, m11, [o(pw_11585_m11585)] {bcstd} ; out11
+    mova                m11, m10
+    vpdpwssd            m11, m3, [o(pw_m11585_m11585)] {bcstd} ; out7
+    mova                m14, m10
+    vpdpwssd            m14, m4, [o(pw_11585_11585)]   {bcstd} ; out6
+    mova                m12, m10
+    vpdpwssd            m12, m3, [o(pw_m11585_11585)]  {bcstd} ; out8
+    mova                 m3, m10
+    vpdpwssd             m3, m4, [o(pw_m11585_11585)]  {bcstd} ; out9
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct, 39-18
+INV_TXFM_16X16_FN adst, adst
+
+cglobal vp9_iadst_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2
+    mova                m15, [o(itx_perm)]
+    psrlq                m7, m15, 4
+    vpermq               m0, m15, [cq+64*0] ;  0  1
+    vpermq               m1, m7, [cq+64*1]  ;  3  2
+    vpermq               m2, m15, [cq+64*2] ;  4  5
+    vpermq               m3, m7, [cq+64*3]  ;  7  6
+    vpbroadcastd        m10, [o(pd_8192)]
+    vpbroadcastq        m13, [o(int_mshift)]
+    vpcmpub              k7, m13, m10, 6
+    sub                eobd, 39
+    jl .pass1_fast
+    vpermq               m4, m15, [cq+64*4] ;  8  9
+    vpermq               m5, m7, [cq+64*5]  ; 11 10
+    vpermq               m6, m15, [cq+64*6] ; 12 13
+    vpermq               m7, m7, [cq+64*7]  ; 15 14
+    call .main
+    IADST16_PASS1_END
+    REPX      {psrad x, 14}, m2, m8, m9, m5, m11, m14, m12, m3
+    packssdw             m2, m8, m2   ; out4  out5
+    packssdw             m5, m9, m5   ; out10 out11
+    packssdw             m4, m12, m3  ; out8  out9
+    packssdw             m3, m14, m11 ; out6  out7
+    pxor                 m9, m9
+    punpckhwd            m8, m0, m1
+    punpcklwd            m0, m1
+    psubw                m8, m9, m8
+    punpckhwd            m1, m0, m8
+    punpcklwd            m0, m8
+    punpckhwd            m8, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m2, m8
+    punpcklwd            m2, m8
+    punpckhwd            m8, m4, m5
+    punpcklwd            m4, m5
+    punpckhwd            m5, m4, m8
+    punpcklwd            m4, m8
+    punpckhwd            m8, m6, m7
+    punpcklwd            m6, m7
+    psubw                m8, m9, m8
+    punpckhwd            m7, m6, m8
+    punpcklwd            m6, m8
+    jmp m(vp9_idct_16x16_internal).pass1_end
+.pass1_fast:
+    WRAP_YMM IADST16_MAIN
+    WRAP_YMM IADST16_PASS1_END
+    vinserti32x8         m0, ym6, 1
+    vinserti32x8         m1, ym7, 1
+    vinserti32x8         m8, ym12, 1
+    vinserti32x8         m2, ym3, 1
+    vinserti32x8        m14, ym9, 1
+    vinserti32x8        m11, ym5, 1
+    pslld               m14, 2
+    pslld               m11, 2
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    vpmultishiftqb  m14{k7}, m13, m8
+    vpmultishiftqb  m11{k7}, m13, m2
+    psrlq                m1, m15, 24
+    pxor                 m2, m2
+    psubw                m2, m4
+    punpckhwd            m3, m0, m2
+    punpcklwd            m0, m2
+    psrlq                m2, m15, 28
+    punpckhwd            m4, m14, m11
+    punpcklwd           m14, m11
+    mova                 m5, m2
+    vpermi2q             m2, m0, m14
+    vpermt2q             m0, m1, m14
+    vpermi2q             m1, m3, m4
+    vpermt2q             m3, m5, m4
+    jmp                tx2q
+.pass2:
+    pshufd               m1, m1, q1032
+    pshufd               m3, m3, q1032
+    test               eobd, eobd
+    jl .pass2_fast
+    pshufd               m5, m5, q1032
+    pshufd               m7, m7, q1032
+    call .main
+    jmp .pass2_end
+.pass2_fast:
+    call .main_fast
+.pass2_end:
+    vbroadcasti32x4      m9, [o(pw_11585_m11585x2x4)]
+    vbroadcasti32x4     m10, [o(pw_m11585_11585x2x4)]
+    punpckhqdq           m1, m8            ; -out3   out2
+    shufps               m0, m2, q3210     ; -out1   out0
+    pshufb               m2, m11, m12
+    pshufb               m5, m12
+    pshufb               m3, m12
+    pshufb               m4, m12
+    vbroadcasti32x4     m11, [o(pw_512)]
+    vpbroadcastd        m12, [o(pw_512)]
+    punpcklqdq           m8, m5, m2        ; t15a  t7
+    punpckhqdq           m5, m2            ; t14a  t6
+    shufps               m2, m3, m4, q1032 ; t2a   t10
+    shufps               m3, m4, q3210     ; t3a   t11
+    psubsw               m4, m2, m3
+    paddsw               m3, m2
+    paddsw               m2, m5, m8
+    psubsw               m5, m8
+    pmulhrsw             m4, m9            ; out8  out9
+    pmulhrsw             m3, m10           ; out7  out6
+    pmulhrsw             m2, m10           ; out5  out4
+    pmulhrsw             m5, m9            ; out10 out11
+    pmulhrsw             m6, m11
+    pmulhrsw             m7, m11
+    pshufd              m11, m11, q1032
+    pmulhrsw             m0, m11
+    pmulhrsw             m1, m11
+    REPX  {pmulhrsw x, m12}, m2, m3, m4, m5
+    psrldq               m8, m15, 2
+    psrlq               m12, m15, 20
+    psrldq              m10, m15, 1
+    psrlq               m13, m15, 12
+    mova                 m9, m8
+    vpermi2q             m8, m0, m2  ;  0  1  4  5
+    vpermt2q             m0, m12, m2
+    vpermi2q             m9, m1, m3  ;  2  3  6  7
+    vpermt2q             m1, m12, m3
+    mova                m11, m10
+    vpermi2q            m10, m4, m6  ;  8  9 12 13
+    vpermt2q             m4, m13, m6
+    vpermi2q            m11, m5, m7  ; 10 11 14 15
+    vpermt2q             m5, m13, m7
+    jmp m(vp9_idct_16x16_internal).pass2_end2
+ALIGN function_align
+    IADST16_MAIN
+    ret
+
+%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
+    pmovzxbw            m10, [dstq+%3]
+    pmovzxbw            m11, [r3  +%4]
+%if %2 < 8
+    paddw                m8, m%2, m%1
+    psubw                m9, m%2, m%1
+%else
+    mova                 m9, [rsp+64*(%2-8)]
+    paddw                m8, m9, m%1
+    psubw                m9, m%1
+%endif
+    pmulhrsw             m8, m12
+    pmulhrsw             m9, m12
+    paddw                m8, m10
+    paddw                m9, m11
+    packuswb             m8, m9
+    vpermq               m8, m13, m8
+    mova          [dstq+%3], ym8
+    vextracti32x8 [r3  +%4], m8, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+    add                dstq, r5
+    sub                  r3, r5
+%endif
+%endmacro
+
+cglobal vp9_idct_idct_32x32_add, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+    lea                  r6, [o_base]
+    cmp                eobd, 1
+    jne .pass1
+    movd               xmm0, [o(pw_11585x2)]
+    pmulhrsw           xmm3, xmm0, [cq]
+    pxor                 m2, m2
+    pmulhrsw           xmm3, xmm0
+    pmulhrsw           xmm3, [o(pw_512)]
+    movd               [cq], xm2
+    add                 r3d, 15
+    vpbroadcastw         m3, xmm3
+.dconly_loop:
+    mova                ym1, [dstq+strideq*0]
+    vinserti32x8         m1, [dstq+strideq*1], 1
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    paddw                m0, m3
+    paddw                m1, m3
+    packuswb             m0, m1
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    dec                 r3d
+    jg .dconly_loop
+    RET
+.pass1:
+    PROLOGUE 0, 7, 30, 64*16, dst, stride, c, eob
+    sub                eobd, 135
+    jl .fast
+    mova                 m0, [cq+64* 0]
+    mova                m14, [cq+64* 2]
+    mova                 m1, [cq+64* 4]
+    mova                m15, [cq+64* 6]
+    mova                 m2, [cq+64* 8]
+    mova                m16, [cq+64*10]
+    mova                 m3, [cq+64*12]
+    mova                m17, [cq+64*14]
+    mova                 m4, [cq+64*16]
+    mova                m18, [cq+64*18]
+    mova                 m5, [cq+64*20]
+    mova                m19, [cq+64*22]
+    mova                 m6, [cq+64*24]
+    mova                m20, [cq+64*26]
+    mova                 m7, [cq+64*28]
+    mova                m21, [cq+64*30]
+    call .idct16
+    mova         [rsp+64*0], m14
+    mova         [rsp+64*1], m15
+    mova         [rsp+64*2], m16
+    mova         [rsp+64*3], m17
+    mova         [rsp+64*4], m18
+    mova         [rsp+64*5], m19
+    mova         [rsp+64*6], m20
+    mova         [rsp+64*7], m21
+    mova                m22, [cq+64* 1]
+    mova                m23, [cq+64* 3]
+    mova                m24, [cq+64* 5]
+    mova                m25, [cq+64* 7]
+    mova                m26, [cq+64* 9]
+    mova                m27, [cq+64*11]
+    mova                m28, [cq+64*13]
+    mova                m29, [cq+64*15]
+    mova                m14, [cq+64*17]
+    mova                m15, [cq+64*19]
+    mova                m16, [cq+64*21]
+    mova                m17, [cq+64*23]
+    mova                m18, [cq+64*25]
+    mova                m19, [cq+64*27]
+    mova                m20, [cq+64*29]
+    mova                m21, [cq+64*31]
+    call .main
+    psubw               m13, m0, m29 ; 31
+    paddw                m0, m29     ;  0
+    psubw               m29, m1, m28 ; 30
+    paddw                m1, m28     ;  1
+    psubw               m28, m2, m27 ; 29
+    paddw                m2, m27     ;  2
+    psubw               m27, m3, m26 ; 28
+    paddw                m3, m26     ;  3
+    psubw               m26, m4, m25 ; 27
+    paddw                m4, m25     ;  4
+    psubw               m25, m5, m24 ; 26
+    paddw                m5, m24     ;  5
+    psubw               m24, m6, m23 ; 25
+    paddw                m6, m23     ;  6
+    psubw               m23, m7, m22 ; 24
+    paddw                m7, m22     ;  7
+    punpckhwd            m8, m0, m1  ; a4 b4 a5 b5 a6 b6 a7 b7
+    punpcklwd            m0, m1      ; a0 b0 a1 b1 a2 b2 a3 b3
+    punpckhwd            m1, m2, m3  ; c4 d4 c5 d5 c6 d6 c7 d7
+    punpcklwd            m2, m3      ; c0 d0 c1 d1 c2 d2 c3 d3
+    punpckhwd           m22, m4, m5  ; e4 f4 e5 f5 e6 f6 e7 f7
+    punpcklwd            m4, m5      ; e0 f0 e1 f1 e2 f2 e3 f3
+    punpckhwd            m5, m6, m7  ; g4 h4 g5 h5 g6 h6 g7 h7
+    punpcklwd            m6, m7      ; g0 h0 g1 h1 g2 h2 g3 h3
+    punpckhwd            m3, m23, m24
+    punpcklwd           m23, m24
+    punpckhwd           m24, m25, m26
+    punpcklwd           m25, m26
+    punpckhwd           m26, m27, m28
+    punpcklwd           m27, m28
+    punpckhwd           m28, m29, m13
+    punpcklwd           m29, m13
+    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
+    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
+    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
+    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
+    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
+    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
+    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
+    punpckhdq           m13, m23, m25
+    punpckldq           m23, m25
+    punpckhdq           m25, m27, m29
+    punpckldq           m27, m29
+    punpckhdq            m9, m3, m24
+    punpckldq            m3, m24
+    punpckhdq           m24, m26, m28
+    punpckldq           m26, m28
+    punpcklqdq           m5, m23, m27 ; d00 d08 d16 d24
+    punpckhqdq          m23, m27      ; d01 d09 d17 d25
+    punpckhqdq          m27, m13, m25 ; d03 d11 d19 d27
+    punpcklqdq          m13, m25      ; d02 d10 d18 d26
+    punpckhqdq          m25, m3, m26  ; d05 d13 d21 d29
+    punpcklqdq           m3, m26      ; d04 d12 d20 d28
+    punpckhqdq          m26, m9, m24  ; d07 d15 d23 d31
+    punpcklqdq           m9, m24      ; d06 d14 d22 d30
+    mova        [rsp+64*12], m23
+    mova        [rsp+64*13], m27
+    mova        [rsp+64*14], m25
+    mova        [rsp+64*15], m26
+    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
+    punpcklqdq           m8, m22      ; a04 a12 a20 a28
+    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
+    punpcklqdq           m0, m4       ; a00 a08 a16 a24
+    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
+    punpcklqdq           m7, m2       ; a02 a10 a18 a26
+    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
+    punpcklqdq           m6, m1       ; a06 a14 a22 a30
+    mova                 m2, [rsp+64*0]
+    mova                m11, [rsp+64*1]
+    mova                m12, [rsp+64*2]
+    mova                m29, [rsp+64*3]
+    mova                m27, [rsp+64*4]
+    mova                m26, [rsp+64*5]
+    mova                 m4, [rsp+64*6]
+    mova                m28, [rsp+64*7]
+    psubw                m1, m2, m21  ; 23
+    paddw                m2, m21      ;  8
+    psubw               m21, m11, m20 ; 22
+    paddw               m11, m20      ;  9
+    psubw               m20, m12, m19 ; 21
+    paddw               m12, m19      ; 10
+    psubw               m19, m29, m18 ; 20
+    paddw               m29, m18      ; 11
+    psubw               m18, m27, m17 ; 19
+    paddw               m27, m17      ; 12
+    psubw               m17, m26, m16 ; 18
+    paddw               m26, m16      ; 13
+    paddw               m16, m4, m15  ; 14
+    psubw                m4, m15      ; 17
+    mova                m15, m6
+    psubw                m6, m28, m14 ; 16
+    paddw               m28, m14      ; 15
+    mova                m14, m7
+    punpcklwd            m7, m6, m4
+    punpckhwd            m6, m4
+    punpckhwd            m4, m17, m18
+    punpcklwd           m17, m18
+    punpckhwd           m18, m19, m20
+    punpcklwd           m19, m20
+    punpckhwd           m20, m21, m1
+    punpcklwd           m21, m1
+    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
+    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
+    punpckhwd           m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+    punpcklwd           m12, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
+    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
+    punpckhwd           m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+    punpcklwd           m16, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
+    punpckhdq           m28, m2, m12  ; i2 j2 k2 l2 i3 j3 k3 l3
+    punpckldq            m2, m12      ; i0 j0 k0 l0 i1 j1 k1 l1
+    punpckhdq           m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
+    punpckldq           m27, m16      ; m0 n0 o0 p0 m1 n1 o1 p1
+    punpckhdq           m16, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
+    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
+    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
+    punpckhdq           m26, m19, m21
+    punpckldq           m19, m21
+    punpckhdq           m21, m6, m4
+    punpckldq            m6, m4
+    punpckhdq            m4, m18, m20
+    punpckldq           m18, m20
+    punpckhdq           m20, m7, m17
+    punpckldq            m7, m17
+    punpcklqdq          m17, m28, m12 ; b02 b10 b18 b26
+    punpckhqdq          m28, m12      ; b03 b11 b19 b27
+    punpckhqdq          m12, m2, m27  ; b01 b09 b17 b25
+    punpcklqdq           m2, m27      ; b00 b08 b16 b24
+    punpckhqdq          m27, m1, m29  ; b05 b13 b21 b29
+    punpcklqdq           m1, m29      ; b04 b12 b20 b28
+    punpckhqdq          m29, m16, m11 ; b07 b15 b23 b31
+    punpcklqdq          m16, m11      ; b06 b14 b22 b30
+    mova        [rsp+64* 8], m12
+    mova        [rsp+64* 9], m28
+    mova        [rsp+64*10], m27
+    mova        [rsp+64*11], m29
+    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
+    punpcklqdq          m20, m26      ; c02 c10 c18 c26
+    punpckhqdq          m26, m7, m19  ; c01 c09 c17 c25
+    punpcklqdq           m7, m19      ; c00 c08 c16 c24
+    punpckhqdq          m28, m6, m18  ; c05 c13 c21 c29
+    punpcklqdq           m6, m18      ; c04 c12 c20 c28
+    punpckhqdq          m29, m21, m4  ; c07 c15 c23 c31
+    punpcklqdq          m21, m4       ; c06 c14 c22 c30
+    mov                 r3d, 64*28
+    pxor                 m4, m4
+.zero_loop:
+    mova       [cq+r3+64*0], m4
+    mova       [cq+r3+64*1], m4
+    mova       [cq+r3+64*2], m4
+    mova       [cq+r3+64*3], m4
+    sub                 r3d, 64*4
+    jge .zero_loop
+    vshufi32x4           m4, m0, m2, q3232   ; a16 a24 b16 b24
+    vinserti32x8         m0, ym2, 1          ; a00 a08 b00 b08
+    vshufi32x4           m2, m7, m5, q3232   ; c16 c24 d16 d24
+    vinserti32x8         m7, ym5, 1          ; c00 c08 d00 d08
+    vshufi32x4           m5, m8, m1, q3232   ; a20 a28 b20 b28
+    vinserti32x8         m1, m8, ym1, 1      ; a04 a12 b04 b12
+    vshufi32x4           m8, m6, m3, q3232   ; c20 c28 d20 d28
+    vinserti32x8         m6, ym3, 1          ; c04 c12 d04 d12
+    vshufi32x4           m3, m1, m6, q3131   ; 12
+    vshufi32x4           m1, m6, q2020       ;  4
+    vshufi32x4           m6, m4, m2, q3131   ; 24
+    vshufi32x4           m4, m2, q2020       ; 16
+    vshufi32x4           m2, m0, m7, q3131   ;  8
+    vshufi32x4           m0, m7, q2020       ;  0
+    vshufi32x4           m7, m5, m8, q3131   ; 28
+    vshufi32x4           m5, m8, q2020       ; 20
+    vshufi32x4          m18, m14, m17, q3232 ; a18 a26 b18 b26
+    vinserti32x8        m14, ym17, 1         ; a02 a10 b02 b10
+    vshufi32x4          m17, m20, m13, q3232 ; c18 c26 d18 d26
+    vinserti32x8        m20, ym13, 1         ; c02 c10 d02 d10
+    vshufi32x4          m13, m21, m9, q3232  ; c22 c30 d22 d30
+    vinserti32x8        m21, ym9, 1          ; c06 c14 d06 d14
+    vshufi32x4          m19, m15, m16, q3232 ; a22 a30 b22 b30
+    vinserti32x8        m15, ym16, 1         ; a06 a14 b06 b14
+    vshufi32x4          m16, m14, m20, q3131 ; 10
+    vshufi32x4          m14, m20, q2020      ;  2
+    vshufi32x4          m20, m18, m17, q3131 ; 26
+    vshufi32x4          m18, m17, q2020      ; 18
+    vshufi32x4          m17, m15, m21, q3131 ; 14
+    vshufi32x4          m15, m21, q2020      ;  6
+    vshufi32x4          m21, m19, m13, q3131 ; 30
+    vshufi32x4          m19, m13, q2020      ; 22
+    call .idct16
+    mova         [rsp+64*0], m14
+    mova         [rsp+64*1], m15
+    mova         [rsp+64*2], m16
+    mova         [rsp+64*3], m17
+    mova         [rsp+64*4], m18
+    mova         [rsp+64*5], m19
+    mova         [rsp+64*6], m20
+    mova         [rsp+64*7], m21
+    mova                m15, [rsp+64* 8]
+    mova                m16, [rsp+64* 9]
+    mova                m17, [rsp+64*10]
+    mova                m19, [rsp+64*11]
+    mova                m20, [rsp+64*12]
+    mova                m21, [rsp+64*13]
+    mova                m13, [rsp+64*14]
+    mova                m18, [rsp+64*15]
+    vshufi32x4          m14, m22, m15, q3232 ; a17 a25 b17 b25
+    vinserti32x8        m22, ym15, 1         ; a01 a09 b01 b09
+    vshufi32x4          m15, m23, m16, q3232 ; a19 a27 b19 b27
+    vinserti32x8        m23, ym16, 1         ; a03 a11 b03 b11
+    vshufi32x4          m16, m24, m17, q3232 ; a21 a29 b21 b29
+    vinserti32x8        m24, ym17, 1         ; a05 a13 b05 b13
+    vshufi32x4          m17, m25, m19, q3232 ; a23 a31 b23 b31
+    vinserti32x8        m25, ym19, 1         ; a07 a15 b07 b15
+    vinserti32x8         m8, m26, ym20, 1    ; c01 c09 d01 d09
+    vshufi32x4          m26, m20, q3232      ; c17 c25 d17 d25
+    vinserti32x8         m9, m27, ym21, 1    ; c03 c11 d03 d11
+    vshufi32x4          m27, m21, q3232      ; c19 c27 d19 d27
+    vinserti32x8        m11, m28, ym13, 1    ; c05 c13 d05 d13
+    vshufi32x4          m28, m13, q3232      ; c21 c29 d21 d29
+    vinserti32x8        m12, m29, ym18, 1    ; c07 c15 d07 d15
+    vshufi32x4          m29, m18, q3232      ; c23 c31 d23 d31
+    vshufi32x4          m18, m14, m26, q3131 ; 25
+    vshufi32x4          m14, m26, q2020      ; 17
+    vshufi32x4          m19, m15, m27, q3131 ; 27
+    vshufi32x4          m15, m27, q2020      ; 19
+    vshufi32x4          m20, m16, m28, q3131 ; 29
+    vshufi32x4          m16, m28, q2020      ; 21
+    vshufi32x4          m21, m17, m29, q3131 ; 31
+    vshufi32x4          m17, m29, q2020      ; 23
+    vshufi32x4          m26, m22, m8, q3131  ;  9
+    vshufi32x4          m22, m8, q2020       ;  1
+    vshufi32x4          m27, m23, m9, q3131  ; 11
+    vshufi32x4          m23, m9, q2020       ;  3
+    vshufi32x4          m28, m24, m11, q3131 ; 13
+    vshufi32x4          m24, m11, q2020      ;  5
+    vshufi32x4          m29, m25, m12, q3131 ; 15
+    vshufi32x4          m25, m12, q2020      ;  7
+    call .main
+    jmp .end
+.fast:
+    mova                m14, [o(dup16_perm)]
+    pmovzxbw             m9, [cq+64*0]
+    pmovzxbw             m6, [cq+64*8]
+    vpermb               m8, m14, [cq+64* 2]
+    vpermb               m0, m14, [cq+64*14]
+    vpermb               m5, m14, [cq+64*10]
+    vpermb               m1, m14, [cq+64* 6]
+    vpermb               m7, m14, [cq+64* 4]
+    vpermb               m3, m14, [cq+64*12]
+    vpbroadcastd        m10, [o(pd_8192)]
+    vpbroadcastq        m13, [o(int_mshift)]
+    packuswb             m9, m9
+    packuswb             m6, m6
+    vpcmpub              k7, m13, m10, 6
+    IDCT16_MAIN           1
+    vpermb              m21, m14, [cq+64* 1]
+    vpermb              m17, m14, [cq+64*15]
+    vpermb              m20, m14, [cq+64* 9]
+    vpermb              m15, m14, [cq+64* 7]
+    vpermb              m18, m14, [cq+64* 5]
+    vpermb              m16, m14, [cq+64*11]
+    vpermb              m19, m14, [cq+64*13]
+    vpermb              m14, m14, [cq+64* 3]
+    call .main_packed_fast
+    punpcklwd            m8, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m1, m3
+    punpckhwd            m1, m3
+    punpcklwd            m3, m4, m6
+    punpckhwd            m4, m6
+    punpcklwd            m6, m5, m7
+    punpckhwd            m5, m7
+    punpcklwd            m7, m14, m16
+    punpckhwd           m14, m16
+    punpcklwd           m16, m15, m17
+    punpckhwd           m15, m17
+    punpcklwd           m17, m19, m21
+    punpckhwd           m19, m21
+    punpckhwd           m21, m18, m20
+    punpcklwd           m18, m20
+    punpcklwd           m20, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpcklwd            m6, m7, m15
+    punpckhwd            m7, m15
+    punpcklwd           m15, m14, m16
+    punpckhwd           m14, m16
+    punpckhwd           m16, m18, m19
+    punpcklwd           m18, m19
+    punpcklwd           m19, m21, m17
+    punpckhwd           m21, m17
+    punpcklwd           m17, m8, m0         ; a2   a6   aa   ae
+    punpckhwd            m8, m0             ; a3   a7   ab   af
+    punpcklwd            m0, m20, m1        ; a0   a4   a8   ac
+    punpckhwd           m20, m1             ; a1   a5   a9   ad
+    punpcklwd            m1, m2, m5         ; b0   b4   b8   bc
+    punpckhwd            m2, m5             ; b1   b5   b9   bd
+    punpcklwd            m5, m3, m4         ; b2   b6   ba   be
+    punpckhwd            m3, m4             ; b3   b7   bb   bf
+    punpcklwd            m4, m6, m15        ; c0   c4   c8   cc
+    punpckhwd            m6, m15            ; c1   c5   c9   cd
+    punpcklwd           m15, m7, m14        ; c2   c6   ca   ce
+    punpckhwd            m7, m14            ; c3   c7   cb   cf
+    punpcklwd           m14, m18, m19       ; d0   d4   d8   dc
+    punpckhwd           m18, m19            ; d1   d5   d9   dd
+    punpcklwd            m9, m16, m21       ; d2   d6   da   de
+    punpckhwd           m16, m21            ; d3   d7   db   df
+    mov                 r3d, 64*12
+    pxor               ym21, ym21
+.fast_zero_loop:
+    mova       [cq+r3+64*0], ym21
+    mova       [cq+r3+64*1], ym21
+    mova       [cq+r3+64*2], ym21
+    mova       [cq+r3+64*3], ym21
+    sub                 r3d, 64*4
+    jge .fast_zero_loop
+    vshufi32x4          m21, m0, m1, q3232  ; a8   ac   b8   bc
+    vinserti32x8         m0, ym1, 1         ; a0   a4   b0   b4
+    vinserti32x8         m1, m17, ym5, 1    ; a2   a6   b2   b6
+    vshufi32x4           m5, m17, m5, q3232 ; aa   ae   ba   be
+    vinserti32x8        m17, m8, ym3, 1     ; a3   a7   b3   b7
+    vshufi32x4          m19, m8, m3, q3232  ; ab   af   bb   bf
+    vinserti32x8         m3, m4, ym14, 1    ; c0   c4   d0   d4
+    vshufi32x4           m4, m14, q3232     ; c8   cc   d8   dc
+    vinserti32x8        m14, m20, ym2, 1    ; a1   a5   b1   b5
+    vshufi32x4          m20, m2, q3232      ; a9   ad   b9   bd
+    vinserti32x8         m2, m6, ym18, 1    ; c1   c5   d1   d5
+    vshufi32x4           m6, m18, q3232     ; c9   cd   d9   dd
+    vinserti32x8        m18, m15, ym9, 1    ; c2   c6   d2   d6
+    vshufi32x4          m15, m9, q3232      ; ca   ce   da   de
+    vinserti32x8         m9, m7, ym16, 1    ; c3   c7   d3   d7
+    vshufi32x4           m7, m16, q3232     ; cb   cf   db   df
+    vshufi32x4          m22, m14, m2, q2020 ;  1
+    vshufi32x4          m24, m14, m2, q3131 ;  5
+    vshufi32x4          m23, m17, m9, q2020 ;  3
+    vshufi32x4          m25, m17, m9, q3131 ;  7
+    vshufi32x4          m16, m5, m15, q2020 ; 10
+    vshufi32x4          m17, m5, m15, q3131 ; 14
+    vshufi32x4          m14, m1, m18, q2020 ;  2
+    vshufi32x4          m15, m1, m18, q3131 ;  6
+    vshufi32x4           m1, m0, m3, q3131  ;  4
+    vshufi32x4           m0, m3, q2020      ;  0
+    vshufi32x4           m3, m21, m4, q3131 ; 12
+    vshufi32x4           m2, m21, m4, q2020 ;  8
+    vshufi32x4          m26, m20, m6, q2020 ;  9
+    vshufi32x4          m28, m20, m6, q3131 ; 13
+    vshufi32x4          m27, m19, m7, q2020 ; 11
+    vshufi32x4          m29, m19, m7, q3131 ; 15
+    call .idct16_fast
+    mova         [rsp+64*0], m14
+    mova         [rsp+64*1], m15
+    mova         [rsp+64*2], m16
+    mova         [rsp+64*3], m17
+    mova         [rsp+64*4], m18
+    mova         [rsp+64*5], m19
+    mova         [rsp+64*6], m20
+    mova         [rsp+64*7], m21
+    call .main_fast
+.end:
+    lea                  r4, [strideq*3]
+    vpbroadcastd        m12, [o(pw_512)]
+    movshdup            m13, [o(itx_perm)]
+    lea                  r3, [dstq+r4*8]
+    lea                  r5, [strideq+r4] ; stride*4
+    add                  r3, r5           ; dst+stride*28
+    IDCT_32x32_END       29,  0, strideq*0, r4
+    IDCT_32x32_END       28,  1, strideq*1, strideq*2
+    IDCT_32x32_END       27,  2, strideq*2, strideq*1
+    IDCT_32x32_END       26,  3, r4       , strideq*0
+    IDCT_32x32_END       25,  4, strideq*0, r4
+    IDCT_32x32_END       24,  5, strideq*1, strideq*2
+    IDCT_32x32_END       23,  6, strideq*2, strideq*1
+    IDCT_32x32_END       22,  7, r4       , strideq*0
+    IDCT_32x32_END       21,  8, strideq*0, r4
+    IDCT_32x32_END       20,  9, strideq*1, strideq*2
+    IDCT_32x32_END       19, 10, strideq*2, strideq*1
+    IDCT_32x32_END       18, 11, r4       , strideq*0
+    IDCT_32x32_END       17, 12, strideq*0, r4
+    IDCT_32x32_END       16, 13, strideq*1, strideq*2
+    IDCT_32x32_END       15, 14, strideq*2, strideq*1
+    IDCT_32x32_END       14, 15, r4       , strideq*0
+    RET
+ALIGN function_align
+.idct16_fast:
+    vpbroadcastd        m21, [o(pw_16305x2)]
+    vpbroadcastd         m8, [o(pw_1606x2)]
+    vpbroadcastd        m18, [o(pw_m10394x2)]
+    vpbroadcastd         m9, [o(pw_12665x2)]
+    pmulhrsw            m21, m14 ; t15a
+    vpbroadcastd        m19, [o(pw_14449x2)]
+    pmulhrsw            m14, m8  ; t8a
+    vpbroadcastd         m8, [o(pw_7723x2)]
+    pmulhrsw            m18, m17 ; t9a
+    vpbroadcastd        m20, [o(pw_m4756x2)]
+    pmulhrsw            m17, m9  ; t14a
+    vpbroadcastd         m9, [o(pw_15679x2)]
+    pmulhrsw            m19, m16 ; t13a
+    vpbroadcastd         m5, [o(pw_m9102x2)]
+    pmulhrsw            m16, m8  ; t10a
+    vpbroadcastd         m8, [o(pw_13623x2)]
+    pmulhrsw            m20, m15 ; t11a
+    vpbroadcastd         m7, [o(pw_16069x2)]
+    pmulhrsw            m15, m9  ; t12a
+    vpbroadcastd         m9, [o(pw_3196x2)]
+    pmulhrsw             m5, m3  ; t5a
+    vpbroadcastd         m6, [o(pw_15137x2)]
+    pmulhrsw             m3, m8  ; t6a
+    vpbroadcastd         m8, [o(pw_6270x2)]
+    pmulhrsw             m7, m1  ; t7a
+    vpbroadcastd         m4, [o(pw_11585x2)]
+    pmulhrsw             m1, m9  ; t4
+    vpbroadcastd        m10, [o(pd_8192)]
+    pmulhrsw             m6, m2  ; t3
+    pmulhrsw             m2, m8  ; t2
+    pmulhrsw             m4, m0  ; t0
+    mova                 m0, m4  ; t1
+    jmp .idct16b
+ALIGN function_align
+.idct16:
+    vpbroadcastd        m10, [o(pd_8192)]
+    ITX_MULSUB_2W        14, 21, 8, 9, 10,  1606, 16305 ; t8a,  t15a
+    ITX_MULSUB_2W        18, 17, 8, 9, 10, 12665, 10394 ; t9a,  t14a
+    ITX_MULSUB_2W        16, 19, 8, 9, 10,  7723, 14449 ; t10a, t13a
+    ITX_MULSUB_2W        20, 15, 8, 9, 10, 15679,  4756 ; t11a, t12
+    ITX_MULSUB_2W         5,  3, 8, 9, 10, 13623,  9102 ; t5a, t6a
+    ITX_MULSUB_2W         1,  7, 8, 9, 10,  3196, 16069 ; t4a, t7a
+    ITX_MULSUB_2W         2,  6, 8, 9, 10,  6270, 15137 ; t2, t3
+    ITX_MULSUB_2W         0,  4, 8, 9, 10, 11585, 11585 ; t1, t0
+.idct16b:
+    paddw                m8, m20, m16 ; t11
+    psubw               m20, m16      ; t10
+    paddw               m16, m15, m19 ; t12
+    psubw               m15, m19      ; t13
+    psubw               m19, m14, m18 ; t9
+    paddw               m14, m18      ; t8
+    psubw               m18, m21, m17 ; t14
+    paddw               m21, m17      ; t15
+    vpbroadcastd        m11, [o(pw_6270_15137)]
+    vpbroadcastd        m12, [o(pw_m15137_6270)]
+    ITX_MULSUB_2W        18, 19, 9, 17, 10, 11, 12 ; t9a,  t14a
+    vpbroadcastd        m11, [o(pw_m6270_m15137)]
+    ITX_MULSUB_2W        15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
+    vpbroadcastd        m11, [o(pw_11585_11585)]
+    vpbroadcastd        m12, [o(pw_m11585_11585)]
+    paddw                m9, m7, m3   ; t7
+    psubw                m3, m7, m3   ; t6a
+    paddw                m7, m1, m5   ; t4
+    psubw                m1, m5       ; t5a
+    psubw               m17, m14, m8  ; t11a
+    paddw                m8, m14      ; t8a
+    paddw               m14, m18, m15 ; t9
+    psubw               m18, m15      ; t10
+    psubw               m15, m19, m20 ; t13
+    paddw               m19, m20      ; t14
+    paddw               m20, m21, m16 ; t15a
+    psubw               m16, m21, m16 ; t12a
+    ITX_MULSUB_2W         3,  1, 5, 21, 10, 11, 12 ; t5,   t6
+    ITX_MULSUB_2W        15, 18, 5, 21, 10, 11, 12 ; t10a, t13a
+    ITX_MULSUB_2W        16, 17, 5, 21, 10, 11, 12 ; t11,  t12
+    psubw                m5, m0, m2   ; t2
+    paddw                m2, m0       ; t1
+    paddw                m0, m4, m6   ; t0
+    psubw                m4, m6       ; t3
+    psubw                m6, m2, m1   ; t6
+    paddw                m1, m2       ; t1
+    paddw                m2, m5, m3   ; t2
+    psubw                m5, m3       ; t5
+    paddw                m3, m4, m7   ; t3
+    psubw                m4, m7       ; t4
+    psubw                m7, m0, m9   ; t7
+    paddw                m0, m9       ; t0
+    psubw               m21, m0, m20  ; out15
+    paddw                m0, m20      ; out0
+    psubw               m20, m1, m19  ; out14
+    paddw                m1, m19      ; out1
+    psubw               m19, m2, m18  ; out13
+    paddw                m2, m18      ; out2
+    psubw               m18, m3, m17  ; out12
+    paddw                m3, m17      ; out3
+    psubw               m17, m4, m16  ; out11
+    paddw                m4, m16      ; out4
+    psubw               m16, m5, m15  ; out10
+    paddw                m5, m15      ; out5
+    psubw               m15, m6, m14  ; out9
+    paddw                m6, m14      ; out6
+    psubw               m14, m7, m8   ; out8
+    paddw                m7, m8       ; out7
+    ret
+ALIGN function_align
+.main_fast:
+    vpbroadcastd        m21, [o(pw_16364x2)]
+    vpbroadcastd         m8, [o(pw_804x2)]
+    vpbroadcastd        m14, [o(pw_m11003x2)]
+    vpbroadcastd         m9, [o(pw_12140x2)]
+    pmulhrsw            m21, m22 ; t31a
+    vpbroadcastd        m17, [o(pw_14811x2)]
+    pmulhrsw            m22, m8  ; t16a
+    vpbroadcastd         m8, [o(pw_7005x2)]
+    pmulhrsw            m14, m29 ; t30a
+    vpbroadcastd        m18, [o(pw_m5520x2)]
+    pmulhrsw            m29, m9  ; t17a
+    vpbroadcastd         m9, [o(pw_15426x2)]
+    pmulhrsw            m17, m26 ; t29a
+    vpbroadcastd        m19, [o(pw_15893x2)]
+    pmulhrsw            m26, m8  ; t18a
+    vpbroadcastd         m8, [o(pw_3981x2)]
+    pmulhrsw            m18, m25 ; t19a
+    vpbroadcastd        m16, [o(pw_m8423x2)]
+    pmulhrsw            m25, m9  ; t28a
+    vpbroadcastd         m9, [o(pw_14053x2)]
+    pmulhrsw            m19, m24 ; t27a
+    vpbroadcastd        m15, [o(pw_13160x2)]
+    pmulhrsw            m24, m8  ; t20a
+    vpbroadcastd         m8, [o(pw_9760x2)]
+    pmulhrsw            m16, m27 ; t21a
+    vpbroadcastd        m20, [o(pw_m2404x2)]
+    pmulhrsw            m27, m9  ; t26a
+    vpbroadcastd         m9, [o(pw_16207x2)]
+    pmulhrsw            m15, m28 ; t25a
+    pmulhrsw            m28, m8  ; t22a
+    pmulhrsw            m20, m23 ; t23a
+    pmulhrsw            m23, m9  ; t24a
+    jmp .main2
+ALIGN function_align
+.main:
+    ITX_MULSUB_2W        22, 21,  8,  9, 10,   804, 16364 ; t16a, t31a
+    ITX_MULSUB_2W        14, 29,  8,  9, 10, 12140, 11003 ; t17a, t30a
+    ITX_MULSUB_2W        26, 17,  8,  9, 10,  7005, 14811 ; t18a, t29a
+    ITX_MULSUB_2W        18, 25,  8,  9, 10, 15426,  5520 ; t19a, t28a
+    ITX_MULSUB_2W        24, 19,  8,  9, 10,  3981, 15893 ; t20a, t27a
+    ITX_MULSUB_2W        16, 27,  8,  9, 10, 14053,  8423 ; t21a, t26a
+    ITX_MULSUB_2W        28, 15,  8,  9, 10,  9760, 13160 ; t22a, t25a
+    ITX_MULSUB_2W        20, 23,  8,  9, 10, 16207,  2404 ; t23a, t24a
+.main2:
+    psubw                m8, m22, m14 ; t17
+    paddw               m22, m14      ; t16
+    paddw               m14, m18, m26 ; t19
+    psubw               m18, m26      ; t18
+    psubw               m26, m24, m16 ; t21
+    paddw               m24, m16      ; t20
+    psubw               m16, m20, m28 ; t22
+    paddw               m28, m20      ; t23
+    psubw               m20, m23, m15 ; t25
+    paddw               m23, m15      ; t24
+    psubw               m15, m21, m29 ; t30
+    paddw               m21, m29      ; t31
+    psubw               m29, m19, m27 ; t26
+    paddw               m19, m27      ; t27
+    paddw               m27, m25, m17 ; t28
+    psubw               m25, m17      ; t29
+    ITX_MULSUB_2W        15,  8,  9, 17, 10,   3196, 16069 ; t17a, t30a
+    ITX_MULSUB_2W        25, 18,  9, 17, 10, m16069,  3196 ; t18a, t29a
+    ITX_MULSUB_2W        29, 26,  9, 17, 10,  13623,  9102 ; t21a, t26a
+    ITX_MULSUB_2W        20, 16,  9, 17, 10,  m9102, 13623 ; t22a, t25a
+    psubw               m17, m21, m27 ; t28a
+    paddw               m21, m27      ; t31a
+    psubw               m27, m15, m25 ; t18
+    paddw               m15, m25      ; t17
+    psubw               m25, m20, m29 ; t21
+    paddw               m20, m29      ; t22
+    psubw               m29, m8, m18  ; t29
+    paddw                m8, m18      ; t30
+    psubw               m18, m22, m14 ; t19a
+    paddw               m22, m14      ; t16a
+    psubw               m14, m28, m24 ; t20a
+    paddw               m24, m28      ; t23a
+    paddw               m28, m16, m26 ; t25
+    psubw               m16, m26      ; t26
+    psubw               m26, m23, m19 ; t27a
+    paddw               m23, m19      ; t24a
+    vpbroadcastd        m12, [o(pw_m15137_6270)]
+    vpbroadcastd        m11, [o(pw_6270_15137)]
+    ITX_MULSUB_2W        29, 27,  9, 19, 10, 11, 12 ; t18a, t29a
+    ITX_MULSUB_2W        17, 18,  9, 19, 10, 11, 12 ; t19,  t28
+    vpbroadcastd        m11, [o(pw_m6270_m15137)]
+    ITX_MULSUB_2W        16, 25,  9, 19, 10, 12, 11 ; t21a, t26a
+    ITX_MULSUB_2W        26, 14,  9, 19, 10, 12, 11 ; t20,  t27
+    vpbroadcastd        m12, [o(pw_m11585_11585)]
+    vpbroadcastd        m11, [o(pw_11585_11585)]
+    psubw               m19, m27, m25 ; t26
+    paddw               m27, m25      ; t29
+    psubw               m25, m17, m26 ; t20a
+    paddw               m17, m26      ; t19a
+    paddw               m26, m18, m14 ; t28a
+    psubw               m18, m14      ; t27a
+    paddw               m14, m22, m24 ; t16
+    psubw               m22, m24      ; t23
+    psubw               m24, m29, m16 ; t21
+    paddw               m16, m29      ; t18
+    paddw               m29, m21, m23 ; t31
+    psubw               m21, m23      ; t24
+    psubw               m23, m15, m20 ; t22a
+    paddw               m15, m20      ; t17a
+    psubw               m20, m8, m28  ; t25a
+    paddw               m28, m8       ; t30a
+    ITX_MULSUB_2W        18, 25,  8,  9, 10, 11, 12 ; t20,  t27
+    ITX_MULSUB_2W        19, 24,  8,  9, 10, 11, 12 ; t21a, t26a
+    ITX_MULSUB_2W        21, 22,  8,  9, 10, 11, 12 ; t23a, t24a
+    ITX_MULSUB_2W        20, 23,  8,  9, 10, 11, 12 ; t22,  t25
+    ret
+ALIGN function_align
+.main_packed_fast:
+    vpbroadcastd         m8, [o(pw_804_16364x2)]
+    vpbroadcastd         m9, [o(pw_m11003_12140x2)]
+    vpbroadcastd        m11, [o(pw_7005_14811x2)]
+    vpbroadcastd        m12, [o(pw_m5520_15426x2)]
+    pmulhrsw            m21, m8       ; t16a, t31a
+    vpbroadcastd         m8, [o(pw_3981_15893x2)]
+    pmulhrsw            m17, m9       ; t17a, t30a
+    vpbroadcastd         m9, [o(pw_m8423_14053x2)]
+    pmulhrsw            m20, m11      ; t18a, t29a
+    vpbroadcastd        m11, [o(pw_9760_13160x2)]
+    pmulhrsw            m15, m12      ; t19a, t28a
+    vpbroadcastd        m12, [o(pw_m2404_16207x2)]
+    pmulhrsw            m18, m8       ; t20a, t27a
+    pmulhrsw            m16, m9       ; t21a, t26a
+    pmulhrsw            m19, m11      ; t22a, t25a
+    pmulhrsw            m14, m12      ; t23a, t24a
+    psubw                m8, m21, m17 ; t17 t30
+    paddw               m21, m17      ; t16 t31
+    psubw               m17, m15, m20 ; t18 t29
+    paddw               m20, m15      ; t19 t28
+    psubw               m15, m18, m16 ; t21 t26
+    paddw               m18, m16      ; t20 t27
+    psubw               m16, m14, m19 ; t22 t25
+    paddw               m14, m19      ; t23 t24
+    ITX_MUL2X_PACK        8, 9, 19, 10,   3196, 16069, 5 ; t17a t30a
+    ITX_MUL2X_PACK       17, 9, 19, 10, m16069,  3196, 5 ; t18a t29a
+    ITX_MUL2X_PACK       15, 9, 19, 10,  13623,  9102, 5 ; t21a t26a
+    ITX_MUL2X_PACK       16, 9, 19, 10,  m9102, 13623, 5 ; t22a t25a
+    vpbroadcastd        m11, [o(pw_m15137_6270)]
+    psubw               m19, m21, m20 ; t19a t28a
+    paddw               m21, m20      ; t16a t31a
+    psubw               m20, m14, m18 ; t20a t27a
+    paddw               m14, m18      ; t23a t24a
+    psubw               m18, m8, m17  ; t18  t29
+    paddw                m8, m17      ; t17  t30
+    psubw               m17, m16, m15 ; t21  t26
+    paddw               m15, m16      ; t22  t25
+    ITX_MUL2X_PACK       18, 9, 16, 10, 6270_15137, 11,   20 ; t18a t29a
+    ITX_MUL2X_PACK       19, 9, 16, 10, 6270_15137, 11,   20 ; t19  t28
+    ITX_MUL2X_PACK       20, 9, 16, 10, 11, m6270_m15137, 36 ; t20  t27
+    ITX_MUL2X_PACK       17, 9, 16, 10, 11, m6270_m15137, 36 ; t21a t26a
+    vbroadcasti32x4      m9, [o(deint_shuf)]
+    psubw               m16, m21, m14 ; t23  t24
+    paddw               m14, m21      ; t16  t31
+    psubw               m21, m8, m15  ; t22a t25a
+    paddw               m15, m8       ; t17a t30a
+    psubw                m8, m18, m17 ; t21  t26
+    paddw               m18, m17      ; t18  t29
+    paddw               m17, m19, m20 ; t19a t28a
+    psubw               m19, m20      ; t20a t27a
+    vpbroadcastd        m11, [o(pw_m11585_11585)]
+    vpbroadcastd        m12, [o(pw_11585_11585)]
+    REPX     {pshufb x, m9}, m14, m15, m18, m17
+    mova                 m9, m10
+    vpdpwssd             m9, m16, m11
+    mova                m20, m10
+    vpdpwssd            m20, m21, m11
+    psrad                m9, 14
+    psrad               m20, 14
+    packssdw             m9, m20      ; t23a t22
+    mova                m20, m10
+    vpdpwssd            m20, m16, m12
+    mova                m16, m10
+    vpdpwssd            m16, m21, m12
+    psrad               m20, 14
+    psrad               m16, 14
+    packssdw            m16, m20, m16 ; t24a t25
+    ITX_MUL2X_PACK        8, 21, 20, 10, 11, 12, 8 ; t21a t26a
+    ITX_MUL2X_PACK       19,  8, 11, 10, 11, 12, 8 ; t20  t27
+    packssdw            m11, m20      ; t27  t26a
+    packssdw             m8, m21      ; t20  t21a
+    punpcklqdq          m20, m14, m15 ; t16  t17a
+    punpckhqdq          m14, m15      ; t31  t30a
+    punpckhqdq          m15, m17, m18 ; t28a t29
+    punpcklqdq          m17, m18      ; t19a t18
+    psubw               m21, m0, m14  ; out31 out30
+    paddw                m0, m14      ; out0  out1
+    psubw               m14, m7, m20  ; out16 out17
+    paddw                m7, m20      ; out15 out14
+    psubw               m20, m1, m15  ; out28 out29
+    paddw                m1, m15      ; out3  out2
+    psubw               m15, m6, m17  ; out19 out18
+    paddw                m6, m17      ; out12 out13
+    psubw               m17, m4, m9   ; out23 out22
+    paddw                m4, m9       ; out8  out9
+    psubw               m18, m3, m16  ; out24 out25
+    paddw                m3, m16      ; out7  out6
+    psubw               m16, m5, m8   ; out20 out21
+    paddw                m5, m8       ; out11 out10
+    psubw               m19, m2, m11  ; out27 out26
+    paddw                m2, m11      ; out4  out5
+    ret
+
+%endif
diff --git a/libavcodec/xvididct.c b/libavcodec/xvididct.c
index 2eddc5978c544..317e4e82cdaf2 100644
--- a/libavcodec/xvididct.c
+++ b/libavcodec/xvididct.c
@@ -32,7 +32,6 @@
 
 #include "config.h"
 #include "libavutil/attributes.h"
-#include "avcodec.h"
 #include "idctdsp.h"
 #include "xvididct.h"
 
@@ -330,27 +329,16 @@ static void xvid_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
     ff_add_pixels_clamped_c(block, dest, line_size);
 }
 
-av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
+av_cold void ff_xvid_idct_init(IDCTDSPContext *c)
 {
-    const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
-
-    if (high_bit_depth || avctx->lowres ||
-        !(avctx->idct_algo == FF_IDCT_AUTO ||
-          avctx->idct_algo == FF_IDCT_XVID))
-        return;
-
-    if (avctx->idct_algo == FF_IDCT_XVID) {
-        c->idct_put  = xvid_idct_put;
-        c->idct_add  = xvid_idct_add;
-        c->idct      = ff_xvid_idct;
-        c->perm_type = FF_IDCT_PERM_NONE;
-    }
+    c->idct_put  = xvid_idct_put;
+    c->idct_add  = xvid_idct_add;
+    c->idct      = ff_xvid_idct;
+    c->perm_type = FF_IDCT_PERM_NONE;
 
 #if ARCH_X86
     ff_xvid_idct_init_x86(c);
 #elif ARCH_MIPS
     ff_xvid_idct_init_mips(c);
 #endif
-
-    ff_init_scantable_permutation(c->idct_permutation, c->perm_type);
 }
diff --git a/libavcodec/xvididct.h b/libavcodec/xvididct.h
index 1395cfd8e1e73..496071a034f90 100644
--- a/libavcodec/xvididct.h
+++ b/libavcodec/xvididct.h
@@ -21,12 +21,11 @@
 
 #include <stdint.h>
 
-#include "avcodec.h"
 #include "idctdsp.h"
 
 void ff_xvid_idct(int16_t *const in);
 
-void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx);
+void ff_xvid_idct_init(IDCTDSPContext *c);
 
 void ff_xvid_idct_init_x86(IDCTDSPContext *c);
 void ff_xvid_idct_init_mips(IDCTDSPContext *c);
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 0effe4127ffd3..97f8f1727203e 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -208,6 +208,7 @@ OBJS-$(CONFIG_BILATERAL_FILTER)              += vf_bilateral.o
 OBJS-$(CONFIG_BILATERAL_CUDA_FILTER)         += vf_bilateral_cuda.o vf_bilateral_cuda.ptx.o
 OBJS-$(CONFIG_BITPLANENOISE_FILTER)          += vf_bitplanenoise.o
 OBJS-$(CONFIG_BLACKDETECT_FILTER)            += vf_blackdetect.o
+OBJS-$(CONFIG_BLACKDETECT_VULKAN_FILTER)     += vf_blackdetect_vulkan.o
 OBJS-$(CONFIG_BLACKFRAME_FILTER)             += vf_blackframe.o
 OBJS-$(CONFIG_BLEND_FILTER)                  += vf_blend.o framesync.o
 OBJS-$(CONFIG_BLEND_VULKAN_FILTER)           += vf_blend_vulkan.o framesync.o vulkan.o vulkan_filter.o
@@ -471,6 +472,7 @@ OBJS-$(CONFIG_SCALE_VULKAN_FILTER)           += vf_scale_vulkan.o vulkan.o vulka
 OBJS-$(CONFIG_SCALE2REF_FILTER)              += vf_scale.o scale_eval.o framesync.o
 OBJS-$(CONFIG_SCALE2REF_NPP_FILTER)          += vf_scale_npp.o scale_eval.o
 OBJS-$(CONFIG_SCDET_FILTER)                  += vf_scdet.o
+OBJS-$(CONFIG_SCDET_VULKAN_FILTER)           += vf_scdet_vulkan.o
 OBJS-$(CONFIG_SCHARR_FILTER)                 += vf_convolution.o
 OBJS-$(CONFIG_SCROLL_FILTER)                 += vf_scroll.o
 OBJS-$(CONFIG_SEGMENT_FILTER)                += f_segment.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 5ea33cdf01b91..3bc045b28f552 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -192,6 +192,7 @@ extern const FFFilter ff_vf_bilateral;
 extern const FFFilter ff_vf_bilateral_cuda;
 extern const FFFilter ff_vf_bitplanenoise;
 extern const FFFilter ff_vf_blackdetect;
+extern const FFFilter ff_vf_blackdetect_vulkan;
 extern const FFFilter ff_vf_blackframe;
 extern const FFFilter ff_vf_blend;
 extern const FFFilter ff_vf_blend_vulkan;
@@ -443,6 +444,7 @@ extern const FFFilter ff_vf_scale_vulkan;
 extern const FFFilter ff_vf_scale2ref;
 extern const FFFilter ff_vf_scale2ref_npp;
 extern const FFFilter ff_vf_scdet;
+extern const FFFilter ff_vf_scdet_vulkan;
 extern const FFFilter ff_vf_scharr;
 extern const FFFilter ff_vf_scroll;
 extern const FFFilter ff_vf_segment;
diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index c76d43a215ea7..56f635a4130fb 100644
--- a/libavfilter/avfilter.c
+++ b/libavfilter/avfilter.c
@@ -1071,7 +1071,8 @@ int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
             strcmp(link->dst->filter->name, "format") &&
             strcmp(link->dst->filter->name, "idet") &&
             strcmp(link->dst->filter->name, "null") &&
-            strcmp(link->dst->filter->name, "scale")) {
+            strcmp(link->dst->filter->name, "scale") &&
+            strcmp(link->dst->filter->name, "libplacebo")) {
             av_assert1(frame->format        == link->format);
             av_assert1(frame->width         == link->w);
             av_assert1(frame->height        == link->h);
diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
index 5e93f93aab362..2d6036df7423e 100644
--- a/libavfilter/avfiltergraph.c
+++ b/libavfilter/avfiltergraph.c
@@ -1068,8 +1068,8 @@ static void swap_channel_layouts_on_filter(AVFilterContext *filter)
             }
 
             /* no penalty for LFE channel mismatch */
-            if (av_channel_layout_channel_from_index(&in_chlayout,  AV_CHAN_LOW_FREQUENCY) >= 0 &&
-                av_channel_layout_channel_from_index(&out_chlayout, AV_CHAN_LOW_FREQUENCY) >= 0)
+            if (av_channel_layout_index_from_channel(&in_chlayout,  AV_CHAN_LOW_FREQUENCY) >= 0 &&
+                av_channel_layout_index_from_channel(&out_chlayout, AV_CHAN_LOW_FREQUENCY) >= 0)
                 score += 10;
             av_channel_layout_from_mask(&in_chlayout, av_channel_layout_subset(&in_chlayout, ~AV_CH_LOW_FREQUENCY));
             av_channel_layout_from_mask(&out_chlayout, av_channel_layout_subset(&out_chlayout, ~AV_CH_LOW_FREQUENCY));
diff --git a/libavfilter/vf_blackdetect.c b/libavfilter/vf_blackdetect.c
index 21f35f705dd74..8be33a814dd83 100644
--- a/libavfilter/vf_blackdetect.c
+++ b/libavfilter/vf_blackdetect.c
@@ -31,6 +31,7 @@
 #include "libavutil/timestamp.h"
 #include "avfilter.h"
 #include "filters.h"
+#include "formats.h"
 #include "video.h"
 
 typedef struct BlackDetectContext {
@@ -45,6 +46,7 @@ typedef struct BlackDetectContext {
     double       picture_black_ratio_th;
     double       pixel_black_th;
     unsigned int pixel_black_th_i;
+    int          alpha;
 
     unsigned int nb_black_pixels;   ///< number of black pixels counted so far
     AVRational   time_base;
@@ -63,6 +65,7 @@ static const AVOption blackdetect_options[] = {
     { "pic_th",                 "set the picture black ratio threshold", OFFSET(picture_black_ratio_th), AV_OPT_TYPE_DOUBLE, {.dbl=.98}, 0, 1, FLAGS },
     { "pixel_black_th", "set the pixel black threshold", OFFSET(pixel_black_th), AV_OPT_TYPE_DOUBLE, {.dbl=.10}, 0, 1, FLAGS },
     { "pix_th",         "set the pixel black threshold", OFFSET(pixel_black_th), AV_OPT_TYPE_DOUBLE, {.dbl=.10}, 0, 1, FLAGS },
+    { "alpha",          "check alpha instead of luma", OFFSET(alpha), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
     { NULL }
 };
 
@@ -71,11 +74,21 @@ AVFILTER_DEFINE_CLASS(blackdetect);
 #define YUVJ_FORMATS \
     AV_PIX_FMT_YUVJ411P, AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P
 
+#define YUVA_FORMATS \
+    AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,   AV_PIX_FMT_YUVA444P, \
+    AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_YUVA444P16, \
+    AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P12, AV_PIX_FMT_YUVA422P16, \
+    AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16
+
 static const enum AVPixelFormat yuvj_formats[] = {
     YUVJ_FORMATS, AV_PIX_FMT_NONE
 };
 
-static const enum AVPixelFormat pix_fmts[] = {
+static const enum AVPixelFormat yuva_formats[] = {
+    YUVA_FORMATS, AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat yuv_formats[] = {
     AV_PIX_FMT_GRAY8,
     AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
     AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
@@ -91,13 +104,23 @@ static const enum AVPixelFormat pix_fmts[] = {
     AV_PIX_FMT_YUV440P12,
     AV_PIX_FMT_YUV444P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV420P14,
     AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
-    AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,   AV_PIX_FMT_YUVA444P,
-    AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_YUVA444P16,
-    AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P12, AV_PIX_FMT_YUVA422P16,
-    AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
-    AV_PIX_FMT_NONE
+    YUVA_FORMATS, AV_PIX_FMT_NONE
 };
 
+static int query_format(const AVFilterContext *ctx,
+                        AVFilterFormatsConfig **cfg_in,
+                        AVFilterFormatsConfig **cfg_out)
+{
+    const BlackDetectContext *s = ctx->priv;
+    AVFilterFormats *formats;
+    if (s->alpha)
+        formats = ff_make_format_list(yuva_formats);
+    else
+        formats = ff_make_format_list(yuv_formats);
+
+    return ff_set_common_formats2(ctx, cfg_in, cfg_out, formats);
+}
+
 static int config_input(AVFilterLink *inlink)
 {
     AVFilterContext *ctx = inlink->dst;
@@ -114,9 +137,9 @@ static int config_input(AVFilterLink *inlink)
         return AVERROR(ENOMEM);
 
     av_log(s, AV_LOG_VERBOSE,
-           "black_min_duration:%s pixel_black_th:%f picture_black_ratio_th:%f\n",
+           "black_min_duration:%s pixel_black_th:%f picture_black_ratio_th:%f alpha:%d\n",
            av_ts2timestr(s->black_min_duration, &s->time_base),
-           s->pixel_black_th, s->picture_black_ratio_th);
+           s->pixel_black_th, s->picture_black_ratio_th, s->alpha);
     return 0;
 }
 
@@ -140,7 +163,8 @@ static int black_counter(AVFilterContext *ctx, void *arg,
     const unsigned int threshold = s->pixel_black_th_i;
     unsigned int *counterp = &s->counter[jobnr];
     AVFrame *in = arg;
-    const int linesize = in->linesize[0];
+    const int plane = s->alpha ? 3 : 0;
+    const int linesize = in->linesize[plane];
     const int w = in->width;
     const int h = in->height;
     const int start = (h * jobnr) / nb_jobs;
@@ -149,7 +173,7 @@ static int black_counter(AVFilterContext *ctx, void *arg,
     unsigned int counter = 0;
 
     if (s->depth == 8) {
-        const uint8_t *p = in->data[0] + start * linesize;
+        const uint8_t *p = in->data[plane] + start * linesize;
 
         for (int i = 0; i < size; i++) {
             for (int x = 0; x < w; x++)
@@ -157,7 +181,7 @@ static int black_counter(AVFilterContext *ctx, void *arg,
             p += linesize;
         }
     } else {
-        const uint16_t *p = (const uint16_t *)(in->data[0] + start * linesize);
+        const uint16_t *p = (const uint16_t *)(in->data[plane] + start * linesize);
 
         for (int i = 0; i < size; i++) {
             for (int x = 0; x < w; x++)
@@ -180,7 +204,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
     const int max = (1 << s->depth) - 1;
     const int factor = (1 << (s->depth - 8));
     const int full = picref->color_range == AVCOL_RANGE_JPEG ||
-                     ff_fmt_is_in(picref->format, yuvj_formats);
+                     ff_fmt_is_in(picref->format, yuvj_formats) ||
+                     s->alpha;
 
     s->pixel_black_th_i = full ? s->pixel_black_th * max :
         // luminance_minimum_value + pixel_black_th * luminance_range_size
@@ -252,6 +277,6 @@ const FFFilter ff_vf_blackdetect = {
     .priv_size     = sizeof(BlackDetectContext),
     FILTER_INPUTS(blackdetect_inputs),
     FILTER_OUTPUTS(ff_video_default_filterpad),
-    FILTER_PIXFMTS_ARRAY(pix_fmts),
+    FILTER_QUERY_FUNC2(query_format),
     .uninit        = uninit,
 };
diff --git a/libavfilter/vf_blackdetect_vulkan.c b/libavfilter/vf_blackdetect_vulkan.c
new file mode 100644
index 0000000000000..4e977abe3d773
--- /dev/null
+++ b/libavfilter/vf_blackdetect_vulkan.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright 2025 (c) Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <float.h>
+#include "libavutil/vulkan_spirv.h"
+#include "libavutil/opt.h"
+#include "libavutil/timestamp.h"
+#include "vulkan_filter.h"
+
+#include "filters.h"
+#include "video.h"
+
+typedef struct BlackDetectVulkanContext {
+    FFVulkanContext vkctx;
+
+    int initialized;
+    FFVkExecPool e;
+    AVVulkanDeviceQueueFamily *qf;
+    FFVulkanShader shd;
+    AVBufferPool *sum_buf_pool;
+
+    double black_min_duration_time;
+    double picture_black_ratio_th;
+    double pixel_black_th;
+    int    alpha;
+
+    int64_t black_start;
+    int64_t black_end;
+} BlackDetectVulkanContext;
+
+typedef struct BlackDetectPushData {
+    float threshold;
+} BlackDetectPushData;
+
+typedef struct BlackDetectBuf {
+#define SLICES 16
+    uint32_t slice_sum[SLICES];
+} BlackDetectBuf;
+
+static av_cold int init_filter(AVFilterContext *ctx)
+{
+    int err;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    BlackDetectVulkanContext *s = ctx->priv;
+    FFVulkanContext *vkctx = &s->vkctx;
+    FFVulkanShader *shd;
+    FFVkSPIRVCompiler *spv;
+    FFVulkanDescriptorSetBinding *desc;
+    const int plane = s->alpha ? 3 : 0;
+
+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->vkctx.input_format);
+    if (pixdesc->flags & AV_PIX_FMT_FLAG_RGB) {
+        av_log(ctx, AV_LOG_ERROR, "RGB inputs are not supported\n");
+        return AVERROR(ENOTSUP);
+    }
+
+    spv = ff_vk_spirv_init();
+    if (!spv) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
+    if (!s->qf) {
+        av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
+        err = AVERROR(ENOTSUP);
+        goto fail;
+    }
+
+    RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, s->qf->num*4, 0, 0, 0, NULL));
+    RET(ff_vk_shader_init(vkctx, &s->shd, "blackdetect",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_KHR_shader_subgroup_ballot" }, 1,
+                          32, 32, 1,
+                          0));
+    shd = &s->shd;
+
+    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            );
+    GLSLC(1,     float threshold;                                             );
+    GLSLC(0, };                                                               );
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(BlackDetectPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name       = "input_img",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .mem_layout = ff_vk_shader_rep_fmt(s->vkctx.input_format, FF_VK_REP_FLOAT),
+            .mem_quali  = "readonly",
+            .dimensions = 2,
+            .elems      = av_pix_fmt_count_planes(s->vkctx.input_format),
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        }, {
+            .name        = "sum_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "uint slice_sum[];",
+        }
+    };
+
+    RET(ff_vk_shader_add_descriptor_set(vkctx, &s->shd, desc, 2, 0, 0));
+
+    GLSLC(0, shared uint wg_sum;                                              );
+    GLSLC(0,                                                                  );
+    GLSLC(0, void main()                                                      );
+    GLSLC(0, {                                                                );
+    GLSLC(1,     wg_sum = 0u;                                                 );
+    GLSLC(1,     barrier();                                                   );
+    GLSLC(0,                                                                  );
+    GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           );
+    GLSLF(1,     if (!IS_WITHIN(pos, imageSize(input_img[%d])))               ,plane);
+    GLSLC(2,         return;                                                  );
+    GLSLF(1,     float value = imageLoad(input_img[%d], pos).x;               ,plane);
+    GLSLC(1,     uvec4 isblack = subgroupBallot(value <= threshold);          );
+    GLSLC(1,     if (subgroupElect())                                         );
+    GLSLC(2,         atomicAdd(wg_sum, subgroupBallotBitCount(isblack));      );
+    GLSLC(1,     barrier();                                                   );
+    GLSLC(1,     if (gl_LocalInvocationIndex == 0u)                           );
+    GLSLF(2,         atomicAdd(slice_sum[gl_WorkGroupID.x %% %du], wg_sum);   ,SLICES);
+    GLSLC(0, }                                                                );
+
+    RET(spv->compile_shader(vkctx, spv, &s->shd, &spv_data, &spv_len, "main",
+                            &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, &s->shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(vkctx, &s->e, &s->shd));
+
+    s->initialized = 1;
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+    if (spv)
+        spv->uninit(&spv);
+
+    return err;
+}
+
+static void evaluate(AVFilterLink *link, AVFrame *in,
+                     const BlackDetectBuf *sum)
+{
+    AVFilterContext *ctx = link->dst;
+    BlackDetectVulkanContext *s = ctx->priv;
+    FilterLink *inl = ff_filter_link(link);
+    uint64_t nb_black_pixels = 0;
+    double ratio;
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(sum->slice_sum); i++)
+        nb_black_pixels += sum->slice_sum[i];
+
+    ratio = (double) nb_black_pixels / (link->w * link->h);
+
+    av_log(ctx, AV_LOG_DEBUG,
+           "frame:%"PRId64" picture_black_ratio:%f pts:%s t:%s type:%c\n",
+           inl->frame_count_out, ratio,
+           av_ts2str(in->pts), av_ts2timestr(in->pts, &in->time_base),
+           av_get_picture_type_char(in->pict_type));
+
+    if (ratio >= s->picture_black_ratio_th) {
+        if (s->black_start == AV_NOPTS_VALUE) {
+            s->black_start = in->pts;
+            av_dict_set(&in->metadata, "lavfi.black_start",
+                av_ts2timestr(in->pts, &in->time_base), 0);
+        }
+    } else if (s->black_start != AV_NOPTS_VALUE) {
+        av_dict_set(&in->metadata, "lavfi.black_end",
+            av_ts2timestr(in->pts, &in->time_base), 0);
+        if ((in->pts - s->black_start) >= s->black_min_duration_time / av_q2d(in->time_base)) {
+            av_log(s, AV_LOG_INFO,
+                   "black_start:%s black_end:%s black_duration:%s\n",
+                   av_ts2timestr(s->black_start, &in->time_base),
+                   av_ts2timestr(in->pts, &in->time_base),
+                   av_ts2timestr(in->pts - s->black_start, &in->time_base));
+        }
+        s->black_start = AV_NOPTS_VALUE;
+    }
+}
+
+static int blackdetect_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    int err;
+    AVFilterContext *ctx = link->dst;
+    BlackDetectVulkanContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+
+    VkImageView in_views[AV_NUM_DATA_POINTERS];
+    VkImageMemoryBarrier2 img_bar[4];
+    int nb_img_bar = 0;
+
+    FFVulkanContext *vkctx = &s->vkctx;
+    FFVulkanFunctions *vk = &vkctx->vkfn;
+    FFVkExecContext *exec = NULL;
+    AVBufferRef *sum_buf = NULL;
+    FFVkBuffer *sum_vk;
+
+    BlackDetectBuf *sum;
+    BlackDetectPushData push_data;
+
+    if (in->color_range == AVCOL_RANGE_JPEG || s->alpha) {
+        push_data.threshold = s->pixel_black_th;
+    } else {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(vkctx->input_format);
+        const int depth = desc->comp[0].depth;
+        const int ymin = 16  << (depth - 8);
+        const int ymax = 235 << (depth - 8);
+        const int imax = (1 << depth) - 1;
+        push_data.threshold = (s->pixel_black_th * (ymax - ymin) + ymin) / imax;
+    }
+
+    if (!s->initialized)
+        RET(init_filter(ctx));
+
+    err = ff_vk_get_pooled_buffer(vkctx, &s->sum_buf_pool, &sum_buf,
+                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+                                  NULL,
+                                  sizeof(BlackDetectBuf),
+                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                                  VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                  VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+    if (err < 0)
+        return err;
+    sum_vk = (FFVkBuffer *)sum_buf->data;
+    sum = (BlackDetectBuf *) sum_vk->mapped_mem;
+
+    exec = ff_vk_exec_get(vkctx, &s->e);
+    ff_vk_exec_start(vkctx, exec);
+
+    RET(ff_vk_exec_add_dep_frame(vkctx, exec, in,
+                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+    RET(ff_vk_create_imageviews(vkctx, exec, in_views, in, FF_VK_REP_FLOAT));
+
+    ff_vk_shader_update_img_array(vkctx, exec, &s->shd, in, in_views, 0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+
+    ff_vk_frame_barrier(vkctx, exec, in, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_READ_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    /* zero sum buffer */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = VK_PIPELINE_STAGE_2_NONE,
+                .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
+                .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = sum_vk->buf,
+                .size = sum_vk->size,
+                .offset = 0,
+            },
+            .bufferMemoryBarrierCount = 1,
+        });
+
+    vk->CmdFillBuffer(exec->buf, sum_vk->buf, 0, sum_vk->size, 0x0);
+
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pImageMemoryBarriers = img_bar,
+            .imageMemoryBarrierCount = nb_img_bar,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
+                .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+                .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = sum_vk->buf,
+                .size = sum_vk->size,
+                .offset = 0,
+            },
+            .bufferMemoryBarrierCount = 1,
+        });
+
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd, 0, 1, 0,
+                                        sum_vk, 0, sum_vk->size,
+                                        VK_FORMAT_UNDEFINED));
+
+    ff_vk_exec_bind_shader(vkctx, exec, &s->shd);
+    ff_vk_shader_update_push_const(vkctx, exec, &s->shd, VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(push_data), &push_data);
+
+    vk->CmdDispatch(exec->buf,
+                    FFALIGN(in->width,  s->shd.lg_size[0]) / s->shd.lg_size[0],
+                    FFALIGN(in->height, s->shd.lg_size[1]) / s->shd.lg_size[1],
+                    s->shd.lg_size[2]);
+
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+                .srcAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+                .dstAccessMask = VK_ACCESS_HOST_READ_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = sum_vk->buf,
+                .size = sum_vk->size,
+                .offset = 0,
+            },
+            .bufferMemoryBarrierCount = 1,
+        });
+
+    RET(ff_vk_exec_submit(vkctx, exec));
+    ff_vk_exec_wait(vkctx, exec);
+    evaluate(link, in, sum);
+
+    av_buffer_unref(&sum_buf);
+    return ff_filter_frame(outlink, in);
+
+fail:
+    if (exec)
+        ff_vk_exec_discard_deps(&s->vkctx, exec);
+    av_frame_free(&in);
+    av_buffer_unref(&sum_buf);
+    return err;
+}
+
+static void blackdetect_vulkan_uninit(AVFilterContext *avctx)
+{
+    BlackDetectVulkanContext *s = avctx->priv;
+    FFVulkanContext *vkctx = &s->vkctx;
+
+    ff_vk_exec_pool_free(vkctx, &s->e);
+    ff_vk_shader_free(vkctx, &s->shd);
+
+    av_buffer_pool_uninit(&s->sum_buf_pool);
+
+    ff_vk_uninit(&s->vkctx);
+
+    s->initialized = 0;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    BlackDetectVulkanContext *s = ctx->priv;
+    FFVulkanContext *vkctx = &s->vkctx;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(vkctx->input_format);
+
+    if (s->alpha && !(desc->flags & AV_PIX_FMT_FLAG_ALPHA)) {
+        av_log(ctx, AV_LOG_ERROR, "Input format %s does not have an alpha channel\n",
+               av_get_pix_fmt_name(vkctx->input_format));
+        return AVERROR(EINVAL);
+    }
+
+    if (desc->flags & (AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_XYZ) ||
+        !(desc->flags & AV_PIX_FMT_FLAG_PLANAR)) {
+        av_log(ctx, AV_LOG_ERROR, "Input format %s is not planar YUV\n",
+               av_get_pix_fmt_name(vkctx->input_format));
+        return AVERROR(EINVAL);
+    }
+
+    return ff_vk_filter_config_output(outlink);
+}
+
+#define OFFSET(x) offsetof(BlackDetectVulkanContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption blackdetect_vulkan_options[] = {
+    { "d",                  "set minimum detected black duration in seconds", OFFSET(black_min_duration_time), AV_OPT_TYPE_DOUBLE, {.dbl=2}, 0, DBL_MAX, FLAGS },
+    { "black_min_duration", "set minimum detected black duration in seconds", OFFSET(black_min_duration_time), AV_OPT_TYPE_DOUBLE, {.dbl=2}, 0, DBL_MAX, FLAGS },
+    { "picture_black_ratio_th", "set the picture black ratio threshold", OFFSET(picture_black_ratio_th), AV_OPT_TYPE_DOUBLE, {.dbl=.98}, 0, 1, FLAGS },
+    { "pic_th",                 "set the picture black ratio threshold", OFFSET(picture_black_ratio_th), AV_OPT_TYPE_DOUBLE, {.dbl=.98}, 0, 1, FLAGS },
+    { "pixel_black_th", "set the pixel black threshold", OFFSET(pixel_black_th), AV_OPT_TYPE_DOUBLE, {.dbl=.10}, 0, 1, FLAGS },
+    { "pix_th",         "set the pixel black threshold", OFFSET(pixel_black_th), AV_OPT_TYPE_DOUBLE, {.dbl=.10}, 0, 1, FLAGS },
+    { "alpha",          "check alpha instead of luma", OFFSET(alpha), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(blackdetect_vulkan);
+
+static const AVFilterPad blackdetect_vulkan_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = &blackdetect_vulkan_filter_frame,
+        .config_props = &ff_vk_filter_config_input,
+    },
+};
+
+static const AVFilterPad blackdetect_vulkan_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+        .config_props = &config_output,
+    },
+};
+
+const FFFilter ff_vf_blackdetect_vulkan = {
+    .p.name         = "blackdetect_vulkan",
+    .p.description  = NULL_IF_CONFIG_SMALL("Detect video intervals that are (almost) black."),
+    .p.priv_class   = &blackdetect_vulkan_class,
+    .p.flags        = AVFILTER_FLAG_HWDEVICE,
+    .priv_size      = sizeof(BlackDetectVulkanContext),
+    .init           = &ff_vk_filter_init,
+    .uninit         = &blackdetect_vulkan_uninit,
+    FILTER_INPUTS(blackdetect_vulkan_inputs),
+    FILTER_OUTPUTS(blackdetect_vulkan_outputs),
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_VULKAN),
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_gblur_vulkan.c b/libavfilter/vf_gblur_vulkan.c
index 80b66de735b5e..fb676a7fc9981 100644
--- a/libavfilter/vf_gblur_vulkan.c
+++ b/libavfilter/vf_gblur_vulkan.c
@@ -171,7 +171,6 @@ static int init_gblur_pipeline(GBlurVulkanContext *s,
     RET(ff_vk_shader_register_exec(&s->vkctx, &s->e, shd));
 
     RET(ff_vk_create_buf(&s->vkctx, params_buf, sizeof(float) * ksize, NULL, NULL,
-                         VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
                          VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
     RET(ff_vk_map_buffer(&s->vkctx, params_buf, &kernel_mapped, 0));
diff --git a/libavfilter/vf_interlace_vulkan.c b/libavfilter/vf_interlace_vulkan.c
index b5cd321fef217..7afb30c2d76f6 100644
--- a/libavfilter/vf_interlace_vulkan.c
+++ b/libavfilter/vf_interlace_vulkan.c
@@ -189,7 +189,9 @@ static int interlace_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     AVFrame *out = NULL, *input_top, *input_bot;
     AVFilterContext *ctx = link->dst;
     InterlaceVulkanContext *s = ctx->priv;
+    const AVFilterLink *inlink = ctx->inputs[0];
     AVFilterLink *outlink = ctx->outputs[0];
+    FilterLink *l = ff_filter_link(outlink);
 
     if (!s->initialized)
         RET(init_filter(ctx));
@@ -226,6 +228,9 @@ static int interlace_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     if (s->mode == MODE_TFF)
         out->flags |= AV_FRAME_FLAG_TOP_FIELD_FIRST;
 
+    out->pts = av_rescale_q(out->pts, inlink->time_base, outlink->time_base);
+    out->duration = av_rescale_q(1, av_inv_q(l->frame_rate), outlink->time_base);
+
     av_frame_free(&s->cur);
     av_frame_free(&in);
 
@@ -260,9 +265,12 @@ static void interlace_vulkan_uninit(AVFilterContext *avctx)
 
 static int config_out_props(AVFilterLink *outlink)
 {
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    const FilterLink *il = ff_filter_link(inlink);
     FilterLink *ol = ff_filter_link(outlink);
 
-    ol->frame_rate = av_mul_q(ol->frame_rate, av_make_q(1, 2));
+    ol->frame_rate = av_mul_q(il->frame_rate, av_make_q(1, 2));
+    outlink->time_base = av_mul_q(inlink->time_base, av_make_q(2, 1));
     return ff_vk_filter_config_output(outlink);
 }
 
diff --git a/libavfilter/vf_libplacebo.c b/libavfilter/vf_libplacebo.c
index 86e1f43dea8c2..9ff64053cc0ea 100644
--- a/libavfilter/vf_libplacebo.c
+++ b/libavfilter/vf_libplacebo.c
@@ -193,8 +193,14 @@ typedef struct LibplaceboContext {
     int color_range;
     int color_primaries;
     int color_trc;
+    int rotation;
     AVDictionary *extra_opts;
 
+#if PL_API_VER >= 351
+    pl_cache cache;
+    char *shader_cache;
+#endif
+
     int have_hwdevice;
 
     /* pl_render_params */
@@ -522,6 +528,21 @@ static int libplacebo_init(AVFilterContext *avctx)
         return AVERROR(ENOMEM);
     }
 
+#if PL_API_VER >= 351
+    if (s->shader_cache && s->shader_cache[0]) {
+        s->cache = pl_cache_create(pl_cache_params(
+            .log  = s->log,
+            .get  = pl_cache_get_file,
+            .set  = pl_cache_set_file,
+            .priv = s->shader_cache,
+        ));
+        if (!s->cache) {
+            libplacebo_uninit(avctx);
+            return AVERROR(ENOMEM);
+        }
+    }
+#endif
+
     if (s->out_format_string) {
         s->out_format = av_get_pix_fmt(s->out_format_string);
         if (s->out_format == AV_PIX_FMT_NONE) {
@@ -676,6 +697,9 @@ static int init_vulkan(AVFilterContext *avctx, const AVVulkanDeviceContext *hwct
     }
 
     s->gpu = s->vulkan->gpu;
+#if PL_API_VER >= 351
+    pl_gpu_set_cache(s->gpu, s->cache);
+#endif
 
     /* Parse the user shaders, if requested */
     if (s->shader_bin_len)
@@ -714,6 +738,9 @@ static void libplacebo_uninit(AVFilterContext *avctx)
         av_freep(&s->inputs);
     }
 
+#if PL_API_VER >= 351
+    pl_cache_destroy(&s->cache);
+#endif
     pl_options_free(&s->opts);
     pl_vulkan_destroy(&s->vulkan);
     pl_log_destroy(&s->log);
@@ -802,6 +829,13 @@ static void update_crops(AVFilterContext *ctx, LibplaceboInput *in,
         image->crop.y0 = av_expr_eval(s->crop_y_pexpr, s->var_values, NULL);
         image->crop.x1 = image->crop.x0 + s->var_values[VAR_CROP_W];
         image->crop.y1 = image->crop.y0 + s->var_values[VAR_CROP_H];
+        image->rotation = s->rotation;
+        if (s->rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+            /* Libplacebo expects the input crop relative to the actual frame
+             * dimensions, so un-transpose them here */
+            FFSWAP(float, image->crop.x0, image->crop.y0);
+            FFSWAP(float, image->crop.x1, image->crop.y1);
+        }
 
         if (src == ref) {
             /* Only update the target crop once, for the 'reference' frame */
@@ -1198,6 +1232,14 @@ static int libplacebo_config_input(AVFilterLink *inlink)
     AVFilterContext *avctx = inlink->dst;
     LibplaceboContext *s   = avctx->priv;
 
+    if (s->rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+        /* Swap width and height for 90 degree rotations to make the size and
+         * scaling calculations work out correctly */
+        FFSWAP(int, inlink->w, inlink->h);
+        if (inlink->sample_aspect_ratio.num)
+            inlink->sample_aspect_ratio = av_inv_q(inlink->sample_aspect_ratio);
+    }
+
     if (inlink->format == AV_PIX_FMT_VULKAN)
         return ff_vk_filter_config_input(inlink);
 
@@ -1328,6 +1370,9 @@ static const AVOption libplacebo_options[] = {
     { "fillcolor", "Background fill color", OFFSET(fillcolor), AV_OPT_TYPE_COLOR, {.str = "black@0"}, .flags = DYNAMIC },
     { "corner_rounding", "Corner rounding radius", OFFSET(corner_rounding), AV_OPT_TYPE_FLOAT, {.dbl = 0.0}, 0.0, 1.0, .flags = DYNAMIC },
     { "extra_opts", "Pass extra libplacebo-specific options using a :-separated list of key=value pairs", OFFSET(extra_opts), AV_OPT_TYPE_DICT, .flags = DYNAMIC },
+#if PL_API_VER >= 351
+    { "shader_cache",  "Set shader cache path", OFFSET(shader_cache), AV_OPT_TYPE_STRING, {.str = NULL}, .flags = STATIC },
+#endif
 
     {"colorspace", "select colorspace", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64=-1}, -1, AVCOL_SPC_NB-1, DYNAMIC, .unit = "colorspace"},
     {"auto", "keep the same colorspace",  0, AV_OPT_TYPE_CONST, {.i64=-1},                          INT_MIN, INT_MAX, STATIC, .unit = "colorspace"},
@@ -1386,6 +1431,13 @@ static const AVOption libplacebo_options[] = {
     {"smpte2084",                      NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_TRC_SMPTE2084},    INT_MIN, INT_MAX, STATIC, .unit = "color_trc"},
     {"arib-std-b67",                   NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_TRC_ARIB_STD_B67}, INT_MIN, INT_MAX, STATIC, .unit = "color_trc"},
 
+    {"rotate", "rotate the input clockwise", OFFSET(rotation), AV_OPT_TYPE_INT, {.i64=PL_ROTATION_0}, PL_ROTATION_0, PL_ROTATION_360, DYNAMIC, .unit = "rotation"},
+    {"0",                              NULL,  0, AV_OPT_TYPE_CONST, {.i64=PL_ROTATION_0},   .flags = STATIC, .unit = "rotation"},
+    {"90",                             NULL,  0, AV_OPT_TYPE_CONST, {.i64=PL_ROTATION_90},  .flags = STATIC, .unit = "rotation"},
+    {"180",                            NULL,  0, AV_OPT_TYPE_CONST, {.i64=PL_ROTATION_180}, .flags = STATIC, .unit = "rotation"},
+    {"270",                            NULL,  0, AV_OPT_TYPE_CONST, {.i64=PL_ROTATION_270}, .flags = STATIC, .unit = "rotation"},
+    {"360",                            NULL,  0, AV_OPT_TYPE_CONST, {.i64=PL_ROTATION_360}, .flags = STATIC, .unit = "rotation"},
+
     { "upscaler", "Upscaler function", OFFSET(upscaler), AV_OPT_TYPE_STRING, {.str = "spline36"}, .flags = DYNAMIC },
     { "downscaler", "Downscaler function", OFFSET(downscaler), AV_OPT_TYPE_STRING, {.str = "mitchell"}, .flags = DYNAMIC },
     { "frame_mixer", "Frame mixing function", OFFSET(frame_mixer), AV_OPT_TYPE_STRING, {.str = "none"}, .flags = DYNAMIC },
diff --git a/libavfilter/vf_scdet_vulkan.c b/libavfilter/vf_scdet_vulkan.c
new file mode 100644
index 0000000000000..fadc0842aeb83
--- /dev/null
+++ b/libavfilter/vf_scdet_vulkan.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright 2025 (c) Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/vulkan_spirv.h"
+#include "libavutil/opt.h"
+#include "libavutil/timestamp.h"
+#include "vulkan_filter.h"
+
+#include "filters.h"
+
+typedef struct SceneDetectVulkanContext {
+    FFVulkanContext vkctx;
+
+    int initialized;
+    FFVkExecPool e;
+    AVVulkanDeviceQueueFamily *qf;
+    FFVulkanShader shd;
+    AVBufferPool *det_buf_pool;
+
+    double threshold;
+    int sc_pass;
+
+    int nb_planes;
+    double prev_mafd;
+    AVFrame *prev;
+    AVFrame *cur;
+} SceneDetectVulkanContext;
+
+typedef struct SceneDetectBuf {
+#define SLICES 16
+    uint32_t frame_sad[SLICES];
+} SceneDetectBuf;
+
+static av_cold int init_filter(AVFilterContext *ctx)
+{
+    int err;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    SceneDetectVulkanContext *s = ctx->priv;
+    FFVulkanContext *vkctx = &s->vkctx;
+    FFVulkanShader *shd;
+    FFVkSPIRVCompiler *spv;
+    FFVulkanDescriptorSetBinding *desc;
+
+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->vkctx.input_format);
+    const int lumaonly = !(pixdesc->flags & AV_PIX_FMT_FLAG_RGB) &&
+                         (pixdesc->flags & AV_PIX_FMT_FLAG_PLANAR);
+    s->nb_planes = lumaonly ? 1 : av_pix_fmt_count_planes(s->vkctx.input_format);
+
+    spv = ff_vk_spirv_init();
+    if (!spv) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
+    if (!s->qf) {
+        av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
+        err = AVERROR(ENOTSUP);
+        goto fail;
+    }
+
+    RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, s->qf->num*4, 0, 0, 0, NULL));
+    RET(ff_vk_shader_init(vkctx, &s->shd, "scdet",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_KHR_shader_subgroup_arithmetic" }, 1,
+                          32, 32, 1,
+                          0));
+    shd = &s->shd;
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name       = "prev_img",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .mem_layout = ff_vk_shader_rep_fmt(s->vkctx.input_format, FF_VK_REP_UINT),
+            .mem_quali  = "readonly",
+            .dimensions = 2,
+            .elems      = av_pix_fmt_count_planes(s->vkctx.input_format),
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        }, {
+            .name       = "cur_img",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .mem_layout = ff_vk_shader_rep_fmt(s->vkctx.input_format, FF_VK_REP_UINT),
+            .mem_quali  = "readonly",
+            .dimensions = 2,
+            .elems      = av_pix_fmt_count_planes(s->vkctx.input_format),
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        }, {
+            .name        = "sad_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "uint frame_sad[];",
+        }
+    };
+
+    RET(ff_vk_shader_add_descriptor_set(vkctx, &s->shd, desc, 3, 0, 0));
+
+    GLSLC(0, shared uint wg_sum;                                              );
+    GLSLC(0, void main()                                                      );
+    GLSLC(0, {                                                                );
+    GLSLF(1,     const uint slice = gl_WorkGroupID.x %% %u;            ,SLICES);
+    GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           );
+    GLSLC(1,     wg_sum = 0;                                                  );
+    GLSLC(1,     barrier();                                                   );
+    for (int i = 0; i < s->nb_planes; i++) {
+        GLSLF(1, if (IS_WITHIN(pos, imageSize(cur_img[%d]))) {              ,i);
+        GLSLF(2,     uvec4 prev = imageLoad(prev_img[%d], pos);             ,i);
+        GLSLF(2,     uvec4 cur  = imageLoad(cur_img[%d],  pos);             ,i);
+        GLSLC(2,     uvec4 sad = abs(ivec4(cur) - ivec4(prev));               );
+        GLSLC(2,     uint sum = subgroupAdd(sad.x + sad.y + sad.z);           );
+        GLSLC(2,     if (subgroupElect())                                     );
+        GLSLC(3,         atomicAdd(wg_sum, sum);                              );
+        GLSLC(1, }                                                            );
+    }
+    GLSLC(1,     barrier();                                                   );
+    GLSLC(1,     if (gl_LocalInvocationIndex == 0)                            );
+    GLSLC(2,         atomicAdd(frame_sad[slice], wg_sum);                     );
+    GLSLC(0, }                                                                );
+
+    RET(spv->compile_shader(vkctx, spv, &s->shd, &spv_data, &spv_len, "main",
+                            &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, &s->shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(vkctx, &s->e, &s->shd));
+
+    s->initialized = 1;
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+    if (spv)
+        spv->uninit(&spv);
+
+    return err;
+}
+
+static double evaluate(AVFilterContext *ctx, const SceneDetectBuf *buf)
+{
+    SceneDetectVulkanContext *s = ctx->priv;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->vkctx.input_format);
+    const AVFilterLink *inlink = ctx->inputs[0];
+    uint64_t count;
+    double mafd, diff;
+
+    uint64_t sad = 0;
+    for (int i = 0; i < SLICES; i++)
+        sad += buf->frame_sad[i];
+
+    av_assert2(s->nb_planes == 1 || !(desc->log2_chroma_w || desc->log2_chroma_h));
+    count = s->nb_planes * inlink->w * inlink->h;
+    mafd = (double) sad * 100.0 / count / (1ULL << desc->comp[0].depth);
+    diff = fabs(mafd - s->prev_mafd);
+    s->prev_mafd = mafd;
+
+    return av_clipf(FFMIN(mafd, diff), 0.0, 100.0);
+}
+
+static int scdet_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    int err;
+    AVFilterContext *ctx = link->dst;
+    SceneDetectVulkanContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+
+    VkImageView prev_views[AV_NUM_DATA_POINTERS];
+    VkImageView cur_views[AV_NUM_DATA_POINTERS];
+    VkImageMemoryBarrier2 img_bar[8];
+    int nb_img_bar = 0;
+
+    FFVulkanContext *vkctx = &s->vkctx;
+    FFVulkanFunctions *vk = &vkctx->vkfn;
+    FFVkExecContext *exec = NULL;
+    AVBufferRef *buf = NULL;
+    FFVkBuffer *buf_vk;
+
+    SceneDetectBuf *sad;
+    double score = 0.0;
+    char str[64];
+
+    if (!s->initialized)
+        RET(init_filter(ctx));
+
+    av_frame_free(&s->prev);
+    s->prev = s->cur;
+    s->cur = av_frame_clone(in);
+    if (!s->prev)
+        goto done;
+
+    RET(ff_vk_get_pooled_buffer(vkctx, &s->det_buf_pool, &buf,
+                                VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+                                NULL,
+                                sizeof(SceneDetectBuf),
+                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                                VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                VK_MEMORY_PROPERTY_HOST_COHERENT_BIT));
+    buf_vk = (FFVkBuffer *)buf->data;
+    sad = (SceneDetectBuf *) buf_vk->mapped_mem;
+
+    exec = ff_vk_exec_get(vkctx, &s->e);
+    ff_vk_exec_start(vkctx, exec);
+
+    RET(ff_vk_exec_add_dep_frame(vkctx, exec, s->prev,
+                                 VK_PIPELINE_STAGE_2_NONE,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+    RET(ff_vk_create_imageviews(vkctx, exec, prev_views, s->prev, FF_VK_REP_UINT));
+
+    ff_vk_shader_update_img_array(vkctx, exec, &s->shd, s->prev, prev_views, 0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+
+    ff_vk_frame_barrier(vkctx, exec, s->prev, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_READ_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    RET(ff_vk_exec_add_dep_frame(vkctx, exec, s->cur,
+                                 VK_PIPELINE_STAGE_2_NONE,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+    RET(ff_vk_create_imageviews(vkctx, exec, cur_views, s->cur, FF_VK_REP_UINT));
+
+    ff_vk_shader_update_img_array(vkctx, exec, &s->shd, s->cur, cur_views, 0, 1,
+                                  VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+
+    ff_vk_frame_barrier(vkctx, exec, s->cur, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_READ_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    /* zero buffer */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = VK_PIPELINE_STAGE_2_NONE,
+                .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
+                .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = buf_vk->buf,
+                .size = buf_vk->size,
+                .offset = 0,
+            },
+            .bufferMemoryBarrierCount = 1,
+        });
+
+    vk->CmdFillBuffer(exec->buf, buf_vk->buf, 0, buf_vk->size, 0x0);
+
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pImageMemoryBarriers = img_bar,
+            .imageMemoryBarrierCount = nb_img_bar,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
+                .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+                .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = buf_vk->buf,
+                .size = buf_vk->size,
+                .offset = 0,
+            },
+            .bufferMemoryBarrierCount = 1,
+        });
+
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd, 0, 2, 0,
+                                        buf_vk, 0, buf_vk->size,
+                                        VK_FORMAT_UNDEFINED));
+
+    ff_vk_exec_bind_shader(vkctx, exec, &s->shd);
+
+    vk->CmdDispatch(exec->buf,
+                    FFALIGN(in->width,  s->shd.lg_size[0]) / s->shd.lg_size[0],
+                    FFALIGN(in->height, s->shd.lg_size[1]) / s->shd.lg_size[1],
+                    s->shd.lg_size[2]);
+
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+                .srcAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+                .dstAccessMask = VK_ACCESS_HOST_READ_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = buf_vk->buf,
+                .size = buf_vk->size,
+                .offset = 0,
+            },
+            .bufferMemoryBarrierCount = 1,
+        });
+
+    RET(ff_vk_exec_submit(vkctx, exec));
+    ff_vk_exec_wait(vkctx, exec);
+    score = evaluate(ctx, sad);
+
+done:
+    snprintf(str, sizeof(str), "%0.3f", s->prev_mafd);
+    av_dict_set(&in->metadata, "lavfi.scd.mafd", str, 0);
+    snprintf(str, sizeof(str), "%0.3f", score);
+    av_dict_set(&in->metadata, "lavfi.scd.score", str, 0);
+
+    if (score >= s->threshold) {
+        const char *pts = av_ts2timestr(in->pts, &link->time_base);
+        av_dict_set(&in->metadata, "lavfi.scd.time", pts, 0);
+        av_log(s, AV_LOG_INFO, "lavfi.scd.score: %.3f, lavfi.scd.time: %s\n",
+               score, pts);
+    }
+
+    av_buffer_unref(&buf);
+    if (!s->sc_pass || score >= s->threshold)
+        return ff_filter_frame(outlink, in);
+    else {
+        av_frame_free(&in);
+        return 0;
+    }
+
+fail:
+    if (exec)
+        ff_vk_exec_discard_deps(&s->vkctx, exec);
+    av_frame_free(&in);
+    av_buffer_unref(&buf);
+    return err;
+}
+
+static void scdet_vulkan_uninit(AVFilterContext *avctx)
+{
+    SceneDetectVulkanContext *s = avctx->priv;
+    FFVulkanContext *vkctx = &s->vkctx;
+
+    av_frame_free(&s->prev);
+    av_frame_free(&s->cur);
+
+    ff_vk_exec_pool_free(vkctx, &s->e);
+    ff_vk_shader_free(vkctx, &s->shd);
+
+    av_buffer_pool_uninit(&s->det_buf_pool);
+
+    ff_vk_uninit(&s->vkctx);
+
+    s->initialized = 0;
+}
+
+#define OFFSET(x) offsetof(SceneDetectVulkanContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption scdet_vulkan_options[] = {
+    { "threshold",   "set scene change detect threshold",        OFFSET(threshold),  AV_OPT_TYPE_DOUBLE,   {.dbl = 10.},     0,  100., FLAGS },
+    { "t",           "set scene change detect threshold",        OFFSET(threshold),  AV_OPT_TYPE_DOUBLE,   {.dbl = 10.},     0,  100., FLAGS },
+    { "sc_pass",     "Set the flag to pass scene change frames", OFFSET(sc_pass),    AV_OPT_TYPE_BOOL,     {.i64 = 0  },     0,    1,  FLAGS },
+    { "s",           "Set the flag to pass scene change frames", OFFSET(sc_pass),    AV_OPT_TYPE_BOOL,     {.i64 = 0  },     0,    1,  FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(scdet_vulkan);
+
+static const AVFilterPad scdet_vulkan_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = &scdet_vulkan_filter_frame,
+        .config_props = &ff_vk_filter_config_input,
+    },
+};
+
+static const AVFilterPad scdet_vulkan_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+        .config_props = &ff_vk_filter_config_output,
+    },
+};
+
+const FFFilter ff_vf_scdet_vulkan = {
+    .p.name         = "scdet_vulkan",
+    .p.description  = NULL_IF_CONFIG_SMALL("Detect video scene change"),
+    .p.priv_class   = &scdet_vulkan_class,
+    .p.flags        = AVFILTER_FLAG_HWDEVICE,
+    .priv_size      = sizeof(SceneDetectVulkanContext),
+    .init           = &ff_vk_filter_init,
+    .uninit         = &scdet_vulkan_uninit,
+    FILTER_INPUTS(scdet_vulkan_inputs),
+    FILTER_OUTPUTS(scdet_vulkan_outputs),
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_VULKAN),
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index 498660d7d0121..f8e5727bfcfa2 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -21,159 +21,9 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/crc.h"
-#include "libavutil/x86/asm.h"
 #include "libavfilter/vf_spp.h"
 
 #if HAVE_MMX_INLINE
-static void hardthresh_mmx(int16_t dst[64], const int16_t src[64],
-                           int qp, const uint8_t *permutation)
-{
-    int bias = 0; //FIXME
-    unsigned int threshold1;
-
-    threshold1 = qp * ((1<<4) - bias) - 1;
-
-#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
-    "movq " #src0 ", %%mm0      \n"                                     \
-    "movq " #src1 ", %%mm1      \n"                                     \
-    "movq " #src2 ", %%mm2      \n"                                     \
-    "movq " #src3 ", %%mm3      \n"                                     \
-    "psubw %%mm4, %%mm0         \n"                                     \
-    "psubw %%mm4, %%mm1         \n"                                     \
-    "psubw %%mm4, %%mm2         \n"                                     \
-    "psubw %%mm4, %%mm3         \n"                                     \
-    "paddusw %%mm5, %%mm0       \n"                                     \
-    "paddusw %%mm5, %%mm1       \n"                                     \
-    "paddusw %%mm5, %%mm2       \n"                                     \
-    "paddusw %%mm5, %%mm3       \n"                                     \
-    "paddw %%mm6, %%mm0         \n"                                     \
-    "paddw %%mm6, %%mm1         \n"                                     \
-    "paddw %%mm6, %%mm2         \n"                                     \
-    "paddw %%mm6, %%mm3         \n"                                     \
-    "psubusw %%mm6, %%mm0       \n"                                     \
-    "psubusw %%mm6, %%mm1       \n"                                     \
-    "psubusw %%mm6, %%mm2       \n"                                     \
-    "psubusw %%mm6, %%mm3       \n"                                     \
-    "psraw $3, %%mm0            \n"                                     \
-    "psraw $3, %%mm1            \n"                                     \
-    "psraw $3, %%mm2            \n"                                     \
-    "psraw $3, %%mm3            \n"                                     \
-                                                                        \
-    "movq %%mm0, %%mm7          \n"                                     \
-    "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
-    "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
-    "movq %%mm1, %%mm2          \n"                                     \
-    "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
-    "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
-    "movq %%mm0, %%mm3          \n"                                     \
-    "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
-    "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
-    "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
-    "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
-                                                                        \
-    "movq %%mm0, " #dst0 "      \n"                                     \
-    "movq %%mm7, " #dst1 "      \n"                                     \
-    "movq %%mm3, " #dst2 "      \n"                                     \
-    "movq %%mm1, " #dst3 "      \n"
-
-    __asm__ volatile(
-        "movd %2, %%mm4             \n"
-        "movd %3, %%mm5             \n"
-        "movd %4, %%mm6             \n"
-        "packssdw %%mm4, %%mm4      \n"
-        "packssdw %%mm5, %%mm5      \n"
-        "packssdw %%mm6, %%mm6      \n"
-        "packssdw %%mm4, %%mm4      \n"
-        "packssdw %%mm5, %%mm5      \n"
-        "packssdw %%mm6, %%mm6      \n"
-        REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
-        REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
-        REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
-        REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
-        : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
-    );
-    dst[0] = (src[0] + 4) >> 3;
-}
-
-static void softthresh_mmx(int16_t dst[64], const int16_t src[64],
-                           int qp, const uint8_t *permutation)
-{
-    int bias = 0; //FIXME
-    unsigned int threshold1;
-
-    threshold1 = qp*((1<<4) - bias) - 1;
-
-#undef REQUANT_CORE
-#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
-    "movq " #src0 ", %%mm0      \n"                                     \
-    "movq " #src1 ", %%mm1      \n"                                     \
-    "pxor %%mm6, %%mm6          \n"                                     \
-    "pxor %%mm7, %%mm7          \n"                                     \
-    "pcmpgtw %%mm0, %%mm6       \n"                                     \
-    "pcmpgtw %%mm1, %%mm7       \n"                                     \
-    "pxor %%mm6, %%mm0          \n"                                     \
-    "pxor %%mm7, %%mm1          \n"                                     \
-    "psubusw %%mm4, %%mm0       \n"                                     \
-    "psubusw %%mm4, %%mm1       \n"                                     \
-    "pxor %%mm6, %%mm0          \n"                                     \
-    "pxor %%mm7, %%mm1          \n"                                     \
-    "movq " #src2 ", %%mm2      \n"                                     \
-    "movq " #src3 ", %%mm3      \n"                                     \
-    "pxor %%mm6, %%mm6          \n"                                     \
-    "pxor %%mm7, %%mm7          \n"                                     \
-    "pcmpgtw %%mm2, %%mm6       \n"                                     \
-    "pcmpgtw %%mm3, %%mm7       \n"                                     \
-    "pxor %%mm6, %%mm2          \n"                                     \
-    "pxor %%mm7, %%mm3          \n"                                     \
-    "psubusw %%mm4, %%mm2       \n"                                     \
-    "psubusw %%mm4, %%mm3       \n"                                     \
-    "pxor %%mm6, %%mm2          \n"                                     \
-    "pxor %%mm7, %%mm3          \n"                                     \
-                                                                        \
-    "paddsw %%mm5, %%mm0        \n"                                     \
-    "paddsw %%mm5, %%mm1        \n"                                     \
-    "paddsw %%mm5, %%mm2        \n"                                     \
-    "paddsw %%mm5, %%mm3        \n"                                     \
-    "psraw $3, %%mm0            \n"                                     \
-    "psraw $3, %%mm1            \n"                                     \
-    "psraw $3, %%mm2            \n"                                     \
-    "psraw $3, %%mm3            \n"                                     \
-                                                                        \
-    "movq %%mm0, %%mm7          \n"                                     \
-    "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
-    "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
-    "movq %%mm1, %%mm2          \n"                                     \
-    "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
-    "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
-    "movq %%mm0, %%mm3          \n"                                     \
-    "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
-    "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
-    "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
-    "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
-                                                                        \
-    "movq %%mm0, " #dst0 "      \n"                                     \
-    "movq %%mm7, " #dst1 "      \n"                                     \
-    "movq %%mm3, " #dst2 "      \n"                                     \
-    "movq %%mm1, " #dst3 "      \n"
-
-    __asm__ volatile(
-        "movd %2, %%mm4             \n"
-        "movd %3, %%mm5             \n"
-        "packssdw %%mm4, %%mm4      \n"
-        "packssdw %%mm5, %%mm5      \n"
-        "packssdw %%mm4, %%mm4      \n"
-        "packssdw %%mm5, %%mm5      \n"
-        REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
-        REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
-        REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
-        REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
-        : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
-    );
-
-    dst[0] = (src[0] + 4) >> 3;
-}
-
 static void store_slice_mmx(uint8_t *dst, const int16_t *src,
                             int dst_stride, int src_stride,
                             int width, int height, int log2_scale,
@@ -223,20 +73,7 @@ av_cold void ff_spp_init_x86(SPPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (cpu_flags & AV_CPU_FLAG_MMX) {
-        static const uint32_t mmx_idct_perm_crc = 0xe5e8adc4;
-        uint32_t idct_perm_crc =
-            av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
-                   s->dct->idct_permutation,
-                   sizeof(s->dct->idct_permutation));
-        int64_t bps;
         s->store_slice = store_slice_mmx;
-        av_opt_get_int(s->dct, "bits_per_sample", 0, &bps);
-        if (bps <= 8 && idct_perm_crc == mmx_idct_perm_crc) {
-            switch (s->mode) {
-            case 0: s->requantize = hardthresh_mmx; break;
-            case 1: s->requantize = softthresh_mmx; break;
-            }
-        }
     }
 #endif
 }
diff --git a/libavformat/Makefile b/libavformat/Makefile
index 6c9992adab689..9884b4a4cb5a7 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -62,6 +62,7 @@ OBJS-$(CONFIG_RTPDEC)                    += rdt.o                       \
                                             rtpdec_mpeg12.o             \
                                             rtpdec_mpeg4.o              \
                                             rtpdec_mpegts.o             \
+                                            rtpdec_opus.o               \
                                             rtpdec_qcelp.o              \
                                             rtpdec_qdm2.o               \
                                             rtpdec_qt.o                 \
@@ -637,6 +638,7 @@ OBJS-$(CONFIG_WEBM_CHUNK_MUXER)          += webm_chunk.o
 OBJS-$(CONFIG_WEBP_MUXER)                += webpenc.o
 OBJS-$(CONFIG_WEBVTT_DEMUXER)            += webvttdec.o subtitles.o
 OBJS-$(CONFIG_WEBVTT_MUXER)              += webvttenc.o
+OBJS-$(CONFIG_WHIP_MUXER)                += whip.o avc.o http.o srtp.o tls_openssl.o
 OBJS-$(CONFIG_WSAUD_DEMUXER)             += westwood_aud.o
 OBJS-$(CONFIG_WSAUD_MUXER)               += westwood_audenc.o
 OBJS-$(CONFIG_WSD_DEMUXER)               += wsddec.o rawdec.o
diff --git a/libavformat/allformats.c b/libavformat/allformats.c
index b5a23f9c179be..17215d733ded7 100644
--- a/libavformat/allformats.c
+++ b/libavformat/allformats.c
@@ -94,7 +94,6 @@ extern const FFInputFormat  ff_av1_demuxer;
 extern const FFInputFormat  ff_avi_demuxer;
 extern const FFOutputFormat ff_avi_muxer;
 extern const FFOutputFormat ff_avif_muxer;
-extern const FFInputFormat  ff_avisynth_demuxer;
 extern const FFOutputFormat ff_avm2_muxer;
 extern const FFInputFormat  ff_avr_demuxer;
 extern const FFInputFormat  ff_avs_demuxer;
@@ -155,7 +154,6 @@ extern const FFInputFormat  ff_dv_demuxer;
 extern const FFOutputFormat ff_dv_muxer;
 extern const FFInputFormat  ff_dvbsub_demuxer;
 extern const FFInputFormat  ff_dvbtxt_demuxer;
-extern const FFInputFormat  ff_dvdvideo_demuxer;
 extern const FFInputFormat  ff_dxa_demuxer;
 extern const FFInputFormat  ff_ea_demuxer;
 extern const FFInputFormat  ff_ea_cdata_demuxer;
@@ -517,6 +515,7 @@ extern const FFOutputFormat ff_webp_muxer;
 extern const FFInputFormat  ff_webvtt_demuxer;
 extern const FFOutputFormat ff_webvtt_muxer;
 extern const FFInputFormat  ff_wsaud_demuxer;
+extern const FFOutputFormat ff_whip_muxer;
 extern const FFOutputFormat ff_wsaud_muxer;
 extern const FFInputFormat  ff_wsd_demuxer;
 extern const FFInputFormat  ff_wsvqa_demuxer;
@@ -573,7 +572,9 @@ extern const FFInputFormat  ff_image_xpm_pipe_demuxer;
 extern const FFInputFormat  ff_image_xwd_pipe_demuxer;
 
 /* external libraries */
+extern const FFInputFormat  ff_avisynth_demuxer;
 extern const FFOutputFormat ff_chromaprint_muxer;
+extern const FFInputFormat  ff_dvdvideo_demuxer;
 extern const FFInputFormat  ff_libgme_demuxer;
 extern const FFInputFormat  ff_libmodplug_demuxer;
 extern const FFInputFormat  ff_libopenmpt_demuxer;
diff --git a/libavformat/avformat.h b/libavformat/avformat.h
index 498c3020a5852..2034d2aecc14f 100644
--- a/libavformat/avformat.h
+++ b/libavformat/avformat.h
@@ -1870,10 +1870,6 @@ typedef struct AVFormatContext {
     /**
      * A callback for closing the streams opened with AVFormatContext.io_open().
      *
-     * Using this is preferred over io_close, because this can return an error.
-     * Therefore this callback is used instead of io_close by the generic
-     * libavformat code if io_close is NULL or the default.
-     *
      * @param s the format context
      * @param pb IO context to be closed and freed
      * @return 0 on success, a negative AVERROR code on failure
diff --git a/libavformat/avio.c b/libavformat/avio.c
index d109f3adff03d..b146ac9f19234 100644
--- a/libavformat/avio.c
+++ b/libavformat/avio.c
@@ -339,8 +339,9 @@ static const struct URLProtocol *url_find_protocol(const char *filename)
         }
     }
     av_freep(&protocols);
-    if (av_strstart(filename, "https:", NULL) || av_strstart(filename, "tls:", NULL))
-        av_log(NULL, AV_LOG_WARNING, "https protocol not found, recompile FFmpeg with "
+    if (av_strstart(filename, "https:", NULL) || av_strstart(filename, "tls:", NULL) ||
+        av_strstart(filename, "dtls:", NULL))
+        av_log(NULL, AV_LOG_WARNING, "https or dtls protocol not found, recompile FFmpeg with "
                                      "openssl, gnutls or securetransport enabled.\n");
 
     return NULL;
diff --git a/libavformat/demux.c b/libavformat/demux.c
index 2795863567361..ecd4f40da9bcc 100644
--- a/libavformat/demux.c
+++ b/libavformat/demux.c
@@ -383,11 +383,10 @@ void avformat_close_input(AVFormatContext **ps)
         if (ffifmt(s->iformat)->read_close)
             ffifmt(s->iformat)->read_close(s);
 
+    ff_format_io_close(s, &pb);
     avformat_free_context(s);
 
     *ps = NULL;
-
-    avio_close(pb);
 }
 
 static void force_codec_ids(AVFormatContext *s, AVStream *st)
diff --git a/libavformat/dhav.c b/libavformat/dhav.c
index b2ead99609cb5..d9db775802d36 100644
--- a/libavformat/dhav.c
+++ b/libavformat/dhav.c
@@ -22,6 +22,7 @@
 
 #include <time.h>
 
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/parseutils.h"
 #include "avio_internal.h"
@@ -232,37 +233,60 @@ static void get_timeinfo(unsigned date, struct tm *timeinfo)
     timeinfo->tm_sec  = sec;
 }
 
+#define MAX_DURATION_BUFFER_SIZE (1024*1024)
+
 static int64_t get_duration(AVFormatContext *s)
 {
-    DHAVContext *dhav = s->priv_data;
     int64_t start_pos = avio_tell(s->pb);
+    int64_t end_pos = -1;
     int64_t start = 0, end = 0;
     struct tm timeinfo;
-    int max_interations = 100000;
+    uint8_t *end_buffer;
+    int64_t end_buffer_size;
+    int64_t end_buffer_pos;
+    int64_t offset;
+    unsigned date;
 
     if (!s->pb->seekable)
         return 0;
 
-    avio_seek(s->pb, avio_size(s->pb) - 8, SEEK_SET);
-    while (avio_tell(s->pb) > 12 && max_interations--) {
-        if (avio_rl32(s->pb) == MKTAG('d','h','a','v')) {
-            int64_t seek_back = avio_rl32(s->pb);
+    if (start_pos + 16 > avio_size(s->pb))
+        return 0;
 
-            avio_seek(s->pb, -seek_back, SEEK_CUR);
-            read_chunk(s);
-            get_timeinfo(dhav->date, &timeinfo);
-            end = av_timegm(&timeinfo) * 1000LL;
+    avio_skip(s->pb, 16);
+    date = avio_rl32(s->pb);
+    get_timeinfo(date, &timeinfo);
+    start = av_timegm(&timeinfo) * 1000LL;
+
+    end_buffer_size = FFMIN(MAX_DURATION_BUFFER_SIZE, avio_size(s->pb));
+    end_buffer = av_malloc(end_buffer_size);
+    if (!end_buffer) {
+        avio_seek(s->pb, start_pos, SEEK_SET);
+        return 0;
+    }
+    end_buffer_pos = avio_size(s->pb) - end_buffer_size;
+    avio_seek(s->pb, end_buffer_pos, SEEK_SET);
+    avio_read(s->pb, end_buffer, end_buffer_size);
+
+    offset = end_buffer_size - 8;
+    while (offset > 0) {
+        if (AV_RL32(end_buffer + offset) == MKTAG('d','h','a','v')) {
+            int64_t seek_back = AV_RL32(end_buffer + offset + 4);
+            end_pos = end_buffer_pos + offset - seek_back + 8;
             break;
         } else {
-            avio_seek(s->pb, -12, SEEK_CUR);
+            offset -= 9;
         }
     }
 
-    avio_seek(s->pb, start_pos, SEEK_SET);
+    if (end_pos < 0 || end_pos + 16 > end_buffer_pos + end_buffer_size) {
+        avio_seek(s->pb, start_pos, SEEK_SET);
+        return 0;
+    }
 
-    read_chunk(s);
-    get_timeinfo(dhav->date, &timeinfo);
-    start = av_timegm(&timeinfo) * 1000LL;
+    date = AV_RL32(end_buffer + (end_pos - end_buffer_pos) + 16);
+    get_timeinfo(date, &timeinfo);
+    end = av_timegm(&timeinfo) * 1000LL;
 
     avio_seek(s->pb, start_pos, SEEK_SET);
 
diff --git a/libavformat/http.c b/libavformat/http.c
index f7b2a8a02933d..ff63c259699a8 100644
--- a/libavformat/http.c
+++ b/libavformat/http.c
@@ -562,6 +562,12 @@ int ff_http_averror(int status_code, int default_averror)
         return default_averror;
 }
 
+const char* ff_http_get_new_location(URLContext *h)
+{
+    HTTPContext *s = h->priv_data;
+    return s->new_location;
+}
+
 static int http_write_reply(URLContext* h, int status_code)
 {
     int ret, body = 0, reply_code, message_len;
diff --git a/libavformat/http.h b/libavformat/http.h
index 5f650ef143f77..d1b691826bf3b 100644
--- a/libavformat/http.h
+++ b/libavformat/http.h
@@ -62,4 +62,6 @@ int ff_http_do_new_request2(URLContext *h, const char *uri, AVDictionary **optio
 
 int ff_http_averror(int status_code, int default_averror);
 
+const char* ff_http_get_new_location(URLContext *h);
+
 #endif /* AVFORMAT_HTTP_H */
diff --git a/libavformat/imfdec.c b/libavformat/imfdec.c
index a86b4763ff888..b4df37daa3574 100644
--- a/libavformat/imfdec.c
+++ b/libavformat/imfdec.c
@@ -380,6 +380,7 @@ static int open_track_resource_context(AVFormatContext *s,
 
     track_resource->ctx->io_open = s->io_open;
     track_resource->ctx->io_close2 = s->io_close2;
+    track_resource->ctx->opaque = s->opaque;
     track_resource->ctx->flags |= s->flags & ~AVFMT_FLAG_CUSTOM_IO;
 
     if ((ret = ff_copy_whiteblacklists(track_resource->ctx, s)) < 0)
diff --git a/libavformat/matroska.c b/libavformat/matroska.c
index bbad9a7f549a0..60584e268731d 100644
--- a/libavformat/matroska.c
+++ b/libavformat/matroska.c
@@ -82,6 +82,7 @@ const CodecTags ff_mkv_codec_tags[]={
     {"V_AVS3"           , AV_CODEC_ID_AVS3},
     {"V_DIRAC"          , AV_CODEC_ID_DIRAC},
     {"V_FFV1"           , AV_CODEC_ID_FFV1},
+    {"V_JPEG2000"       , AV_CODEC_ID_JPEG2000},
     {"V_MJPEG"          , AV_CODEC_ID_MJPEG},
     {"V_MPEG1"          , AV_CODEC_ID_MPEG1VIDEO},
     {"V_MPEG2"          , AV_CODEC_ID_MPEG2VIDEO},
diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index 29e35e6dd4db0..da5166319e90d 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -2877,6 +2877,11 @@ static int mkv_parse_video_codec(MatroskaTrack *track, AVCodecParameters *par,
 {
     if (!strcmp(track->codec_id, "V_MS/VFW/FOURCC") &&
         track->codec_priv.size >= 40) {
+        uint32_t size = AV_RL32A(track->codec_priv.data);
+        // VFW extradata is padded to an even length, yet
+        // the size field contains the real length.
+        if (size & 1 && size == track->codec_priv.size - 1)
+            --track->codec_priv.size;
         track->ms_compat    = 1;
         par->bits_per_coded_sample = AV_RL16(track->codec_priv.data + 14);
         par->codec_tag      = AV_RL32(track->codec_priv.data + 16);
@@ -3824,9 +3829,6 @@ static int matroska_parse_webvtt(MatroskaDemuxContext *matroska,
         text_len = len;
     }
 
-    if (text_len <= 0)
-        return AVERROR_INVALIDDATA;
-
     err = av_new_packet(pkt, text_len);
     if (err < 0) {
         return err;
diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
index 6d0d791f180ee..408890fa89914 100644
--- a/libavformat/matroskaenc.c
+++ b/libavformat/matroskaenc.c
@@ -1960,8 +1960,8 @@ static int mkv_write_track(AVFormatContext *s, MatroskaMuxContext *mkv,
 
         // look for a codec ID string specific to mkv to use,
         // if none are found, use AVI codes
-        if (par->codec_id == AV_CODEC_ID_FFV1) {
-            /* FFV1 is actually supported natively in Matroska,
+        if (par->codec_id == AV_CODEC_ID_JPEG2000) {
+            /* JPEG2000 is actually supported natively in Matroska,
              * yet we use the VfW way to mux it for compatibility
              * with old demuxers. (FIXME: Are they really important?) */
         } else if (par->codec_id != AV_CODEC_ID_RAWVIDEO || par->codec_tag) {
diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 4bc8bd1b2ab76..402611e81ed1e 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -3966,7 +3966,7 @@ static int mov_write_edts_tag(AVIOContext *pb, MOVMuxContext *mov,
     int flags = 0;
 
     if (track->entry) {
-        if (start_dts != track->cluster[0].dts || start_ct != track->cluster[0].cts) {
+        if (start_dts != track->cluster[0].dts || (start_ct != track->cluster[0].cts && track->cluster[0].dts >= 0)) {
 
             av_log(mov->fc, AV_LOG_DEBUG,
                    "EDTS using dts:%"PRId64" cts:%d instead of dts:%"PRId64" cts:%"PRId64" tid:%d\n",
@@ -6504,14 +6504,14 @@ static int mov_flush_fragment(AVFormatContext *s, int force)
                       av_rescale(mov->tracks[first_track].cluster[0].dts, AV_TIME_BASE, mov->tracks[first_track].timescale),
                       (has_video ? starts_with_key : mov->tracks[first_track].cluster[0].flags & MOV_SYNC_SAMPLE) ? AVIO_DATA_MARKER_SYNC_POINT : AVIO_DATA_MARKER_BOUNDARY_POINT);
 
-    for (i = 0; i < mov->nb_tracks; i++) {
+    for (i = first_track; i < mov->nb_tracks; i++) {
         MOVTrack *track = &mov->tracks[i];
         int buf_size, write_moof = 1, moof_tracks = -1;
         uint8_t *buf;
 
+        if (!track->entry)
+            continue;
         if (mov->flags & FF_MOV_FLAG_SEPARATE_MOOF) {
-            if (!track->entry)
-                continue;
             mdat_size = avio_tell(track->mdat_buf);
             moof_tracks = i;
         } else {
@@ -6928,7 +6928,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
         trk->flags |= MOV_TRACK_CTTS;
     trk->cluster[trk->entry].cts   = pkt->pts - pkt->dts;
     trk->cluster[trk->entry].flags = 0;
-    if (trk->start_cts == AV_NOPTS_VALUE)
+    if (trk->start_cts == AV_NOPTS_VALUE || (pkt->dts <= 0 && trk->start_cts > pkt->pts - pkt->dts))
         trk->start_cts = pkt->pts - pkt->dts;
     if (trk->end_pts == AV_NOPTS_VALUE)
         trk->end_pts = trk->cluster[trk->entry].dts +
@@ -7731,6 +7731,12 @@ static int mov_init(AVFormatContext *s)
                       FF_MOV_FLAG_FRAG_EVERY_FRAME))
         mov->flags |= FF_MOV_FLAG_FRAGMENT;
 
+    if (mov->flags & FF_MOV_FLAG_HYBRID_FRAGMENTED &&
+        mov->flags & FF_MOV_FLAG_FASTSTART) {
+        av_log(s, AV_LOG_ERROR, "Setting both hybrid_fragmented and faststart is not supported.\n");
+        return AVERROR(EINVAL);
+    }
+
     /* Set other implicit flags immediately */
     if (mov->flags & FF_MOV_FLAG_HYBRID_FRAGMENTED)
         mov->flags |= FF_MOV_FLAG_FRAGMENT;
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 54594b3a11bae..deb69a0548bdd 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -940,6 +940,8 @@ static int mpegts_set_stream_info(AVStream *st, PESContext *pes,
     mpegts_find_stream_type(st, pes->stream_type, ISO_types);
     if (pes->stream_type == STREAM_TYPE_AUDIO_MPEG2 || pes->stream_type == STREAM_TYPE_AUDIO_AAC)
         sti->request_probe = 50;
+    if (pes->stream_type == STREAM_TYPE_PRIVATE_DATA)
+        sti->request_probe = AVPROBE_SCORE_STREAM_RETRY;
     if ((prog_reg_desc == AV_RL32("HDMV") ||
          prog_reg_desc == AV_RL32("HDPR")) &&
         st->codecpar->codec_id == AV_CODEC_ID_NONE) {
@@ -2508,7 +2510,7 @@ static void pmt_cb(MpegTSFilter *filter, const uint8_t *section, int section_len
         if (!st)
             goto out;
 
-        if (pes && !pes->stream_type)
+        if (pes && pes->stream_type != stream_type)
             mpegts_set_stream_info(st, pes, stream_type, prog_reg_desc);
 
         add_pid_to_program(prg, pid);
diff --git a/libavformat/oggdec.c b/libavformat/oggdec.c
index 9baf8040a9017..da3ef815db237 100644
--- a/libavformat/oggdec.c
+++ b/libavformat/oggdec.c
@@ -77,6 +77,7 @@ static void free_stream(AVFormatContext *s, int i)
 
     av_freep(&stream->private);
     av_freep(&stream->new_metadata);
+    av_freep(&stream->new_extradata);
 }
 
 //FIXME We could avoid some structure duplication
@@ -239,10 +240,6 @@ static int ogg_replace_stream(AVFormatContext *s, uint32_t serial, char *magic,
     os->start_trimming = 0;
     os->end_trimming = 0;
 
-    /* Chained files have extradata as a new packet */
-    if (codec == &ff_opus_codec)
-        os->header = -1;
-
     return i;
 }
 
@@ -892,6 +889,16 @@ static int ogg_read_packet(AVFormatContext *s, AVPacket *pkt)
         os->new_metadata_size = 0;
     }
 
+    if (os->new_extradata) {
+        ret = av_packet_add_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
+                                      os->new_extradata, os->new_extradata_size);
+        if (ret < 0)
+            return ret;
+
+        os->new_extradata      = NULL;
+        os->new_extradata_size = 0;
+    }
+
     return psize;
 }
 
diff --git a/libavformat/oggdec.h b/libavformat/oggdec.h
index bc670d0f1e58e..c15fbe738eb4a 100644
--- a/libavformat/oggdec.h
+++ b/libavformat/oggdec.h
@@ -42,8 +42,8 @@ struct ogg_codec {
      * Attempt to process a packet as a data packet
      * @return < 0 (AVERROR) code or -1 on error
      *         == 0 if the packet was a regular data packet.
-     *         == 0 or 1 if the packet was a header from a chained bitstream.
-     *           (1 will cause the packet to be skiped in calling code (ogg_packet())
+     *         == 1 if the packet was a header from a chained bitstream.
+     *            This will cause the packet to be skiped in calling code (ogg_packet()
      */
     int (*packet)(AVFormatContext *, int);
     /**
@@ -94,6 +94,8 @@ struct ogg_stream {
     int end_trimming; ///< set the number of packets to drop from the end
     uint8_t *new_metadata;
     size_t new_metadata_size;
+    uint8_t *new_extradata;
+    size_t new_extradata_size;
     void *private;
 };
 
diff --git a/libavformat/oggparseflac.c b/libavformat/oggparseflac.c
index f25ed9cc15544..d66b85b09e833 100644
--- a/libavformat/oggparseflac.c
+++ b/libavformat/oggparseflac.c
@@ -27,6 +27,8 @@
 #include "oggdec.h"
 
 #define OGG_FLAC_METADATA_TYPE_STREAMINFO 0x7F
+#define OGG_FLAC_MAGIC "\177FLAC"
+#define OGG_FLAC_MAGIC_SIZE sizeof(OGG_FLAC_MAGIC)-1
 
 static int
 flac_header (AVFormatContext * s, int idx)
@@ -78,6 +80,27 @@ flac_header (AVFormatContext * s, int idx)
     return 1;
 }
 
+static int
+flac_packet (AVFormatContext * s, int idx)
+{
+    struct ogg *ogg = s->priv_data;
+    struct ogg_stream *os = ogg->streams + idx;
+
+    if (os->psize > OGG_FLAC_MAGIC_SIZE &&
+        !memcmp(
+            os->buf + os->pstart,
+            OGG_FLAC_MAGIC,
+            OGG_FLAC_MAGIC_SIZE))
+        return 1;
+
+    if (os->psize > 0 &&
+        ((os->buf[os->pstart] & 0x7F) == FLAC_METADATA_TYPE_VORBIS_COMMENT)) {
+        return 1;
+    }
+
+    return 0;
+}
+
 static int
 old_flac_header (AVFormatContext * s, int idx)
 {
@@ -127,10 +150,11 @@ old_flac_header (AVFormatContext * s, int idx)
 }
 
 const struct ogg_codec ff_flac_codec = {
-    .magic = "\177FLAC",
-    .magicsize = 5,
+    .magic = OGG_FLAC_MAGIC,
+    .magicsize = OGG_FLAC_MAGIC_SIZE,
     .header = flac_header,
     .nb_header = 2,
+    .packet = flac_packet,
 };
 
 const struct ogg_codec ff_old_flac_codec = {
diff --git a/libavformat/oggparseopus.c b/libavformat/oggparseopus.c
index 218e9df581406..65b93b405324c 100644
--- a/libavformat/oggparseopus.c
+++ b/libavformat/oggparseopus.c
@@ -36,6 +36,51 @@ struct oggopus_private {
 #define OPUS_SEEK_PREROLL_MS 80
 #define OPUS_HEAD_SIZE 19
 
+static int parse_opus_header(AVFormatContext *avf, AVStream *st, struct ogg_stream *os,
+                             struct oggopus_private *priv, uint8_t *packet,
+                             size_t psize)
+{
+    int channels;
+    int ret;
+
+    if (psize < OPUS_HEAD_SIZE || (AV_RL8(packet + 8) & 0xF0) != 0)
+        return AVERROR_INVALIDDATA;
+
+    st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
+    st->codecpar->codec_id   = AV_CODEC_ID_OPUS;
+
+    channels = AV_RL8(packet + 9);
+    if (st->codecpar->ch_layout.nb_channels &&
+        channels != st->codecpar->ch_layout.nb_channels) {
+        av_log(avf, AV_LOG_ERROR, "Channel change is not supported\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    st->codecpar->ch_layout.nb_channels = channels;
+
+    priv->pre_skip        = AV_RL16(packet + 10);
+    st->codecpar->initial_padding = priv->pre_skip;
+    os->start_trimming = priv->pre_skip;
+    /*orig_sample_rate    = AV_RL32(packet + 12);*/
+    /*gain                = AV_RL16(packet + 16);*/
+    /*channel_map         = AV_RL8 (packet + 18);*/
+
+    ret = ff_alloc_extradata(st->codecpar, os->psize);
+    if (ret < 0)
+        return ret;
+
+    memcpy(st->codecpar->extradata, packet, os->psize);
+
+    st->codecpar->sample_rate = 48000;
+    st->codecpar->seek_preroll = av_rescale(OPUS_SEEK_PREROLL_MS,
+                                            st->codecpar->sample_rate, 1000);
+    avpriv_set_pts_info(st, 64, 1, 48000);
+
+    priv->need_comments = 1;
+
+    return 1;
+}
+
 static int opus_header(AVFormatContext *avf, int idx)
 {
     struct ogg *ogg              = avf->priv_data;
@@ -43,7 +88,6 @@ static int opus_header(AVFormatContext *avf, int idx)
     AVStream *st                 = avf->streams[idx];
     struct oggopus_private *priv = os->private;
     uint8_t *packet              = os->buf + os->pstart;
-    int ret;
 
     if (!priv) {
         priv = os->private = av_mallocz(sizeof(*priv));
@@ -51,32 +95,8 @@ static int opus_header(AVFormatContext *avf, int idx)
             return AVERROR(ENOMEM);
     }
 
-    if (os->flags & OGG_FLAG_BOS) {
-        if (os->psize < OPUS_HEAD_SIZE || (AV_RL8(packet + 8) & 0xF0) != 0)
-            return AVERROR_INVALIDDATA;
-        st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
-        st->codecpar->codec_id   = AV_CODEC_ID_OPUS;
-        st->codecpar->ch_layout.nb_channels = AV_RL8(packet + 9);
-
-        priv->pre_skip        = AV_RL16(packet + 10);
-        st->codecpar->initial_padding = priv->pre_skip;
-        os->start_trimming = priv->pre_skip;
-        /*orig_sample_rate    = AV_RL32(packet + 12);*/
-        /*gain                = AV_RL16(packet + 16);*/
-        /*channel_map         = AV_RL8 (packet + 18);*/
-
-        if ((ret = ff_alloc_extradata(st->codecpar, os->psize)) < 0)
-            return ret;
-
-        memcpy(st->codecpar->extradata, packet, os->psize);
-
-        st->codecpar->sample_rate = 48000;
-        st->codecpar->seek_preroll = av_rescale(OPUS_SEEK_PREROLL_MS,
-                                                st->codecpar->sample_rate, 1000);
-        avpriv_set_pts_info(st, 64, 1, 48000);
-        priv->need_comments = 1;
-        return 1;
-    }
+    if (os->flags & OGG_FLAG_BOS)
+        return parse_opus_header(avf, st, os, priv, packet, os->psize);
 
     if (priv->need_comments) {
         if (os->psize < 8 || memcmp(packet, "OpusTags", 8))
@@ -125,6 +145,19 @@ static int opus_packet(AVFormatContext *avf, int idx)
         return AVERROR_INVALIDDATA;
     }
 
+     if (os->psize > 8 && !memcmp(packet, "OpusHead", 8)) {
+        ret = parse_opus_header(avf, st, os, priv, packet, os->psize);
+        if (ret < 0)
+            return ret;
+
+        return 1;
+    }
+
+    if (os->psize > 8 && !memcmp(packet, "OpusTags", 8)) {
+        priv->need_comments = 0;
+        return 1;
+    }
+
     if ((!os->lastpts || os->lastpts == AV_NOPTS_VALUE) && !(os->flags & OGG_FLAG_EOS)) {
         int seg, d;
         int duration;
diff --git a/libavformat/oggparsevorbis.c b/libavformat/oggparsevorbis.c
index 9f50ab9ffc5ac..62cc2da6de70a 100644
--- a/libavformat/oggparsevorbis.c
+++ b/libavformat/oggparsevorbis.c
@@ -293,6 +293,62 @@ static int vorbis_update_metadata(AVFormatContext *s, int idx)
     return ret;
 }
 
+static int vorbis_parse_header(AVFormatContext *s, AVStream *st,
+                               const uint8_t *p, unsigned int psize)
+{
+    unsigned blocksize, bs0, bs1;
+    int srate;
+    int channels;
+
+    if (psize != 30)
+        return AVERROR_INVALIDDATA;
+
+    p += 7; /* skip "\001vorbis" tag */
+
+    if (bytestream_get_le32(&p) != 0) /* vorbis_version */
+        return AVERROR_INVALIDDATA;
+
+    channels = bytestream_get_byte(&p);
+    if (st->codecpar->ch_layout.nb_channels &&
+        channels != st->codecpar->ch_layout.nb_channels) {
+        av_log(s, AV_LOG_ERROR, "Channel change is not supported\n");
+        return AVERROR_PATCHWELCOME;
+    }
+    st->codecpar->ch_layout.nb_channels = channels;
+    srate               = bytestream_get_le32(&p);
+    p += 4; // skip maximum bitrate
+    st->codecpar->bit_rate = bytestream_get_le32(&p); // nominal bitrate
+    p += 4; // skip minimum bitrate
+
+    blocksize = bytestream_get_byte(&p);
+    bs0       = blocksize & 15;
+    bs1       = blocksize >> 4;
+
+    if (bs0 > bs1)
+        return AVERROR_INVALIDDATA;
+    if (bs0 < 6 || bs1 > 13)
+        return AVERROR_INVALIDDATA;
+
+    if (bytestream_get_byte(&p) != 1) /* framing_flag */
+        return AVERROR_INVALIDDATA;
+
+    st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
+    st->codecpar->codec_id   = AV_CODEC_ID_VORBIS;
+
+    if (srate > 0) {
+        if (st->codecpar->sample_rate &&
+            srate != st->codecpar->sample_rate) {
+            av_log(s, AV_LOG_ERROR, "Sample rate change is not supported\n");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        st->codecpar->sample_rate = srate;
+        avpriv_set_pts_info(st, 64, 1, srate);
+    }
+
+    return 1;
+}
+
 static int vorbis_header(AVFormatContext *s, int idx)
 {
     struct ogg *ogg = s->priv_data;
@@ -329,50 +385,10 @@ static int vorbis_header(AVFormatContext *s, int idx)
     priv->packet[pkt_type >> 1] = av_memdup(os->buf + os->pstart, os->psize);
     if (!priv->packet[pkt_type >> 1])
         return AVERROR(ENOMEM);
-    if (os->buf[os->pstart] == 1) {
-        const uint8_t *p = os->buf + os->pstart + 7; /* skip "\001vorbis" tag */
-        unsigned blocksize, bs0, bs1;
-        int srate;
-        int channels;
-
-        if (os->psize != 30)
-            return AVERROR_INVALIDDATA;
-
-        if (bytestream_get_le32(&p) != 0) /* vorbis_version */
-            return AVERROR_INVALIDDATA;
-
-        channels = bytestream_get_byte(&p);
-        if (st->codecpar->ch_layout.nb_channels &&
-            channels != st->codecpar->ch_layout.nb_channels) {
-            av_log(s, AV_LOG_ERROR, "Channel change is not supported\n");
-            return AVERROR_PATCHWELCOME;
-        }
-        st->codecpar->ch_layout.nb_channels = channels;
-        srate               = bytestream_get_le32(&p);
-        p += 4; // skip maximum bitrate
-        st->codecpar->bit_rate = bytestream_get_le32(&p); // nominal bitrate
-        p += 4; // skip minimum bitrate
-
-        blocksize = bytestream_get_byte(&p);
-        bs0       = blocksize & 15;
-        bs1       = blocksize >> 4;
-
-        if (bs0 > bs1)
-            return AVERROR_INVALIDDATA;
-        if (bs0 < 6 || bs1 > 13)
-            return AVERROR_INVALIDDATA;
-
-        if (bytestream_get_byte(&p) != 1) /* framing_flag */
-            return AVERROR_INVALIDDATA;
-
-        st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
-        st->codecpar->codec_id   = AV_CODEC_ID_VORBIS;
-
-        if (srate > 0) {
-            st->codecpar->sample_rate = srate;
-            avpriv_set_pts_info(st, 64, 1, srate);
-        }
-    } else if (os->buf[os->pstart] == 3) {
+    if (pkt_type == 1)
+        return vorbis_parse_header(s, st, os->buf + os->pstart, os->psize);
+
+    if (pkt_type == 3) {
         if (vorbis_update_metadata(s, idx) >= 0 && priv->len[1] > 10) {
             unsigned new_len;
 
diff --git a/libavformat/protocols.c b/libavformat/protocols.c
index 93a6d67261e8f..d394454d414f3 100644
--- a/libavformat/protocols.c
+++ b/libavformat/protocols.c
@@ -62,6 +62,7 @@ extern const URLProtocol ff_subfile_protocol;
 extern const URLProtocol ff_tee_protocol;
 extern const URLProtocol ff_tcp_protocol;
 extern const URLProtocol ff_tls_protocol;
+extern const URLProtocol ff_dtls_protocol;
 extern const URLProtocol ff_udp_protocol;
 extern const URLProtocol ff_udplite_protocol;
 extern const URLProtocol ff_unix_protocol;
diff --git a/libavformat/rtpdec.c b/libavformat/rtpdec.c
index a7d5a79a83c14..ebd5402bce89f 100644
--- a/libavformat/rtpdec.c
+++ b/libavformat/rtpdec.c
@@ -61,12 +61,6 @@ static const RTPDynamicProtocolHandler speex_dynamic_handler = {
     .codec_id   = AV_CODEC_ID_SPEEX,
 };
 
-static const RTPDynamicProtocolHandler opus_dynamic_handler = {
-    .enc_name   = "opus",
-    .codec_type = AVMEDIA_TYPE_AUDIO,
-    .codec_id   = AV_CODEC_ID_OPUS,
-};
-
 static const RTPDynamicProtocolHandler t140_dynamic_handler = { /* RFC 4103 */
     .enc_name   = "t140",
     .codec_type = AVMEDIA_TYPE_SUBTITLE,
@@ -125,7 +119,7 @@ static const RTPDynamicProtocolHandler *const rtp_dynamic_protocol_handler_list[
     &ff_vp9_dynamic_handler,
     &gsm_dynamic_handler,
     &l24_dynamic_handler,
-    &opus_dynamic_handler,
+    &ff_opus_dynamic_handler,
     &realmedia_mp3_dynamic_handler,
     &speex_dynamic_handler,
     &t140_dynamic_handler,
@@ -531,43 +525,6 @@ int ff_rtp_send_rtcp_feedback(RTPDemuxContext *s, URLContext *fd,
     return 0;
 }
 
-static int opus_write_extradata(AVCodecParameters *codecpar)
-{
-    uint8_t *bs;
-    int ret;
-
-    /* This function writes an extradata with a channel mapping family of 0.
-     * This mapping family only supports mono and stereo layouts. And RFC7587
-     * specifies that the number of channels in the SDP must be 2.
-     */
-    if (codecpar->ch_layout.nb_channels > 2) {
-        return AVERROR_INVALIDDATA;
-    }
-
-    ret = ff_alloc_extradata(codecpar, 19);
-    if (ret < 0)
-        return ret;
-
-    bs = (uint8_t *)codecpar->extradata;
-
-    /* Opus magic */
-    bytestream_put_buffer(&bs, "OpusHead", 8);
-    /* Version */
-    bytestream_put_byte  (&bs, 0x1);
-    /* Channel count */
-    bytestream_put_byte  (&bs, codecpar->ch_layout.nb_channels);
-    /* Pre skip */
-    bytestream_put_le16  (&bs, 0);
-    /* Input sample rate */
-    bytestream_put_le32  (&bs, 48000);
-    /* Output gain */
-    bytestream_put_le16  (&bs, 0x0);
-    /* Mapping family */
-    bytestream_put_byte  (&bs, 0x0);
-
-    return 0;
-}
-
 /**
  * open a new RTP parse context for stream 'st'. 'st' can be NULL for
  * MPEG-2 TS streams.
@@ -576,7 +533,6 @@ RTPDemuxContext *ff_rtp_parse_open(AVFormatContext *s1, AVStream *st,
                                    int payload_type, int queue_size)
 {
     RTPDemuxContext *s;
-    int ret;
 
     s = av_mallocz(sizeof(RTPDemuxContext));
     if (!s)
@@ -600,16 +556,13 @@ RTPDemuxContext *ff_rtp_parse_open(AVFormatContext *s1, AVStream *st,
             if (st->codecpar->sample_rate == 8000)
                 st->codecpar->sample_rate = 16000;
             break;
-        case AV_CODEC_ID_OPUS:
-            ret = opus_write_extradata(st->codecpar);
-            if (ret < 0) {
-                av_log(s1, AV_LOG_ERROR,
-                       "Error creating opus extradata: %s\n",
-                       av_err2str(ret));
-                av_free(s);
-                return NULL;
-            }
+        case AV_CODEC_ID_PCM_MULAW: {
+            AVCodecParameters *par = st->codecpar;
+            par->bits_per_coded_sample = av_get_bits_per_sample(par->codec_id);
+            par->block_align           = par->ch_layout.nb_channels * par->bits_per_coded_sample / 8;
+            par->bit_rate              = par->block_align * 8LL * par->sample_rate;
             break;
+        }
         default:
             break;
         }
diff --git a/libavformat/rtpdec_formats.h b/libavformat/rtpdec_formats.h
index 72a8f16a90999..1ff2a72d2ae00 100644
--- a/libavformat/rtpdec_formats.h
+++ b/libavformat/rtpdec_formats.h
@@ -77,6 +77,7 @@ extern const RTPDynamicProtocolHandler ff_mpeg4_generic_dynamic_handler;
 extern const RTPDynamicProtocolHandler ff_mpegts_dynamic_handler;
 extern const RTPDynamicProtocolHandler ff_ms_rtp_asf_pfa_handler;
 extern const RTPDynamicProtocolHandler ff_ms_rtp_asf_pfv_handler;
+extern const RTPDynamicProtocolHandler ff_opus_dynamic_handler;
 extern const RTPDynamicProtocolHandler ff_qcelp_dynamic_handler;
 extern const RTPDynamicProtocolHandler ff_qdm2_dynamic_handler;
 extern const RTPDynamicProtocolHandler ff_qt_rtp_aud_handler;
diff --git a/libavformat/rtpdec_opus.c b/libavformat/rtpdec_opus.c
new file mode 100644
index 0000000000000..4ed9d8842bd74
--- /dev/null
+++ b/libavformat/rtpdec_opus.c
@@ -0,0 +1,151 @@
+/*
+ * RTP Depacketization of Opus, RFC 7587
+ * Copyright (c) 2025 Jonathan Baudanza <jon@jonb.org>
+ * Copyright (c) 2022 Erik Linge
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bytestream.h"
+#include "libavutil/mem.h"
+#include "libavutil/avstring.h"
+#include "rtpdec_formats.h"
+#include "internal.h"
+
+static int opus_duration(const uint8_t *src, int size)
+{
+    unsigned nb_frames  = 1;
+    unsigned toc        = src[0];
+    unsigned toc_config = toc >> 3;
+    unsigned toc_count  = toc & 3;
+    unsigned frame_size = toc_config < 12 ? FFMAX(480, 960 * (toc_config & 3)) :
+                          toc_config < 16 ? 480 << (toc_config & 1) :
+                                            120 << (toc_config & 3);
+    if (toc_count == 3) {
+        if (size<2)
+            return AVERROR_INVALIDDATA;
+        nb_frames = src[1] & 0x3F;
+    } else if (toc_count) {
+        nb_frames = 2;
+    }
+
+    return frame_size * nb_frames;
+}
+
+static int opus_write_extradata(AVCodecParameters *codecpar)
+{
+    uint8_t *bs;
+    int ret;
+
+    /* This function writes an extradata with a channel mapping family of 0.
+     * This mapping family only supports mono and stereo layouts. And RFC7587
+     * specifies that the number of channels in the SDP must be 2.
+     */
+    if (codecpar->ch_layout.nb_channels > 2) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    ret = ff_alloc_extradata(codecpar, 19);
+    if (ret < 0)
+        return ret;
+
+    bs = (uint8_t *)codecpar->extradata;
+
+    /* Opus magic */
+    bytestream_put_buffer(&bs, "OpusHead", 8);
+    /* Version */
+    bytestream_put_byte  (&bs, 0x1);
+    /* Channel count */
+    bytestream_put_byte  (&bs, codecpar->ch_layout.nb_channels);
+    /* Pre skip */
+    bytestream_put_le16  (&bs, 0);
+    /* Input sample rate */
+    bytestream_put_le32  (&bs, 48000);
+    /* Output gain */
+    bytestream_put_le16  (&bs, 0x0);
+    /* Mapping family */
+    bytestream_put_byte  (&bs, 0x0);
+
+    return 0;
+}
+
+static int opus_init(AVFormatContext *s, int st_index, PayloadContext *priv_data)
+{
+    return opus_write_extradata(s->streams[st_index]->codecpar);
+}
+
+static int opus_parse_packet(AVFormatContext *ctx, PayloadContext *data,
+                            AVStream *st, AVPacket *pkt, uint32_t *timestamp,
+                            const uint8_t *buf, int len, uint16_t seq,
+                            int flags)
+{
+    int rv;
+    int duration;
+
+    if ((rv = av_new_packet(pkt, len)) < 0)
+        return rv;
+
+    memcpy(pkt->data, buf, len);
+    pkt->stream_index = st->index;
+
+    duration = opus_duration(buf, len);
+    if (duration != AVERROR_INVALIDDATA) {
+        pkt->duration = duration;
+    }
+
+    return 0;
+}
+
+static int parse_fmtp(AVFormatContext *s,
+                      AVStream *stream, PayloadContext *data,
+                      const char *attr, const char *value)
+{
+    if (!strcmp(attr, "sprop-maxcapturerate")) {
+        int rate = atoi(value);
+        if (rate < 8000 || rate > 48000) {
+            av_log(s, AV_LOG_ERROR,
+                   "fmtp field 'sprop-maxcapturerate' must be between 8000 to 48000 (provided value: %s)",
+                   value);
+            return AVERROR_INVALIDDATA;
+        }
+        stream->codecpar->sample_rate = rate;
+    }
+    return 0;
+}
+
+static int opus_parse_sdp_line(AVFormatContext *s, int st_index,
+                               PayloadContext *data, const char *line)
+{
+    const char *p;
+
+    if (st_index < 0)
+        return 0;
+
+    if (av_strstart(line, "fmtp:", &p)) {
+        return ff_parse_fmtp(s, s->streams[st_index], data, p, parse_fmtp);
+    }
+    return 0;
+}
+
+const RTPDynamicProtocolHandler ff_opus_dynamic_handler = {
+    .enc_name     = "opus",
+    .codec_type   = AVMEDIA_TYPE_AUDIO,
+    .codec_id     = AV_CODEC_ID_OPUS,
+    .parse_packet = opus_parse_packet,
+    .init         = opus_init,
+    .parse_sdp_a_line = opus_parse_sdp_line,
+};
diff --git a/libavformat/srtp.h b/libavformat/srtp.h
index 3189f8f54bd0e..35224cc9ba901 100644
--- a/libavformat/srtp.h
+++ b/libavformat/srtp.h
@@ -27,7 +27,7 @@
 struct AVAES;
 struct AVHMAC;
 
-struct SRTPContext {
+typedef struct SRTPContext {
     struct AVAES *aes;
     struct AVHMAC *hmac;
     int rtp_hmac_size, rtcp_hmac_size;
@@ -40,7 +40,7 @@ struct SRTPContext {
     uint32_t roc;
 
     uint32_t rtcp_index;
-};
+} SRTPContext;
 
 int ff_srtp_set_crypto(struct SRTPContext *s, const char *suite,
                        const char *params);
diff --git a/libavformat/tls.c b/libavformat/tls.c
index f96ff6215d63f..e06b7022bf4a6 100644
--- a/libavformat/tls.c
+++ b/libavformat/tls.c
@@ -1,6 +1,7 @@
 /*
- * TLS/SSL Protocol
+ * TLS/DTLS/SSL Protocol
  * Copyright (c) 2011 Martin Storsjo
+ * Copyright (c) 2025 Jack Lau
  *
  * This file is part of FFmpeg.
  *
@@ -20,6 +21,7 @@
  */
 
 #include "avformat.h"
+#include "internal.h"
 #include "network.h"
 #include "os_support.h"
 #include "url.h"
@@ -93,7 +95,7 @@ int ff_tls_open_underlying(TLSShared *c, URLContext *parent, const char *uri, AV
             c->listen = 1;
     }
 
-    ff_url_join(buf, sizeof(buf), "tcp", NULL, c->underlying_host, port, "%s", p);
+    ff_url_join(buf, sizeof(buf), c->is_dtls ? "udp" : "tcp", NULL, c->underlying_host, port, "%s", p);
 
     hints.ai_flags = AI_NUMERICHOST;
     if (!getaddrinfo(c->underlying_host, NULL, &hints, &ai)) {
@@ -124,7 +126,65 @@ int ff_tls_open_underlying(TLSShared *c, URLContext *parent, const char *uri, AV
     }
 
     freeenv_utf8(env_http_proxy);
-    return ffurl_open_whitelist(&c->tcp, buf, AVIO_FLAG_READ_WRITE,
-                                &parent->interrupt_callback, options,
-                                parent->protocol_whitelist, parent->protocol_blacklist, parent);
+    if (c->is_dtls) {
+        av_dict_set_int(options, "connect", 1, 0);
+        av_dict_set_int(options, "fifo_size", 0, 0);
+        /* Set the max packet size to the buffer size. */
+        av_dict_set_int(options, "pkt_size", c->mtu, 0);
+    }
+    ret = ffurl_open_whitelist(c->is_dtls ? &c->udp : &c->tcp, buf, AVIO_FLAG_READ_WRITE,
+                               &parent->interrupt_callback, options,
+                               parent->protocol_whitelist, parent->protocol_blacklist, parent);
+    if (c->is_dtls) {
+        if (ret < 0) {
+            av_log(c, AV_LOG_ERROR, "WHIP: Failed to connect udp://%s:%d\n", c->underlying_host, port);
+            return ret;
+        }
+        /* Make the socket non-blocking, set to READ and WRITE mode after connected */
+        ff_socket_nonblock(ffurl_get_file_handle(c->udp), 1);
+        c->udp->flags |= AVIO_FLAG_READ | AVIO_FLAG_NONBLOCK;
+    }
+    return ret;
 }
+
+/**
+ * Read all data from the given URL url and store it in the given buffer bp.
+ */
+int ff_url_read_all(const char *url, AVBPrint *bp)
+{
+    int ret = 0;
+    AVDictionary *opts = NULL;
+    URLContext *uc = NULL;
+    char buf[MAX_URL_SIZE];
+
+    ret = ffurl_open_whitelist(&uc, url, AVIO_FLAG_READ, NULL, &opts, NULL, NULL, NULL);
+    if (ret < 0) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to open url %s\n", url);
+        goto end;
+    }
+
+    while (1) {
+        ret = ffurl_read(uc, buf, sizeof(buf));
+        if (ret == AVERROR_EOF) {
+            /* Reset the error because we read all response as answer util EOF. */
+            ret = 0;
+            break;
+        }
+        if (ret <= 0) {
+            av_log(NULL, AV_LOG_ERROR, "TLS: Failed to read from url=%s, key is %s\n", url, bp->str);
+            goto end;
+        }
+
+        av_bprintf(bp, "%.*s", ret, buf);
+        if (!av_bprint_is_complete(bp)) {
+            av_log(NULL, AV_LOG_ERROR, "TLS: Exceed max size %.*s, %s\n", ret, buf, bp->str);
+            ret = AVERROR(EIO);
+            goto end;
+        }
+    }
+
+end:
+    ffurl_closep(&uc);
+    av_dict_free(&opts);
+    return ret;
+}
\ No newline at end of file
diff --git a/libavformat/tls.h b/libavformat/tls.h
index 6c6aa01a9a928..cb626f1977a30 100644
--- a/libavformat/tls.h
+++ b/libavformat/tls.h
@@ -1,6 +1,7 @@
 /*
- * TLS/SSL Protocol
+ * TLS/DTLS/SSL Protocol
  * Copyright (c) 2011 Martin Storsjo
+ * Copyright (c) 2025 Jack Lau
  *
  * This file is part of FFmpeg.
  *
@@ -22,10 +23,27 @@
 #ifndef AVFORMAT_TLS_H
 #define AVFORMAT_TLS_H
 
+#include "libavutil/bprint.h"
 #include "libavutil/opt.h"
 
 #include "url.h"
 
+/**
+ * Maximum size limit of a certificate and private key size.
+ */
+#define MAX_CERTIFICATE_SIZE 8192
+
+enum DTLSState {
+    DTLS_STATE_NONE,
+
+    /* Whether DTLS handshake is finished. */
+    DTLS_STATE_FINISHED,
+    /* Whether DTLS session is closed. */
+    DTLS_STATE_CLOSED,
+    /* Whether DTLS handshake is failed. */
+    DTLS_STATE_FAILED,
+};
+
 typedef struct TLSShared {
     char *ca_file;
     int verify;
@@ -40,6 +58,25 @@ typedef struct TLSShared {
     int numerichost;
 
     URLContext *tcp;
+
+    int is_dtls;
+
+    enum DTLSState state;
+
+    int use_external_udp;
+    URLContext *udp;
+
+    /* The fingerprint of certificate, used in SDP offer. */
+    char *fingerprint;
+
+    /* The certificate and private key content used for DTLS handshake */
+    char* cert_buf;
+    char* key_buf;
+    /**
+     * The size of RTP packet, should generally be set to MTU.
+     * Note that pion requires a smaller value, for example, 1200.
+     */
+    int mtu;
 } TLSShared;
 
 #define TLS_OPTFL (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
@@ -51,10 +88,27 @@ typedef struct TLSShared {
     {"key_file",   "Private key file",                    offsetof(pstruct, options_field . key_file),  AV_OPT_TYPE_STRING, .flags = TLS_OPTFL }, \
     {"listen",     "Listen for incoming connections",     offsetof(pstruct, options_field . listen),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, .flags = TLS_OPTFL }, \
     {"verifyhost", "Verify against a specific hostname",  offsetof(pstruct, options_field . host),      AV_OPT_TYPE_STRING, .flags = TLS_OPTFL }, \
-    {"http_proxy", "Set proxy to tunnel through",         offsetof(pstruct, options_field . http_proxy), AV_OPT_TYPE_STRING, .flags = TLS_OPTFL }
+    {"http_proxy", "Set proxy to tunnel through",         offsetof(pstruct, options_field . http_proxy), AV_OPT_TYPE_STRING, .flags = TLS_OPTFL }, \
+    {"use_external_udp", "Use external UDP from muxer or demuxer", offsetof(pstruct, options_field . use_external_udp), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 1, .flags = TLS_OPTFL }, \
+    {"mtu", "Maximum Transmission Unit", offsetof(pstruct, options_field . mtu), AV_OPT_TYPE_INT,  { .i64 = 0}, INT64_MIN, INT64_MAX, .flags = TLS_OPTFL}, \
+    {"fingerprint", "The optional fingerprint for DTLS", offsetof(pstruct, options_field . fingerprint), AV_OPT_TYPE_STRING, .flags = TLS_OPTFL}, \
+    {"cert_buf", "The optional certificate buffer for DTLS", offsetof(pstruct, options_field . cert_buf), AV_OPT_TYPE_STRING, .flags = TLS_OPTFL}, \
+    {"key_buf", "The optional private key buffer for DTLS", offsetof(pstruct, options_field . key_buf), AV_OPT_TYPE_STRING, .flags = TLS_OPTFL}
 
 int ff_tls_open_underlying(TLSShared *c, URLContext *parent, const char *uri, AVDictionary **options);
 
+int ff_url_read_all(const char *url, AVBPrint *bp);
+
+int ff_dtls_set_udp(URLContext *h, URLContext *udp);
+
+int ff_dtls_export_materials(URLContext *h, char *dtls_srtp_materials, size_t materials_sz);
+
+int ff_dtls_state(URLContext *h);
+
+int ff_ssl_read_key_cert(char *key_url, char *cert_url, char *key_buf, size_t key_sz, char *cert_buf, size_t cert_sz, char **fingerprint);
+
+int ff_ssl_gen_key_cert(char *key_buf, size_t key_sz, char *cert_buf, size_t cert_sz, char **fingerprint);
+
 void ff_gnutls_init(void);
 void ff_gnutls_deinit(void);
 
diff --git a/libavformat/tls_openssl.c b/libavformat/tls_openssl.c
index 8b0cf9efb23e9..b589d5d90a4b7 100644
--- a/libavformat/tls_openssl.c
+++ b/libavformat/tls_openssl.c
@@ -1,6 +1,7 @@
 /*
- * TLS/SSL Protocol
+ * TLS/DTLS/SSL Protocol
  * Copyright (c) 2011 Martin Storsjo
+ * Copyright (c) 2025 Jack Lau
  *
  * This file is part of FFmpeg.
  *
@@ -19,8 +20,10 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/mem.h"
 #include "network.h"
 #include "os_support.h"
+#include "libavutil/random_seed.h"
 #include "url.h"
 #include "tls.h"
 #include "libavutil/opt.h"
@@ -29,6 +32,436 @@
 #include <openssl/ssl.h>
 #include <openssl/err.h>
 
+/**
+ * Returns a heap‐allocated null‐terminated string containing
+ * the PEM‐encoded public key.  Caller must free.
+ */
+static char *pkey_to_pem_string(EVP_PKEY *pkey) {
+    BIO        *mem = NULL;
+    BUF_MEM    *bptr = NULL;
+    char       *pem_str = NULL;
+
+    // Create a memory BIO
+    if (!(mem = BIO_new(BIO_s_mem())))
+        goto err;
+
+    // Write public key in PEM form
+    if (!PEM_write_bio_PrivateKey(mem, pkey, NULL, NULL, 0, NULL, NULL))
+        goto err;
+
+    // Extract pointer/length
+    BIO_get_mem_ptr(mem, &bptr);
+    if (!bptr || !bptr->length)
+        goto err;
+
+    // Allocate string (+1 for NUL)
+    pem_str = av_malloc(bptr->length + 1);
+    if (!pem_str)
+        goto err;
+
+    // Copy data & NUL‐terminate
+    memcpy(pem_str, bptr->data, bptr->length);
+    pem_str[bptr->length] = '\0';
+
+cleanup:
+    BIO_free(mem);
+    return pem_str;
+
+err:
+    // error path: free and return NULL
+    free(pem_str);
+    pem_str = NULL;
+    goto cleanup;
+}
+
+/**
+ * Serialize an X509 certificate to a av_malloc’d PEM string.
+ * Caller must free the returned pointer.
+ */
+static char *cert_to_pem_string(X509 *cert)
+{
+    BIO     *mem = BIO_new(BIO_s_mem());
+    BUF_MEM *bptr = NULL;
+    char    *out = NULL;
+
+    if (!mem) goto err;
+
+    /* Write the PEM certificate */
+    if (!PEM_write_bio_X509(mem, cert))
+        goto err;
+
+    BIO_get_mem_ptr(mem, &bptr);
+    if (!bptr || !bptr->length) goto err;
+
+    out = av_malloc(bptr->length + 1);
+    if (!out) goto err;
+
+    memcpy(out, bptr->data, bptr->length);
+    out[bptr->length] = '\0';
+
+cleanup:
+    BIO_free(mem);
+    return out;
+
+err:
+    free(out);
+    out = NULL;
+    goto cleanup;
+}
+
+
+/**
+ * Generate a SHA-256 fingerprint of an X.509 certificate.
+ *
+ * @param ctx       AVFormatContext for logging (can be NULL)
+ * @param cert      X509 certificate to fingerprint
+ * @return          Newly allocated fingerprint string in "AA:BB:CC:…" format,
+ *                  or NULL on error (logs via av_log if ctx is not NULL).
+ *                  Caller must free() the returned string.
+ */
+static char *generate_fingerprint(X509 *cert)
+{
+    unsigned char md[EVP_MAX_MD_SIZE];
+    int n = 0;
+    AVBPrint fingerprint;
+    char *result = NULL;
+    int i;
+
+    /* To prevent a crash during cleanup, always initialize it. */
+    av_bprint_init(&fingerprint, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    if (X509_digest(cert, EVP_sha256(), md, &n) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to generate fingerprint, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto end;
+    }
+
+    for (i = 0; i < n; i++) {
+        av_bprintf(&fingerprint, "%02X", md[i]);
+        if (i + 1 < n)
+            av_bprintf(&fingerprint, ":");
+    }
+
+    if (!fingerprint.str || !strlen(fingerprint.str)) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Fingerprint is empty\n");
+        goto end;
+    }
+
+    result = av_strdup(fingerprint.str);
+    if (!result) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Out of memory generating fingerprint\n");
+    }
+
+end:
+    av_bprint_finalize(&fingerprint, NULL);
+    return result;
+}
+
+int ff_ssl_read_key_cert(char *key_url, char *cert_url, char *key_buf, size_t key_sz, char *cert_buf, size_t cert_sz, char **fingerprint)
+{
+    int ret = 0;
+    BIO *key_b = NULL, *cert_b = NULL;
+    AVBPrint key_bp, cert_bp;
+    EVP_PKEY *pkey;
+    X509 *cert;
+    char *key_tem = NULL, *cert_tem = NULL;
+
+    /* To prevent a crash during cleanup, always initialize it. */
+    av_bprint_init(&key_bp, 1, MAX_CERTIFICATE_SIZE);
+    av_bprint_init(&cert_bp, 1, MAX_CERTIFICATE_SIZE);
+
+    /* Read key file. */
+    ret = ff_url_read_all(key_url, &key_bp);
+    if (ret < 0) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to open key file %s\n", key_url);
+        goto end;
+    }
+
+    if (!(key_b = BIO_new(BIO_s_mem()))) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    BIO_write(key_b, key_bp.str, key_bp.len);
+    pkey = PEM_read_bio_PrivateKey(key_b, NULL, NULL, NULL);
+    if (!pkey) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to read private key from %s\n", key_url);
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+    /* Read certificate. */
+    ret = ff_url_read_all(cert_url, &cert_bp);
+    if (ret < 0) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to open cert file %s\n", cert_url);
+        goto end;
+    }
+
+    if (!(cert_b = BIO_new(BIO_s_mem()))) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    BIO_write(cert_b, cert_bp.str, cert_bp.len);
+    cert = PEM_read_bio_X509(cert_b, NULL, NULL, NULL);
+    if (!cert) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to read certificate from %s\n", cert_url);
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+    key_tem = pkey_to_pem_string(pkey);
+    cert_tem = cert_to_pem_string(cert);
+
+    snprintf(key_buf,  key_sz,  "%s", key_tem);
+    snprintf(cert_buf, cert_sz, "%s", cert_tem);
+
+    /* Generate fingerprint. */
+    *fingerprint = generate_fingerprint(cert);
+    if (!*fingerprint) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to generate fingerprint from %s\n", cert_url);
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+end:
+    BIO_free(key_b);
+    av_bprint_finalize(&key_bp, NULL);
+    BIO_free(cert_b);
+    av_bprint_finalize(&cert_bp, NULL);
+    if (key_tem) av_free(key_tem);
+    if (cert_tem) av_free(cert_tem);
+    return ret;
+}
+
+static int openssl_gen_private_key(EVP_PKEY **pkey, EC_KEY **eckey)
+{
+    int ret = 0;
+
+    /**
+     * Note that secp256r1 in openssl is called NID_X9_62_prime256v1 or prime256v1 in string,
+     * not NID_secp256k1 or secp256k1 in string.
+     *
+     * TODO: Should choose the curves in ClientHello.supported_groups, for example:
+     *      Supported Group: x25519 (0x001d)
+     *      Supported Group: secp256r1 (0x0017)
+     *      Supported Group: secp384r1 (0x0018)
+     */
+#if OPENSSL_VERSION_NUMBER < 0x30000000L /* OpenSSL 3.0 */
+    EC_GROUP *ecgroup = NULL;
+    int curve = NID_X9_62_prime256v1;
+#else
+    const char *curve = SN_X9_62_prime256v1;
+#endif
+
+#if OPENSSL_VERSION_NUMBER < 0x30000000L /* OpenSSL 3.0 */
+    *pkey = EVP_PKEY_new();
+    *eckey = EC_KEY_new();
+    ecgroup = EC_GROUP_new_by_curve_name(curve);
+    if (!ecgroup) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Create EC group by curve=%d failed, %s", curve, ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+#if OPENSSL_VERSION_NUMBER < 0x10100000L // v1.1.x
+    /* For openssl 1.0, we must set the group parameters, so that cert is ok. */
+    EC_GROUP_set_asn1_flag(ecgroup, OPENSSL_EC_NAMED_CURVE);
+#endif
+
+    if (EC_KEY_set_group(*eckey, ecgroup) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Generate private key, EC_KEY_set_group failed, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (EC_KEY_generate_key(*eckey) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Generate private key, EC_KEY_generate_key failed, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (EVP_PKEY_set1_EC_KEY(*pkey, *eckey) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Generate private key, EVP_PKEY_set1_EC_KEY failed, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+#else
+    *pkey = EVP_EC_gen(curve);
+    if (!*pkey) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Generate private key, EVP_EC_gen curve=%s failed, %s\n", curve, ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+#endif
+    goto end;
+
+einval_end:
+    ret = AVERROR(EINVAL);
+end:
+#if OPENSSL_VERSION_NUMBER < 0x30000000L /* OpenSSL 3.0 */
+    EC_GROUP_free(ecgroup);
+#endif
+    return ret;
+}
+
+static int openssl_gen_certificate(EVP_PKEY *pkey, X509 **cert, char **fingerprint)
+{
+    int ret = 0, serial, expire_day;
+    const char *aor = "lavf";
+    X509_NAME* subject = NULL;
+
+    *cert= X509_new();
+    if (!*cert) {
+        goto enomem_end;
+    }
+
+    // TODO: Support non-self-signed certificate, for example, load from a file.
+    subject = X509_NAME_new();
+    if (!subject) {
+        goto enomem_end;
+    }
+
+    serial = (int)av_get_random_seed();
+    if (ASN1_INTEGER_set(X509_get_serialNumber(*cert), serial) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set serial, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (X509_NAME_add_entry_by_txt(subject, "CN", MBSTRING_ASC, aor, strlen(aor), -1, 0) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set CN, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (X509_set_issuer_name(*cert, subject) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set issuer, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+    if (X509_set_subject_name(*cert, subject) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set subject name, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    expire_day = 365;
+    if (!X509_gmtime_adj(X509_get_notBefore(*cert), 0)) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set notBefore, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+    if (!X509_gmtime_adj(X509_get_notAfter(*cert), 60*60*24*expire_day)) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set notAfter, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (X509_set_version(*cert, 2) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set version, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (X509_set_pubkey(*cert, pkey) != 1) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to set public key, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    if (!X509_sign(*cert, pkey, EVP_sha1())) {
+        av_log(NULL, AV_LOG_ERROR, "TLS: Failed to sign certificate, %s\n", ERR_error_string(ERR_get_error(), NULL));
+        goto einval_end;
+    }
+
+    *fingerprint = generate_fingerprint(*cert);
+    if (!*fingerprint) {
+        goto enomem_end;
+    }
+
+    goto end;
+enomem_end:
+    ret = AVERROR(ENOMEM);
+    goto end;
+einval_end:
+    ret = AVERROR(EINVAL);
+end:
+    X509_NAME_free(subject);
+    //av_bprint_finalize(&fingerprint, NULL);
+    return ret;
+}
+
+int ff_ssl_gen_key_cert(char *key_buf, size_t key_sz, char *cert_buf, size_t cert_sz, char **fingerprint)
+{
+    int ret = 0;
+    EVP_PKEY *pkey = NULL;
+    EC_KEY *ec_key = NULL;
+    X509 *cert = NULL;
+    char *key_tem = NULL, *cert_tem = NULL;
+
+    ret = openssl_gen_private_key(&pkey, &ec_key);
+    if (ret < 0) goto error;
+
+    ret = openssl_gen_certificate(pkey, &cert, fingerprint);
+    if (ret < 0) goto error;
+
+    key_tem = pkey_to_pem_string(pkey);
+    cert_tem = cert_to_pem_string(cert);
+
+    snprintf(key_buf,  key_sz,  "%s", key_tem);
+    snprintf(cert_buf, cert_sz, "%s", cert_tem);
+
+    if (key_tem) av_free(key_tem);
+    if (cert_tem) av_free(cert_tem);
+error:
+    return ret;
+}
+
+
+/**
+ * Deserialize a PEM‐encoded private or public key from a NUL-terminated C string.
+ *
+ * @param pem_str   The PEM text, e.g.
+ *                  "-----BEGIN PRIVATE KEY-----\n…\n-----END PRIVATE KEY-----\n"
+ * @param is_priv   If non-zero, parse as a PRIVATE key; otherwise, parse as a PUBLIC key.
+ * @return          EVP_PKEY* on success (must EVP_PKEY_free()), or NULL on error.
+ */
+static EVP_PKEY *pkey_from_pem_string(const char *pem_str, int is_priv)
+{
+    BIO *mem = BIO_new_mem_buf(pem_str, -1);
+    if (!mem) {
+        av_log(NULL, AV_LOG_ERROR, "BIO_new_mem_buf failed\n");
+        return NULL;
+    }
+
+    EVP_PKEY *pkey = NULL;
+    if (is_priv) {
+        pkey = PEM_read_bio_PrivateKey(mem, NULL, NULL, NULL);
+    } else {
+        pkey = PEM_read_bio_PUBKEY(mem, NULL, NULL, NULL);
+    }
+
+    if (!pkey)
+        av_log(NULL, AV_LOG_ERROR, "Failed to parse %s key from string\n",
+              is_priv ? "private" : "public");
+
+    BIO_free(mem);
+    return pkey;
+}
+
+/**
+ * Deserialize a PEM‐encoded certificate from a NUL-terminated C string.
+ *
+ * @param pem_str   The PEM text, e.g.
+ *                  "-----BEGIN CERTIFICATE-----\n…\n-----END CERTIFICATE-----\n"
+ * @return          X509* on success (must X509_free()), or NULL on error.
+ */
+static X509 *cert_from_pem_string(const char *pem_str)
+{
+    BIO *mem = BIO_new_mem_buf(pem_str, -1);
+    if (!mem) {
+        av_log(NULL, AV_LOG_ERROR, "BIO_new_mem_buf failed\n");
+        return NULL;
+    }
+
+    X509 *cert = PEM_read_bio_X509(mem, NULL, NULL, NULL);
+    if (!cert) {
+        av_log(NULL, AV_LOG_ERROR, "Failed to parse certificate from string\n");
+        return NULL;
+    }
+
+    BIO_free(mem);
+    return cert;
+}
+
+
 typedef struct TLSContext {
     const AVClass *class;
     TLSShared tls_shared;
@@ -38,8 +471,56 @@ typedef struct TLSContext {
     BIO_METHOD* url_bio_method;
 #endif
     int io_err;
+    char error_message[256];
 } TLSContext;
 
+/**
+ * Retrieves the error message for the latest OpenSSL error.
+ *
+ * This function retrieves the error code from the thread's error queue, converts it
+ * to a human-readable string, and stores it in the TLSContext's error_message field.
+ * The error queue is then cleared using ERR_clear_error().
+ */
+static const char* openssl_get_error(TLSContext *ctx)
+{
+    int r2 = ERR_get_error();
+    if (r2) {
+        ERR_error_string_n(r2, ctx->error_message, sizeof(ctx->error_message));
+    } else
+        ctx->error_message[0] = '\0';
+
+    ERR_clear_error();
+    return ctx->error_message;
+}
+
+int ff_dtls_set_udp(URLContext *h, URLContext *udp)
+{
+    TLSContext *c = h->priv_data;
+    c->tls_shared.udp = udp;
+    return 0;
+}
+
+int ff_dtls_export_materials(URLContext *h, char *dtls_srtp_materials, size_t materials_sz)
+{
+    int ret = 0;
+    const char* dst = "EXTRACTOR-dtls_srtp";
+    TLSContext *c = h->priv_data;
+
+    ret = SSL_export_keying_material(c->ssl, dtls_srtp_materials, materials_sz,
+        dst, strlen(dst), NULL, 0, 0);
+    if (!ret) {
+        av_log(c, AV_LOG_ERROR, "TLS: Failed to export SRTP material, %s\n", openssl_get_error(c));
+        return -1;
+    }
+    return 0;
+}
+
+int ff_dtls_state(URLContext *h)
+{
+    TLSContext *c = h->priv_data;
+    return c->tls_shared.state;
+}
+
 /* OpenSSL 1.0.2 or below, then you would use SSL_library_init. If you are
  * using OpenSSL 1.1.0 or above, then the library will initialize
  * itself automatically.
@@ -121,7 +602,7 @@ void ff_openssl_deinit(void)
 }
 #endif
 
-static int print_tls_error(URLContext *h, int ret)
+static int print_ssl_error(URLContext *h, int ret)
 {
     TLSContext *c = h->priv_data;
     int printed = 0, e, averr = AVERROR(EIO);
@@ -193,7 +674,7 @@ static int url_bio_destroy(BIO *b)
 static int url_bio_bread(BIO *b, char *buf, int len)
 {
     TLSContext *c = GET_BIO_DATA(b);
-    int ret = ffurl_read(c->tls_shared.tcp, buf, len);
+    int ret = ffurl_read(c->tls_shared.is_dtls ? c->tls_shared.udp : c->tls_shared.tcp, buf, len);
     if (ret >= 0)
         return ret;
     BIO_clear_retry_flags(b);
@@ -209,7 +690,7 @@ static int url_bio_bread(BIO *b, char *buf, int len)
 static int url_bio_bwrite(BIO *b, const char *buf, int len)
 {
     TLSContext *c = GET_BIO_DATA(b);
-    int ret = ffurl_write(c->tls_shared.tcp, buf, len);
+    int ret = ffurl_write(c->tls_shared.is_dtls ? c->tls_shared.udp : c->tls_shared.tcp, buf, len);
     if (ret >= 0)
         return ret;
     BIO_clear_retry_flags(b);
@@ -250,11 +731,300 @@ static BIO_METHOD url_bio_method = {
 };
 #endif
 
+static av_cold void init_bio_method(URLContext *h)
+{
+    TLSContext *p = h->priv_data;
+    BIO *bio;
+#if OPENSSL_VERSION_NUMBER >= 0x1010000fL
+    p->url_bio_method = BIO_meth_new(BIO_TYPE_SOURCE_SINK, "urlprotocol bio");
+    BIO_meth_set_write(p->url_bio_method, url_bio_bwrite);
+    BIO_meth_set_read(p->url_bio_method, url_bio_bread);
+    BIO_meth_set_puts(p->url_bio_method, url_bio_bputs);
+    BIO_meth_set_ctrl(p->url_bio_method, url_bio_ctrl);
+    BIO_meth_set_create(p->url_bio_method, url_bio_create);
+    BIO_meth_set_destroy(p->url_bio_method, url_bio_destroy);
+    bio = BIO_new(p->url_bio_method);
+    BIO_set_data(bio, p);
+#else
+    bio = BIO_new(&url_bio_method);
+    bio->ptr = p;
+#endif
+    SSL_set_bio(p->ssl, bio, bio);
+}
+
+static void openssl_info_callback(const SSL *ssl, int where, int ret) {
+    const char *method = "undefined";
+    TLSContext *ctx = (TLSContext*)SSL_get_ex_data(ssl, 0);
+
+    if (where & SSL_ST_CONNECT) {
+        method = "SSL_connect";
+    } else if (where & SSL_ST_ACCEPT)
+        method = "SSL_accept";
+
+    if (where & SSL_CB_LOOP) {
+        av_log(ctx, AV_LOG_DEBUG, "Info method=%s state=%s(%s), where=%d, ret=%d\n",
+               method, SSL_state_string(ssl), SSL_state_string_long(ssl), where, ret);
+    } else if (where & SSL_CB_ALERT) {
+        method = (where & SSL_CB_READ) ? "read":"write";
+        av_log(ctx, AV_LOG_DEBUG, "Alert method=%s state=%s(%s), where=%d, ret=%d\n",
+               method, SSL_state_string(ssl), SSL_state_string_long(ssl), where, ret);
+    }
+}
+
+/**
+ * Always return 1 to accept any certificate. This is because we allow the peer to
+ * use a temporary self-signed certificate for DTLS.
+ */
+static int openssl_dtls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx)
+{
+    return 1;
+}
+
+static int dtls_handshake(URLContext *h)
+{
+    int ret = 0, r0, r1;
+    TLSContext *p = h->priv_data;
+
+    r0 = SSL_do_handshake(p->ssl);
+    r1 = SSL_get_error(p->ssl, r0);
+    if (r0 <= 0) {
+        if (r1 != SSL_ERROR_WANT_READ && r1 != SSL_ERROR_WANT_WRITE && r1 != SSL_ERROR_ZERO_RETURN) {
+            av_log(p, AV_LOG_ERROR, "TLS: Read failed, r0=%d, r1=%d %s\n", r0, r1, openssl_get_error(p));
+            ret = AVERROR(EIO);
+            goto end;
+        }
+    } else {
+        av_log(p, AV_LOG_TRACE, "TLS: Read %d bytes, r0=%d, r1=%d\n", r0, r0, r1);
+    }
+
+    /* Check whether the DTLS is completed. */
+    if (SSL_is_init_finished(p->ssl) != 1)
+        goto end;
+
+    p->tls_shared.state = DTLS_STATE_FINISHED;
+end:
+    return ret;
+}
+
+static av_cold int openssl_init_ca_key_cert(URLContext *h)
+{
+    int ret;
+    TLSContext *p = h->priv_data;
+    TLSShared *c = &p->tls_shared;
+    EVP_PKEY *pkey = NULL;
+    X509 *cert = NULL;
+    /* setup ca, private key, certificate */
+    if (c->ca_file) {
+        if (!SSL_CTX_load_verify_locations(p->ctx, c->ca_file, NULL))
+            av_log(h, AV_LOG_ERROR, "SSL_CTX_load_verify_locations %s\n", openssl_get_error(p));
+    }
+
+    if (c->cert_file) {
+        ret = SSL_CTX_use_certificate_chain_file(p->ctx, c->cert_file);
+        if (ret <= 0) {
+            av_log(h, AV_LOG_ERROR, "Unable to load cert file %s: %s\n",
+               c->cert_file, openssl_get_error(p));
+            ret = AVERROR(EIO);
+            goto fail;
+        }
+    } else if (p->tls_shared.cert_buf) {
+        cert = cert_from_pem_string(p->tls_shared.cert_buf);
+        if (SSL_CTX_use_certificate(p->ctx, cert) != 1) {
+            av_log(p, AV_LOG_ERROR, "SSL: Init SSL_CTX_use_certificate failed, %s\n", openssl_get_error(p));
+            ret = AVERROR(EINVAL);
+            return ret;
+        }
+    } else if (p->tls_shared.is_dtls){
+        av_log(p, AV_LOG_ERROR, "TLS: Init cert failed, %s\n", openssl_get_error(p));
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    if (c->key_file) {
+        ret = SSL_CTX_use_PrivateKey_file(p->ctx, c->key_file, SSL_FILETYPE_PEM);
+        if (ret <= 0) {
+            av_log(h, AV_LOG_ERROR, "Unable to load key file %s: %s\n",
+                c->key_file, openssl_get_error(p));
+            ret = AVERROR(EIO);
+            goto fail;
+        }
+    } else if (p->tls_shared.key_buf) {
+        pkey = pkey_from_pem_string(p->tls_shared.key_buf, 1);
+        if (SSL_CTX_use_PrivateKey(p->ctx, pkey) != 1) {
+            av_log(p, AV_LOG_ERROR, "TLS: Init SSL_CTX_use_PrivateKey failed, %s\n", openssl_get_error(p));
+            ret = AVERROR(EINVAL);
+            return ret;
+        }
+    } else if (p->tls_shared.is_dtls){
+        av_log(p, AV_LOG_ERROR, "TLS: Init pkey failed, %s\n", openssl_get_error(p));
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+    ret = 0;
+fail:
+    return ret;
+}
+
+/**
+ * Once the DTLS role has been negotiated - active for the DTLS client or passive for the
+ * DTLS server - we proceed to set up the DTLS state and initiate the handshake.
+ */
+static int dtls_start(URLContext *h, const char *url, int flags, AVDictionary **options)
+{
+    TLSContext *p = h->priv_data;
+    TLSShared *c = &p->tls_shared;
+    int ret = 0;
+    c->is_dtls = 1;
+    const char* ciphers = "ALL";
+    /**
+     * The profile for OpenSSL's SRTP is SRTP_AES128_CM_SHA1_80, see ssl/d1_srtp.c.
+     * The profile for FFmpeg's SRTP is SRTP_AES128_CM_HMAC_SHA1_80, see libavformat/srtp.c.
+     */
+    const char* profiles = "SRTP_AES128_CM_SHA1_80";
+    /* Refer to the test cases regarding these curves in the WebRTC code. */
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L /* OpenSSL 1.1.0 */
+    const char* curves = "X25519:P-256:P-384:P-521";
+#elif OPENSSL_VERSION_NUMBER >= 0x10002000L /* OpenSSL 1.0.2 */
+    const char* curves = "P-256:P-384:P-521";
+#endif
+
+#if OPENSSL_VERSION_NUMBER < 0x10002000L /* OpenSSL v1.0.2 */
+    p->ctx = SSL_CTX_new(DTLSv1_method());
+#else
+    p->ctx = SSL_CTX_new(DTLS_method());
+#endif
+    if (!p->ctx) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+#if OPENSSL_VERSION_NUMBER >= 0x10002000L /* OpenSSL 1.0.2 */
+    /* For ECDSA, we could set the curves list. */
+    if (SSL_CTX_set1_curves_list(p->ctx, curves) != 1) {
+        av_log(p, AV_LOG_ERROR, "TLS: Init SSL_CTX_set1_curves_list failed, curves=%s, %s\n",
+            curves, openssl_get_error(p));
+        ret = AVERROR(EINVAL);
+        return ret;
+    }
+#endif
+
+#if OPENSSL_VERSION_NUMBER < 0x10100000L // v1.1.x
+#if OPENSSL_VERSION_NUMBER < 0x10002000L // v1.0.2
+    if (ctx->dtls_eckey)
+        SSL_CTX_set_tmp_ecdh(p->ctx, p->dtls_eckey);
+#else
+    SSL_CTX_set_ecdh_auto(p->ctx, 1);
+#endif
+#endif
+
+    /**
+     * We activate "ALL" cipher suites to align with the peer's capabilities,
+     * ensuring maximum compatibility.
+     */
+    if (SSL_CTX_set_cipher_list(p->ctx, ciphers) != 1) {
+        av_log(p, AV_LOG_ERROR, "TLS: Init SSL_CTX_set_cipher_list failed, ciphers=%s, %s\n",
+            ciphers, openssl_get_error(p));
+        ret = AVERROR(EINVAL);
+        return ret;
+    }
+    ret = openssl_init_ca_key_cert(h);
+    if (ret < 0) goto fail;
+
+    /* Server will send Certificate Request. */
+    SSL_CTX_set_verify(p->ctx, SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE, openssl_dtls_verify_callback);
+    /* The depth count is "level 0:peer certificate", "level 1: CA certificate",
+     * "level 2: higher level CA certificate", and so on. */
+    SSL_CTX_set_verify_depth(p->ctx, 4);
+    /* Whether we should read as many input bytes as possible (for non-blocking reads) or not. */
+    SSL_CTX_set_read_ahead(p->ctx, 1);
+    /* Setup the SRTP context */
+    if (SSL_CTX_set_tlsext_use_srtp(p->ctx, profiles)) {
+        av_log(p, AV_LOG_ERROR, "TLS: Init SSL_CTX_set_tlsext_use_srtp failed, profiles=%s, %s\n",
+            profiles, openssl_get_error(p));
+        ret = AVERROR(EINVAL);
+        return ret;
+    }
+
+    /* The ssl should not be created unless the ctx has been initialized. */
+    p->ssl = SSL_new(p->ctx);
+    if (!p->ssl) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    /* Setup the callback for logging. */
+    SSL_set_ex_data(p->ssl, 0, p);
+    SSL_set_info_callback(p->ssl, openssl_info_callback);
+    /**
+     * We have set the MTU to fragment the DTLS packet. It is important to note that the
+     * packet is split to ensure that each handshake packet is smaller than the MTU.
+     */
+    SSL_set_options(p->ssl, SSL_OP_NO_QUERY_MTU);
+    SSL_set_mtu(p->ssl, p->tls_shared.mtu);
+#if OPENSSL_VERSION_NUMBER >= 0x100010b0L /* OpenSSL 1.0.1k */
+    DTLS_set_link_mtu(p->ssl, p->tls_shared.mtu);
+#endif
+    init_bio_method(h);
+
+    if (p->tls_shared.use_external_udp != 1) {
+        if ((ret = ff_tls_open_underlying(&p->tls_shared, h, url, options)) < 0) {
+            av_log(p, AV_LOG_ERROR, "Failed to connect %s\n", url);
+            return ret;
+        }
+    }
+
+    /* Setup DTLS as passive, which is server role. */
+    c->listen ? SSL_set_accept_state(p->ssl) : SSL_set_connect_state(p->ssl);
+
+    /**
+     * During initialization, we only need to call SSL_do_handshake once because SSL_read consumes
+     * the handshake message if the handshake is incomplete.
+     * To simplify maintenance, we initiate the handshake for both the DTLS server and client after
+     * sending out the ICE response in the start_active_handshake function. It's worth noting that
+     * although the DTLS server may receive the ClientHello immediately after sending out the ICE
+     * response, this shouldn't be an issue as the handshake function is called before any DTLS
+     * packets are received.
+     *
+     * The SSL_do_handshake can't be called if DTLS hasn't prepare for udp.
+     */
+    if (p->tls_shared.use_external_udp != 1) {
+        ret = dtls_handshake(h);
+        // Fatal SSL error, for example, no available suite when peer is DTLS 1.0 while we are DTLS 1.2.
+        if (ret < 0) {
+            av_log(p, AV_LOG_ERROR, "TLS: Failed to drive SSL context, ret=%d\n", ret);
+            return AVERROR(EIO);
+        }
+    }
+
+    av_log(p, AV_LOG_VERBOSE, "TLS: Setup ok, MTU=%d, fingerprint %s\n",
+        p->tls_shared.mtu, p->tls_shared.fingerprint);
+
+    ret = 0;
+fail:
+    return ret;
+}
+
+/**
+ * Cleanup the DTLS context.
+ */
+static av_cold int dtls_close(URLContext *h)
+{
+    TLSContext *ctx = h->priv_data;
+    SSL_free(ctx->ssl);
+    SSL_CTX_free(ctx->ctx);
+    av_freep(&ctx->tls_shared.fingerprint);
+    av_freep(&ctx->tls_shared.cert_buf);
+    av_freep(&ctx->tls_shared.key_buf);
+#if OPENSSL_VERSION_NUMBER < 0x30000000L /* OpenSSL 3.0 */
+    EC_KEY_free(ctx->dtls_eckey);
+#endif
+    return 0;
+}
+
 static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **options)
 {
     TLSContext *p = h->priv_data;
     TLSShared *c = &p->tls_shared;
-    BIO *bio;
     int ret;
 
 #if OPENSSL_VERSION_NUMBER < 0x10100000L
@@ -271,52 +1041,26 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op
     // support for the old protocols immediately after creating the context.
     p->ctx = SSL_CTX_new(c->listen ? SSLv23_server_method() : SSLv23_client_method());
     if (!p->ctx) {
-        av_log(h, AV_LOG_ERROR, "%s\n", ERR_error_string(ERR_get_error(), NULL));
+        av_log(h, AV_LOG_ERROR, "%s\n", openssl_get_error(p));
         ret = AVERROR(EIO);
         goto fail;
     }
     SSL_CTX_set_options(p->ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3);
-    if (c->ca_file) {
-        if (!SSL_CTX_load_verify_locations(p->ctx, c->ca_file, NULL))
-            av_log(h, AV_LOG_ERROR, "SSL_CTX_load_verify_locations %s\n", ERR_error_string(ERR_get_error(), NULL));
-    }
-    if (c->cert_file && !SSL_CTX_use_certificate_chain_file(p->ctx, c->cert_file)) {
-        av_log(h, AV_LOG_ERROR, "Unable to load cert file %s: %s\n",
-               c->cert_file, ERR_error_string(ERR_get_error(), NULL));
-        ret = AVERROR(EIO);
-        goto fail;
-    }
-    if (c->key_file && !SSL_CTX_use_PrivateKey_file(p->ctx, c->key_file, SSL_FILETYPE_PEM)) {
-        av_log(h, AV_LOG_ERROR, "Unable to load key file %s: %s\n",
-               c->key_file, ERR_error_string(ERR_get_error(), NULL));
-        ret = AVERROR(EIO);
-        goto fail;
-    }
+    ret = openssl_init_ca_key_cert(h);
+    if (ret < 0) goto fail;
     // Note, this doesn't check that the peer certificate actually matches
     // the requested hostname.
     if (c->verify)
         SSL_CTX_set_verify(p->ctx, SSL_VERIFY_PEER|SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL);
     p->ssl = SSL_new(p->ctx);
     if (!p->ssl) {
-        av_log(h, AV_LOG_ERROR, "%s\n", ERR_error_string(ERR_get_error(), NULL));
+        av_log(h, AV_LOG_ERROR, "%s\n", openssl_get_error(p));
         ret = AVERROR(EIO);
         goto fail;
     }
-#if OPENSSL_VERSION_NUMBER >= 0x1010000fL
-    p->url_bio_method = BIO_meth_new(BIO_TYPE_SOURCE_SINK, "urlprotocol bio");
-    BIO_meth_set_write(p->url_bio_method, url_bio_bwrite);
-    BIO_meth_set_read(p->url_bio_method, url_bio_bread);
-    BIO_meth_set_puts(p->url_bio_method, url_bio_bputs);
-    BIO_meth_set_ctrl(p->url_bio_method, url_bio_ctrl);
-    BIO_meth_set_create(p->url_bio_method, url_bio_create);
-    BIO_meth_set_destroy(p->url_bio_method, url_bio_destroy);
-    bio = BIO_new(p->url_bio_method);
-    BIO_set_data(bio, p);
-#else
-    bio = BIO_new(&url_bio_method);
-    bio->ptr = p;
-#endif
-    SSL_set_bio(p->ssl, bio, bio);
+    SSL_set_ex_data(p->ssl, 0, p);
+    SSL_CTX_set_info_callback(p->ctx, openssl_info_callback);
+    init_bio_method(h);
     if (!c->listen && !c->numerichost)
         SSL_set_tlsext_host_name(p->ssl, c->host);
     ret = c->listen ? SSL_accept(p->ssl) : SSL_connect(p->ssl);
@@ -325,7 +1069,7 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op
         ret = AVERROR(EIO);
         goto fail;
     } else if (ret < 0) {
-        ret = print_tls_error(h, ret);
+        ret = print_ssl_error(h, ret);
         goto fail;
     }
 
@@ -338,31 +1082,35 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op
 static int tls_read(URLContext *h, uint8_t *buf, int size)
 {
     TLSContext *c = h->priv_data;
+    URLContext *uc = c->tls_shared.is_dtls ? c->tls_shared.udp
+                                           : c->tls_shared.tcp;
     int ret;
     // Set or clear the AVIO_FLAG_NONBLOCK on c->tls_shared.tcp
-    c->tls_shared.tcp->flags &= ~AVIO_FLAG_NONBLOCK;
-    c->tls_shared.tcp->flags |= h->flags & AVIO_FLAG_NONBLOCK;
+    uc->flags &= ~AVIO_FLAG_NONBLOCK;
+    uc->flags |= h->flags & AVIO_FLAG_NONBLOCK;
     ret = SSL_read(c->ssl, buf, size);
     if (ret > 0)
         return ret;
     if (ret == 0)
         return AVERROR_EOF;
-    return print_tls_error(h, ret);
+    return print_ssl_error(h, ret);
 }
 
 static int tls_write(URLContext *h, const uint8_t *buf, int size)
 {
     TLSContext *c = h->priv_data;
+    URLContext *uc = c->tls_shared.is_dtls ? c->tls_shared.udp
+                                           : c->tls_shared.tcp;
     int ret;
     // Set or clear the AVIO_FLAG_NONBLOCK on c->tls_shared.tcp
-    c->tls_shared.tcp->flags &= ~AVIO_FLAG_NONBLOCK;
-    c->tls_shared.tcp->flags |= h->flags & AVIO_FLAG_NONBLOCK;
+    uc->flags &= ~AVIO_FLAG_NONBLOCK;
+    uc->flags |= h->flags & AVIO_FLAG_NONBLOCK;
     ret = SSL_write(c->ssl, buf, size);
     if (ret > 0)
         return ret;
     if (ret == 0)
         return AVERROR_EOF;
-    return print_tls_error(h, ret);
+    return print_ssl_error(h, ret);
 }
 
 static int tls_get_file_handle(URLContext *h)
@@ -401,3 +1149,22 @@ const URLProtocol ff_tls_protocol = {
     .flags          = URL_PROTOCOL_FLAG_NETWORK,
     .priv_data_class = &tls_class,
 };
+
+static const AVClass dtls_class = {
+    .class_name = "dtls",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const URLProtocol ff_dtls_protocol = {
+    .name           = "dtls",
+    .url_open2      = dtls_start,
+    .url_handshake  = dtls_handshake,
+    .url_close      = dtls_close,
+    .url_read       = tls_read,
+    .url_write      = tls_write,
+    .priv_data_size = sizeof(TLSContext),
+    .flags          = URL_PROTOCOL_FLAG_NETWORK,
+    .priv_data_class = &dtls_class,
+};
diff --git a/libavformat/whip.c b/libavformat/whip.c
new file mode 100644
index 0000000000000..0671e23635d65
--- /dev/null
+++ b/libavformat/whip.c
@@ -0,0 +1,1917 @@
+/*
+ * WebRTC-HTTP ingestion protocol (WHIP) muxer
+ * Copyright (c) 2023 The FFmpeg Project
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/codec_desc.h"
+#include "libavcodec/h264.h"
+#include "libavcodec/startcode.h"
+#include "libavutil/base64.h"
+#include "libavutil/bprint.h"
+#include "libavutil/crc.h"
+#include "libavutil/hmac.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/lfg.h"
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+#include "libavutil/random_seed.h"
+#include "libavutil/time.h"
+#include "avc.h"
+#include "nal.h"
+#include "avio_internal.h"
+#include "http.h"
+#include "internal.h"
+#include "mux.h"
+#include "network.h"
+#include "srtp.h"
+#include "tls.h"
+
+/**
+ * Maximum size limit of a Session Description Protocol (SDP),
+ * be it an offer or answer.
+ */
+#define MAX_SDP_SIZE 8192
+
+/**
+ * The size of the Secure Real-time Transport Protocol (SRTP) master key material
+ * that is exported by Secure Sockets Layer (SSL) after a successful Datagram
+ * Transport Layer Security (DTLS) handshake. This material consists of a key
+ * of 16 bytes and a salt of 14 bytes.
+ */
+#define DTLS_SRTP_KEY_LEN 16
+#define DTLS_SRTP_SALT_LEN 14
+
+/**
+ * The maximum size of the Secure Real-time Transport Protocol (SRTP) HMAC checksum
+ * and padding that is appended to the end of the packet. To calculate the maximum
+ * size of the User Datagram Protocol (UDP) packet that can be sent out, subtract
+ * this size from the `pkt_size`.
+ */
+#define DTLS_SRTP_CHECKSUM_LEN 16
+
+/**
+ * When sending ICE or DTLS messages, responses are received via UDP. However, the peer
+ * may not be ready and return EAGAIN, in which case we should wait for a short duration
+ * and retry reading.
+ * For instance, if we try to read from UDP and get EAGAIN, we sleep for 5ms and retry.
+ * This macro is used to limit the total duration in milliseconds (e.g., 50ms), so we
+ * will try at most 5 times.
+ * Keep in mind that this macro should have a minimum duration of 5 ms.
+ */
+#define ICE_DTLS_READ_INTERVAL 50
+
+/* The magic cookie for Session Traversal Utilities for NAT (STUN) messages. */
+#define STUN_MAGIC_COOKIE 0x2112A442
+
+/**
+ * The DTLS content type.
+ * See https://tools.ietf.org/html/rfc2246#section-6.2.1
+ * change_cipher_spec(20), alert(21), handshake(22), application_data(23)
+ */
+#define DTLS_CONTENT_TYPE_CHANGE_CIPHER_SPEC 20
+
+/**
+ * The DTLS record layer header has a total size of 13 bytes, consisting of
+ * ContentType (1 byte), ProtocolVersion (2 bytes), Epoch (2 bytes),
+ * SequenceNumber (6 bytes), and Length (2 bytes).
+ * See https://datatracker.ietf.org/doc/html/rfc9147#section-4
+ */
+#define DTLS_RECORD_LAYER_HEADER_LEN 13
+
+/**
+ * The DTLS version number, which is 0xfeff for DTLS 1.0, or 0xfefd for DTLS 1.2.
+ * See https://datatracker.ietf.org/doc/html/rfc9147#name-the-dtls-record-layer
+ */
+#define DTLS_VERSION_10 0xfeff
+#define DTLS_VERSION_12 0xfefd
+
+/**
+ * Maximum size of the buffer for sending and receiving UDP packets.
+ * Please note that this size does not limit the size of the UDP packet that can be sent.
+ * To set the limit for packet size, modify the `pkt_size` parameter.
+ * For instance, it is possible to set the UDP buffer to 4096 to send or receive packets,
+ * but please keep in mind that the `pkt_size` option limits the packet size to 1400.
+ */
+#define MAX_UDP_BUFFER_SIZE 4096
+
+/* Referring to Chrome's definition of RTP payload types. */
+#define WHIP_RTP_PAYLOAD_TYPE_H264 106
+#define WHIP_RTP_PAYLOAD_TYPE_OPUS 111
+
+/**
+ * The STUN message header, which is 20 bytes long, comprises the
+ * STUNMessageType (1B), MessageLength (2B), MagicCookie (4B),
+ * and TransactionID (12B).
+ * See https://datatracker.ietf.org/doc/html/rfc5389#section-6
+ */
+#define ICE_STUN_HEADER_SIZE 20
+
+/**
+ * The RTP header is 12 bytes long, comprising the Version(1B), PT(1B),
+ * SequenceNumber(2B), Timestamp(4B), and SSRC(4B).
+ * See https://www.rfc-editor.org/rfc/rfc3550#section-5.1
+ */
+#define WHIP_RTP_HEADER_SIZE 12
+
+/**
+ * For RTCP, PT is [128, 223] (or without marker [0, 95]). Literally, RTCP starts
+ * from 64 not 0, so PT is [192, 223] (or without marker [64, 95]), see "RTCP Control
+ * Packet Types (PT)" at
+ * https://www.iana.org/assignments/rtp-parameters/rtp-parameters.xhtml#rtp-parameters-4
+ *
+ * For RTP, the PT is [96, 127], or [224, 255] with marker. See "RTP Payload Types (PT)
+ * for standard audio and video encodings" at
+ * https://www.iana.org/assignments/rtp-parameters/rtp-parameters.xhtml#rtp-parameters-1
+ */
+#define WHIP_RTCP_PT_START 192
+#define WHIP_RTCP_PT_END   223
+
+/**
+ * In the case of ICE-LITE, these fields are not used; instead, they are defined
+ * as constant values.
+ */
+#define WHIP_SDP_SESSION_ID "4489045141692799359"
+#define WHIP_SDP_CREATOR_IP "127.0.0.1"
+
+/* Calculate the elapsed time from starttime to endtime in milliseconds. */
+#define ELAPSED(starttime, endtime) ((int)(endtime - starttime) / 1000)
+
+/* STUN Attribute, comprehension-required range (0x0000-0x7FFF) */
+enum STUNAttr {
+    STUN_ATTR_USERNAME                  = 0x0006, /// shared secret response/bind request
+    STUN_ATTR_USE_CANDIDATE             = 0x0025, /// bind request
+    STUN_ATTR_MESSAGE_INTEGRITY         = 0x0008, /// bind request/response
+    STUN_ATTR_FINGERPRINT               = 0x8028, /// rfc5389
+};
+
+enum WHIPState {
+    WHIP_STATE_NONE,
+
+    /* The initial state. */
+    WHIP_STATE_INIT,
+    /* The muxer has sent the offer to the peer. */
+    WHIP_STATE_OFFER,
+    /* The muxer has received the answer from the peer. */
+    WHIP_STATE_ANSWER,
+    /**
+     * After parsing the answer received from the peer, the muxer negotiates the abilities
+     * in the offer that it generated.
+     */
+    WHIP_STATE_NEGOTIATED,
+    /* The muxer has connected to the peer via UDP. */
+    WHIP_STATE_UDP_CONNECTED,
+    /* The muxer has sent the ICE request to the peer. */
+    WHIP_STATE_ICE_CONNECTING,
+    /* The muxer has received the ICE response from the peer. */
+    WHIP_STATE_ICE_CONNECTED,
+    /* The muxer starts attempting the DTLS handshake. */
+    WHIP_STATE_DTLS_CONNECTING,
+    /* The muxer has finished the DTLS handshake with the peer. */
+    WHIP_STATE_DTLS_FINISHED,
+    /* The muxer has finished the SRTP setup. */
+    WHIP_STATE_SRTP_FINISHED,
+    /* The muxer is ready to send/receive media frames. */
+    WHIP_STATE_READY,
+    /* The muxer is failed. */
+    WHIP_STATE_FAILED,
+};
+
+typedef struct WHIPContext {
+    AVClass *av_class;
+
+    /* The state of the RTC connection. */
+    enum WHIPState state;
+    /* The callback return value for DTLS. */
+    int dtls_ret;
+    int dtls_closed;
+
+    /* Parameters for the input audio and video codecs. */
+    AVCodecParameters *audio_par;
+    AVCodecParameters *video_par;
+
+    /**
+     * The h264_mp4toannexb Bitstream Filter (BSF) bypasses the AnnexB packet;
+     * therefore, it is essential to insert the SPS and PPS before each IDR frame
+     * in such cases.
+     */
+    int h264_annexb_insert_sps_pps;
+
+    /* The random number generator. */
+    AVLFG rnd;
+
+    /* The ICE username and pwd fragment generated by the muxer. */
+    char ice_ufrag_local[9];
+    char ice_pwd_local[33];
+    /* The SSRC of the audio and video stream, generated by the muxer. */
+    uint32_t audio_ssrc;
+    uint32_t video_ssrc;
+    /* The PT(Payload Type) of stream, generated by the muxer. */
+    uint8_t audio_payload_type;
+    uint8_t video_payload_type;
+    /**
+     * This is the SDP offer generated by the muxer based on the codec parameters,
+     * DTLS, and ICE information.
+     */
+    char *sdp_offer;
+
+    /* The ICE username and pwd from remote server. */
+    char *ice_ufrag_remote;
+    char *ice_pwd_remote;
+    /**
+     * This represents the ICE candidate protocol, priority, host and port.
+     * Currently, we only support one candidate and choose the first UDP candidate.
+     * However, we plan to support multiple candidates in the future.
+     */
+    char *ice_protocol;
+    char *ice_host;
+    int ice_port;
+
+    /* The SDP answer received from the WebRTC server. */
+    char *sdp_answer;
+    /* The resource URL returned in the Location header of WHIP HTTP response. */
+    char *whip_resource_url;
+
+    /* These variables represent timestamps used for calculating and tracking the cost. */
+    int64_t whip_starttime;
+    int64_t whip_init_time;
+    int64_t whip_offer_time;
+    int64_t whip_answer_time;
+    int64_t whip_udp_time;
+    int64_t whip_ice_time;
+    int64_t whip_dtls_time;
+    int64_t whip_srtp_time;
+
+    /* The certificate and private key content used for DTLS hanshake */
+    char cert_buf[MAX_CERTIFICATE_SIZE];
+    char key_buf[MAX_CERTIFICATE_SIZE];
+    /* The fingerprint of certificate, used in SDP offer. */
+    char *dtls_fingerprint;
+    /**
+     * This represents the material used to build the SRTP master key. It is
+     * generated by DTLS and has the following layout:
+     *          16B         16B         14B             14B
+     *      client_key | server_key | client_salt | server_salt
+     */
+    uint8_t dtls_srtp_materials[(DTLS_SRTP_KEY_LEN + DTLS_SRTP_SALT_LEN) * 2];
+
+    char ssl_error_message[256];
+
+    /* TODO: Use AVIOContext instead of URLContext */
+    URLContext *dtls_uc;
+
+    /* The SRTP send context, to encrypt outgoing packets. */
+    SRTPContext srtp_audio_send;
+    SRTPContext srtp_video_send;
+    SRTPContext srtp_rtcp_send;
+    /* The SRTP receive context, to decrypt incoming packets. */
+    SRTPContext srtp_recv;
+
+    /* The UDP transport is used for delivering ICE, DTLS and SRTP packets. */
+    URLContext *udp;
+    /* The buffer for UDP transmission. */
+    char buf[MAX_UDP_BUFFER_SIZE];
+
+    /* The timeout in milliseconds for ICE and DTLS handshake. */
+    int handshake_timeout;
+    /**
+     * The size of RTP packet, should generally be set to MTU.
+     * Note that pion requires a smaller value, for example, 1200.
+     */
+    int pkt_size;
+    /**
+     * The optional Bearer token for WHIP Authorization.
+     * See https://www.ietf.org/archive/id/draft-ietf-wish-whip-08.html#name-authentication-and-authoriz
+     */
+    char* authorization;
+    /* The certificate and private key used for DTLS handshake. */
+    char* cert_file;
+    char* key_file;
+} WHIPContext;
+
+/**
+ * Whether the packet is a DTLS packet.
+ */
+static int is_dtls_packet(uint8_t *b, int size) {
+    uint16_t version = AV_RB16(&b[1]);
+    return size > DTLS_RECORD_LAYER_HEADER_LEN &&
+        b[0] >= DTLS_CONTENT_TYPE_CHANGE_CIPHER_SPEC &&
+        (version == DTLS_VERSION_10 || version == DTLS_VERSION_12);
+}
+
+
+/**
+ * Get or Generate a self-signed certificate and private key for DTLS,
+ * fingerprint for SDP
+ */
+static av_cold int certificate_key_init(AVFormatContext *s)
+{
+    int ret = 0;
+    WHIPContext *whip = s->priv_data;
+
+    if (whip->cert_file && whip->key_file) {
+        /* Read the private key and certificate from the file. */
+        if ((ret = ff_ssl_read_key_cert(whip->key_file, whip->cert_file,
+                                        whip->key_buf, sizeof(whip->key_buf),
+                                        whip->cert_buf, sizeof(whip->cert_buf),
+                                        &whip->dtls_fingerprint)) < 0) {
+            av_log(s, AV_LOG_ERROR, "DTLS: Failed to read DTLS certificate from cert=%s, key=%s\n",
+                whip->cert_file, whip->key_file);
+            return ret;
+        }
+    } else {
+        /* Generate a private key to ctx->dtls_pkey and self-signed certificate. */
+        if ((ret = ff_ssl_gen_key_cert(whip->key_buf, sizeof(whip->key_buf),
+                                       whip->cert_buf, sizeof(whip->cert_buf),
+                                       &whip->dtls_fingerprint)) < 0) {
+            av_log(s, AV_LOG_ERROR, "DTLS: Failed to generate DTLS private key and certificate\n");
+            return ret;
+        }
+    }
+
+    return ret;
+}
+
+/**
+ * When DTLS state change.
+ */
+static int dtls_context_on_state(AVFormatContext *s, const char* type, const char* desc)
+{
+    int ret = 0;
+    WHIPContext *whip = s->priv_data;
+    int state = ff_dtls_state(whip->dtls_uc);
+
+    if (state == DTLS_STATE_CLOSED) {
+        whip->dtls_closed = 1;
+        av_log(whip, AV_LOG_VERBOSE, "WHIP: DTLS session closed, type=%s, desc=%s, elapsed=%dms\n",
+            type ? type : "", desc ? desc : "", ELAPSED(whip->whip_starttime, av_gettime()));
+        goto error;
+    }
+
+    if (state == DTLS_STATE_FAILED) {
+        whip->state = WHIP_STATE_FAILED;
+        av_log(whip, AV_LOG_ERROR, "WHIP: DTLS session failed, type=%s, desc=%s\n",
+            type ? type : "", desc ? desc : "");
+        whip->dtls_ret = AVERROR(EIO);
+        goto error;
+    }
+
+    if (state == DTLS_STATE_FINISHED && whip->state < WHIP_STATE_DTLS_FINISHED) {
+        whip->state = WHIP_STATE_DTLS_FINISHED;
+        whip->whip_dtls_time = av_gettime();
+        av_log(whip, AV_LOG_VERBOSE, "WHIP: DTLS handshake is done, elapsed=%dms\n",
+            ELAPSED(whip->whip_starttime, av_gettime()));
+        return ret;
+    }
+error:
+    return -1;
+}
+
+static av_cold int dtls_initialize(AVFormatContext *s)
+{
+    WHIPContext *whip = s->priv_data;
+    /* reuse the udp created by whip */
+    ff_dtls_set_udp(whip->dtls_uc, whip->udp);
+    return 0;
+}
+
+/**
+ * Initialize and check the options for the WebRTC muxer.
+ */
+static av_cold int initialize(AVFormatContext *s)
+{
+    int ret, ideal_pkt_size = 532;
+    WHIPContext *whip = s->priv_data;
+    uint32_t seed;
+
+    whip->whip_starttime = av_gettime();
+
+    ret = certificate_key_init(s);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to init certificate and key\n");
+        return ret;
+    }
+
+    /* Initialize the random number generator. */
+    seed = av_get_random_seed();
+    av_lfg_init(&whip->rnd, seed);
+
+    if (whip->pkt_size < ideal_pkt_size)
+        av_log(whip, AV_LOG_WARNING, "WHIP: pkt_size=%d(<%d) is too small, may cause packet loss\n",
+               whip->pkt_size, ideal_pkt_size);
+
+    if (whip->state < WHIP_STATE_INIT)
+        whip->state = WHIP_STATE_INIT;
+    whip->whip_init_time = av_gettime();
+    av_log(whip, AV_LOG_VERBOSE, "WHIP: Init state=%d, handshake_timeout=%dms, pkt_size=%d, seed=%d, elapsed=%dms\n",
+        whip->state, whip->handshake_timeout, whip->pkt_size, seed, ELAPSED(whip->whip_starttime, av_gettime()));
+
+    return 0;
+}
+
+/**
+ * When duplicating a stream, the demuxer has already set the extradata, profile, and
+ * level of the par. Keep in mind that this function will not be invoked since the
+ * profile and level are set.
+ *
+ * When utilizing an encoder, such as libx264, to encode a stream, the extradata in
+ * par->extradata contains the SPS, which includes profile and level information.
+ * However, the profile and level of par remain unspecified. Therefore, it is necessary
+ * to extract the profile and level data from the extradata and assign it to the par's
+ * profile and level. Keep in mind that AVFMT_GLOBALHEADER must be enabled; otherwise,
+ * the extradata will remain empty.
+ */
+static int parse_profile_level(AVFormatContext *s, AVCodecParameters *par)
+{
+    int ret = 0;
+    const uint8_t *r = par->extradata, *r1, *end = par->extradata + par->extradata_size;
+    H264SPS seq, *const sps = &seq;
+    uint32_t state;
+    WHIPContext *whip = s->priv_data;
+
+    if (par->codec_id != AV_CODEC_ID_H264)
+        return ret;
+
+    if (par->profile != AV_PROFILE_UNKNOWN && par->level != AV_LEVEL_UNKNOWN)
+        return ret;
+
+    if (!par->extradata || par->extradata_size <= 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Unable to parse profile from empty extradata=%p, size=%d\n",
+            par->extradata, par->extradata_size);
+        return AVERROR(EINVAL);
+    }
+
+    while (1) {
+        r = avpriv_find_start_code(r, end, &state);
+        if (r >= end)
+            break;
+
+        r1 = ff_nal_find_startcode(r, end);
+        if ((state & 0x1f) == H264_NAL_SPS) {
+            ret = ff_avc_decode_sps(sps, r, r1 - r);
+            if (ret < 0) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Failed to decode SPS, state=%x, size=%d\n",
+                    state, (int)(r1 - r));
+                return ret;
+            }
+
+            av_log(whip, AV_LOG_VERBOSE, "WHIP: Parse profile=%d, level=%d from SPS\n",
+                sps->profile_idc, sps->level_idc);
+            par->profile = sps->profile_idc;
+            par->level = sps->level_idc;
+        }
+
+        r = r1;
+    }
+
+    return ret;
+}
+
+/**
+ * Parses video SPS/PPS from the extradata of codecpar and checks the codec.
+ * Currently only supports video(h264) and audio(opus). Note that only baseline
+ * and constrained baseline profiles of h264 are supported.
+ *
+ * If the profile is less than 0, the function considers the profile as baseline.
+ * It may need to parse the profile from SPS/PPS. This situation occurs when ingesting
+ * desktop and transcoding.
+ *
+ * @param s Pointer to the AVFormatContext
+ * @returns Returns 0 if successful or AVERROR_xxx in case of an error.
+ *
+ * TODO: FIXME: There is an issue with the timestamp of OPUS audio, especially when
+ *  the input is an MP4 file. The timestamp deviates from the expected value of 960,
+ *  causing Chrome to play the audio stream with noise. This problem can be replicated
+ *  by transcoding a specific file into MP4 format and publishing it using the WHIP
+ *  muxer. However, when directly transcoding and publishing through the WHIP muxer,
+ *  the issue is not present, and the audio timestamp remains consistent. The root
+ *  cause is still unknown, and this comment has been added to address this issue
+ *  in the future. Further research is needed to resolve the problem.
+ */
+static int parse_codec(AVFormatContext *s)
+{
+    int i, ret = 0;
+    WHIPContext *whip = s->priv_data;
+
+    for (i = 0; i < s->nb_streams; i++) {
+        AVCodecParameters *par = s->streams[i]->codecpar;
+        const AVCodecDescriptor *desc = avcodec_descriptor_get(par->codec_id);
+        switch (par->codec_type) {
+        case AVMEDIA_TYPE_VIDEO:
+            if (whip->video_par) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Only one video stream is supported by RTC\n");
+                return AVERROR(EINVAL);
+            }
+            whip->video_par = par;
+
+            if (par->codec_id != AV_CODEC_ID_H264) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Unsupported video codec %s by RTC, choose h264\n",
+                       desc ? desc->name : "unknown");
+                return AVERROR_PATCHWELCOME;
+            }
+
+            if (par->video_delay > 0) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Unsupported B frames by RTC\n");
+                return AVERROR_PATCHWELCOME;
+            }
+
+            if ((ret = parse_profile_level(s, par)) < 0) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Failed to parse SPS/PPS from extradata\n");
+                return AVERROR(EINVAL);
+            }
+
+            if (par->profile == AV_PROFILE_UNKNOWN) {
+                av_log(whip, AV_LOG_WARNING, "WHIP: No profile found in extradata, consider baseline\n");
+                return AVERROR(EINVAL);
+            }
+            if (par->level == AV_LEVEL_UNKNOWN) {
+                av_log(whip, AV_LOG_WARNING, "WHIP: No level found in extradata, consider 3.1\n");
+                return AVERROR(EINVAL);
+            }
+            break;
+        case AVMEDIA_TYPE_AUDIO:
+            if (whip->audio_par) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Only one audio stream is supported by RTC\n");
+                return AVERROR(EINVAL);
+            }
+            whip->audio_par = par;
+
+            if (par->codec_id != AV_CODEC_ID_OPUS) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Unsupported audio codec %s by RTC, choose opus\n",
+                    desc ? desc->name : "unknown");
+                return AVERROR_PATCHWELCOME;
+            }
+
+            if (par->ch_layout.nb_channels != 2) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Unsupported audio channels %d by RTC, choose stereo\n",
+                    par->ch_layout.nb_channels);
+                return AVERROR_PATCHWELCOME;
+            }
+
+            if (par->sample_rate != 48000) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Unsupported audio sample rate %d by RTC, choose 48000\n", par->sample_rate);
+                return AVERROR_PATCHWELCOME;
+            }
+            break;
+        default:
+            av_log(whip, AV_LOG_ERROR, "WHIP: Codec type '%s' for stream %d is not supported by RTC\n",
+                   av_get_media_type_string(par->codec_type), i);
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+
+    return ret;
+}
+
+/**
+ * Generate SDP offer according to the codec parameters, DTLS and ICE information.
+ *
+ * Note that we don't use av_sdp_create to generate SDP offer because it doesn't
+ * support DTLS and ICE information.
+ *
+ * @return 0 if OK, AVERROR_xxx on error
+ */
+static int generate_sdp_offer(AVFormatContext *s)
+{
+    int ret = 0, profile, level, profile_iop;
+    const char *acodec_name = NULL, *vcodec_name = NULL;
+    AVBPrint bp;
+    WHIPContext *whip = s->priv_data;
+
+    /* To prevent a crash during cleanup, always initialize it. */
+    av_bprint_init(&bp, 1, MAX_SDP_SIZE);
+
+    if (whip->sdp_offer) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: SDP offer is already set\n");
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    snprintf(whip->ice_ufrag_local, sizeof(whip->ice_ufrag_local), "%08x",
+        av_lfg_get(&whip->rnd));
+    snprintf(whip->ice_pwd_local, sizeof(whip->ice_pwd_local), "%08x%08x%08x%08x",
+        av_lfg_get(&whip->rnd), av_lfg_get(&whip->rnd), av_lfg_get(&whip->rnd),
+        av_lfg_get(&whip->rnd));
+
+    whip->audio_ssrc = av_lfg_get(&whip->rnd);
+    whip->video_ssrc = av_lfg_get(&whip->rnd);
+
+    whip->audio_payload_type = WHIP_RTP_PAYLOAD_TYPE_OPUS;
+    whip->video_payload_type = WHIP_RTP_PAYLOAD_TYPE_H264;
+
+    av_bprintf(&bp, ""
+        "v=0\r\n"
+        "o=FFmpeg %s 2 IN IP4 %s\r\n"
+        "s=FFmpegPublishSession\r\n"
+        "t=0 0\r\n"
+        "a=group:BUNDLE 0 1\r\n"
+        "a=extmap-allow-mixed\r\n"
+        "a=msid-semantic: WMS\r\n",
+        WHIP_SDP_SESSION_ID,
+        WHIP_SDP_CREATOR_IP);
+
+    if (whip->audio_par) {
+        if (whip->audio_par->codec_id == AV_CODEC_ID_OPUS)
+            acodec_name = "opus";
+
+        av_bprintf(&bp, ""
+            "m=audio 9 UDP/TLS/RTP/SAVPF %u\r\n"
+            "c=IN IP4 0.0.0.0\r\n"
+            "a=ice-ufrag:%s\r\n"
+            "a=ice-pwd:%s\r\n"
+            "a=fingerprint:sha-256 %s\r\n"
+            "a=setup:passive\r\n"
+            "a=mid:0\r\n"
+            "a=sendonly\r\n"
+            "a=msid:FFmpeg audio\r\n"
+            "a=rtcp-mux\r\n"
+            "a=rtpmap:%u %s/%d/%d\r\n"
+            "a=ssrc:%u cname:FFmpeg\r\n"
+            "a=ssrc:%u msid:FFmpeg audio\r\n",
+            whip->audio_payload_type,
+            whip->ice_ufrag_local,
+            whip->ice_pwd_local,
+            whip->dtls_fingerprint,
+            whip->audio_payload_type,
+            acodec_name,
+            whip->audio_par->sample_rate,
+            whip->audio_par->ch_layout.nb_channels,
+            whip->audio_ssrc,
+            whip->audio_ssrc);
+    }
+
+    if (whip->video_par) {
+        profile_iop = profile = whip->video_par->profile;
+        level = whip->video_par->level;
+        if (whip->video_par->codec_id == AV_CODEC_ID_H264) {
+            vcodec_name = "H264";
+            profile_iop &= AV_PROFILE_H264_CONSTRAINED;
+            profile &= (~AV_PROFILE_H264_CONSTRAINED);
+        }
+
+        av_bprintf(&bp, ""
+            "m=video 9 UDP/TLS/RTP/SAVPF %u\r\n"
+            "c=IN IP4 0.0.0.0\r\n"
+            "a=ice-ufrag:%s\r\n"
+            "a=ice-pwd:%s\r\n"
+            "a=fingerprint:sha-256 %s\r\n"
+            "a=setup:passive\r\n"
+            "a=mid:1\r\n"
+            "a=sendonly\r\n"
+            "a=msid:FFmpeg video\r\n"
+            "a=rtcp-mux\r\n"
+            "a=rtcp-rsize\r\n"
+            "a=rtpmap:%u %s/90000\r\n"
+            "a=fmtp:%u level-asymmetry-allowed=1;packetization-mode=1;profile-level-id=%02x%02x%02x\r\n"
+            "a=ssrc:%u cname:FFmpeg\r\n"
+            "a=ssrc:%u msid:FFmpeg video\r\n",
+            whip->video_payload_type,
+            whip->ice_ufrag_local,
+            whip->ice_pwd_local,
+            whip->dtls_fingerprint,
+            whip->video_payload_type,
+            vcodec_name,
+            whip->video_payload_type,
+            profile,
+            profile_iop,
+            level,
+            whip->video_ssrc,
+            whip->video_ssrc);
+    }
+
+    if (!av_bprint_is_complete(&bp)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Offer exceed max %d, %s\n", MAX_SDP_SIZE, bp.str);
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+    whip->sdp_offer = av_strdup(bp.str);
+    if (!whip->sdp_offer) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    if (whip->state < WHIP_STATE_OFFER)
+        whip->state = WHIP_STATE_OFFER;
+    whip->whip_offer_time = av_gettime();
+    av_log(whip, AV_LOG_VERBOSE, "WHIP: Generated state=%d, offer: %s\n", whip->state, whip->sdp_offer);
+
+end:
+    av_bprint_finalize(&bp, NULL);
+    return ret;
+}
+
+/**
+ * Exchange SDP offer with WebRTC peer to get the answer.
+ *
+ * @return 0 if OK, AVERROR_xxx on error
+ */
+static int exchange_sdp(AVFormatContext *s)
+{
+    int ret;
+    char buf[MAX_URL_SIZE];
+    AVBPrint bp;
+    WHIPContext *whip = s->priv_data;
+    /* The URL context is an HTTP transport layer for the WHIP protocol. */
+    URLContext *whip_uc = NULL;
+    AVDictionary *opts = NULL;
+    char *hex_data = NULL;
+
+    /* To prevent a crash during cleanup, always initialize it. */
+    av_bprint_init(&bp, 1, MAX_SDP_SIZE);
+
+    if (!whip->sdp_offer || !strlen(whip->sdp_offer)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: No offer to exchange\n");
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    ret = snprintf(buf, sizeof(buf), "Cache-Control: no-cache\r\nContent-Type: application/sdp\r\n");
+    if (whip->authorization)
+        ret += snprintf(buf + ret, sizeof(buf) - ret, "Authorization: Bearer %s\r\n", whip->authorization);
+    if (ret <= 0 || ret >= sizeof(buf)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to generate headers, size=%d, %s\n", ret, buf);
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    av_dict_set(&opts, "headers", buf, 0);
+    av_dict_set_int(&opts, "chunked_post", 0, 0);
+
+    hex_data = av_mallocz(2 * strlen(whip->sdp_offer) + 1);
+    if (!hex_data) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    ff_data_to_hex(hex_data, whip->sdp_offer, strlen(whip->sdp_offer), 0);
+    av_dict_set(&opts, "post_data", hex_data, 0);
+
+    ret = ffurl_open_whitelist(&whip_uc, s->url, AVIO_FLAG_READ_WRITE, &s->interrupt_callback,
+        &opts, s->protocol_whitelist, s->protocol_blacklist, NULL);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to request url=%s, offer: %s\n", s->url, whip->sdp_offer);
+        goto end;
+    }
+
+    if (ff_http_get_new_location(whip_uc)) {
+        whip->whip_resource_url = av_strdup(ff_http_get_new_location(whip_uc));
+        if (!whip->whip_resource_url) {
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+    }
+
+    while (1) {
+        ret = ffurl_read(whip_uc, buf, sizeof(buf));
+        if (ret == AVERROR_EOF) {
+            /* Reset the error because we read all response as answer util EOF. */
+            ret = 0;
+            break;
+        }
+        if (ret <= 0) {
+            av_log(whip, AV_LOG_ERROR, "WHIP: Failed to read response from url=%s, offer is %s, answer is %s\n",
+                s->url, whip->sdp_offer, whip->sdp_answer);
+            goto end;
+        }
+
+        av_bprintf(&bp, "%.*s", ret, buf);
+        if (!av_bprint_is_complete(&bp)) {
+            av_log(whip, AV_LOG_ERROR, "WHIP: Answer exceed max size %d, %.*s, %s\n", MAX_SDP_SIZE, ret, buf, bp.str);
+            ret = AVERROR(EIO);
+            goto end;
+        }
+    }
+
+    if (!av_strstart(bp.str, "v=", NULL)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Invalid answer: %s\n", bp.str);
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    whip->sdp_answer = av_strdup(bp.str);
+    if (!whip->sdp_answer) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    if (whip->state < WHIP_STATE_ANSWER)
+        whip->state = WHIP_STATE_ANSWER;
+    av_log(whip, AV_LOG_VERBOSE, "WHIP: Got state=%d, answer: %s\n", whip->state, whip->sdp_answer);
+
+end:
+    ffurl_closep(&whip_uc);
+    av_bprint_finalize(&bp, NULL);
+    av_dict_free(&opts);
+    av_freep(&hex_data);
+    return ret;
+}
+
+/**
+ * Parses the ICE ufrag, pwd, and candidates from the SDP answer.
+ *
+ * This function is used to extract the ICE ufrag, pwd, and candidates from the SDP answer.
+ * It returns an error if any of these fields is NULL. The function only uses the first
+ * candidate if there are multiple candidates. However, support for multiple candidates
+ * will be added in the future.
+ *
+ * @param s Pointer to the AVFormatContext
+ * @returns Returns 0 if successful or AVERROR_xxx if an error occurs.
+ */
+static int parse_answer(AVFormatContext *s)
+{
+    int ret = 0;
+    AVIOContext *pb;
+    char line[MAX_URL_SIZE];
+    const char *ptr;
+    int i;
+    WHIPContext *whip = s->priv_data;
+
+    if (!whip->sdp_answer || !strlen(whip->sdp_answer)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: No answer to parse\n");
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    pb = avio_alloc_context(whip->sdp_answer, strlen(whip->sdp_answer), 0, NULL, NULL, NULL, NULL);
+    if (!pb)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; !avio_feof(pb); i++) {
+        ff_get_chomp_line(pb, line, sizeof(line));
+        if (av_strstart(line, "a=ice-ufrag:", &ptr) && !whip->ice_ufrag_remote) {
+            whip->ice_ufrag_remote = av_strdup(ptr);
+            if (!whip->ice_ufrag_remote) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+        } else if (av_strstart(line, "a=ice-pwd:", &ptr) && !whip->ice_pwd_remote) {
+            whip->ice_pwd_remote = av_strdup(ptr);
+            if (!whip->ice_pwd_remote) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+        } else if (av_strstart(line, "a=candidate:", &ptr) && !whip->ice_protocol) {
+            ptr = av_stristr(ptr, "udp");
+            if (ptr && av_stristr(ptr, "host")) {
+                char protocol[17], host[129];
+                int priority, port;
+                ret = sscanf(ptr, "%16s %d %128s %d typ host", protocol, &priority, host, &port);
+                if (ret != 4) {
+                    av_log(whip, AV_LOG_ERROR, "WHIP: Failed %d to parse line %d %s from %s\n",
+                        ret, i, line, whip->sdp_answer);
+                    ret = AVERROR(EIO);
+                    goto end;
+                }
+
+                if (av_strcasecmp(protocol, "udp")) {
+                    av_log(whip, AV_LOG_ERROR, "WHIP: Protocol %s is not supported by RTC, choose udp, line %d %s of %s\n",
+                        protocol, i, line, whip->sdp_answer);
+                    ret = AVERROR(EIO);
+                    goto end;
+                }
+
+                whip->ice_protocol = av_strdup(protocol);
+                whip->ice_host = av_strdup(host);
+                whip->ice_port = port;
+                if (!whip->ice_protocol || !whip->ice_host) {
+                    ret = AVERROR(ENOMEM);
+                    goto end;
+                }
+            }
+        }
+    }
+
+    if (!whip->ice_pwd_remote || !strlen(whip->ice_pwd_remote)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: No remote ice pwd parsed from %s\n", whip->sdp_answer);
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    if (!whip->ice_ufrag_remote || !strlen(whip->ice_ufrag_remote)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: No remote ice ufrag parsed from %s\n", whip->sdp_answer);
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    if (!whip->ice_protocol || !whip->ice_host || !whip->ice_port) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: No ice candidate parsed from %s\n", whip->sdp_answer);
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    if (whip->state < WHIP_STATE_NEGOTIATED)
+        whip->state = WHIP_STATE_NEGOTIATED;
+    whip->whip_answer_time = av_gettime();
+    av_log(whip, AV_LOG_VERBOSE, "WHIP: SDP state=%d, offer=%luB, answer=%luB, ufrag=%s, pwd=%luB, transport=%s://%s:%d, elapsed=%dms\n",
+        whip->state, strlen(whip->sdp_offer), strlen(whip->sdp_answer), whip->ice_ufrag_remote, strlen(whip->ice_pwd_remote),
+        whip->ice_protocol, whip->ice_host, whip->ice_port, ELAPSED(whip->whip_starttime, av_gettime()));
+
+end:
+    avio_context_free(&pb);
+    return ret;
+}
+
+/**
+ * Creates and marshals an ICE binding request packet.
+ *
+ * This function creates and marshals an ICE binding request packet. The function only
+ * generates the username attribute and does not include goog-network-info, ice-controlling,
+ * use-candidate, and priority. However, some of these attributes may be added in the future.
+ *
+ * @param s Pointer to the AVFormatContext
+ * @param buf Pointer to memory buffer to store the request packet
+ * @param buf_size Size of the memory buffer
+ * @param request_size Pointer to an integer that receives the size of the request packet
+ * @return Returns 0 if successful or AVERROR_xxx if an error occurs.
+ */
+static int ice_create_request(AVFormatContext *s, uint8_t *buf, int buf_size, int *request_size)
+{
+    int ret, size, crc32;
+    char username[128];
+    AVIOContext *pb = NULL;
+    AVHMAC *hmac = NULL;
+    WHIPContext *whip = s->priv_data;
+
+    pb = avio_alloc_context(buf, buf_size, 1, NULL, NULL, NULL, NULL);
+    if (!pb)
+        return AVERROR(ENOMEM);
+
+    hmac = av_hmac_alloc(AV_HMAC_SHA1);
+    if (!hmac) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    /* Write 20 bytes header */
+    avio_wb16(pb, 0x0001); /* STUN binding request */
+    avio_wb16(pb, 0);      /* length */
+    avio_wb32(pb, STUN_MAGIC_COOKIE); /* magic cookie */
+    avio_wb32(pb, av_lfg_get(&whip->rnd)); /* transaction ID */
+    avio_wb32(pb, av_lfg_get(&whip->rnd)); /* transaction ID */
+    avio_wb32(pb, av_lfg_get(&whip->rnd)); /* transaction ID */
+
+    /* The username is the concatenation of the two ICE ufrag */
+    ret = snprintf(username, sizeof(username), "%s:%s", whip->ice_ufrag_remote, whip->ice_ufrag_local);
+    if (ret <= 0 || ret >= sizeof(username)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to build username %s:%s, max=%lu, ret=%d\n",
+            whip->ice_ufrag_remote, whip->ice_ufrag_local, sizeof(username), ret);
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+    /* Write the username attribute */
+    avio_wb16(pb, STUN_ATTR_USERNAME); /* attribute type username */
+    avio_wb16(pb, ret); /* size of username */
+    avio_write(pb, username, ret); /* bytes of username */
+    ffio_fill(pb, 0, (4 - (ret % 4)) % 4); /* padding */
+
+    /* Write the use-candidate attribute */
+    avio_wb16(pb, STUN_ATTR_USE_CANDIDATE); /* attribute type use-candidate */
+    avio_wb16(pb, 0); /* size of use-candidate */
+
+    /* Build and update message integrity */
+    avio_wb16(pb, STUN_ATTR_MESSAGE_INTEGRITY); /* attribute type message integrity */
+    avio_wb16(pb, 20); /* size of message integrity */
+    ffio_fill(pb, 0, 20); /* fill with zero to directly write and skip it */
+    size = avio_tell(pb);
+    buf[2] = (size - 20) >> 8;
+    buf[3] = (size - 20) & 0xFF;
+    av_hmac_init(hmac, whip->ice_pwd_remote, strlen(whip->ice_pwd_remote));
+    av_hmac_update(hmac, buf, size - 24);
+    av_hmac_final(hmac, buf + size - 20, 20);
+
+    /* Write the fingerprint attribute */
+    avio_wb16(pb, STUN_ATTR_FINGERPRINT); /* attribute type fingerprint */
+    avio_wb16(pb, 4); /* size of fingerprint */
+    ffio_fill(pb, 0, 4); /* fill with zero to directly write and skip it */
+    size = avio_tell(pb);
+    buf[2] = (size - 20) >> 8;
+    buf[3] = (size - 20) & 0xFF;
+    /* Refer to the av_hash_alloc("CRC32"), av_hash_init and av_hash_final */
+    crc32 = av_crc(av_crc_get_table(AV_CRC_32_IEEE_LE), 0xFFFFFFFF, buf, size - 8) ^ 0xFFFFFFFF;
+    avio_skip(pb, -4);
+    avio_wb32(pb, crc32 ^ 0x5354554E); /* xor with "STUN" */
+
+    *request_size = size;
+
+end:
+    avio_context_free(&pb);
+    av_hmac_free(hmac);
+    return ret;
+}
+
+/**
+ * Create an ICE binding response.
+ *
+ * This function generates an ICE binding response and writes it to the provided
+ * buffer. The response is signed using the local password for message integrity.
+ *
+ * @param s Pointer to the AVFormatContext structure.
+ * @param tid Pointer to the transaction ID of the binding request. The tid_size should be 12.
+ * @param tid_size The size of the transaction ID, should be 12.
+ * @param buf Pointer to the buffer where the response will be written.
+ * @param buf_size The size of the buffer provided for the response.
+ * @param response_size Pointer to an integer that will store the size of the generated response.
+ * @return Returns 0 if successful or AVERROR_xxx if an error occurs.
+ */
+static int ice_create_response(AVFormatContext *s, char *tid, int tid_size, uint8_t *buf, int buf_size, int *response_size)
+{
+    int ret = 0, size, crc32;
+    AVIOContext *pb = NULL;
+    AVHMAC *hmac = NULL;
+    WHIPContext *whip = s->priv_data;
+
+    if (tid_size != 12) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Invalid transaction ID size. Expected 12, got %d\n", tid_size);
+        return AVERROR(EINVAL);
+    }
+
+    pb = avio_alloc_context(buf, buf_size, 1, NULL, NULL, NULL, NULL);
+    if (!pb)
+        return AVERROR(ENOMEM);
+
+    hmac = av_hmac_alloc(AV_HMAC_SHA1);
+    if (!hmac) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    /* Write 20 bytes header */
+    avio_wb16(pb, 0x0101); /* STUN binding response */
+    avio_wb16(pb, 0);      /* length */
+    avio_wb32(pb, STUN_MAGIC_COOKIE); /* magic cookie */
+    avio_write(pb, tid, tid_size); /* transaction ID */
+
+    /* Build and update message integrity */
+    avio_wb16(pb, STUN_ATTR_MESSAGE_INTEGRITY); /* attribute type message integrity */
+    avio_wb16(pb, 20); /* size of message integrity */
+    ffio_fill(pb, 0, 20); /* fill with zero to directly write and skip it */
+    size = avio_tell(pb);
+    buf[2] = (size - 20) >> 8;
+    buf[3] = (size - 20) & 0xFF;
+    av_hmac_init(hmac, whip->ice_pwd_local, strlen(whip->ice_pwd_local));
+    av_hmac_update(hmac, buf, size - 24);
+    av_hmac_final(hmac, buf + size - 20, 20);
+
+    /* Write the fingerprint attribute */
+    avio_wb16(pb, STUN_ATTR_FINGERPRINT); /* attribute type fingerprint */
+    avio_wb16(pb, 4); /* size of fingerprint */
+    ffio_fill(pb, 0, 4); /* fill with zero to directly write and skip it */
+    size = avio_tell(pb);
+    buf[2] = (size - 20) >> 8;
+    buf[3] = (size - 20) & 0xFF;
+    /* Refer to the av_hash_alloc("CRC32"), av_hash_init and av_hash_final */
+    crc32 = av_crc(av_crc_get_table(AV_CRC_32_IEEE_LE), 0xFFFFFFFF, buf, size - 8) ^ 0xFFFFFFFF;
+    avio_skip(pb, -4);
+    avio_wb32(pb, crc32 ^ 0x5354554E); /* xor with "STUN" */
+
+    *response_size = size;
+
+end:
+    avio_context_free(&pb);
+    av_hmac_free(hmac);
+    return ret;
+}
+
+/**
+ * A Binding request has class=0b00 (request) and method=0b000000000001 (Binding)
+ * and is encoded into the first 16 bits as 0x0001.
+ * See https://datatracker.ietf.org/doc/html/rfc5389#section-6
+ */
+static int ice_is_binding_request(uint8_t *b, int size)
+{
+    return size >= ICE_STUN_HEADER_SIZE && AV_RB16(&b[0]) == 0x0001;
+}
+
+/**
+ * A Binding response has class=0b10 (success response) and method=0b000000000001,
+ * and is encoded into the first 16 bits as 0x0101.
+ */
+static int ice_is_binding_response(uint8_t *b, int size)
+{
+    return size >= ICE_STUN_HEADER_SIZE && AV_RB16(&b[0]) == 0x0101;
+}
+
+/**
+ * In RTP packets, the first byte is represented as 0b10xxxxxx, where the initial
+ * two bits (0b10) indicate the RTP version,
+ * see https://www.rfc-editor.org/rfc/rfc3550#section-5.1
+ * The RTCP packet header is similar to RTP,
+ * see https://www.rfc-editor.org/rfc/rfc3550#section-6.4.1
+ */
+static int media_is_rtp_rtcp(uint8_t *b, int size)
+{
+    return size >= WHIP_RTP_HEADER_SIZE && (b[0] & 0xC0) == 0x80;
+}
+
+/* Whether the packet is RTCP. */
+static int media_is_rtcp(uint8_t *b, int size)
+{
+    return size >= WHIP_RTP_HEADER_SIZE && b[1] >= WHIP_RTCP_PT_START && b[1] <= WHIP_RTCP_PT_END;
+}
+
+/**
+ * This function handles incoming binding request messages by responding to them.
+ * If the message is not a binding request, it will be ignored.
+ */
+static int ice_handle_binding_request(AVFormatContext *s, char *buf, int buf_size)
+{
+    int ret = 0, size;
+    char tid[12];
+    WHIPContext *whip = s->priv_data;
+
+    /* Ignore if not a binding request. */
+    if (!ice_is_binding_request(buf, buf_size))
+        return ret;
+
+    if (buf_size < ICE_STUN_HEADER_SIZE) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Invalid STUN message, expected at least %d, got %d\n",
+            ICE_STUN_HEADER_SIZE, buf_size);
+        return AVERROR(EINVAL);
+    }
+
+    /* Parse transaction id from binding request in buf. */
+    memcpy(tid, buf + 8, 12);
+
+    /* Build the STUN binding response. */
+    ret = ice_create_response(s, tid, sizeof(tid), whip->buf, sizeof(whip->buf), &size);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to create STUN binding response, size=%d\n", size);
+        return ret;
+    }
+
+    ret = ffurl_write(whip->udp, whip->buf, size);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to send STUN binding response, size=%d\n", size);
+        return ret;
+    }
+
+    return 0;
+}
+
+/**
+ * To establish a connection with the UDP server, we utilize ICE-LITE in a Client-Server
+ * mode. In this setup, FFmpeg acts as the UDP client, while the peer functions as the
+ * UDP server.
+ */
+static int udp_connect(AVFormatContext *s)
+{
+    int ret = 0;
+    char url[256];
+    AVDictionary *opts = NULL;
+    WHIPContext *whip = s->priv_data;
+
+    /* Build UDP URL and create the UDP context as transport. */
+    ff_url_join(url, sizeof(url), "udp", NULL, whip->ice_host, whip->ice_port, NULL);
+
+    av_dict_set_int(&opts, "connect", 1, 0);
+    av_dict_set_int(&opts, "fifo_size", 0, 0);
+    /* Set the max packet size to the buffer size. */
+    av_dict_set_int(&opts, "pkt_size", whip->pkt_size, 0);
+
+    ret = ffurl_open_whitelist(&whip->udp, url, AVIO_FLAG_WRITE, &s->interrupt_callback,
+        &opts, s->protocol_whitelist, s->protocol_blacklist, NULL);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to connect udp://%s:%d\n", whip->ice_host, whip->ice_port);
+        goto end;
+    }
+
+    /* Make the socket non-blocking, set to READ and WRITE mode after connected */
+    ff_socket_nonblock(ffurl_get_file_handle(whip->udp), 1);
+    whip->udp->flags |= AVIO_FLAG_READ | AVIO_FLAG_NONBLOCK;
+
+    if (whip->state < WHIP_STATE_UDP_CONNECTED)
+        whip->state = WHIP_STATE_UDP_CONNECTED;
+    whip->whip_udp_time = av_gettime();
+    av_log(whip, AV_LOG_VERBOSE, "WHIP: UDP state=%d, elapsed=%dms, connected to udp://%s:%d\n",
+        whip->state, ELAPSED(whip->whip_starttime, av_gettime()), whip->ice_host, whip->ice_port);
+
+end:
+    av_dict_free(&opts);
+    return ret;
+}
+
+static int ice_dtls_handshake(AVFormatContext *s)
+{
+    int ret = 0, size, i;
+    int64_t starttime = av_gettime(), now;
+    WHIPContext *whip = s->priv_data;
+    AVDictionary *opts = NULL;
+    char str[8];
+    char buf[256], *cert_buf = NULL, *key_buf = NULL;
+
+    if (whip->state < WHIP_STATE_UDP_CONNECTED || !whip->udp) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: UDP not connected, state=%d, udp=%p\n", whip->state, whip->udp);
+        return AVERROR(EINVAL);
+    }
+
+    while (1) {
+        if (whip->state <= WHIP_STATE_ICE_CONNECTING) {
+            /* Build the STUN binding request. */
+            ret = ice_create_request(s, whip->buf, sizeof(whip->buf), &size);
+            if (ret < 0) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Failed to create STUN binding request, size=%d\n", size);
+                goto end;
+            }
+
+            ret = ffurl_write(whip->udp, whip->buf, size);
+            if (ret < 0) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Failed to send STUN binding request, size=%d\n", size);
+                goto end;
+            }
+
+            if (whip->state < WHIP_STATE_ICE_CONNECTING)
+                whip->state = WHIP_STATE_ICE_CONNECTING;
+        }
+
+next_packet:
+        if (whip->state >= WHIP_STATE_DTLS_FINISHED)
+            /* DTLS handshake is done, exit the loop. */
+            break;
+
+        now = av_gettime();
+        if (now - starttime >= whip->handshake_timeout * 1000) {
+            av_log(whip, AV_LOG_ERROR, "WHIP: DTLS handshake timeout=%dms, cost=%dms, elapsed=%dms, state=%d\n",
+                whip->handshake_timeout, ELAPSED(starttime, now), ELAPSED(whip->whip_starttime, now), whip->state);
+            ret = AVERROR(ETIMEDOUT);
+            goto end;
+        }
+
+        /* Read the STUN or DTLS messages from peer. */
+        for (i = 0; i < ICE_DTLS_READ_INTERVAL / 5 && whip->state < WHIP_STATE_DTLS_CONNECTING; i++) {
+            ret = ffurl_read(whip->udp, whip->buf, sizeof(whip->buf));
+            if (ret > 0)
+                break;
+            if (ret == AVERROR(EAGAIN)) {
+                av_usleep(5 * 1000);
+                continue;
+            }
+            av_log(whip, AV_LOG_ERROR, "WHIP: Failed to read message\n");
+            goto end;
+        }
+
+        /* Got nothing, continue to process handshake. */
+        if (ret <= 0 && whip->state < WHIP_STATE_DTLS_CONNECTING)
+            continue;
+
+        /* Handle the ICE binding response. */
+        if (ice_is_binding_response(whip->buf, ret)) {
+            if (whip->state < WHIP_STATE_ICE_CONNECTED) {
+                whip->state = WHIP_STATE_ICE_CONNECTED;
+                whip->whip_ice_time = av_gettime();
+                av_log(whip, AV_LOG_VERBOSE, "WHIP: ICE STUN ok, state=%d, url=udp://%s:%d, location=%s, username=%s:%s, res=%dB, elapsed=%dms\n",
+                    whip->state, whip->ice_host, whip->ice_port, whip->whip_resource_url ? whip->whip_resource_url : "",
+                    whip->ice_ufrag_remote, whip->ice_ufrag_local, ret, ELAPSED(whip->whip_starttime, av_gettime()));
+
+                ff_url_join(buf, sizeof(buf), "dtls", NULL, whip->ice_host, whip->ice_port, NULL);
+                snprintf(str, sizeof(str), "%d", whip->pkt_size);
+                av_dict_set(&opts, "mtu", str, 0);
+                if (whip->cert_file) {
+                    av_dict_set(&opts, "cert_file", whip->cert_file, 0);
+                } else
+                    av_dict_set(&opts, "cert_buf", whip->cert_buf, 0);
+
+                if (whip->key_file) {
+                    av_dict_set(&opts, "key_file", whip->key_file, 0);
+                } else
+                    av_dict_set(&opts, "key_buf", whip->key_buf, 0);
+
+                av_dict_set(&opts, "fingerprint", whip->dtls_fingerprint, 0);
+                av_dict_set(&opts, "use_external_udp", "1", 0);
+                av_dict_set(&opts, "listen", "1", 0);
+                /* If got the first binding response, start DTLS handshake. */
+                ret = ffurl_open_whitelist(&whip->dtls_uc, buf, AVIO_FLAG_READ_WRITE, &s->interrupt_callback,
+                    &opts, s->protocol_whitelist, s->protocol_blacklist, NULL);
+                if (ret < 0)
+                    goto end;
+                dtls_initialize(s);
+            }
+            goto next_packet;
+        }
+
+        /* When a binding request is received, it is necessary to respond immediately. */
+        if (ice_is_binding_request(whip->buf, ret)) {
+            if ((ret = ice_handle_binding_request(s, whip->buf, ret)) < 0)
+                goto end;
+            goto next_packet;
+        }
+
+        /* If got any DTLS messages, handle it. */
+        if (is_dtls_packet(whip->buf, ret) && whip->state >= WHIP_STATE_ICE_CONNECTED || whip->state == WHIP_STATE_DTLS_CONNECTING) {
+            whip->state = WHIP_STATE_DTLS_CONNECTING;
+            if ((ret = ffurl_handshake(whip->dtls_uc)) < 0)
+                goto end;
+            dtls_context_on_state(s, NULL, NULL);
+            goto next_packet;
+        }
+    }
+
+end:
+    if (cert_buf)
+        av_free(cert_buf);
+    if (key_buf)
+        av_free(key_buf);
+    return ret;
+}
+
+/**
+ * Establish the SRTP context using the keying material exported from DTLS.
+ *
+ * Create separate SRTP contexts for sending video and audio, as their sequences differ
+ * and should not share a single context. Generate a single SRTP context for receiving
+ * RTCP only.
+ *
+ * @return 0 if OK, AVERROR_xxx on error
+ */
+static int setup_srtp(AVFormatContext *s)
+{
+    int ret;
+    char recv_key[DTLS_SRTP_KEY_LEN + DTLS_SRTP_SALT_LEN];
+    char send_key[DTLS_SRTP_KEY_LEN + DTLS_SRTP_SALT_LEN];
+    char buf[AV_BASE64_SIZE(DTLS_SRTP_KEY_LEN + DTLS_SRTP_SALT_LEN)];
+    /**
+     * The profile for OpenSSL's SRTP is SRTP_AES128_CM_SHA1_80, see ssl/d1_srtp.c.
+     * The profile for FFmpeg's SRTP is SRTP_AES128_CM_HMAC_SHA1_80, see libavformat/srtp.c.
+     */
+    const char* suite = "SRTP_AES128_CM_HMAC_SHA1_80";
+    WHIPContext *whip = s->priv_data;
+    ret = ff_dtls_export_materials(whip->dtls_uc, whip->dtls_srtp_materials, sizeof(whip->dtls_srtp_materials));
+    if (ret < 0)
+        goto end;
+    /**
+     * This represents the material used to build the SRTP master key. It is
+     * generated by DTLS and has the following layout:
+     *          16B         16B         14B             14B
+     *      client_key | server_key | client_salt | server_salt
+     */
+    char *client_key = whip->dtls_srtp_materials;
+    char *server_key = whip->dtls_srtp_materials + DTLS_SRTP_KEY_LEN;
+    char *client_salt = server_key + DTLS_SRTP_KEY_LEN;
+    char *server_salt = client_salt + DTLS_SRTP_SALT_LEN;
+
+    /* As DTLS server, the recv key is client master key plus salt. */
+    memcpy(recv_key, client_key, DTLS_SRTP_KEY_LEN);
+    memcpy(recv_key + DTLS_SRTP_KEY_LEN, client_salt, DTLS_SRTP_SALT_LEN);
+
+    /* As DTLS server, the send key is server master key plus salt. */
+    memcpy(send_key, server_key, DTLS_SRTP_KEY_LEN);
+    memcpy(send_key + DTLS_SRTP_KEY_LEN, server_salt, DTLS_SRTP_SALT_LEN);
+
+    /* Setup SRTP context for outgoing packets */
+    if (!av_base64_encode(buf, sizeof(buf), send_key, sizeof(send_key))) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to encode send key\n");
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+    ret = ff_srtp_set_crypto(&whip->srtp_audio_send, suite, buf);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to set crypto for audio send\n");
+        goto end;
+    }
+
+    ret = ff_srtp_set_crypto(&whip->srtp_video_send, suite, buf);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to set crypto for video send\n");
+        goto end;
+    }
+
+    ret = ff_srtp_set_crypto(&whip->srtp_rtcp_send, suite, buf);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "Failed to set crypto for rtcp send\n");
+        goto end;
+    }
+
+    /* Setup SRTP context for incoming packets */
+    if (!av_base64_encode(buf, sizeof(buf), recv_key, sizeof(recv_key))) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to encode recv key\n");
+        ret = AVERROR(EIO);
+        goto end;
+    }
+
+    ret = ff_srtp_set_crypto(&whip->srtp_recv, suite, buf);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to set crypto for recv\n");
+        goto end;
+    }
+
+    if (whip->state < WHIP_STATE_SRTP_FINISHED)
+        whip->state = WHIP_STATE_SRTP_FINISHED;
+    whip->whip_srtp_time = av_gettime();
+    av_log(whip, AV_LOG_VERBOSE, "WHIP: SRTP setup done, state=%d, suite=%s, key=%luB, elapsed=%dms\n",
+        whip->state, suite, sizeof(send_key), ELAPSED(whip->whip_starttime, av_gettime()));
+
+end:
+    return ret;
+}
+
+/**
+ * Callback triggered by the RTP muxer when it creates and sends out an RTP packet.
+ *
+ * This function modifies the video STAP packet, removing the markers, and updating the
+ * NRI of the first NALU. Additionally, it uses the corresponding SRTP context to encrypt
+ * the RTP packet, where the video packet is handled by the video SRTP context.
+ */
+static int on_rtp_write_packet(void *opaque, const uint8_t *buf, int buf_size)
+{
+    int ret, cipher_size, is_rtcp, is_video;
+    uint8_t payload_type;
+    AVFormatContext *s = opaque;
+    WHIPContext *whip = s->priv_data;
+    SRTPContext *srtp;
+
+    /* Ignore if not RTP or RTCP packet. */
+    if (!media_is_rtp_rtcp(buf, buf_size))
+        return 0;
+
+    /* Only support audio, video and rtcp. */
+    is_rtcp = media_is_rtcp(buf, buf_size);
+    payload_type = buf[1] & 0x7f;
+    is_video = payload_type == whip->video_payload_type;
+    if (!is_rtcp && payload_type != whip->video_payload_type && payload_type != whip->audio_payload_type)
+        return 0;
+
+    /* Get the corresponding SRTP context. */
+    srtp = is_rtcp ? &whip->srtp_rtcp_send : (is_video? &whip->srtp_video_send : &whip->srtp_audio_send);
+
+    /* Encrypt by SRTP and send out. */
+    cipher_size = ff_srtp_encrypt(srtp, buf, buf_size, whip->buf, sizeof(whip->buf));
+    if (cipher_size <= 0 || cipher_size < buf_size) {
+        av_log(whip, AV_LOG_WARNING, "WHIP: Failed to encrypt packet=%dB, cipher=%dB\n", buf_size, cipher_size);
+        return 0;
+    }
+
+    ret = ffurl_write(whip->udp, whip->buf, cipher_size);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to write packet=%dB, ret=%d\n", cipher_size, ret);
+        return ret;
+    }
+
+    return ret;
+}
+
+/**
+ * Creates dedicated RTP muxers for each stream in the AVFormatContext to build RTP
+ * packets from the encoded frames.
+ *
+ * The corresponding SRTP context is utilized to encrypt each stream's RTP packets. For
+ * example, a video SRTP context is used for the video stream. Additionally, the
+ * "on_rtp_write_packet" callback function is set as the write function for each RTP
+ * muxer to send out encrypted RTP packets.
+ *
+ * @return 0 if OK, AVERROR_xxx on error
+ */
+static int create_rtp_muxer(AVFormatContext *s)
+{
+    int ret, i, is_video, buffer_size, max_packet_size;
+    AVFormatContext *rtp_ctx = NULL;
+    AVDictionary *opts = NULL;
+    uint8_t *buffer = NULL;
+    char buf[64];
+    WHIPContext *whip = s->priv_data;
+
+    const AVOutputFormat *rtp_format = av_guess_format("rtp", NULL, NULL);
+    if (!rtp_format) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to guess rtp muxer\n");
+        ret = AVERROR(ENOSYS);
+        goto end;
+    }
+
+    /* The UDP buffer size, may greater than MTU. */
+    buffer_size = MAX_UDP_BUFFER_SIZE;
+    /* The RTP payload max size. Reserved some bytes for SRTP checksum and padding. */
+    max_packet_size = whip->pkt_size - DTLS_SRTP_CHECKSUM_LEN;
+
+    for (i = 0; i < s->nb_streams; i++) {
+        rtp_ctx = avformat_alloc_context();
+        if (!rtp_ctx) {
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+
+        rtp_ctx->oformat = rtp_format;
+        if (!avformat_new_stream(rtp_ctx, NULL)) {
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+        /* Pass the interrupt callback on */
+        rtp_ctx->interrupt_callback = s->interrupt_callback;
+        /* Copy the max delay setting; the rtp muxer reads this. */
+        rtp_ctx->max_delay = s->max_delay;
+        /* Copy other stream parameters. */
+        rtp_ctx->streams[0]->sample_aspect_ratio = s->streams[i]->sample_aspect_ratio;
+        rtp_ctx->flags |= s->flags & AVFMT_FLAG_BITEXACT;
+        rtp_ctx->strict_std_compliance = s->strict_std_compliance;
+
+        /* Set the synchronized start time. */
+        rtp_ctx->start_time_realtime = s->start_time_realtime;
+
+        avcodec_parameters_copy(rtp_ctx->streams[0]->codecpar, s->streams[i]->codecpar);
+        rtp_ctx->streams[0]->time_base = s->streams[i]->time_base;
+
+        /**
+         * For H.264, consistently utilize the annexb format through the Bitstream Filter (BSF);
+         * therefore, we deactivate the extradata detection for the RTP muxer.
+         */
+        if (s->streams[i]->codecpar->codec_id == AV_CODEC_ID_H264) {
+            av_freep(&rtp_ctx->streams[i]->codecpar->extradata);
+            rtp_ctx->streams[i]->codecpar->extradata_size = 0;
+        }
+
+        buffer = av_malloc(buffer_size);
+        if (!buffer) {
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+
+        rtp_ctx->pb = avio_alloc_context(buffer, buffer_size, 1, s, NULL, on_rtp_write_packet, NULL);
+        if (!rtp_ctx->pb) {
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+        rtp_ctx->pb->max_packet_size = max_packet_size;
+        rtp_ctx->pb->av_class = &ff_avio_class;
+
+        is_video = s->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO;
+        snprintf(buf, sizeof(buf), "%d", is_video? whip->video_payload_type : whip->audio_payload_type);
+        av_dict_set(&opts, "payload_type", buf, 0);
+        snprintf(buf, sizeof(buf), "%d", is_video? whip->video_ssrc : whip->audio_ssrc);
+        av_dict_set(&opts, "ssrc", buf, 0);
+
+        ret = avformat_write_header(rtp_ctx, &opts);
+        if (ret < 0) {
+            av_log(whip, AV_LOG_ERROR, "WHIP: Failed to write rtp header\n");
+            goto end;
+        }
+
+        ff_format_set_url(rtp_ctx, av_strdup(s->url));
+        s->streams[i]->time_base = rtp_ctx->streams[0]->time_base;
+        s->streams[i]->priv_data = rtp_ctx;
+        rtp_ctx = NULL;
+    }
+
+    if (whip->state < WHIP_STATE_READY)
+        whip->state = WHIP_STATE_READY;
+    av_log(whip, AV_LOG_INFO, "WHIP: Muxer state=%d, buffer_size=%d, max_packet_size=%d, "
+                           "elapsed=%dms(init:%d,offer:%d,answer:%d,udp:%d,ice:%d,dtls:%d,srtp:%d)\n",
+        whip->state, buffer_size, max_packet_size, ELAPSED(whip->whip_starttime, av_gettime()),
+        ELAPSED(whip->whip_starttime,   whip->whip_init_time),
+        ELAPSED(whip->whip_init_time,   whip->whip_offer_time),
+        ELAPSED(whip->whip_offer_time,  whip->whip_answer_time),
+        ELAPSED(whip->whip_answer_time, whip->whip_udp_time),
+        ELAPSED(whip->whip_udp_time,    whip->whip_ice_time),
+        ELAPSED(whip->whip_ice_time,    whip->whip_dtls_time),
+        ELAPSED(whip->whip_dtls_time,   whip->whip_srtp_time));
+
+end:
+    if (rtp_ctx)
+        avio_context_free(&rtp_ctx->pb);
+    avformat_free_context(rtp_ctx);
+    av_dict_free(&opts);
+    return ret;
+}
+
+/**
+ * RTC is connectionless, for it's based on UDP, so it check whether sesison is
+ * timeout. In such case, publishers can't republish the stream util the session
+ * is timeout.
+ * This function is called to notify the server that the stream is ended, server
+ * should expire and close the session immediately, so that publishers can republish
+ * the stream quickly.
+ */
+static int dispose_session(AVFormatContext *s)
+{
+    int ret;
+    char buf[MAX_URL_SIZE];
+    URLContext *whip_uc = NULL;
+    AVDictionary *opts = NULL;
+    WHIPContext *whip = s->priv_data;
+
+    if (!whip->whip_resource_url)
+        return 0;
+
+    ret = snprintf(buf, sizeof(buf), "Cache-Control: no-cache\r\n");
+    if (whip->authorization)
+        ret += snprintf(buf + ret, sizeof(buf) - ret, "Authorization: Bearer %s\r\n", whip->authorization);
+    if (ret <= 0 || ret >= sizeof(buf)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to generate headers, size=%d, %s\n", ret, buf);
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    av_dict_set(&opts, "headers", buf, 0);
+    av_dict_set_int(&opts, "chunked_post", 0, 0);
+    av_dict_set(&opts, "method", "DELETE", 0);
+    ret = ffurl_open_whitelist(&whip_uc, whip->whip_resource_url, AVIO_FLAG_READ_WRITE, &s->interrupt_callback,
+        &opts, s->protocol_whitelist, s->protocol_blacklist, NULL);
+    if (ret < 0) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to DELETE url=%s\n", whip->whip_resource_url);
+        goto end;
+    }
+
+    while (1) {
+        ret = ffurl_read(whip_uc, buf, sizeof(buf));
+        if (ret == AVERROR_EOF) {
+            ret = 0;
+            break;
+        }
+        if (ret < 0) {
+            av_log(whip, AV_LOG_ERROR, "WHIP: Failed to read response from DELETE url=%s\n", whip->whip_resource_url);
+            goto end;
+        }
+    }
+
+    av_log(whip, AV_LOG_INFO, "WHIP: Dispose resource %s ok\n", whip->whip_resource_url);
+
+end:
+    ffurl_closep(&whip_uc);
+    av_dict_free(&opts);
+    return ret;
+}
+
+/**
+ * Since the h264_mp4toannexb filter only processes the MP4 ISOM format and bypasses
+ * the annexb format, it is necessary to manually insert encoder metadata before each
+ * IDR when dealing with annexb format packets. For instance, in the case of H.264,
+ * we must insert SPS and PPS before the IDR frame.
+ */
+static int h264_annexb_insert_sps_pps(AVFormatContext *s, AVPacket *pkt)
+{
+    int ret = 0;
+    AVPacket *in = NULL;
+    AVCodecParameters *par = s->streams[pkt->stream_index]->codecpar;
+    uint32_t nal_size = 0, out_size = par ? par->extradata_size : 0;
+    uint8_t unit_type, sps_seen = 0, pps_seen = 0, idr_seen = 0, *out;
+    const uint8_t *buf, *buf_end, *r1;
+
+    if (!pkt || !pkt->data || pkt->size <= 0)
+        return ret;
+    if (!par || !par->extradata || par->extradata_size <= 0)
+        return ret;
+
+    /* Discover NALU type from packet. */
+    buf_end  = pkt->data + pkt->size;
+    for (buf = ff_nal_find_startcode(pkt->data, buf_end); buf < buf_end; buf += nal_size) {
+        while (!*(buf++));
+        r1 = ff_nal_find_startcode(buf, buf_end);
+        if ((nal_size = r1 - buf) > 0) {
+            unit_type = *buf & 0x1f;
+            if (unit_type == H264_NAL_SPS) {
+                sps_seen = 1;
+            } else if (unit_type == H264_NAL_PPS) {
+                pps_seen = 1;
+            } else if (unit_type == H264_NAL_IDR_SLICE) {
+                idr_seen = 1;
+            }
+
+            out_size += 3 + nal_size;
+        }
+    }
+
+    if (!idr_seen || (sps_seen && pps_seen))
+        return ret;
+
+    /* See av_bsf_send_packet */
+    in = av_packet_alloc();
+    if (!in)
+        return AVERROR(ENOMEM);
+
+    ret = av_packet_make_refcounted(pkt);
+    if (ret < 0)
+        goto fail;
+
+    av_packet_move_ref(in, pkt);
+
+    /* Create a new packet with sps/pps inserted. */
+    ret = av_new_packet(pkt, out_size);
+    if (ret < 0)
+        goto fail;
+
+    ret = av_packet_copy_props(pkt, in);
+    if (ret < 0)
+        goto fail;
+
+    memcpy(pkt->data, par->extradata, par->extradata_size);
+    out = pkt->data + par->extradata_size;
+    buf_end  = in->data + in->size;
+    for (buf = ff_nal_find_startcode(in->data, buf_end); buf < buf_end; buf += nal_size) {
+        while (!*(buf++));
+        r1 = ff_nal_find_startcode(buf, buf_end);
+        if ((nal_size = r1 - buf) > 0) {
+            AV_WB24(out, 0x00001);
+            memcpy(out + 3, buf, nal_size);
+            out += 3 + nal_size;
+        }
+    }
+
+fail:
+    if (ret < 0)
+        av_packet_unref(pkt);
+    av_packet_free(&in);
+
+    return ret;
+}
+
+static av_cold int whip_init(AVFormatContext *s)
+{
+    int ret;
+    WHIPContext *whip = s->priv_data;
+
+    if ((ret = initialize(s)) < 0)
+        goto end;
+
+    if ((ret = parse_codec(s)) < 0)
+        goto end;
+
+    if ((ret = generate_sdp_offer(s)) < 0)
+        goto end;
+
+    if ((ret = exchange_sdp(s)) < 0)
+        goto end;
+
+    if ((ret = parse_answer(s)) < 0)
+        goto end;
+
+    if ((ret = udp_connect(s)) < 0)
+        goto end;
+
+    if ((ret = ice_dtls_handshake(s)) < 0)
+        goto end;
+
+    if ((ret = setup_srtp(s)) < 0)
+        goto end;
+
+    if ((ret = create_rtp_muxer(s)) < 0)
+        goto end;
+
+end:
+    if (ret < 0 && whip->state < WHIP_STATE_FAILED)
+        whip->state = WHIP_STATE_FAILED;
+    if (ret >= 0 && whip->state >= WHIP_STATE_FAILED && whip->dtls_ret < 0)
+        ret = whip->dtls_ret;
+    return ret;
+}
+
+static int whip_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    int ret;
+    WHIPContext *whip = s->priv_data;
+    AVStream *st = s->streams[pkt->stream_index];
+    AVFormatContext *rtp_ctx = st->priv_data;
+
+    /* TODO: Send binding request every 1s as WebRTC heartbeat. */
+
+    /**
+     * Receive packets from the server such as ICE binding requests, DTLS messages,
+     * and RTCP like PLI requests, then respond to them.
+     */
+    ret = ffurl_read(whip->udp, whip->buf, sizeof(whip->buf));
+    if (ret > 0) {
+        if (is_dtls_packet(whip->buf, ret)) {
+            if ((ret = ffurl_write(whip->dtls_uc, whip->buf, ret)) < 0) {
+                av_log(whip, AV_LOG_ERROR, "WHIP: Failed to handle DTLS message\n");
+                goto end;
+            }
+        }
+    } else if (ret != AVERROR(EAGAIN)) {
+        av_log(whip, AV_LOG_ERROR, "WHIP: Failed to read from UDP socket\n");
+        goto end;
+    }
+
+    if (whip->h264_annexb_insert_sps_pps && st->codecpar->codec_id == AV_CODEC_ID_H264) {
+        if ((ret = h264_annexb_insert_sps_pps(s, pkt)) < 0) {
+            av_log(whip, AV_LOG_ERROR, "WHIP: Failed to insert SPS/PPS before IDR\n");
+            goto end;
+        }
+    }
+
+    ret = ff_write_chained(rtp_ctx, 0, pkt, s, 0);
+    if (ret < 0) {
+        if (ret == AVERROR(EINVAL)) {
+            av_log(whip, AV_LOG_WARNING, "WHIP: Ignore failed to write packet=%dB, ret=%d\n", pkt->size, ret);
+            ret = 0;
+        } else
+            av_log(whip, AV_LOG_ERROR, "WHIP: Failed to write packet, size=%d\n", pkt->size);
+        goto end;
+    }
+
+end:
+    if (ret < 0 && whip->state < WHIP_STATE_FAILED)
+        whip->state = WHIP_STATE_FAILED;
+    if (ret >= 0 && whip->state >= WHIP_STATE_FAILED && whip->dtls_ret < 0)
+        ret = whip->dtls_ret;
+    if (ret >= 0 && whip->dtls_closed)
+        ret = AVERROR(EIO);
+    return ret;
+}
+
+static av_cold void whip_deinit(AVFormatContext *s)
+{
+    int i, ret;
+    WHIPContext *whip = s->priv_data;
+
+    ret = dispose_session(s);
+    if (ret < 0)
+        av_log(whip, AV_LOG_WARNING, "WHIP: Failed to dispose resource, ret=%d\n", ret);
+
+    for (i = 0; i < s->nb_streams; i++) {
+        AVFormatContext* rtp_ctx = s->streams[i]->priv_data;
+        if (!rtp_ctx)
+            continue;
+
+        av_write_trailer(rtp_ctx);
+        /**
+         * Keep in mind that it is necessary to free the buffer of pb since we allocate
+         * it and pass it to pb using avio_alloc_context, while avio_context_free does
+         * not perform this action.
+         */
+        av_freep(&rtp_ctx->pb->buffer);
+        avio_context_free(&rtp_ctx->pb);
+        avformat_free_context(rtp_ctx);
+        s->streams[i]->priv_data = NULL;
+    }
+
+    av_freep(&whip->sdp_offer);
+    av_freep(&whip->sdp_answer);
+    av_freep(&whip->whip_resource_url);
+    av_freep(&whip->ice_ufrag_remote);
+    av_freep(&whip->ice_pwd_remote);
+    av_freep(&whip->ice_protocol);
+    av_freep(&whip->ice_host);
+    av_freep(&whip->authorization);
+    av_freep(&whip->cert_file);
+    av_freep(&whip->key_file);
+    ffurl_closep(&whip->udp);
+    ff_srtp_free(&whip->srtp_audio_send);
+    ff_srtp_free(&whip->srtp_video_send);
+    ff_srtp_free(&whip->srtp_rtcp_send);
+    ff_srtp_free(&whip->srtp_recv);
+    ffurl_close(whip->dtls_uc);
+}
+
+static int whip_check_bitstream(AVFormatContext *s, AVStream *st, const AVPacket *pkt)
+{
+    int ret = 1, extradata_isom = 0;
+    uint8_t *b = pkt->data;
+    WHIPContext *whip = s->priv_data;
+
+    if (st->codecpar->codec_id == AV_CODEC_ID_H264) {
+        extradata_isom = st->codecpar->extradata_size > 0 && st->codecpar->extradata[0] == 1;
+        if (pkt->size >= 5 && AV_RB32(b) != 0x0000001 && (AV_RB24(b) != 0x000001 || extradata_isom)) {
+            ret = ff_stream_add_bitstream_filter(st, "h264_mp4toannexb", NULL);
+            av_log(whip, AV_LOG_VERBOSE, "WHIP: Enable BSF h264_mp4toannexb, packet=[%x %x %x %x %x ...], extradata_isom=%d\n",
+                b[0], b[1], b[2], b[3], b[4], extradata_isom);
+        } else
+            whip->h264_annexb_insert_sps_pps = 1;
+    }
+
+    return ret;
+}
+
+#define OFFSET(x) offsetof(WHIPContext, x)
+#define DEC AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "handshake_timeout",  "Timeout in milliseconds for ICE and DTLS handshake.",      OFFSET(handshake_timeout),  AV_OPT_TYPE_INT,    { .i64 = 5000 },    -1, INT_MAX, DEC },
+    { "pkt_size",           "The maximum size, in bytes, of RTP packets that send out", OFFSET(pkt_size),           AV_OPT_TYPE_INT,    { .i64 = 1200 },    -1, INT_MAX, DEC },
+    { "authorization",      "The optional Bearer token for WHIP Authorization",         OFFSET(authorization),      AV_OPT_TYPE_STRING, { .str = NULL },     0,       0, DEC },
+    { "cert_file",          "The optional certificate file path for DTLS",              OFFSET(cert_file),          AV_OPT_TYPE_STRING, { .str = NULL },     0,       0, DEC },
+    { "key_file",           "The optional private key file path for DTLS",              OFFSET(key_file),      AV_OPT_TYPE_STRING, { .str = NULL },     0,       0, DEC },
+    { NULL },
+};
+
+static const AVClass whip_muxer_class = {
+    .class_name = "WHIP muxer",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const FFOutputFormat ff_whip_muxer = {
+    .p.name             = "whip",
+    .p.long_name        = NULL_IF_CONFIG_SMALL("WHIP(WebRTC-HTTP ingestion protocol) muxer"),
+    .p.audio_codec      = AV_CODEC_ID_OPUS,
+    .p.video_codec      = AV_CODEC_ID_H264,
+    .p.flags            = AVFMT_GLOBALHEADER | AVFMT_NOFILE,
+    .p.priv_class       = &whip_muxer_class,
+    .priv_data_size     = sizeof(WHIPContext),
+    .init               = whip_init,
+    .write_packet       = whip_write_packet,
+    .deinit             = whip_deinit,
+    .check_bitstream    = whip_check_bitstream,
+};
diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index 50ce7d4dfd96b..2e4e451ec2517 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -196,7 +196,7 @@ DISABLE_SVE2
         .popsection
 #endif
 
-.macro  function name, export=0, align=2
+.macro  function name, export=0, align=4
     .macro endfunc
 ELF     .size   \name, . - \name
 FUNC    .endfunc
@@ -217,7 +217,7 @@ FUNC    .func   \name
     .endif
 .endm
 
-.macro  const   name, align=2, relocate=0
+.macro  const   name, align=4, relocate=0
     .macro endconst
 ELF     .size   \name, . - \name
         .purgem endconst
diff --git a/libavutil/avassert.h b/libavutil/avassert.h
index 1895fb75513bf..8dbdb01566709 100644
--- a/libavutil/avassert.h
+++ b/libavutil/avassert.h
@@ -31,6 +31,7 @@
 #ifdef HAVE_AV_CONFIG_H
 #   include "config.h"
 #endif
+#include "attributes.h"
 #include "log.h"
 #include "macros.h"
 
@@ -75,4 +76,45 @@
  */
 void av_assert0_fpu(void);
 
+/**
+ * Asserts that are used as compiler optimization hints depending
+ * upon ASSERT_LEVEL and NBDEBUG.
+ *
+ * Undefined behaviour occurs if execution reaches a point marked
+ * with av_unreachable() or if a condition used with av_assume()
+ * is false.
+ *
+ * The condition used with av_assume() should not have side-effects
+ * and should be visible to the compiler.
+ */
+#if defined(ASSERT_LEVEL) ? ASSERT_LEVEL > 0 : !defined(HAVE_AV_CONFIG_H) && !defined(NDEBUG)
+#define av_unreachable(msg)                                             \
+do {                                                                    \
+    av_log(NULL, AV_LOG_PANIC,                                          \
+           "Reached supposedly unreachable code at %s:%d: %s\n",        \
+           __FILE__, __LINE__, msg);                                    \
+    abort();                                                            \
+} while (0)
+#define av_assume(cond) av_assert0(cond)
+#else
+#if AV_GCC_VERSION_AT_LEAST(4, 5) || AV_HAS_BUILTIN(__builtin_unreachable)
+#define av_unreachable(msg) __builtin_unreachable()
+#elif  defined(_MSC_VER)
+#define av_unreachable(msg) __assume(0)
+#define av_assume(cond)     __assume(cond)
+#elif __STDC_VERSION__ >= 202311L
+#include <stddef.h>
+#define av_unreachable(msg) unreachable()
+#else
+#define av_unreachable(msg) ((void)0)
+#endif
+
+#ifndef av_assume
+#define av_assume(cond) do { \
+    if (!(cond))             \
+        av_unreachable();    \
+} while (0)
+#endif
+#endif
+
 #endif /* AVUTIL_AVASSERT_H */
diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 978d7e29d3b37..ce485a85a2176 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -79,6 +79,7 @@ typedef struct VulkanDeviceFeatures {
     VkPhysicalDeviceVulkan12Features vulkan_1_2;
     VkPhysicalDeviceVulkan13Features vulkan_1_3;
     VkPhysicalDeviceTimelineSemaphoreFeatures timeline_semaphore;
+    VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR subgroup_rotate;
 
 #ifdef VK_KHR_shader_expect_assume
     VkPhysicalDeviceShaderExpectAssumeFeaturesKHR expect_assume;
@@ -205,6 +206,8 @@ static void device_features_init(AVHWDeviceContext *ctx, VulkanDeviceFeatures *f
 
     FF_VK_STRUCT_EXT(s, &feats->device, &feats->timeline_semaphore, FF_VK_EXT_PORTABILITY_SUBSET,
                      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES);
+    FF_VK_STRUCT_EXT(s, &feats->device, &feats->subgroup_rotate, FF_VK_EXT_SUBGROUP_ROTATE,
+                     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_ROTATE_FEATURES_KHR);
 
 #ifdef VK_KHR_shader_expect_assume
     FF_VK_STRUCT_EXT(s, &feats->device, &feats->expect_assume, FF_VK_EXT_EXPECT_ASSUME,
@@ -283,6 +286,7 @@ static void device_features_copy_needed(VulkanDeviceFeatures *dst, VulkanDeviceF
     COPY_VAL(vulkan_1_3.dynamicRendering);
 
     COPY_VAL(timeline_semaphore.timelineSemaphore);
+    COPY_VAL(subgroup_rotate.shaderSubgroupRotate);
 
     COPY_VAL(video_maintenance_1.videoMaintenance1);
 #ifdef VK_KHR_video_maintenance2
@@ -406,6 +410,23 @@ static const struct FFVkFormatEntry {
     { VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16, AV_PIX_FMT_Y212,    VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, 1, { VK_FORMAT_R16G16B16A16_UNORM } },
     { VK_FORMAT_G16B16G16R16_422_UNORM,                 AV_PIX_FMT_Y216,    VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, 1, { VK_FORMAT_R16G16B16A16_UNORM } },
 
+    /* Planar YUVA 420 at 8, 10 and 16 bits */
+    { VK_FORMAT_R8_UNORM,   AV_PIX_FMT_YUVA420P,    VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM   } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA420P10,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA420P16,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+
+    /* Planar YUVA 422 at 8, 10, 12 and 16 bits */
+    { VK_FORMAT_R8_UNORM,   AV_PIX_FMT_YUVA422P,    VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM   } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA422P10,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA422P12,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA422P16,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+
+    /* Planar YUVA 444 at 8, 10, 12 and 16 bits */
+    { VK_FORMAT_R8_UNORM,   AV_PIX_FMT_YUVA444P,    VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM,   VK_FORMAT_R8_UNORM   } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA444P10,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA444P12,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+    { VK_FORMAT_R16_UNORM,  AV_PIX_FMT_YUVA444P16,  VK_IMAGE_ASPECT_COLOR_BIT, 4, 4, 4, { VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM,  VK_FORMAT_R16_UNORM  } },
+
     /* Single plane 444 at 8, 10, 12 and 16 bits */
     { VK_FORMAT_B8G8R8A8_UNORM,                         AV_PIX_FMT_UYVA,    VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, 1, { VK_FORMAT_B8G8R8A8_UNORM     } },
     { VK_FORMAT_A2R10G10B10_UNORM_PACK32,               AV_PIX_FMT_XV30,    VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, 1, { VK_FORMAT_R16G16B16A16_UNORM } },
@@ -588,6 +609,7 @@ static const VulkanOptExtension optional_device_exts[] = {
     { VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME,               FF_VK_EXT_COOP_MATRIX            },
     { VK_NV_OPTICAL_FLOW_EXTENSION_NAME,                      FF_VK_EXT_OPTICAL_FLOW           },
     { VK_EXT_SHADER_OBJECT_EXTENSION_NAME,                    FF_VK_EXT_SHADER_OBJECT          },
+    { VK_KHR_SHADER_SUBGROUP_ROTATE_EXTENSION_NAME,           FF_VK_EXT_SUBGROUP_ROTATE        },
 #ifdef VK_KHR_shader_expect_assume
     { VK_KHR_SHADER_EXPECT_ASSUME_EXTENSION_NAME,             FF_VK_EXT_EXPECT_ASSUME          },
 #endif
@@ -2638,11 +2660,12 @@ static AVBufferRef *vulkan_pool_alloc(void *opaque, size_t size)
     if (p->vkctx.extensions & FF_VK_EXT_EXTERNAL_FD_MEMORY)
         try_export_flags(hwfc, &eiinfo.handleTypes, &e,
                          VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT);
-#endif
 
-    if (p->vkctx.extensions & FF_VK_EXT_EXTERNAL_DMABUF_MEMORY)
+    if (p->vkctx.extensions & FF_VK_EXT_EXTERNAL_DMABUF_MEMORY &&
+        hwctx->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
         try_export_flags(hwfc, &eiinfo.handleTypes, &e,
                          VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+#endif
 
     for (int i = 0; i < av_pix_fmt_count_planes(hwfc->sw_format); i++) {
         eminfo[i].sType       = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
@@ -2779,8 +2802,8 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc)
 
     /* Image usage flags */
     if (!hwctx->usage) {
-        hwctx->usage = supported_usage & (VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-                                          VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+        hwctx->usage = supported_usage & (VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+                                          VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                                           VK_IMAGE_USAGE_STORAGE_BIT       |
                                           VK_IMAGE_USAGE_SAMPLED_BIT);
 
diff --git a/libavutil/mem_internal.h b/libavutil/mem_internal.h
index c027fa51c38c2..d58881d09c9ae 100644
--- a/libavutil/mem_internal.h
+++ b/libavutil/mem_internal.h
@@ -131,4 +131,6 @@
 
 #define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_D(32, t, v, __VA_ARGS__,,))
 
+#define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,))
+
 #endif /* AVUTIL_MEM_INTERNAL_H */
diff --git a/libavutil/refstruct.c b/libavutil/refstruct.c
index ce804f781aee7..7a5eb2482665b 100644
--- a/libavutil/refstruct.c
+++ b/libavutil/refstruct.c
@@ -45,7 +45,7 @@
 #define REFSTRUCT_COOKIE AV_NE((uint64_t)MKBETAG('R', 'e', 'f', 'S') << 32 | MKBETAG('t', 'r', 'u', 'c'), \
                                MKTAG('R', 'e', 'f', 'S') | (uint64_t)MKTAG('t', 'r', 'u', 'c') << 32)
 
-#if __STDC_VERSION__ >= 201112L && !defined(_MSC_VER)
+#ifndef _MSC_VER
 #define REFCOUNT_OFFSET FFALIGN(sizeof(RefCount), FFMAX(ALIGN_64, _Alignof(max_align_t)))
 #else
 #define REFCOUNT_OFFSET FFALIGN(sizeof(RefCount), ALIGN_64)
diff --git a/libavutil/version.h b/libavutil/version.h
index 4717cd562b1b0..2979f802332a7 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -79,7 +79,7 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  60
-#define LIBAVUTIL_VERSION_MINOR   2
+#define LIBAVUTIL_VERSION_MINOR   3
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c
index 2cc8ec110e790..a989e080abd37 100644
--- a/libavutil/vulkan.c
+++ b/libavutil/vulkan.c
@@ -989,6 +989,16 @@ int ff_vk_create_buf(FFVulkanContext *s, FFVkBuffer *buf, size_t size,
     int use_ded_mem;
     FFVulkanFunctions *vk = &s->vkfn;
 
+    /* Buffer usage flags corresponding to buffer descriptor types */
+    const VkBufferUsageFlags desc_usage =
+        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+        VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
+        VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+
+    if ((s->extensions & FF_VK_EXT_DESCRIPTOR_BUFFER) && (usage & desc_usage))
+        usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;
+
     VkBufferCreateInfo buf_spawn = {
         .sType       = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
         .pNext       = pNext,
@@ -1611,7 +1621,10 @@ const char *ff_vk_shader_rep_fmt(enum AVPixelFormat pix_fmt,
     case AV_PIX_FMT_GBRAP:
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
-    case AV_PIX_FMT_YUV444P: {
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P: {
         const char *rep_tab[] = {
             [FF_VK_REP_NATIVE] = "r8ui",
             [FF_VK_REP_FLOAT] = "r8",
@@ -1640,7 +1653,15 @@ const char *ff_vk_shader_rep_fmt(enum AVPixelFormat pix_fmt,
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV444P10:
     case AV_PIX_FMT_YUV444P12:
-    case AV_PIX_FMT_YUV444P16: {
+    case AV_PIX_FMT_YUV444P16:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA422P12:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA444P12:
+    case AV_PIX_FMT_YUVA444P16: {
         const char *rep_tab[] = {
             [FF_VK_REP_NATIVE] = "r16ui",
             [FF_VK_REP_FLOAT] = "r16f",
diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h
index cd61d71577948..8b413013e6564 100644
--- a/libavutil/vulkan_functions.h
+++ b/libavutil/vulkan_functions.h
@@ -48,6 +48,7 @@ typedef uint64_t FFVulkanExtensions;
 #define FF_VK_EXT_PUSH_DESCRIPTOR        (1ULL << 14) /* VK_KHR_push_descriptor */
 #define FF_VK_EXT_RELAXED_EXTENDED_INSTR (1ULL << 15) /* VK_KHR_shader_relaxed_extended_instruction */
 #define FF_VK_EXT_EXPECT_ASSUME          (1ULL << 16) /* VK_KHR_shader_expect_assume */
+#define FF_VK_EXT_SUBGROUP_ROTATE        (1ULL << 17) /* VK_KHR_shader_subgroup_rotate */
 
 /* Video extensions */
 #define FF_VK_EXT_VIDEO_QUEUE            (1ULL << 36) /* VK_KHR_video_queue */
diff --git a/libavutil/vulkan_loader.h b/libavutil/vulkan_loader.h
index eaf6e2e6bb86d..a7976fe5606d2 100644
--- a/libavutil/vulkan_loader.h
+++ b/libavutil/vulkan_loader.h
@@ -58,6 +58,7 @@ static inline uint64_t ff_vk_extensions_to_mask(const char * const *extensions,
         { VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME,        FF_VK_EXT_COOP_MATRIX            },
         { VK_NV_OPTICAL_FLOW_EXTENSION_NAME,               FF_VK_EXT_OPTICAL_FLOW           },
         { VK_EXT_SHADER_OBJECT_EXTENSION_NAME,             FF_VK_EXT_SHADER_OBJECT          },
+        { VK_KHR_SHADER_SUBGROUP_ROTATE_EXTENSION_NAME,    FF_VK_EXT_SUBGROUP_ROTATE        },
         { VK_KHR_VIDEO_MAINTENANCE_1_EXTENSION_NAME,       FF_VK_EXT_VIDEO_MAINTENANCE_1    },
 #ifdef VK_KHR_video_maintenance2
         { VK_KHR_VIDEO_MAINTENANCE_2_EXTENSION_NAME,       FF_VK_EXT_VIDEO_MAINTENANCE_2    },
diff --git a/libswscale/format.c b/libswscale/format.c
index b859af7b043b6..e4c1348b9034a 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -483,7 +483,7 @@ static int infer_trc_ref(SwsColor *csp, const SwsColor *ref)
     return 1;
 }
 
-int ff_infer_colors(SwsColor *src, SwsColor *dst)
+bool ff_infer_colors(SwsColor *src, SwsColor *dst)
 {
     int incomplete = 0;
 
diff --git a/libswscale/format.h b/libswscale/format.h
index 11b4345f7c021..3b6d745159c80 100644
--- a/libswscale/format.h
+++ b/libswscale/format.h
@@ -21,6 +21,8 @@
 #ifndef SWSCALE_FORMAT_H
 #define SWSCALE_FORMAT_H
 
+#include <stdbool.h>
+
 #include "libavutil/csp.h"
 #include "libavutil/pixdesc.h"
 
@@ -129,7 +131,7 @@ static inline int ff_fmt_align(enum AVPixelFormat fmt)
 
 int ff_test_fmt(const SwsFormat *fmt, int output);
 
-/* Returns 1 if the formats are incomplete, 0 otherwise */
-int ff_infer_colors(SwsColor *src, SwsColor *dst);
+/* Returns true if the formats are incomplete, false otherwise */
+bool ff_infer_colors(SwsColor *src, SwsColor *dst);
 
 #endif /* SWSCALE_FORMAT_H */
diff --git a/libswscale/graph.c b/libswscale/graph.c
index cd56f51f88c91..dc7784aa499ad 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -44,10 +44,9 @@ static int pass_alloc_output(SwsPass *pass)
                           pass->num_slices * pass->slice_h, pass->format, 64);
 }
 
-/* slice_align should be a power of two, or 0 to disable slice threading */
-static SwsPass *pass_add(SwsGraph *graph, void *priv, enum AVPixelFormat fmt,
-                         int w, int h, SwsPass *input, int slice_align,
-                         sws_filter_run_t run)
+SwsPass *ff_sws_graph_add_pass(SwsGraph *graph, enum AVPixelFormat fmt,
+                               int width, int height, SwsPass *input,
+                               int align, void *priv, sws_filter_run_t run)
 {
     int ret;
     SwsPass *pass = av_mallocz(sizeof(*pass));
@@ -58,8 +57,8 @@ static SwsPass *pass_add(SwsGraph *graph, void *priv, enum AVPixelFormat fmt,
     pass->run    = run;
     pass->priv   = priv;
     pass->format = fmt;
-    pass->width  = w;
-    pass->height = h;
+    pass->width  = width;
+    pass->height = height;
     pass->input  = input;
     pass->output.fmt = AV_PIX_FMT_NONE;
 
@@ -69,12 +68,12 @@ static SwsPass *pass_add(SwsGraph *graph, void *priv, enum AVPixelFormat fmt,
         return NULL;
     }
 
-    if (!slice_align) {
+    if (!align) {
         pass->slice_h = pass->height;
         pass->num_slices = 1;
     } else {
         pass->slice_h = (pass->height + graph->num_threads - 1) / graph->num_threads;
-        pass->slice_h = FFALIGN(pass->slice_h, slice_align);
+        pass->slice_h = FFALIGN(pass->slice_h, align);
         pass->num_slices = (pass->height + pass->slice_h - 1) / pass->slice_h;
     }
 
@@ -84,41 +83,27 @@ static SwsPass *pass_add(SwsGraph *graph, void *priv, enum AVPixelFormat fmt,
     return pass;
 }
 
-/* Wrapper around pass_add that chains a pass "in-place" */
-static int pass_append(SwsGraph *graph, void *priv, enum AVPixelFormat fmt,
-                       int w, int h, SwsPass **pass, int slice_align,
-                       sws_filter_run_t run)
+/* Wrapper around ff_sws_graph_add_pass() that chains a pass "in-place" */
+static int pass_append(SwsGraph *graph, enum AVPixelFormat fmt, int w, int h,
+                       SwsPass **pass, int align, void *priv, sws_filter_run_t run)
 {
-    SwsPass *new = pass_add(graph, priv, fmt, w, h, *pass, slice_align, run);
+    SwsPass *new = ff_sws_graph_add_pass(graph, fmt, w, h, *pass, align, priv, run);
     if (!new)
         return AVERROR(ENOMEM);
     *pass = new;
     return 0;
 }
 
-static int vshift(enum AVPixelFormat fmt, int plane)
-{
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
-    return (plane == 1 || plane == 2) ? desc->log2_chroma_h : 0;
-}
-
-/* Shift an image vertically by y lines */
-static SwsImg shift_img(const SwsImg *img_base, int y)
-{
-    SwsImg img = *img_base;
-    for (int i = 0; i < 4 && img.data[i]; i++)
-        img.data[i] += (y >> vshift(img.fmt, i)) * img.linesize[i];
-    return img;
-}
-
 static void run_copy(const SwsImg *out_base, const SwsImg *in_base,
                      int y, int h, const SwsPass *pass)
 {
-    SwsImg in  = shift_img(in_base,  y);
-    SwsImg out = shift_img(out_base, y);
+    SwsImg in  = ff_sws_img_shift(in_base,  y);
+    SwsImg out = ff_sws_img_shift(out_base, y);
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(out.data) && out.data[i]; i++) {
+        const int lines = h >> ff_fmt_vshift(in.fmt, i);
+        av_assert1(in.data[i]);
 
-    for (int i = 0; i < FF_ARRAY_ELEMS(in.data) && in.data[i]; i++) {
-        const int lines = h >> vshift(in.fmt, i);
         if (in.linesize[i] == out.linesize[i]) {
             memcpy(out.data[i], in.data[i], lines * out.linesize[i]);
         } else {
@@ -219,7 +204,7 @@ static void run_legacy_unscaled(const SwsImg *out, const SwsImg *in_base,
 {
     SwsContext *sws = slice_ctx(pass, y);
     SwsInternal *c = sws_internal(sws);
-    const SwsImg in = shift_img(in_base, y);
+    const SwsImg in = ff_sws_img_shift(in_base, y);
 
     c->convert_unscaled(c, (const uint8_t *const *) in.data, in.linesize, y, h,
                         out->data, out->linesize);
@@ -230,7 +215,7 @@ static void run_legacy_swscale(const SwsImg *out_base, const SwsImg *in,
 {
     SwsContext *sws = slice_ctx(pass, y);
     SwsInternal *c = sws_internal(sws);
-    const SwsImg out = shift_img(out_base, y);
+    const SwsImg out = ff_sws_img_shift(out_base, y);
 
     ff_swscale(c, (const uint8_t *const *) in->data, in->linesize, 0,
                sws->src_h, out.data, out.linesize, y, h);
@@ -325,19 +310,19 @@ static int init_legacy_subpass(SwsGraph *graph, SwsContext *sws,
         align = 0; /* disable slice threading */
 
     if (c->src0Alpha && !c->dst0Alpha && isALPHA(sws->dst_format)) {
-        ret = pass_append(graph, c, AV_PIX_FMT_RGBA, src_w, src_h, &input, 1, run_rgb0);
+        ret = pass_append(graph, AV_PIX_FMT_RGBA, src_w, src_h, &input, 1, c, run_rgb0);
         if (ret < 0)
             return ret;
     }
 
     if (c->srcXYZ && !(c->dstXYZ && unscaled)) {
-        ret = pass_append(graph, c, AV_PIX_FMT_RGB48, src_w, src_h, &input, 1, run_xyz2rgb);
+        ret = pass_append(graph, AV_PIX_FMT_RGB48, src_w, src_h, &input, 1, c, run_xyz2rgb);
         if (ret < 0)
             return ret;
     }
 
-    pass = pass_add(graph, sws, sws->dst_format, dst_w, dst_h, input, align,
-                    c->convert_unscaled ? run_legacy_unscaled : run_legacy_swscale);
+    pass = ff_sws_graph_add_pass(graph, sws->dst_format, dst_w, dst_h, input, align, sws,
+                                 c->convert_unscaled ? run_legacy_unscaled : run_legacy_swscale);
     if (!pass)
         return AVERROR(ENOMEM);
     pass->setup = setup_legacy_swscale;
@@ -387,7 +372,7 @@ static int init_legacy_subpass(SwsGraph *graph, SwsContext *sws,
     }
 
     if (c->dstXYZ && !(c->srcXYZ && unscaled)) {
-        ret = pass_append(graph, c, AV_PIX_FMT_RGB48, dst_w, dst_h, &pass, 1, run_rgb2xyz);
+        ret = pass_append(graph, AV_PIX_FMT_RGB48, dst_w, dst_h, &pass, 1, c, run_rgb2xyz);
         if (ret < 0)
             return ret;
     }
@@ -490,8 +475,8 @@ static void run_lut3d(const SwsImg *out_base, const SwsImg *in_base,
                       int y, int h, const SwsPass *pass)
 {
     SwsLut3D *lut = pass->priv;
-    const SwsImg in  = shift_img(in_base,  y);
-    const SwsImg out = shift_img(out_base, y);
+    const SwsImg in  = ff_sws_img_shift(in_base,  y);
+    const SwsImg out = ff_sws_img_shift(out_base, y);
 
     ff_sws_lut3d_apply(lut, in.data[0], in.linesize[0], out.data[0],
                        out.linesize[0], pass->width, h);
@@ -548,8 +533,8 @@ static int adapt_colors(SwsGraph *graph, SwsFormat src, SwsFormat dst,
         return ret;
     }
 
-    pass = pass_add(graph, lut, fmt_out, src.width, src.height,
-                    input, 1, run_lut3d);
+    pass = ff_sws_graph_add_pass(graph, fmt_out, src.width, src.height,
+                                 input, 1, lut, run_lut3d);
     if (!pass) {
         ff_sws_lut3d_free(&lut);
         return AVERROR(ENOMEM);
@@ -589,7 +574,8 @@ static int init_passes(SwsGraph *graph)
         graph->noop = 1;
 
         /* Add threaded memcpy pass */
-        pass = pass_add(graph, NULL, dst.format, dst.width, dst.height, pass, 1, run_copy);
+        pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height,
+                                     pass, 1, NULL, run_copy);
         if (!pass)
             return AVERROR(ENOMEM);
     }
diff --git a/libswscale/graph.h b/libswscale/graph.h
index b42d54be04a78..0630b31ce639e 100644
--- a/libswscale/graph.h
+++ b/libswscale/graph.h
@@ -21,6 +21,8 @@
 #ifndef SWSCALE_GRAPH_H
 #define SWSCALE_GRAPH_H
 
+#include <stdbool.h>
+
 #include "libavutil/slicethread.h"
 #include "swscale.h"
 #include "format.h"
@@ -34,6 +36,20 @@ typedef struct SwsImg {
     int linesize[4];
 } SwsImg;
 
+static av_always_inline av_const int ff_fmt_vshift(enum AVPixelFormat fmt, int plane)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+    return (plane == 1 || plane == 2) ? desc->log2_chroma_h : 0;
+}
+
+static av_const inline SwsImg ff_sws_img_shift(const SwsImg *base, const int y)
+{
+    SwsImg img = *base;
+    for (int i = 0; i < 4 && img.data[i]; i++)
+        img.data[i] += (y >> ff_fmt_vshift(img.fmt, i)) * img.linesize[i];
+    return img;
+}
+
 typedef struct SwsPass  SwsPass;
 typedef struct SwsGraph SwsGraph;
 
@@ -95,8 +111,8 @@ typedef struct SwsGraph {
     SwsContext *ctx;
     AVSliceThread *slicethread;
     int num_threads; /* resolved at init() time */
-    int incomplete;  /* set during init() if formats had to be inferred */
-    int noop;        /* set during init() if the graph is a no-op */
+    bool incomplete; /* set during init() if formats had to be inferred */
+    bool noop;       /* set during init() if the graph is a no-op */
 
     /** Sorted sequence of filter passes to apply */
     SwsPass **passes;
@@ -128,6 +144,24 @@ typedef struct SwsGraph {
 int ff_sws_graph_create(SwsContext *ctx, const SwsFormat *dst, const SwsFormat *src,
                         int field, SwsGraph **out_graph);
 
+
+/**
+ * Allocate and add a new pass to the filter graph.
+ *
+ * @param graph  Filter graph to add the pass to.
+ * @param fmt    Pixel format of the output image.
+ * @param w      Width of the output image.
+ * @param h      Height of the output image.
+ * @param input  Previous pass to read from, or NULL for the input image.
+ * @param align  Minimum slice alignment for this pass, or 0 for no threading.
+ * @param priv   Private state for the filter run function.
+ * @param run    Filter function to run.
+ * @return The newly created pass, or NULL on error.
+ */
+SwsPass *ff_sws_graph_add_pass(SwsGraph *graph, enum AVPixelFormat fmt,
+                               int width, int height, SwsPass *input,
+                               int align, void *priv, sws_filter_run_t run);
+
 /**
  * Uninitialize any state associate with this filter graph and free it.
  */
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index b84120549eb0a..61073c6c0a45c 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -699,7 +699,7 @@ static void packed16togbra16(const uint8_t *src, int srcStride,
                     dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
                     dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
                     dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
-                    dst[3][x] = 0xFFFF;
+                    dst[3][x] = av_bswap16(0xFFFF >> shift);
                 }
             } else if (src_alpha) {
                 for (x = 0; x < width; x++) {
@@ -729,7 +729,7 @@ static void packed16togbra16(const uint8_t *src, int srcStride,
                     dst[0][x] = av_bswap16(*src_line++ >> shift);
                     dst[1][x] = av_bswap16(*src_line++ >> shift);
                     dst[2][x] = av_bswap16(*src_line++ >> shift);
-                    dst[3][x] = 0xFFFF;
+                    dst[3][x] = av_bswap16(0xFFFF >> shift);
                 }
             } else if (src_alpha) {
                 for (x = 0; x < width; x++) {
@@ -759,7 +759,7 @@ static void packed16togbra16(const uint8_t *src, int srcStride,
                     dst[0][x] = av_bswap16(*src_line++) >> shift;
                     dst[1][x] = av_bswap16(*src_line++) >> shift;
                     dst[2][x] = av_bswap16(*src_line++) >> shift;
-                    dst[3][x] = 0xFFFF;
+                    dst[3][x] = 0xFFFF >> shift;
                 }
             } else if (src_alpha) {
                 for (x = 0; x < width; x++) {
@@ -789,7 +789,7 @@ static void packed16togbra16(const uint8_t *src, int srcStride,
                     dst[0][x] = *src_line++ >> shift;
                     dst[1][x] = *src_line++ >> shift;
                     dst[2][x] = *src_line++ >> shift;
-                    dst[3][x] = 0xFFFF;
+                    dst[3][x] = 0xFFFF >> shift;
                 }
             } else if (src_alpha) {
                 for (x = 0; x < width; x++) {
@@ -818,6 +818,7 @@ static void packed30togbra10(const uint8_t *src, int srcStride,
     int x, h, i;
     int dst_alpha = dst[3] != NULL;
     int scale_high = bpc - 10, scale_low = 10 - scale_high;
+    uint16_t alpha_val = (1U << bpc) - 1;
     for (h = 0; h < srcSliceH; h++) {
         uint32_t *src_line = (uint32_t *)(src + srcStride * h);
         unsigned component;
@@ -834,7 +835,7 @@ static void packed30togbra10(const uint8_t *src, int srcStride,
                     dst[1][x] = av_bswap16(component << scale_high | component >> scale_low);
                     component =  p        & 0x3FF;
                     dst[2][x] = av_bswap16(component << scale_high | component >> scale_low);
-                    dst[3][x] = 0xFFFF;
+                    dst[3][x] = av_bswap16(alpha_val);
                     src_line++;
                 }
             } else {
@@ -860,7 +861,7 @@ static void packed30togbra10(const uint8_t *src, int srcStride,
                     dst[1][x] = component << scale_high | component >> scale_low;
                     component =  p        & 0x3FF;
                     dst[2][x] = component << scale_high | component >> scale_low;
-                    dst[3][x] = 0xFFFF;
+                    dst[3][x] = alpha_val;
                     src_line++;
                 }
             } else {
@@ -1377,8 +1378,15 @@ static int planarRgbToplanarRgbWrapper(SwsInternal *c,
                  dst[1], dstStride[1]);
     ff_copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->opts.src_w,
                  dst[2], dstStride[2]);
-    if (dst[3])
-        fillPlane(dst[3], dstStride[3], c->opts.src_w, srcSliceH, srcSliceY, 255);
+    if (dst[3]) {
+        if (is16BPS(c->opts.dst_format) || isNBPS(c->opts.dst_format)) {
+            const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->opts.dst_format);
+            fillPlane16(dst[3], dstStride[3], c->opts.src_w, srcSliceH, srcSliceY, 1,
+                        desc_dst->comp[3].depth, isBE(c->opts.dst_format));
+        } else {
+            fillPlane(dst[3], dstStride[3], c->opts.src_w, srcSliceH, srcSliceY, 255);
+        }
+    }
 
     return srcSliceH;
 }
@@ -2221,7 +2229,7 @@ static int planarCopyWrapper(SwsInternal *c, const uint8_t *const src[],
 
         // ignore palette for GRAY8
         if (plane == 1 && desc_dst->nb_components < 3) continue;
-        if (!src[plane] || (plane == 1 && desc_src->nb_components < 3)) {
+        if (!src[plane] || (plane == 1 && desc_src->nb_components < 3) || (plane == 3 && desc_src->nb_components <= 3)) {
             if (is16BPS(c->opts.dst_format) || isNBPS(c->opts.dst_format)) {
                 fillPlane16(dst[plane], dstStride[plane], length, height, y,
                         plane == 3, desc_dst->comp[plane].depth,
diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 70810581305db..0f1f8311c9f78 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -79,11 +79,12 @@ static int speedup_count;
 
 static const char *speedup_color(double ratio)
 {
-    return ratio > 1.10 ? "\033[1;32m" : /* bold green */
-           ratio > 1.02 ? "\033[32m"   : /* green */
-           ratio > 0.98 ? ""           : /* default */
-           ratio > 0.95 ? "\033[33m"   : /* yellow */
-           ratio > 0.90 ? "\033[31m"   : /* red */
+    return ratio > 10.00 ? "\033[1;94m" : /* bold blue */
+           ratio >  2.00 ? "\033[1;32m" : /* bold green */
+           ratio >  1.02 ? "\033[32m"   : /* green */
+           ratio >  0.98 ? ""           : /* default */
+           ratio >  0.90 ? "\033[33m"   : /* yellow */
+           ratio >  0.75 ? "\033[31m"   : /* red */
             "\033[1;31m";  /* bold red */
 }
 
diff --git a/libswscale/utils.c b/libswscale/utils.c
index f659e22fdc661..94a47ea5d0fcd 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -692,13 +692,35 @@ static void fill_rgb2yuv_table(SwsInternal *c, const int table[4], int dstRange)
         AV_WL16(p + 16*4 + 2*i, map[i] >= 0 ? c->input_rgb2yuv_table[map[i]] : 0);
 }
 
-static int fill_xyztables(SwsInternal *c)
+#if CONFIG_SMALL
+static void init_xyz_tables(uint16_t xyzgamma_tab[4096],  uint16_t xyzgammainv_tab[65536],
+                            uint16_t rgbgamma_tab[65536], uint16_t rgbgammainv_tab[4096])
+#else
+static uint16_t xyzgamma_tab[4096],  rgbgammainv_tab[4096];
+static uint16_t rgbgamma_tab[65536], xyzgammainv_tab[65536];
+static av_cold void init_xyz_tables(void)
+#endif
 {
-    int i;
-    double xyzgamma = XYZ_GAMMA;
-    double rgbgamma = 1.0 / RGB_GAMMA;
+    double xyzgamma    = XYZ_GAMMA;
+    double rgbgamma    = 1.0 / RGB_GAMMA;
     double xyzgammainv = 1.0 / XYZ_GAMMA;
     double rgbgammainv = RGB_GAMMA;
+
+    /* set input gamma vectors */
+    for (int i = 0; i < 4096; i++) {
+        xyzgamma_tab[i]    = lrint(pow(i / 4095.0, xyzgamma)    * 65535.0);
+        rgbgammainv_tab[i] = lrint(pow(i / 4095.0, rgbgammainv) * 65535.0);
+    }
+
+    /* set output gamma vectors */
+    for (int i = 0; i < 65536; i++) {
+        rgbgamma_tab[i]    = lrint(pow(i / 65535.0, rgbgamma)    * 4095.0);
+        xyzgammainv_tab[i] = lrint(pow(i / 65535.0, xyzgammainv) * 4095.0);
+    }
+}
+
+static int fill_xyztables(SwsInternal *c)
+{
     static const int16_t xyz2rgb_matrix[3][4] = {
         {13270, -6295, -2041},
         {-3969,  7682,   170},
@@ -707,10 +729,7 @@ static int fill_xyztables(SwsInternal *c)
         {1689, 1464,  739},
         { 871, 2929,  296},
         {  79,  488, 3891} };
-#if !CONFIG_SMALL
-    static uint16_t xyzgamma_tab[4096],  rgbgammainv_tab[4096];
-    static uint16_t rgbgamma_tab[65536], xyzgammainv_tab[65536];
-#endif
+
     if (c->xyzgamma)
         return 0;
 
@@ -724,26 +743,16 @@ static int fill_xyztables(SwsInternal *c)
     c->rgbgammainv = c->xyzgamma + 4096;
     c->rgbgamma = c->rgbgammainv + 4096;
     c->xyzgammainv = c->rgbgamma + 65536;
+    init_xyz_tables(c->xyzgamma, c->xyzgammainv, c->rgbgamma, c->rgbgammainv);
 #else
     c->xyzgamma = xyzgamma_tab;
     c->rgbgamma = rgbgamma_tab;
     c->xyzgammainv = xyzgammainv_tab;
     c->rgbgammainv = rgbgammainv_tab;
-    if (xyzgamma_tab[4095])
-        return 0;
-#endif
 
-    /* set input gamma vectors */
-    for (i = 0; i < 4096; i++) {
-        c->xyzgamma[i]    = lrint(pow(i / 4095.0, xyzgamma) * 65535.0);
-        c->rgbgammainv[i] = lrint(pow(i / 4095.0, rgbgammainv) * 65535.0);
-    }
-
-    /* set output gamma vectors */
-    for (i = 0; i < 65536; i++) {
-        c->rgbgamma[i]    = lrint(pow(i / 65535.0, rgbgamma) * 4095.0);
-        c->xyzgammainv[i] = lrint(pow(i / 65535.0, xyzgammainv) * 4095.0);
-    }
+    static AVOnce xyz_init_static_once = AV_ONCE_INIT;
+    ff_thread_once(&xyz_init_static_once, init_xyz_tables);
+#endif
     return 0;
 }
 
diff --git a/tests/checkasm/pixblockdsp.c b/tests/checkasm/pixblockdsp.c
index 26a697a3468c2..79763de1ea435 100644
--- a/tests/checkasm/pixblockdsp.c
+++ b/tests/checkasm/pixblockdsp.c
@@ -90,11 +90,8 @@ void checkasm_check_pixblockdsp(void)
     uint16_t *dst0 = (uint16_t *)dst0_;
     uint16_t *dst1 = (uint16_t *)dst1_;
     PixblockDSPContext h;
-    AVCodecContext avctx = {
-        .bits_per_raw_sample = 8,
-    };
 
-    ff_pixblockdsp_init(&h, &avctx);
+    ff_pixblockdsp_init(&h, 8);
 
     if (check_func(h.get_pixels, "get_pixels"))
         check_get_pixels(uint8_t, 1);
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index cecd0dee0fa29..bddc9a79fc59b 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -310,13 +310,13 @@ static int is_zero(const int16_t *c, int sz)
 
 static void check_itxfm(void)
 {
-    LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
-    LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, src, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst0, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(uint8_t, dst1, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(int16_t, coef, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(int16_t, subcoef0, [32 * 32 * 2]);
+    LOCAL_ALIGNED_64(int16_t, subcoef1, [32 * 32 * 2]);
     declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
     VP9DSPContext dsp;
     int y, x, tx, txtp, bit_depth, sub;
diff --git a/tests/fate/ac3.mak b/tests/fate/ac3.mak
index 1ecb5a3f54b52..e52678a2fdc12 100644
--- a/tests/fate/ac3.mak
+++ b/tests/fate/ac3.mak
@@ -91,6 +91,13 @@ fate-ac3-fixed-encode: CMD = md5 -i $(SRC) -c ac3_fixed -ab 128k -f ac3 -flags +
 fate-ac3-fixed-encode: CMP = oneline
 fate-ac3-fixed-encode: REF = e9d78bca187b4bbafc4512bcea8efd3e
 
+# This tests that the LFE does not get lost when converting the input 7.1
+# to a channel layout supported by the encoder.
+FATE_AC3-$(call FRAMECRC, WAV, PCM_S16LE, ARESAMPLE_FILTER AC3_FIXED_ENCODER) += fate-ac3-fixed-encode-2
+fate-ac3-fixed-encode-2: tests/data/asynth-44100-8.wav
+fate-ac3-fixed-encode-2: SRC = $(TARGET_PATH)/tests/data/asynth-44100-8.wav
+fate-ac3-fixed-encode-2: CMD = framecrc -i $(SRC) -c:a ac3_fixed -ab 256k -frames:a 6 -af aresample
+
 FATE_EAC3-$(call ALLYES, EAC3_DEMUXER EAC3_MUXER EAC3_CORE_BSF) += fate-eac3-core-bsf
 fate-eac3-core-bsf: CMD = md5pipe -i $(TARGET_SAMPLES)/eac3/the_great_wall_7.1.eac3 -c:a copy -bsf:a eac3_core -fflags +bitexact -f eac3
 fate-eac3-core-bsf: CMP = oneline
diff --git a/tests/fate/cbs.mak b/tests/fate/cbs.mak
index 32207e2ee223c..138dab67a9d15 100644
--- a/tests/fate/cbs.mak
+++ b/tests/fate/cbs.mak
@@ -172,6 +172,11 @@ $(foreach N,$(FATE_CBS_DISCARD_TYPES),$(eval $(call FATE_CBS_DISCARD_TEST,hevc,$
 
 FATE_CBS_HEVC-$(call ALLYES, HEVC_DEMUXER HEVC_MUXER HEVC_PARSER FILTER_UNITS_BSF HEVC_METADATA_BSF FILE_PROTOCOL) += $(FATE_CBS_hevc_DISCARD)
 
+fate-cbs-hevc-metadata-set-color: CMD = md5 -i $(TARGET_SAMPLES)/hevc-conformance/AMP_A_Samsung_4.bit -c:v copy -bsf:v hevc_metadata=colour_primaries=0:transfer_characteristics=0:matrix_coefficients=3 -f hevc
+fate-cbs-hevc-metadata-set-color: CMP = oneline
+fate-cbs-hevc-metadata-set-color: REF = d073124fca9e30a46c173292f948967c
+FATE_CBS_HEVC-$(call ALLYES, HEVC_DEMUXER, HEVC_METADATA_BSF, HEVC_MUXER) += fate-cbs-hevc-metadata-set-color
+
 FATE_SAMPLES_AVCONV += $(FATE_CBS_HEVC-yes)
 fate-cbs-hevc: $(FATE_CBS_HEVC-yes)
 
diff --git a/tests/fate/hevc.mak b/tests/fate/hevc.mak
index e432345ef73be..8113c04300c26 100644
--- a/tests/fate/hevc.mak
+++ b/tests/fate/hevc.mak
@@ -292,7 +292,10 @@ fate-hevc-mv-position: CMD = framecrc -i $(TARGET_SAMPLES)/hevc/multiview.mov -m
 FATE_HEVC-$(call FRAMECRC, MOV, HEVC) += fate-hevc-mv-position
 
 fate-hevc-alpha: CMD = framecrc -i $(TARGET_SAMPLES)/hevc/alpha.mp4
-FATE_HEVC-$(call FRAMECRC, HEVC, HEVC) += fate-hevc-alpha
+FATE_HEVC-$(call FRAMECRC, MOV, HEVC) += fate-hevc-alpha
+
+fate-hevc-color-reserved: CMD = framecrc -bsf:v hevc_metadata=colour_primaries=0:transfer_characteristics=0:matrix_coefficients=3 -i $(TARGET_SAMPLES)/hevc-conformance/AMP_A_Samsung_4.bit -vf scale,format=nv12 -frames:v 1
+FATE_HEVC-$(call FRAMECRC, HEVC, HEVC, HEVC_METADATA_BSF SCALE_FILTER) += fate-hevc-color-reserved
 
 FATE_SAMPLES_AVCONV += $(FATE_HEVC-yes)
 FATE_SAMPLES_FFPROBE += $(FATE_HEVC_FFPROBE-yes)
diff --git a/tests/fate/matroska.mak b/tests/fate/matroska.mak
index 563d7564852a9..b00d19942d55e 100644
--- a/tests/fate/matroska.mak
+++ b/tests/fate/matroska.mak
@@ -100,6 +100,18 @@ fate-matroska-non-rotation-displaymatrix: CMD = transcode mov $(TARGET_SAMPLES)/
     "-c copy" \
     "-show_entries stream_side_data_list"
 
+# This test tests container cropping. The expected output is that
+# only the copied streams have cropping (and displaymatrix) side data
+# and that stream #1 (for which applying cropping was not disabled)
+# and the reencoded stream #2 decode to the same.
+FATE_MATROSKA_FFMPEG_FFPROBE-$(call TRANSCODE, UTVIDEO, MATROSKA, MOV_DEMUXER HEVC_DECODER) \
+                               += fate-matroska-crop
+fate-matroska-crop: CMD = transcode mov $(TARGET_SAMPLES)/heif-conformance/MIAF007.heic matroska \
+    "-map 0:0 -map 0:0 -map 0:0 -c:0 copy -c:1 copy -c:2 utvideo" \
+    "-map 0" \
+    "-show_entries stream=index,codec_name,width,height:stream_side_data_list" "" \
+    "-apply_cropping:0 none"
+
 # This tests DOVI (reading from MP4 and Matroska and writing to Matroska)
 # as well as writing the Cues at the front (by shifting data) if
 # the initially reserved amount of space turns out to be insufficient.
diff --git a/tests/fate/mov.mak b/tests/fate/mov.mak
index f7e5e522178a7..b966249dc0738 100644
--- a/tests/fate/mov.mak
+++ b/tests/fate/mov.mak
@@ -84,6 +84,14 @@ fate-mov-ibi-elst-starts-b: CMD = framemd5 -flags +bitexact -i $(TARGET_SAMPLES)
 # Makes sure that we handle overlapping framgments
 fate-mov-frag-overlap: CMD = framemd5 -i $(TARGET_SAMPLES)/mov/frag_overlap.mp4
 
+fate-mov-mp4-frag-flush: CMD = md5 -f lavfi -i color=blue,format=rgb24,trim=duration=0.04 -f lavfi -i anullsrc,aformat=s16,atrim=duration=2 -c:v png -c:a pcm_s16le -movflags +empty_moov+hybrid_fragmented -frag_duration 1000000 -frag_interleave 1 -f mp4
+fate-mov-mp4-frag-flush: CMP = oneline
+fate-mov-mp4-frag-flush: REF = a10c0e2e2dfc120f31ca5e59e0e4392a
+FATE_MOV_FFMPEG-$(call ALLYES, LAVFI_INDEV COLOR_FILTER FORMAT_FILTER TRIM_FILTER \
+                               ANULLSRC_FILTER AFORMAT_FILTER ATRIM_FILTER        \
+                               WRAPPED_AVFRAME_DECODER PCM_S16LE_DECODER PCM_S16BE_DECODER \
+                               PNG_ENCODER PCM_S16LE_ENCODER MP4_MUXER) += fate-mov-mp4-frag-flush
+
 # Makes sure that we pick the right frames according to edit list when there is no keyframe with PTS < edit list start.
 # For example, when video starts on a B-frame, and edit list starts on that B-frame too.
 # GOP structure : B B I in presentation order.
diff --git a/tests/fate/pixfmt.mak b/tests/fate/pixfmt.mak
index 859aeebec0c52..5f8e343fdc810 100644
--- a/tests/fate/pixfmt.mak
+++ b/tests/fate/pixfmt.mak
@@ -136,9 +136,9 @@ $(FATE_PIXFMT_EXT): REF = $(SRC_PATH)/tests/ref/pixfmt/$(@:fate-pixfmt-%=%)
 FATE_PIXFMT_16-YUV-$(call ALLYES, SCALE_FILTER YUVTESTSRC_FILTER LAVFI_INDEV) += $(PIXFMT_16_LIST)
 FATE_PIXFMT_16-RGB-$(call ALLYES, SCALE_FILTER RGBTESTSRC_FILTER LAVFI_INDEV) += $(PIXFMT_16_LIST)
 
-FATE_PIXFMT_16-YUV := $(FATE_PIXFMT_16-YUV-yes:%=fate-pixfmt-yuv444p16-%)
-FATE_PIXFMT_16-YUV := $(FATE_PIXFMT_16-RGB-yes:%=fate-pixfmt-p416-%)
-FATE_PIXFMT_16-RGB := $(FATE_PIXFMT_16-RGB-yes:%=fate-pixfmt-gbrp16-%)
+FATE_PIXFMT_16-YUV += $(FATE_PIXFMT_16-YUV-yes:%=fate-pixfmt-yuv444p16-%)
+FATE_PIXFMT_16-YUV += $(FATE_PIXFMT_16-YUV-yes:%=fate-pixfmt-p416-%)
+FATE_PIXFMT_16-RGB += $(FATE_PIXFMT_16-RGB-yes:%=fate-pixfmt-gbrp16-%)
 
 $(FATE_PIXFMT_16-YUV): CMD = pixfmt_conversion_ext "yuv" "le"
 $(FATE_PIXFMT_16-RGB): CMD = pixfmt_conversion_ext "rgb" "le"
diff --git a/tests/fate/qt.mak b/tests/fate/qt.mak
index 42e5fd9107688..436da172f1a67 100644
--- a/tests/fate/qt.mak
+++ b/tests/fate/qt.mak
@@ -57,7 +57,7 @@ fate-svq3-1: CMD = framecrc -i $(TARGET_SAMPLES)/svq3/Vertical400kbit.sorenson3.
 fate-svq3-2: CMD = framecrc -flags +bitexact -ignore_editlist 1 -i $(TARGET_SAMPLES)/svq3/svq3_decoding_regression.mov -an
 
 FATE_SVQ3 += fate-svq3-watermark
-fate-svq3-watermark: CMD = framecrc -flags +bitexact -i $(TARGET_SAMPLES)/svq3/svq3_watermark.mov
+fate-svq3-watermark: CMD = framecrc -flags +bitexact -i $(TARGET_SAMPLES)/svq3/svq3_watermark.mov -fps_mode passthrough
 
 FATE_QT-$(call FRAMECRC, MOV, SVQ3, ZLIB) += $(FATE_SVQ3)
 fate-svq3: $(FATE_SVQ3)
diff --git a/tests/ref/fate/ac3-fixed-encode-2 b/tests/ref/fate/ac3-fixed-encode-2
new file mode 100644
index 0000000000000..8e945b6637945
--- /dev/null
+++ b/tests/ref/fate/ac3-fixed-encode-2
@@ -0,0 +1,13 @@
+#tb 0: 1/44100
+#media_type 0: audio
+#codec_id 0: ac3
+#sample_rate 0: 44100
+#channel_layout_name 0: 5.1(side)
+0,       -256,       -256,     1536,     1114, 0x32fd276c
+0,       1280,       1280,     1536,     1116, 0x1ac63ba7
+0,       2816,       2816,     1536,     1114, 0xdde82dbc
+0,       4352,       4352,     1536,     1114, 0x39313179
+0,       5888,       5888,     1536,     1116, 0x166214e2
+0,       7424,       7424,     1536,     1114, 0xfbcc27ad
+0,       8960,       8960,     1536,     1114, 0xe7ed3321
+0,      10496,      10496,     1536,     1114, 0xa1823473
diff --git a/tests/ref/fate/dxv3enc-dxt1 b/tests/ref/fate/dxv3enc-dxt1
index 74849a803113c..e09000e1815e8 100644
--- a/tests/ref/fate/dxv3enc-dxt1
+++ b/tests/ref/fate/dxv3enc-dxt1
@@ -3,4 +3,4 @@
 #codec_id 0: dxv
 #dimensions 0: 1920x1080
 #sar 0: 1/1
-0,          0,          0,        1,    76521, 0xed387a5e
+0,          0,          0,        1,    76190, 0x0e6f0326
diff --git a/tests/ref/fate/hevc-color-reserved b/tests/ref/fate/hevc-color-reserved
new file mode 100644
index 0000000000000..cba6397aa855f
--- /dev/null
+++ b/tests/ref/fate/hevc-color-reserved
@@ -0,0 +1,6 @@
+#tb 0: 1/25
+#media_type 0: video
+#codec_id 0: rawvideo
+#dimensions 0: 2560x1600
+#sar 0: 0/1
+0,          0,          0,        1,  6144000, 0x427b9a00
diff --git a/tests/ref/fate/matroska-crop b/tests/ref/fate/matroska-crop
new file mode 100644
index 0000000000000..12a863942c082
--- /dev/null
+++ b/tests/ref/fate/matroska-crop
@@ -0,0 +1,70 @@
+fc4932f90dfc955b55cfbdbb210fdd16 *tests/data/fate/matroska-crop.matroska
+355698 tests/data/fate/matroska-crop.matroska
+#tb 0: 1/1
+#media_type 0: video
+#codec_id 0: rawvideo
+#dimensions 0: 720x1280
+#sar 0: 0/1
+#tb 1: 1/1
+#media_type 1: video
+#codec_id 1: rawvideo
+#dimensions 1: 360x640
+#sar 1: 0/1
+#tb 2: 1/1
+#media_type 2: video
+#codec_id 2: rawvideo
+#dimensions 2: 360x640
+#sar 2: 0/1
+0,          0,          0,        1,  1382400, 0xc8267e89
+1,          0,          0,        1,   345600, 0x84b4bdaa
+2,          0,          0,        1,   345600, 0x84b4bdaa
+[STREAM]
+index=0
+codec_name=hevc
+width=1280
+height=720
+[SIDE_DATA]
+side_data_type=Frame Cropping
+crop_top=180
+crop_bottom=180
+crop_left=320
+crop_right=320
+[/SIDE_DATA]
+[SIDE_DATA]
+side_data_type=Display Matrix
+displaymatrix=
+00000000:            0       65536           0
+00000001:        65536           0           0
+00000002:            0           0  1073741824
+
+rotation=-90
+[/SIDE_DATA]
+[/STREAM]
+[STREAM]
+index=1
+codec_name=hevc
+width=1280
+height=720
+[SIDE_DATA]
+side_data_type=Frame Cropping
+crop_top=180
+crop_bottom=180
+crop_left=320
+crop_right=320
+[/SIDE_DATA]
+[SIDE_DATA]
+side_data_type=Display Matrix
+displaymatrix=
+00000000:            0       65536           0
+00000001:        65536           0           0
+00000002:            0           0  1073741824
+
+rotation=-90
+[/SIDE_DATA]
+[/STREAM]
+[STREAM]
+index=2
+codec_name=utvideo
+width=360
+height=640
+[/STREAM]
diff --git a/tests/ref/fate/matroska-mastering-display-metadata b/tests/ref/fate/matroska-mastering-display-metadata
index 6a2ff15b1b220..6f10dc57a67fb 100644
--- a/tests/ref/fate/matroska-mastering-display-metadata
+++ b/tests/ref/fate/matroska-mastering-display-metadata
@@ -1,7 +1,7 @@
-c1e5e2ecf433cf05af8556debc7d4d0b *tests/data/fate/matroska-mastering-display-metadata.matroska
-1669773 tests/data/fate/matroska-mastering-display-metadata.matroska
+bdca53906b34c57192416a0f737b885e *tests/data/fate/matroska-mastering-display-metadata.matroska
+1669723 tests/data/fate/matroska-mastering-display-metadata.matroska
 #extradata 0:        4, 0x040901a3
-#extradata 3:      202, 0xfce96279
+#extradata 3:      201, 0x9a706279
 #tb 0: 1/1000
 #media_type 0: video
 #codec_id 0: prores
diff --git a/tests/ref/fate/ogg-flac-chained-meta.txt b/tests/ref/fate/ogg-flac-chained-meta.txt
index ad20ba935f745..28e22aa29e613 100644
--- a/tests/ref/fate/ogg-flac-chained-meta.txt
+++ b/tests/ref/fate/ogg-flac-chained-meta.txt
@@ -5,8 +5,6 @@ Stream ID: 0, frame PTS: 0, metadata: N/A
 Stream ID: 0, packet PTS: 4608, packet DTS: 4608
 Stream ID: 0, frame PTS: 4608, metadata: N/A
 Stream ID: 0, packet PTS: 0, packet DTS: 0
-Stream ID: 0, packet PTS: 0, packet DTS: 0
-Stream ID: 0, packet PTS: 0, packet DTS: 0
 Stream ID: 0, frame PTS: 0, metadata: N/A
 Stream ID: 0, packet PTS: 4608, packet DTS: 4608
 Stream ID: 0, frame PTS: 4608, metadata: N/A
diff --git a/tests/ref/fate/ogg-opus-chained-meta.txt b/tests/ref/fate/ogg-opus-chained-meta.txt
index fc84b8b703fb7..addc41c1eb73c 100644
--- a/tests/ref/fate/ogg-opus-chained-meta.txt
+++ b/tests/ref/fate/ogg-opus-chained-meta.txt
@@ -13,7 +13,6 @@ Stream ID: 0, frame PTS: 3528, metadata: N/A
 Stream ID: 0, packet PTS: 4488, packet DTS: 4488
 Stream ID: 0, frame PTS: 4488, metadata: N/A
 Stream ID: 0, packet PTS: -312, packet DTS: -312
-Stream ID: 0, new metadata: encoder=Lavc61.19.100 libopus;Lavc61.19.100 libopus:title=First Stream;Second Stream
 Stream ID: 0, frame PTS: -312, metadata: N/A
 Stream ID: 0, packet PTS: 648, packet DTS: 648
 Stream ID: 0, frame PTS: 648, metadata: N/A
diff --git a/tests/ref/fate/svq3-watermark b/tests/ref/fate/svq3-watermark
index f4068c612e85c..95d67e3da4dd6 100644
--- a/tests/ref/fate/svq3-watermark
+++ b/tests/ref/fate/svq3-watermark
@@ -12,3 +12,4 @@
 0,          7,          7,        1,   102240, 0x342bf32f
 0,          8,          8,        1,   102240, 0x7b311bf1
 0,          9,          9,        1,   102240, 0xf56e0cd3
+0,          9,          9,        1,   102240, 0xfb95c7d3
diff --git a/tests/ref/fate/ts-demux b/tests/ref/fate/ts-demux
index 6a830d0d99fd7..d56cc279379d7 100644
--- a/tests/ref/fate/ts-demux
+++ b/tests/ref/fate/ts-demux
@@ -24,6 +24,6 @@ packet|codec_type=video|stream_index=0|pts=3912686363|pts_time=43474.292922|dts=
 packet|codec_type=audio|stream_index=1|pts=3912644825|pts_time=43473.831389|dts=3912644825|dts_time=43473.831389|duration=2880|duration_time=0.032000|size=906|pos=474888|flags=K__|data_hash=CRC32:0893d398
 packet|codec_type=audio|stream_index=2|pts=3912645580|pts_time=43473.839778|dts=3912645580|dts_time=43473.839778|duration=2880|duration_time=0.032000|size=354|pos=491808|flags=K__|data_hash=CRC32:f5963fa6
 stream|index=0|codec_name=mpeg2video|profile=4|codec_type=video|codec_tag_string=[2][0][0][0]|codec_tag=0x0002|width=1280|height=720|coded_width=0|coded_height=0|has_b_frames=1|sample_aspect_ratio=1:1|display_aspect_ratio=16:9|pix_fmt=yuv420p|level=4|color_range=tv|color_space=unknown|color_transfer=unknown|color_primaries=unknown|chroma_location=left|field_order=progressive|refs=1|ts_id=32776|ts_packetsize=188|id=0x31|r_frame_rate=60000/1001|avg_frame_rate=60000/1001|time_base=1/90000|start_pts=3912669846|start_time=43474.109400|duration_ts=19519|duration=0.216878|bit_rate=15000000|max_bit_rate=N/A|bits_per_raw_sample=N/A|nb_frames=N/A|nb_read_frames=N/A|nb_read_packets=15|extradata_size=150|extradata_hash=CRC32:53134fa8|disposition:default=0|disposition:dub=0|disposition:original=0|disposition:comment=0|disposition:lyrics=0|disposition:karaoke=0|disposition:forced=0|disposition:hearing_impaired=0|disposition:visual_impaired=0|disposition:clean_effects=0|disposition:attached_pic=0|disposition:timed_thumbnails=0|disposition:non_diegetic=0|disposition:captions=0|disposition:descriptions=0|disposition:metadata=0|disposition:dependent=0|disposition:still_image=0|disposition:multilayer=0|side_datum/cpb_properties:side_data_type=CPB properties|side_datum/cpb_properties:max_bitrate=15000000|side_datum/cpb_properties:min_bitrate=0|side_datum/cpb_properties:avg_bitrate=0|side_datum/cpb_properties:buffer_size=9781248|side_datum/cpb_properties:vbv_delay=-1
-stream|index=1|codec_name=ac3|profile=unknown|codec_type=audio|codec_tag_string=[4][0][0][0]|codec_tag=0x0004|sample_fmt=fltp|sample_rate=48000|channels=6|channel_layout=5.1(side)|bits_per_sample=0|initial_padding=0|dmix_mode=0|ltrt_cmixlev=0.000000|ltrt_surmixlev=0.000000|loro_cmixlev=0.000000|loro_surmixlev=0.000000|ts_id=32776|ts_packetsize=188|id=0x34|r_frame_rate=0/0|avg_frame_rate=0/0|time_base=1/90000|start_pts=3912633305|start_time=43473.703389|duration_ts=14400|duration=0.160000|bit_rate=384000|max_bit_rate=N/A|bits_per_raw_sample=N/A|nb_frames=N/A|nb_read_frames=N/A|nb_read_packets=5|disposition:default=0|disposition:dub=0|disposition:original=0|disposition:comment=0|disposition:lyrics=0|disposition:karaoke=0|disposition:forced=0|disposition:hearing_impaired=0|disposition:visual_impaired=0|disposition:clean_effects=0|disposition:attached_pic=0|disposition:timed_thumbnails=0|disposition:non_diegetic=0|disposition:captions=0|disposition:descriptions=0|disposition:metadata=0|disposition:dependent=0|disposition:still_image=0|disposition:multilayer=0|tag:language=eng
-stream|index=2|codec_name=ac3|profile=unknown|codec_type=audio|codec_tag_string=[4][0][0][0]|codec_tag=0x0004|sample_fmt=fltp|sample_rate=48000|channels=2|channel_layout=stereo|bits_per_sample=0|initial_padding=0|dmix_mode=0|ltrt_cmixlev=0.000000|ltrt_surmixlev=0.000000|loro_cmixlev=0.000000|loro_surmixlev=0.000000|ts_id=32776|ts_packetsize=188|id=0x35|r_frame_rate=0/0|avg_frame_rate=0/0|time_base=1/90000|start_pts=3912634060|start_time=43473.711778|duration_ts=14400|duration=0.160000|bit_rate=192000|max_bit_rate=N/A|bits_per_raw_sample=N/A|nb_frames=N/A|nb_read_frames=N/A|nb_read_packets=5|disposition:default=0|disposition:dub=0|disposition:original=0|disposition:comment=0|disposition:lyrics=0|disposition:karaoke=0|disposition:forced=0|disposition:hearing_impaired=0|disposition:visual_impaired=0|disposition:clean_effects=0|disposition:attached_pic=0|disposition:timed_thumbnails=0|disposition:non_diegetic=0|disposition:captions=0|disposition:descriptions=0|disposition:metadata=0|disposition:dependent=0|disposition:still_image=0|disposition:multilayer=0|tag:language=es
+stream|index=1|codec_name=ac3|profile=unknown|codec_type=audio|codec_tag_string=[6][0][0][0]|codec_tag=0x0006|sample_fmt=fltp|sample_rate=48000|channels=6|channel_layout=5.1(side)|bits_per_sample=0|initial_padding=0|dmix_mode=0|ltrt_cmixlev=0.000000|ltrt_surmixlev=0.000000|loro_cmixlev=0.000000|loro_surmixlev=0.000000|ts_id=32776|ts_packetsize=188|id=0x34|r_frame_rate=0/0|avg_frame_rate=0/0|time_base=1/90000|start_pts=3912633305|start_time=43473.703389|duration_ts=14400|duration=0.160000|bit_rate=384000|max_bit_rate=N/A|bits_per_raw_sample=N/A|nb_frames=N/A|nb_read_frames=N/A|nb_read_packets=5|disposition:default=0|disposition:dub=0|disposition:original=0|disposition:comment=0|disposition:lyrics=0|disposition:karaoke=0|disposition:forced=0|disposition:hearing_impaired=0|disposition:visual_impaired=0|disposition:clean_effects=0|disposition:attached_pic=0|disposition:timed_thumbnails=0|disposition:non_diegetic=0|disposition:captions=0|disposition:descriptions=0|disposition:metadata=0|disposition:dependent=0|disposition:still_image=0|disposition:multilayer=0|tag:language=eng
+stream|index=2|codec_name=ac3|profile=unknown|codec_type=audio|codec_tag_string=[6][0][0][0]|codec_tag=0x0006|sample_fmt=fltp|sample_rate=48000|channels=2|channel_layout=stereo|bits_per_sample=0|initial_padding=0|dmix_mode=0|ltrt_cmixlev=0.000000|ltrt_surmixlev=0.000000|loro_cmixlev=0.000000|loro_surmixlev=0.000000|ts_id=32776|ts_packetsize=188|id=0x35|r_frame_rate=0/0|avg_frame_rate=0/0|time_base=1/90000|start_pts=3912634060|start_time=43473.711778|duration_ts=14400|duration=0.160000|bit_rate=192000|max_bit_rate=N/A|bits_per_raw_sample=N/A|nb_frames=N/A|nb_read_frames=N/A|nb_read_packets=5|disposition:default=0|disposition:dub=0|disposition:original=0|disposition:comment=0|disposition:lyrics=0|disposition:karaoke=0|disposition:forced=0|disposition:hearing_impaired=0|disposition:visual_impaired=0|disposition:clean_effects=0|disposition:attached_pic=0|disposition:timed_thumbnails=0|disposition:non_diegetic=0|disposition:captions=0|disposition:descriptions=0|disposition:metadata=0|disposition:dependent=0|disposition:still_image=0|disposition:multilayer=0|tag:language=es
 format|filename=mp3ac325-4864-small.ts|nb_streams=3|nb_programs=1|nb_stream_groups=0|format_name=mpegts|start_time=43473.703389|duration=0.622889|size=512000|bit_rate=6575810|probe_score=50
diff --git a/tests/ref/vsynth/vsynth3-asv1 b/tests/ref/vsynth/vsynth3-asv1
index 0abbf787ecf28..af1dc644b0745 100644
--- a/tests/ref/vsynth/vsynth3-asv1
+++ b/tests/ref/vsynth/vsynth3-asv1
@@ -1,4 +1,4 @@
-81eeea0d0e6219b2f381cf2100e9a12f *tests/data/fate/vsynth3-asv1.avi
-34704 tests/data/fate/vsynth3-asv1.avi
+69ae6df10440e68c53bee4e713851199 *tests/data/fate/vsynth3-asv1.avi
+31524 tests/data/fate/vsynth3-asv1.avi
 3c8636e22a96267451684f42d7a6f608 *tests/data/fate/vsynth3-asv1.out.rawvideo
 stddev:   13.16 PSNR: 25.74 MAXDIFF:  112 bytes:    86700/    86700
diff --git a/tests/ref/vsynth/vsynth3-asv2 b/tests/ref/vsynth/vsynth3-asv2
index 90b8a47f3415f..9fa9822c0bce4 100644
--- a/tests/ref/vsynth/vsynth3-asv2
+++ b/tests/ref/vsynth/vsynth3-asv2
@@ -1,4 +1,4 @@
-8402fb1112fb8119c019154a472b5cd0 *tests/data/fate/vsynth3-asv2.avi
-36208 tests/data/fate/vsynth3-asv2.avi
+63000eaedeb60bede8baeb090f02881a *tests/data/fate/vsynth3-asv2.avi
+33696 tests/data/fate/vsynth3-asv2.avi
 5469c0735b7c9279e5e8e3439fc6acab *tests/data/fate/vsynth3-asv2.out.rawvideo
 stddev:    9.07 PSNR: 28.97 MAXDIFF:   51 bytes:    86700/    86700