PKGBUILDs/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch

ffmpeg: jc-kynesim/test/4.4.1/main

git diff-index --binary n4.4.4

https://github.com/jc-kynesim/rpi-ffmpeg/commit/06605ea7f20102aa140632007aa07edd6bf86546
21-Mar-2023

diff --git a/CREDITS b/CREDITS
index f1aea93d6b..e29f0b853c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1,6 +1,6 @@
-See the Git history of the project (https://git.ffmpeg.org/ffmpeg) to
+See the Git history of the project (git://source.ffmpeg.org/ffmpeg) to
 get the names of people who have contributed to FFmpeg.

 To check the log, you can type the command "git log" in the FFmpeg
 source directory, or browse the online repository at
-https://git.ffmpeg.org/ffmpeg
+http://source.ffmpeg.org.
diff --git a/Changelog b/Changelog
index 620ca2bf40..a6508cd8ac 100644
--- a/Changelog
+++ b/Changelog
@@ -1,267 +1,6 @@
 Entries are sorted chronologically from oldest to youngest within each release,
 releases are sorted from youngest to oldest.

-version 4.4.4:
-- avcodec/tests/snowenc: Fix 2nd test
-- avcodec/tests/snowenc: return a failure if DWT/IDWT mismatches
-- avcodec/snowenc: Fix visual weight calculation
-- avcodec/tests/snowenc: unbreak DWT tests
-- avcodec/vp3: Add missing check for av_malloc
-- avformat/nutdec: Add check for avformat_new_stream
-- avcodec/mpeg12dec: Check input size
-- avcodec/escape124: Fix some return codes
-- avcodec/escape124: fix signdness of end of input check
-- Use https for repository links
-- avcodec/rpzaenc: stop accessing out of bounds frame
-- avcodec/motionpixels: Mask pixels to valid values
-- avcodec/xpmdec: Check size before allocation to avoid truncation
-- avcodec/bink: Avoid undefined out of array end pointers in binkb_decode_plane()
-- avcodec/bink: Fix off by 1 error in ref end
-- avcodec/utils: Ensure linesize for SVQ3
-- avcodec/utils: allocate a line more for VC1 and WMV3
-- avcodec/videodsp_template: Adjust pointers to avoid undefined pointer things
-- avcodec/pngdec: Check deloco index more exactly
-- avcodec/ffv1dec: Check that num h/v slices is supported
-- avformat/mov: Check samplesize and offset to avoid integer overflow
-- avcodec/pictordec: Remove mid exit branch
-- avcodec/eac3dec: avoid float noise in fixed mode addition to overflow
-- avcodec/utils: use 32pixel alignment for bink
-- avcodec/scpr3: Check bx
-- avcodec/012v: Order operations for odd size handling
-- avcodec/eatgq: : Check index increments in tgq_decode_block()
-- avcodec/scpr: Test bx before use
-- avformat/mxfdec: Use 64bit in remainder
-- avcodec/sunrast: Fix maplength check
-- avcodec/wavpack: Avoid undefined shift in get_tail()
-- avcodec/wavpack: Check for end of input in wv_unpack_dsd_high()
-- avformat/id3v2: Check taglen in read_uslt()
-- avcodec/tiff: Ignore tile_count
-- avcodec/ffv1dec: restructure slice coordinate reading a bit
-- avcodec/mlpdec: Check max matrix instead of max channel in noise check
-- swscale/input: Use more unsigned intermediates
-- avcodec/alsdec: The minimal block is at least 7 bits
-- avformat/replaygain: avoid undefined / negative abs
-- swscale/output: Bias 16bps output calculations to improve non overflowing range
-- avcodec/speedhq: Check buf_size to be big enough for DC
-- avcodec/ffv1dec: Fail earlier if prior context is corrupted
-- avcodec/nvenc: fix b-frame DTS behavior with fractional framerates
-- avfilter/vf_untile: swap the chroma shift values used for plane offsets
-- avcodec/nvenc: fix vbv buffer size in cq mode
-- avcodec/mjpegenc: take into account component count when writing the SOF header size
-- swscale: aarch64: Fix yuv2rgb with negative stride
-
-version 4.4.3:
-- avformat/vividas: Check packet size
-- configure: link to libatomic when it's present
-- avcodec/dstdec: Check for overflow in build_filter()
-- avformat/spdifdec: Use 64bit to compute bit rate
-- avformat/rpl: Use 64bit for duration computation
-- avformat/xwma: Use av_rescale() for duration computation
-- avformat/sdsdec: Use av_rescale() to avoid intermediate overflow in duration calculation
-- avformat/sbgdec: Check ts_int in genrate_intervals
-- avformat/rmdec: check tag_size
-- avformat/nutdec: Check fields
-- avformat/flvdec: Use 64bit for sum_flv_tag_size
-- avformat/jacosubdec: Fix overflow in get_shift()
-- avformat/dxa: avoid bpc overflows
-- avformat/cafdec: Check that nb_frasmes fits within 64bit
-- avformat/asfdec_o: Limit packet offset
-- avformat/ape: Check frames size
-- avformat/icodec: Check nb_pal
-- avformat/aiffdec: Use 64bit for block_duration use
-- avformat/aiffdec: Check block_duration
-- avformat/mxfdec: only probe max run in
-- avformat/mxfdec: Check run_in is within 65536
-- avcodec/mjpegdec: Check for unsupported bayer case
-- avcodec/apedec: Fix integer overflow in filter_3800()
-- avcodec/tta: Check 24bit scaling for overflow
-- avcodec/mobiclip: Check quantizer for overflow
-- avcodec/exr: Check preview psize
-- avcodec/tiff: Fix loop detection
-- libavformat/hls: Free keys
-- avcodec/fmvc: Move frame allocation to a later stage
-- avfilter/vf_showinfo: remove backspaces
-- avcodec/speedhq: Check width
-- avcodec/bink: disallow odd positioned scaled blocks
-- avformat/asfdec_o: limit recursion depth in asf_read_unknown()
-- doc/git-howto.texi: Document commit signing
-- libavcodec/8bps: Check that line lengths fit within the buffer
-- avcodec/midivid: Perform lzss_uncompress() before ff_reget_buffer()
-- libavformat/iff: Check for overflow in body_end calculation
-- avformat/avidec: Prevent entity expansion attacks
-- avcodec/h263dec: Sanity check against minimal I/P frame size
-- avcodec/hevcdec: Check s->ref in the md5 path similar to hwaccel
-- avcodec/mpegaudiodec_template: use unsigned shift in handle_crc()
-- avformat/subviewerdec: Make read_ts() more flexible
-- avcodec/mjpegdec: bayer and rct are incompatible
-- MAINTAINERS: Add ED25519 key for signing my commits in the future
-- avcodec/hevc_filter: copy_CTB() only within width&height
-- avcodec/tiff: Check tile_length and tile_width
-- avcodec/mss4: Check image size with av_image_check_size2()
-- avformat/flvdec: Check for EOF in index reading
-- avformat/nutdec: Check get_packetheader() in mainheader
-- avformat/asfdec_f: Use 64bit for packet start time
-- avcodec/exr: Check x/ysize
-- tools/target_dec_fuzzer: Adjust threshold for MMVIDEO
-- avcodec/lagarith: Check dst/src in zero run code
-- avcodec/h264dec: Skip late SEI
-- avcodec/sbrdsp_fixed: Fix integer overflows in sbr_qmf_deint_neg_c()
-- avfilter/vf_signature: Fix integer overflow in filter_frame()
-- avformat/rtsp: break on unknown protocols
-- avcodec/hevcdsp_template: stay within tables in sao_band_filter()
-- avcodec/tiff: Check pixel format types for dng
-- avcodec/qpeldsp: copy less for the mc0x cases
-- avformat/aaxdec: Check for empty segments
-- avcodec/ffv1dec: Limit golomb rice coded slices to width 8M
-- avformat/iff: simplify duration calculation
-- avcodec/wnv1: Check for width =1
-- avcodec/ffv1dec_template: fix indention
-- avformat/sctp: close socket on errors
-- avcodec/aasc: Fix indention
-- avcodec/qdrw: adjust max colors to array size
-- avcodec/alacdsp: Make intermediates unsigned
-- avformat/aiffdec: cleanup size handling for extreem cases
-- avformat/matroskadec: avoid integer overflows in SAR computation
-- avcodec/jpeglsdec: fix end check for xfrm
-- avcodec/cdgraphics: limit scrolling to the line
-- avformat/hls: Limit start_seq_no to one bit less
-- avformat/aiffdec: avoid integer overflow in get_meta()
-- avformat/ape: more bits in size for less overflows
-- avformat/aviobuf: Check buf_size in ffio_ensure_seekback()
-- avformat/bfi: Check offsets better
-- avformat/asfdec_f: Check packet_frag_timestamp
-- avcodec/texturedspenc: Fix indexing in color distribution determination
-- avformat/act: Check ff_get_wav_header() for failure
-- avcodec/libxavs2: Improve r redundancy in occured
-- avformat/libzmq: Improve r redundancy in occured
-- avfilter/vsrc_mandelbrot: Check for malloc failure
-- avfilter/vf_frei0r: Copy to frame allocated according to frei0r requirements
-- avfilter/video: Add ff_default_get_video_buffer2() to set specific alignment
-- avformat/genh: Check sample rate
-- configure: bump year
-- lavc/videotoolbox: do not pass AVCodecContext to decoder output callback
-- lavc/pthread_frame: always transfer stashed hwaccel state
-- avcodec/arm/sbcenc: avoid callee preserved vfp registers
-- avfilter/vf_scale: overwrite the width and height expressions with the original values
-- lavc/pthread_frame: avoid leaving stale hwaccel state in worker threads
-- configure: extend SDL check to accept all 2.x versions
-- lavf/tls_mbedtls: add support for mbedtls version 3
-
-version 4.4.2:
-- fate: update reference files after the recent dash manifest muxer changes
-- avformat/webmdashenc: fix on-demand profile string
-- Update for FFmpeg 4.4.2
-- avcodec/exr: Avoid signed overflow in displayWindow
-- avcodec/diracdec: avoid signed integer overflow in global mv
-- avcodec/takdsp: Fix integer overflow in decorrelate_sf()
-- avcodec/apedec: fix a integer overflow in long_filter_high_3800()
-- avfilter/vf_subtitles: pass storage size to libass
-- avformat/aqtitledec: Skip unrepresentable durations
-- avformat/cafdec: Do not store empty keys in read_info_chunk()
-- avformat/mxfdec: Do not clear array in mxf_read_strong_ref_array() before writing
-- avformat/mxfdec: Check for avio_read() failure in mxf_read_strong_ref_array()
-- avformat/mxfdec: Check count in mxf_read_strong_ref_array()
-- avformat/hls: Check target_duration
-- avcodec/pixlet: Avoid signed integer overflow in scaling in filterfn()
-- avformat/matroskadec: Check pre_ns
-- avcodec/sonic: Use unsigned for predictor_k to avoid undefined behavior
-- avcodec/libuavs3d: Check ff_set_dimensions() for failure
-- avcodec/mjpegbdec: Set buf_size
-- avformat/matroskadec: Use rounded down duration in get_cue_desc() check
-- avcodec/argo: Check packet size
-- avcodec/g729_parser: Check channels
-- avformat/avidec: Check height
-- avformat/rmdec: Better duplicate tags check
-- avformat/mov: Disallow empty sidx
-- avformat/argo_asf: Fix order of operations in error check in argo_asf_write_trailer()
-- avformat/matroskadec: Check duration
-- avformat/mov: Corner case encryption error cleanup in mov_read_senc()
-- avcodec/jpeglsdec: Fix if( code style
-- avcodec/jpeglsdec: Check get_ur_golomb_jpegls() for error
-- avcodec/motion_est: fix indention of ff_get_best_fcode()
-- avcodec/motion_est: Fix xy indexing on range violation in ff_get_best_fcode()
-- avformat/hls: Use unsigned for iv computation
-- avcodec/jpeglsdec: Increase range for N in ls_get_code_runterm() by using unsigned
-- avformat/matroskadec: Check desc_bytes
-- avformat/utils: Fix invalid NULL pointer operation in ff_parse_key_value()
-- avformat/matroskadec: Fix infinite loop with bz decompression
-- avformat/mov: Check size before subtraction
-- avcodec/cfhd: Avoid signed integer overflow in coeff
-- avcodec/apedec: Fix integer overflows in predictor_update_3930()
-- avcodec/apedec: fix integer overflow in 8bit samples
-- avformat/flvdec: timestamps cannot use the full int64 range
-- avcodec/tiff: Remove messing with jpeg context
-- avcodec/tiff: Use ff_set_dimensions() for setting up mjpeg context dimensions
-- avcodec/tiff: Pass max_pixels to mjpeg context
-- avcodec/vqavideo: reset accounting on error
-- avcodec/alacdsp: fix integer overflow in decorrelate_stereo()
-- avformat/4xm: Check for duplicate track ids
-- avformat/4xm: Consider max_streams on reallocating tracks array
-- avformat/mov: Check next offset in mov_read_dref()
-- avformat/vivo: Favor setting fps from explicit fractions
-- avformat/vivo: Do not use the general expression evaluator for parsing a floating point value
-- avformat/mxfdec: Check for duplicate mxf_read_index_entry_array()
-- avcodec/apedec: Change avg to uint32_t
-- avformat/mxfdec: Check component_depth in mxf_get_color_range()
-- avformat/mov: Disallow duplicate smdm
-- avformat/mov: Check for EOF in mov_read_glbl()
-- avcodec/vp3: Check version in all cases when VP4 code is not built
-- avformat/mov: Check channels for mov_parse_stsd_audio()
-- avformat/avidec: Check read_odml_index() for failure
-- avformat/aiffdec: Use av_rescale() for bitrate
-- avformat/aiffdec: sanity check block_align
-- avformat/aiffdec: Check sample_rate
-- avcodec/libdav1d: free the Dav1dData packet on dav1d_send_data() failure
-- avcodec/zmbvenc: Fix memleak upon init error
-- avcodec/dnxhdenc: Fix segfault when using too many slice threads
-- avcodec/wma(dec|enc): Fix memleaks upon allocation error
-- avfilter/avfilter: Actually error out on init error
-- avcodec/opus_silk: Remove wrong size information in function declaration
-- avformat/omadec: Don't output uninitialized values
-- avformat/jacosubenc: Fix writing extradata
-- avformat/cafenc: Fix memleak when trailer is never written
-- avformat/cafenc: Don't segfault upon allocation error
-- avformat/cafenc: Fix potential integer overflow
-- avformat/movenc: Limit ism_lookahead to a sane value
-- avutil/utils: Remove racy check from avutil_version()
-- avformat/sccdec: Don't use uninitialized data, fix crash, simplify logic
-- avformat/subtitles: Honour ff_subtitles_read_line() documentation
-- avformat/tee: Fix leak of FIFO-options dictionary
-- avformat/tee: Fix leak of strings
-- avcodec/rasc: Fix potential use of uninitialized value
-- avfilter/vf_w3fdif: Fix segfault on allocation error
-- avfilter/af_surround: Fix memleaks upon allocation error
-- avfilter/af_vibrato: Fix segfault upon allocation error
-- avfilter/aeval: Fix leak of expressions upon reallocation error
-- avdevice/xv: Increase array size
-- avfilter/asrc_flite: Fix use-after-frees
-- avfilter/asrc_flite: Don't segfault when using list_voices option
-- Revert "avfilter/vf_idet: reduce noisyness if the filter has been auto inserted"
-- avformat/matroskadec: Don't unnecessarily reduce aspect ratio
-- avcodec/h263: Fix global-buffer-overflow with noout flag2 set
-- avcodec/vaapi_encode: Fix segfault upon closing uninitialized encoder
-- avcodec/movtextenc: Fix infinite loop due to variable truncation
-- avcodec/libopenh264dec: Increase array sizes, fix stack-buffer overread
-- avcodec/libkvazaar: Increase array size
-- avformat/aadec: Don't use the same loop counter in inner and outer loop
-- avformat/moflex: Don't use uninitialized timebase for data stream
-- lavf/udp: do not return an uninitialized value from udp_open()
-- avcodec/nvenc: zero-initialize NV_ENC_REGISTER_RESOURCE struct
-- configure: Add missing libshine->mpegaudioheader dependency
-- avcodec/Makefile: Add missing entry for ADPCM_IMA_AMV_ENCODER
-- avcodec/Makefile: Only compile nvenc.o if needed
-- avcodec/av1_vaapi: improve decode quality
-- avcodec/av1_vaapi: enable segmentation features
-- avcodec/av1_vaapi: setting 2 output surface for film grain
-- avcodec/vaapi: increase av1 decode pool size
-- avcodec/dxva2_av1: fix global motion params
-- avcodec/av1_vaapi: add gm params valid check
-- avcodec/av1dec: support setup shear process
-- avcodec/av1: extend some definitions in spec section 3
-- cbs_av1: fix incorrect data type
-- avcodec/libdav1d: let libdav1d choose optimal max frame delay
-- avcodec/libdav1d: pass auto threads value to libdav1d
-
 version 4.4.1:
 - avcodec/flac_parser: Consider AV_INPUT_BUFFER_PADDING_SIZE
 - avcodec/ttadsp: Fix integer overflows in tta_filter_process_c()
diff --git a/MAINTAINERS b/MAINTAINERS
index b825b8d68e..3b6cfad4fc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -615,7 +615,6 @@ Jean Delvare                  7CA6 9F44 60F1 BDC4 1FD2 C858 A552 6B9B B3CD 4E6A
 Loren Merritt                 ABD9 08F4 C920 3F65 D8BE 35D7 1540 DAA7 060F 56DE
 Lynne                         FE50 139C 6805 72CA FD52 1F8D A2FE A5F0 3F03 4464
 Michael Niedermayer           9FF2 128B 147E F673 0BAD F133 611E C787 040B 0FAB
-                              DD1E C9E8 DE08 5C62 9B3E 1846 B18E 8928 B394 8D64
 Nicolas George                24CE 01CE 9ACC 5CEB 74D8 8D9D B063 D997 36E5 4C93
 Nikolay Aleksandrov           8978 1D8C FB71 588E 4B27 EAA8 C4F0 B5FC E011 13B1
 Panagiotis Issaris            6571 13A3 33D9 3726 F728 AA98 F643 B12E ECF3 E029
diff --git a/RELEASE b/RELEASE
index cbe06cdbfc..cca25a93cd 100644
--- a/RELEASE
+++ b/RELEASE
@@ -1 +1 @@
-4.4.4
+4.4.1
diff --git a/configure b/configure
index fb55e04ee7..f2fc33e89b 100755
--- a/configure
+++ b/configure
@@ -207,6 +207,7 @@ External library support:
   --disable-bzlib          disable bzlib [autodetect]
   --disable-coreimage      disable Apple CoreImage framework [autodetect]
   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
+  --disable-epoxy          disable epoxy [autodetect]
   --enable-frei0r          enable frei0r video filtering [no]
   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
                            if openssl, librtmp or gmp is not used [no]
@@ -279,6 +280,7 @@ External library support:
                            if openssl, gnutls or mbedtls is not used [no]
   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
+  --disable-libudev        disable libudev [autodetect]
   --enable-libv4l2         enable libv4l2/v4l-utils [no]
   --enable-libvidstab      enable video stabilization using vid.stab [no]
   --enable-libvmaf         enable vmaf filter via libvmaf [no]
@@ -340,12 +342,17 @@ External library support:
   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
+  --enable-rpi             enable other rpi specific stuff [no]
+  --enable-sand            enable sand video formats [rpi]
+  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
+  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
+  --enable-v4l2-request    enable V4L2 request API code [no]
   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
   --disable-videotoolbox   disable VideoToolbox code [autodetect]
@@ -1703,7 +1710,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
     avfoundation
     bzlib
     coreimage
+    epoxy
     iconv
+    libudev
     libxcb
     libxcb_shm
     libxcb_shape
@@ -1868,7 +1877,10 @@ HWACCEL_LIBRARY_LIST="
     mmal
     omx
     opencl
+    v4l2_request
     vulkan
+    rpi4_8
+    rpi4_10
 "

 DOCUMENT_LIST="
@@ -1884,12 +1896,17 @@ FEATURE_LIST="
     gray
     hardcoded_tables
     omx_rpi
+    rpi
     runtime_cpudetect
     safe_bitstream_reader
+    sand
     shared
     small
     static
     swscale_alpha
+    vout_drm
+    vout_egl
+    v4l2_req_hevc_vx
 "

 # this list should be kept in linking order
@@ -1930,6 +1947,7 @@ SUBSYSTEM_LIST="
     pixelutils
     network
     rdft
+    rpi
 "

 # COMPONENT_LIST needs to come last to ensure correct dependency checking
@@ -2416,9 +2434,11 @@ CONFIG_EXTRA="
     rangecoder
     riffdec
     riffenc
+    rpi
     rtpdec
     rtpenc_chain
     rv34dsp
+    sand
     scene_sad
     sinewin
     snappy
@@ -2750,6 +2770,8 @@ hap_decoder_select="snappy texturedsp"
 hap_encoder_deps="libsnappy"
 hap_encoder_select="texturedspenc"
 hevc_decoder_select="atsc_a53 bswapdsp cabac golomb hevcparse videodsp"
+hevc_rpi_decoder_deps="rpi"
+hevc_rpi_decoder_select="hevc_decoder sand"
 huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
 huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
 hymt_decoder_select="huffyuv_decoder"
@@ -2920,6 +2942,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
 dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
 ffnvcodec_deps_any="libdl LoadLibrary"
 nvdec_deps="ffnvcodec"
+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
 vaapi_x11_deps="xlib"
 videotoolbox_hwaccel_deps="videotoolbox pthreads"
 videotoolbox_hwaccel_extralibs="-framework QuartzCore"
@@ -2961,6 +2984,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
 hevc_dxva2_hwaccel_select="hevc_decoder"
 hevc_nvdec_hwaccel_deps="nvdec"
 hevc_nvdec_hwaccel_select="hevc_decoder"
+hevc_v4l2request_hwaccel_deps="v4l2_request"
+hevc_v4l2request_hwaccel_select="hevc_decoder"
+hevc_rpi4_10_hwaccel_deps="rpi"
+hevc_rpi4_10_hwaccel_select="hevc_decoder"
+hevc_rpi4_8_hwaccel_deps="rpi"
+hevc_rpi4_8_hwaccel_select="hevc_decoder"
 hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
 hevc_vaapi_hwaccel_select="hevc_decoder"
 hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
@@ -3268,7 +3297,7 @@ librav1e_encoder_deps="librav1e"
 librav1e_encoder_select="extract_extradata_bsf"
 librsvg_decoder_deps="librsvg"
 libshine_encoder_deps="libshine"
-libshine_encoder_select="audio_frame_queue mpegaudioheader"
+libshine_encoder_select="audio_frame_queue"
 libspeex_decoder_deps="libspeex"
 libspeex_encoder_deps="libspeex"
 libspeex_encoder_select="audio_frame_queue"
@@ -3438,8 +3467,13 @@ sndio_indev_deps="sndio"
 sndio_outdev_deps="sndio"
 v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_indev_suggest="libv4l2"
+v4l2_outdev_deps="libdrm"
 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_outdev_suggest="libv4l2"
+vout_drm_outdev_deps="libdrm"
+vout_egl_outdev_deps="xlib epoxy"
+vout_rpi_outdev_deps="rpi"
+vout_rpi_outdev_select="sand"
 vfwcap_indev_deps="vfw32 vfwcap_defines"
 xcbgrab_indev_deps="libxcb"
 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
@@ -3658,6 +3692,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping"
 tonemap_opencl_filter_deps="opencl const_nan"
 transpose_opencl_filter_deps="opencl"
 transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
+unsand_filter_select="sand"
 unsharp_opencl_filter_deps="opencl"
 uspp_filter_deps="gpl avcodec"
 vaguedenoiser_filter_deps="gpl"
@@ -3706,23 +3741,23 @@ cws2fws_extralibs="zlib_extralibs"

 # libraries, in any order
 avcodec_deps="avutil"
-avcodec_suggest="libm stdatomic"
+avcodec_suggest="libm"
 avcodec_select="null_bsf"
 avdevice_deps="avformat avcodec avutil"
-avdevice_suggest="libm stdatomic"
+avdevice_suggest="libm"
 avfilter_deps="avutil"
-avfilter_suggest="libm stdatomic"
+avfilter_suggest="libm"
 avformat_deps="avcodec avutil"
-avformat_suggest="libm network zlib stdatomic"
+avformat_suggest="libm network zlib"
 avresample_deps="avutil"
 avresample_suggest="libm"
-avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt stdatomic"
+avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt"
 postproc_deps="avutil gpl"
-postproc_suggest="libm stdatomic"
+postproc_suggest="libm"
 swresample_deps="avutil"
-swresample_suggest="libm libsoxr stdatomic"
+swresample_suggest="libm libsoxr"
 swscale_deps="avutil"
-swscale_suggest="libm stdatomic"
+swscale_suggest="libm"

 avcodec_extralibs="pthreads_extralibs iconv_extralibs dxva2_extralibs"
 avfilter_extralibs="pthreads_extralibs"
@@ -6155,6 +6190,12 @@ check_func_headers glob.h glob
 enabled xlib &&
     check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext

+enabled libudev &&
+    check_pkg_config libudev libudev libudev.h udev_new
+
+enabled epoxy &&
+    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
+
 check_headers direct.h
 check_headers dirent.h
 check_headers dxgidebug.h
@@ -6186,14 +6227,7 @@ check_headers asm/types.h
 # it seems there are versions of clang in some distros that try to use the
 # gcc headers, which explodes for stdatomic
 # so we also check that atomics actually work here
-#
-# some configurations also require linking to libatomic, so try
-# both with -latomic and without
-for LATOMIC in "-latomic" ""; do
-    check_builtin stdatomic stdatomic.h                                                 \
-        "atomic_int foo, bar = ATOMIC_VAR_INIT(-1); atomic_store(&foo, 0); foo += bar"  \
-        $LATOMIC && eval stdatomic_extralibs="\$LATOMIC" && break
-done
+check_builtin stdatomic stdatomic.h "atomic_int foo, bar = ATOMIC_VAR_INIT(-1); atomic_store(&foo, 0); foo += bar"

 check_lib advapi32 "windows.h"            RegCloseKey          -ladvapi32
 check_lib bcrypt   "windows.h bcrypt.h"   BCryptGenRandom      -lbcrypt &&
@@ -6499,11 +6533,12 @@ enabled mbedtls           && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
                                check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
                                die "ERROR: mbedTLS not found"; }
 enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
-enabled mmal              && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+( enabled rpi ||
+  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
                                { ! enabled cross_compile &&
                                  add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
                                  add_ldflags -L/opt/vc/lib/ &&
-                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
+                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
                                die "ERROR: mmal not found" &&
                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
 enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
@@ -6544,8 +6579,16 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
                                { enabled libdrm ||
                                  die "ERROR: rkmpp requires --enable-libdrm"; }
                              }
+enabled v4l2_request      && { enabled libdrm ||
+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
+                             { enabled libudev ||
+                               die "ERROR: v4l2-request requires libudev"; }
 enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init

+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
+
+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
+                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }

 if enabled gcrypt; then
     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
@@ -6562,7 +6605,7 @@ fi

 if enabled sdl2; then
     SDL2_CONFIG="${cross_prefix}sdl2-config"
-    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 3.0.0" SDL_events.h SDL_PollEvent
+    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent
     if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then
         sdl2_cflags=$("${SDL2_CONFIG}" --cflags)
         sdl2_extralibs=$("${SDL2_CONFIG}" --libs)
@@ -6625,6 +6668,10 @@ if enabled v4l2_m2m; then
     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
 fi

+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+disable v4l2_req_hevc_vx
+
 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete

@@ -7112,6 +7159,9 @@ check_deps $CONFIG_LIST       \
 enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86"
 enabled avresample && warn "Building with deprecated library libavresample"

+# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done
+enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx
+
 case $target_os in
 haiku)
     disable memalign
@@ -7591,7 +7641,7 @@ cat > $TMPH <<EOF
 #define FFMPEG_CONFIG_H
 #define FFMPEG_CONFIGURATION "$(c_escape $FFMPEG_CONFIGURATION)"
 #define FFMPEG_LICENSE "$(c_escape $license)"
-#define CONFIG_THIS_YEAR 2023
+#define CONFIG_THIS_YEAR 2021
 #define FFMPEG_DATADIR "$(eval c_escape $datadir)"
 #define AVCONV_DATADIR "$(eval c_escape $datadir)"
 #define CC_IDENT "$(c_escape ${cc_ident:-Unknown compiler})"
diff --git a/doc/Doxyfile b/doc/Doxyfile
index f0b9005da7..f7efc43803 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = FFmpeg
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 4.4.4
+PROJECT_NUMBER         = 4.4.1

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/doc/authors.texi b/doc/authors.texi
index ce088392f8..6c8c1d7efa 100644
--- a/doc/authors.texi
+++ b/doc/authors.texi
@@ -3,9 +3,9 @@
 The FFmpeg developers.

 For details about the authorship, see the Git history of the project
-(https://git.ffmpeg.org/ffmpeg), e.g. by typing the command
+(git://source.ffmpeg.org/ffmpeg), e.g. by typing the command
 @command{git log} in the FFmpeg source directory, or browsing the
-online repository at @url{https://git.ffmpeg.org/ffmpeg}.
+online repository at @url{http://source.ffmpeg.org}.

 Maintainers for the specific components are listed in the file
 @file{MAINTAINERS} in the source code tree.
diff --git a/doc/git-howto.texi b/doc/git-howto.texi
index a6723931ce..2b4fb80233 100644
--- a/doc/git-howto.texi
+++ b/doc/git-howto.texi
@@ -53,7 +53,7 @@ Most distribution and operating system provide a package for it.
 @section Cloning the source tree

 @example
-git clone https://git.ffmpeg.org/ffmpeg.git <target>
+git clone git://source.ffmpeg.org/ffmpeg <target>
 @end example

 This will put the FFmpeg sources into the directory @var{<target>}.
@@ -187,18 +187,11 @@ to make sure you don't have untracked files or deletions.
 git add [-i|-p|-A] <filenames/dirnames>
 @end example

-Make sure you have told Git your name, email address and GPG key
+Make sure you have told Git your name and email address

 @example
 git config --global user.name "My Name"
 git config --global user.email my@@email.invalid
-git config --global user.signingkey ABCDEF0123245
-@end example
-
-Enable signing all commits or use -S
-
-@example
-git config --global commit.gpgsign true
 @end example

 Use @option{--global} to set the global configuration for all your Git checkouts.
@@ -400,19 +393,6 @@ git checkout -b svn_23456 $SHA1
 where @var{$SHA1} is the commit hash from the @command{git log} output.


-@chapter gpg key generation
-
-If you have no gpg key yet, we recommend that you create a ed25519 based key as it
-is small, fast and secure. Especially it results in small signatures in git.
-
-@example
-gpg --default-new-key-algo "ed25519/cert,sign+cv25519/encr" --quick-generate-key "human@@server.com"
-@end example
-
-When generating a key, make sure the email specified matches the email used in git as some sites like
-github consider mismatches a reason to declare such commits unverified. After generating a key you
-can add it to the MAINTAINER file and upload it to a keyserver.
-
 @chapter Pre-push checklist

 Once you have a set of commits that you feel are ready for pushing,
diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index dec012a299..8aa13007f9 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2189,8 +2189,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
                        ifilter->channel_layout != frame->channel_layout;
         break;
     case AVMEDIA_TYPE_VIDEO:
-        need_reinit |= ifilter->width  != frame->width ||
-                       ifilter->height != frame->height;
+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
+                       ifilter->height != av_frame_cropped_height(frame);
         break;
     }

@@ -2201,6 +2201,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
         need_reinit = 1;

+    if (no_cvt_hw && fg->graph)
+        need_reinit = 0;
+
     if (need_reinit) {
         ret = ifilter_parameters_from_frame(ifilter, frame);
         if (ret < 0)
@@ -2469,8 +2472,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
         decoded_frame->top_field_first = ist->top_field_first;

     ist->frames_decoded++;
-
-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
         if (err < 0)
             goto fail;
@@ -2674,7 +2676,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo
         case AVMEDIA_TYPE_VIDEO:
             ret = decode_video    (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt,
                                    &decode_failed);
-            if (!repeating || !pkt || got_output) {
+            // Pi: Do not inc dts if no_cvt_hw set
+            // V4L2 H264 decode has long latency and sometimes spits out a long
+            // stream of output without input. In this case incrementing DTS is wrong.
+            // There may be cases where the condition as written is correct so only
+            // "fix" in the cases which cause problems
+            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
                 if (pkt && pkt->duration) {
                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
@@ -2898,6 +2905,16 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat
         } else {
             const HWAccel *hwaccel = NULL;
             int i;
+
+            if (no_cvt_hw) {
+                config = avcodec_get_hw_config(s->codec, 0);
+                if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
+                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p);
+                    ist->hwaccel_pix_fmt = *p;
+                    break;
+                }
+            }
+
             for (i = 0; hwaccels[i].name; i++) {
                 if (hwaccels[i].pix_fmt == *p) {
                     hwaccel = &hwaccels[i];
@@ -2993,6 +3010,15 @@ static int init_input_stream(int ist_index, char *error, int error_len)
             return ret;
         }

+#if CONFIG_HEVC_RPI_DECODER
+        ret = -1;
+        if (strcmp(codec->name, "hevc_rpi") == 0 &&
+            (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
+            ist->dec = codec = avcodec_find_decoder_by_name("hevc");
+            av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
+        }
+        if (ret < 0)
+#endif
         if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
             if (ret == AVERROR_EXPERIMENTAL)
                 abort_codec_experimental(codec, 0);
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index 606f2afe0c..448cd2e009 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -61,6 +61,7 @@ enum HWAccelID {
     HWACCEL_GENERIC,
     HWACCEL_VIDEOTOOLBOX,
     HWACCEL_QSV,
+    HWACCEL_RPI,
 };

 typedef struct HWAccel {
@@ -611,6 +612,7 @@ extern int video_sync_method;
 extern float frame_drop_threshold;
 extern int do_benchmark;
 extern int do_benchmark_all;
+extern int no_cvt_hw;
 extern int do_deinterlace;
 extern int do_hex_dump;
 extern int do_pkt_dump;
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 4ab769c07b..5cdc3a7b6c 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -1160,8 +1160,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)

     ifilter->format = frame->format;

-    ifilter->width               = frame->width;
-    ifilter->height              = frame->height;
+    ifilter->width               = av_frame_cropped_width(frame);
+    ifilter->height              = av_frame_cropped_height(frame);
     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;

     ifilter->sample_rate         = frame->sample_rate;
diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c
index fc4a5d31d6..cc69dce40e 100644
--- a/fftools/ffmpeg_hw.c
+++ b/fftools/ffmpeg_hw.c
@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type)
     char *name;
     size_t index_pos;
     int index, index_limit = 1000;
+    if (!type_name)
+        return NULL;
     index_pos = strlen(type_name);
     name = av_malloc(index_pos + 4);
     if (!name)
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index 807e783422..456d4f349b 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -133,12 +133,22 @@ static const char *const opt_name_enc_time_bases[]            = {"enc_time_base"
     }\
 }

+#if CONFIG_RPI
+static int rpi_init(AVCodecContext *avctx) {
+    return 0;
+}
+#endif
+
 const HWAccel hwaccels[] = {
 #if CONFIG_VIDEOTOOLBOX
     { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
 #endif
 #if CONFIG_LIBMFX
     { "qsv",   qsv_init,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
+#endif
+#if CONFIG_RPI
+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
 #endif
     { 0 },
 };
@@ -158,6 +168,7 @@ float frame_drop_threshold = 0;
 int do_deinterlace    = 0;
 int do_benchmark      = 0;
 int do_benchmark_all  = 0;
+int no_cvt_hw         = 0;
 int do_hex_dump       = 0;
 int do_pkt_dump       = 0;
 int copy_ts           = 0;
@@ -3499,6 +3510,8 @@ const OptionDef options[] = {
         "add timings for benchmarking" },
     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
       "add timings for each task" },
+    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
+      "do not auto-convert hw frames to sw" },
     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
       "write program-readable progress information", "url" },
     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
diff --git a/libavcodec/012v.c b/libavcodec/012v.c
index 41d9e2708e..b5a4066656 100644
--- a/libavcodec/012v.c
+++ b/libavcodec/012v.c
@@ -131,8 +131,8 @@ static int zero12v_decode_frame(AVCodecContext *avctx, void *data,
             u = x/2 + (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
             v = x/2 + (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
             memcpy(y, y_temp, sizeof(*y) * (width - x));
-            memcpy(u, u_temp, sizeof(*u) * ((width - x + 1) / 2));
-            memcpy(v, v_temp, sizeof(*v) * ((width - x + 1) / 2));
+            memcpy(u, u_temp, sizeof(*u) * (width - x + 1) / 2);
+            memcpy(v, v_temp, sizeof(*v) * (width - x + 1) / 2);
         }

         line_end += stride;
diff --git a/libavcodec/8bps.c b/libavcodec/8bps.c
index 6cc9a0c9ae..53e939d35d 100644
--- a/libavcodec/8bps.c
+++ b/libavcodec/8bps.c
@@ -70,9 +70,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     unsigned char *planemap = c->planemap;
     int ret;

-    if (buf_size < planes * height *2)
-        return AVERROR_INVALIDDATA;
-
     if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index b3d284d7d0..e93c842047 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h                                                  \
           mediacodec.h                                                  \
           packet.h                                                      \
           qsv.h                                                         \
+          rpi_zc.h                                                      \
           vaapi.h                                                       \
           vdpau.h                                                       \
           version.h                                                     \
@@ -132,6 +133,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC)            += mpegvideo_enc.o mpeg12data.o  \
                                           motion_est.o ratecontrol.o    \
                                           mpegvideoencdsp.o
 OBJS-$(CONFIG_MSS34DSP)                += mss34dsp.o
+OBJS-$(CONFIG_NVENC)                   += nvenc.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += pixblockdsp.o
 OBJS-$(CONFIG_QPELDSP)                 += qpeldsp.o
 OBJS-$(CONFIG_QSV)                     += qsv.o
@@ -139,6 +141,7 @@ OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
 OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
 OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
 OBJS-$(CONFIG_RDFT)                    += rdft.o
+OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
 OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
 OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
 OBJS-$(CONFIG_SINEWIN)                 += sinewin.o
@@ -153,7 +156,10 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
 OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
-OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
+                                          weak_link.o v4l2_req_dmabufs.o
+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
+					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o

@@ -374,9 +380,9 @@ OBJS-$(CONFIG_H264_CUVID_DECODER)      += cuviddec.o
 OBJS-$(CONFIG_H264_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_H264_MF_ENCODER)         += mfenc.o mf_utils.o
 OBJS-$(CONFIG_H264_MMAL_DECODER)       += mmaldec.o
-OBJS-$(CONFIG_H264_NVENC_ENCODER)      += nvenc.o nvenc_h264.o
-OBJS-$(CONFIG_NVENC_ENCODER)           += nvenc.o nvenc_h264.o
-OBJS-$(CONFIG_NVENC_H264_ENCODER)      += nvenc.o nvenc_h264.o
+OBJS-$(CONFIG_H264_NVENC_ENCODER)      += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_ENCODER)           += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_H264_ENCODER)      += nvenc_h264.o
 OBJS-$(CONFIG_H264_OMX_ENCODER)        += omx.o
 OBJS-$(CONFIG_H264_QSV_DECODER)        += qsvdec.o
 OBJS-$(CONFIG_H264_QSV_ENCODER)        += qsvenc_h264.o
@@ -396,12 +402,20 @@ OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
 OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
 OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_HEVC_MF_ENCODER)         += mfenc.o mf_utils.o
-OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc.o nvenc_hevc.o
-OBJS-$(CONFIG_NVENC_HEVC_ENCODER)      += nvenc.o nvenc_hevc.o
+OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc_hevc.o
+OBJS-$(CONFIG_NVENC_HEVC_ENCODER)      += nvenc_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_DECODER)        += qsvdec.o
 OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
                                           hevc_data.o
 OBJS-$(CONFIG_HEVC_RKMPP_DECODER)      += rkmppdec.o
+OBJS-$(CONFIG_RPI)                     += rpi_mem.o \
+                                          rpi_mailbox.o rpi_zc.o
+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
+                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
+                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
+                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
+                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
+                                          rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
 OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)      += vaapi_encode_h265.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
 OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
@@ -874,7 +888,6 @@ OBJS-$(CONFIG_ADPCM_G726_ENCODER)         += g726.o
 OBJS-$(CONFIG_ADPCM_G726LE_DECODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_G726LE_ENCODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_IMA_AMV_DECODER)      += adpcm.o adpcm_data.o
-OBJS-$(CONFIG_ADPCM_IMA_AMV_ENCODER)      += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ALP_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ALP_ENCODER)      += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_APC_DECODER)      += adpcm.o adpcm_data.o
@@ -941,6 +954,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
+OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
+OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o
+OBJS-$(CONFIG_V4L2_REQ_HEVC_VX)           += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
@@ -1297,3 +1314,31 @@ $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h
 $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
 endif
+
+ifdef CONFIG_HEVC_RPI_DECODER
+QASM_PY := ../local/bin/qasm.py
+VASMVIDCORE := ../local/bin/vasmvidcore_std
+
+ifneq ("$(wildcard $(QASM_PY))","")
+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
+	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
+
+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
+	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
+endif
+
+ifneq ("$(wildcard $(VASMVIDCORE))","")
+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
+
+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
+	python pi-util/make_array.py $<
+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
+	python pi-util/make_array.py $<
+endif
+
+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
+endif
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..c8935f205e 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
 NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
                                            aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
+                                           aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
+NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o

 # decoders/encoders
diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
index 742a3372e3..eec21aa5a2 100644
--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -27,19 +27,29 @@
 #include "libavcodec/idctdsp.h"
 #include "idct.h"

+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+
 av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();

-    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
-            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
-            c->idct_put  = ff_simple_idct_put_neon;
-            c->idct_add  = ff_simple_idct_add_neon;
-            c->idct      = ff_simple_idct_neon;
-            c->perm_type = FF_IDCT_PERM_PARTTRANS;
+    if (have_neon(cpu_flags)) {
+        if (!avctx->lowres && !high_bit_depth) {
+            if (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+                c->idct_put  = ff_simple_idct_put_neon;
+                c->idct_add  = ff_simple_idct_add_neon;
+                c->idct      = ff_simple_idct_neon;
+                c->perm_type = FF_IDCT_PERM_PARTTRANS;
+            }
         }
+
+        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
     }
 }
diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S
new file mode 100644
index 0000000000..7f47611206
--- /dev/null
+++ b/libavcodec/aarch64/idctdsp_neon.S
@@ -0,0 +1,130 @@
+/*
+ * IDCT AArch64 NEON optimisations
+ *
+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// Clamp 16-bit signed block coefficients to unsigned 8-bit
+// On entry:
+//   x0 -> array of 64x 16-bit coefficients
+//   x1 -> 8-bit results
+//   x2 = row stride for results, bytes
+function ff_put_pixels_clamped_neon, export=1
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        sqxtun          v4.8b, v4.8h
+        st1             {v0.8b}, [x1], x2
+        sqxtun          v0.8b, v5.8h
+        st1             {v1.8b}, [x1], x2
+        sqxtun          v1.8b, v6.8h
+        st1             {v2.8b}, [x1], x2
+        sqxtun          v2.8b, v7.8h
+        st1             {v3.8b}, [x1], x2
+        st1             {v4.8b}, [x1], x2
+        st1             {v0.8b}, [x1], x2
+        st1             {v1.8b}, [x1], x2
+        st1             {v2.8b}, [x1]
+        ret
+endfunc
+
+// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
+// On entry:
+//   x0 -> array of 64x 16-bit coefficients
+//   x1 -> 8-bit results
+//   x2 = row stride for results, bytes
+function ff_put_signed_pixels_clamped_neon, export=1
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+        movi            v4.8b, #128
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
+        sqxtn           v0.8b, v0.8h
+        sqxtn           v1.8b, v1.8h
+        sqxtn           v2.8b, v2.8h
+        sqxtn           v3.8b, v3.8h
+        sqxtn           v5.8b, v16.8h
+        add             v0.8b, v0.8b, v4.8b
+        sqxtn           v6.8b, v17.8h
+        add             v1.8b, v1.8b, v4.8b
+        sqxtn           v7.8b, v18.8h
+        add             v2.8b, v2.8b, v4.8b
+        sqxtn           v16.8b, v19.8h
+        add             v3.8b, v3.8b, v4.8b
+        st1             {v0.8b}, [x1], x2
+        add             v0.8b, v5.8b, v4.8b
+        st1             {v1.8b}, [x1], x2
+        add             v1.8b, v6.8b, v4.8b
+        st1             {v2.8b}, [x1], x2
+        add             v2.8b, v7.8b, v4.8b
+        st1             {v3.8b}, [x1], x2
+        add             v3.8b, v16.8b, v4.8b
+        st1             {v0.8b}, [x1], x2
+        st1             {v1.8b}, [x1], x2
+        st1             {v2.8b}, [x1], x2
+        st1             {v3.8b}, [x1]
+        ret
+endfunc
+
+// Add 16-bit signed block coefficients to unsigned 8-bit
+// On entry:
+//   x0 -> array of 64x 16-bit coefficients
+//   x1 -> 8-bit input and results
+//   x2 = row stride for 8-bit input and results, bytes
+function ff_add_pixels_clamped_neon, export=1
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+        mov             x3, x1
+        ld1             {v4.8b}, [x1], x2
+        ld1             {v5.8b}, [x1], x2
+        ld1             {v6.8b}, [x1], x2
+        ld1             {v7.8b}, [x1], x2
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
+        uaddw           v0.8h, v0.8h, v4.8b
+        uaddw           v1.8h, v1.8h, v5.8b
+        uaddw           v2.8h, v2.8h, v6.8b
+        ld1             {v4.8b}, [x1], x2
+        uaddw           v3.8h, v3.8h, v7.8b
+        ld1             {v5.8b}, [x1], x2
+        sqxtun          v0.8b, v0.8h
+        ld1             {v6.8b}, [x1], x2
+        sqxtun          v1.8b, v1.8h
+        ld1             {v7.8b}, [x1]
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        uaddw           v4.8h, v16.8h, v4.8b
+        st1             {v0.8b}, [x3], x2
+        uaddw           v0.8h, v17.8h, v5.8b
+        st1             {v1.8b}, [x3], x2
+        uaddw           v1.8h, v18.8h, v6.8b
+        st1             {v2.8b}, [x3], x2
+        uaddw           v2.8h, v19.8h, v7.8b
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v0.8b, v0.8h
+        st1             {v3.8b}, [x3], x2
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        st1             {v4.8b}, [x3], x2
+        st1             {v0.8b}, [x3], x2
+        st1             {v1.8b}, [x3], x2
+        st1             {v2.8b}, [x3]
+        ret
+endfunc
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 13dfd74940..a7976fd596 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -21,10 +21,28 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/aarch64/cpu.h"
+#include "libavutil/intreadwrite.h"
 #include "libavcodec/vc1dsp.h"

 #include "config.h"

+void ff_vc1_inv_trans_8x8_neon(int16_t *block);
+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
+
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);

+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+    /* Dealing with starting and stopping, and removing escape bytes, are
+     * comparatively less time-sensitive, so are more clearly expressed using
+     * a C wrapper around the assembly inner loop. Note that we assume a
+     * little-endian machine that supports unaligned loads. */
+    int dsize = 0;
+    while (size >= 4)
+    {
+        int found = 0;
+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+        {
+            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
+            if (!found)
+            {
+                *dst++ = *src++;
+                --size;
+                ++dsize;
+            }
+        }
+        if (!found)
+        {
+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
+            dst += skip;
+            src += skip;
+            size -= skip;
+            dsize += skip;
+            while (!found && size >= 4)
+            {
+                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
+                if (!found)
+                {
+                    *dst++ = *src++;
+                    --size;
+                    ++dsize;
+                }
+            }
+        }
+        if (found)
+        {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            ++src;
+            size -= 3;
+            dsize += 2;
+        }
+    }
+    while (size > 0)
+    {
+        *dst++ = *src++;
+        --size;
+        ++dsize;
+    }
+    return dsize;
+}
+
 av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();

     if (have_neon(cpu_flags)) {
+        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
+        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
+        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
+        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
+        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
+        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
+        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+
+        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
+        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
+        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
+        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
+        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
+
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
     }
 }
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
new file mode 100644
index 0000000000..9a96c2523c
--- /dev/null
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -0,0 +1,1546 @@
+/*
+ * VC1 AArch64 NEON optimisations
+ *
+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// VC-1 8x8 inverse transform
+// On entry:
+//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
+// On exit:
+//   array at x0 updated to hold transformed block; also now held in row-major order
+function ff_vc1_inv_trans_8x8_neon, export=1
+        ld1             {v1.16b, v2.16b}, [x0], #32
+        ld1             {v3.16b, v4.16b}, [x0], #32
+        ld1             {v5.16b, v6.16b}, [x0], #32
+        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
+        sub             x1, x0, #3*32
+        ld1             {v16.16b, v17.16b}, [x0]
+        shl             v7.8h, v2.8h, #4        //          16 * src[8]
+        shl             v18.8h, v2.8h, #2       //           4 * src[8]
+        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
+        ldr             d0, .Lcoeffs_it8
+        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
+        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
+        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
+        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
+        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
+        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
+        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
+        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
+        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
+        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
+        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
+        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
+        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
+        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
+        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
+        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
+        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
+        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
+        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
+        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
+        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
+        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
+        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
+        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
+        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
+        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
+        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
+        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
+        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
+        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
+        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
+        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
+        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
+        neg             v3.8h, v7.8h            // -t1
+        neg             v4.8h, v20.8h           // +t2
+        neg             v6.8h, v19.8h           // +t3
+        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
+        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
+        neg             v7.8h, v18.8h           // +t4
+        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
+        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
+        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
+        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
+        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
+        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
+        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
+        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
+        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
+        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
+        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
+        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
+        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
+        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
+        trn2            v17.8h, v3.8h, v4.8h
+        trn2            v18.8h, v5.8h, v6.8h
+        trn2            v19.8h, v2.8h, v1.8h
+        trn2            v20.8h, v7.8h, v16.8h
+        trn1            v21.4s, v17.4s, v18.4s
+        trn2            v17.4s, v17.4s, v18.4s
+        trn1            v18.4s, v19.4s, v20.4s
+        trn2            v19.4s, v19.4s, v20.4s
+        trn1            v3.8h, v3.8h, v4.8h
+        trn2            v4.2d, v21.2d, v18.2d
+        trn1            v20.2d, v17.2d, v19.2d
+        trn1            v5.8h, v5.8h, v6.8h
+        trn1            v1.8h, v2.8h, v1.8h
+        trn1            v2.8h, v7.8h, v16.8h
+        trn1            v6.2d, v21.2d, v18.2d
+        trn2            v7.2d, v17.2d, v19.2d
+        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
+        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
+        trn1            v18.4s, v3.4s, v5.4s
+        trn1            v19.4s, v1.4s, v2.4s
+        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
+        shl             v22.8h, v6.8h, #2       //           4 * src[8]
+        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
+        trn2            v3.4s, v3.4s, v5.4s
+        trn2            v1.4s, v1.4s, v2.4s
+        shl             v2.8h, v6.8h, #4        //          16 * src[8]
+        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
+        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
+        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
+        trn1            v22.2d, v18.2d, v19.2d
+        trn2            v18.2d, v18.2d, v19.2d
+        trn1            v19.2d, v3.2d, v1.2d
+        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
+        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
+        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
+        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
+        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
+        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
+        trn2            v1.2d, v3.2d, v1.2d
+        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
+        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
+        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
+        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
+        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
+        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
+        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
+        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
+        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
+        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
+        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
+        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
+        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
+        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
+        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
+        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
+        neg             v21.8h, v17.8h          // +t2
+        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
+        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
+        neg             v4.8h, v5.8h            // +t3
+        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
+        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
+        neg             v24.8h, v16.8h          // +t4
+        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
+        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
+        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
+        neg             v3.8h, v2.8h            // -t1
+        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
+        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
+        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
+        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
+        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
+        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
+        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
+        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
+        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
+        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
+        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
+        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
+        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
+        st1             {v2.16b, v3.16b}, [x1], #32
+        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
+        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
+        st1             {v4.16b, v5.16b}, [x1], #32
+        st1             {v16.16b, v17.16b}, [x1], #32
+        st1             {v0.16b, v1.16b}, [x1]
+        ret
+endfunc
+
+// VC-1 8x4 inverse transform
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x4_neon, export=1
+        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
+        mov             x3, x0
+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
+        ld1             {v5.8b}, [x0], x1
+        trn2            v6.4h, v1.4h, v3.4h
+        trn2            v7.4h, v2.4h, v4.4h
+        trn1            v1.4h, v1.4h, v3.4h
+        trn1            v2.4h, v2.4h, v4.4h
+        trn2            v3.4h, v16.4h, v18.4h
+        trn2            v4.4h, v17.4h, v19.4h
+        trn1            v16.4h, v16.4h, v18.4h
+        trn1            v17.4h, v17.4h, v19.4h
+        ld1             {v18.8b}, [x0], x1
+        trn1            v19.2s, v6.2s, v3.2s
+        trn2            v3.2s, v6.2s, v3.2s
+        trn1            v6.2s, v7.2s, v4.2s
+        trn2            v4.2s, v7.2s, v4.2s
+        trn1            v7.2s, v1.2s, v16.2s
+        trn1            v20.2s, v2.2s, v17.2s
+        shl             v21.4h, v19.4h, #4      //          16 * src[1]
+        trn2            v1.2s, v1.2s, v16.2s
+        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
+        trn2            v2.2s, v2.2s, v17.2s
+        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
+        ld1             {v22.8b}, [x0], x1
+        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
+        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
+        ld1             {v25.8b}, [x0]
+        shl             v26.4h, v19.4h, #2      //           4 * src[1]
+        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
+        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
+        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
+        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
+        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
+        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
+        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
+        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
+        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
+        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
+        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
+        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
+        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
+        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
+        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
+        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
+        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
+        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
+        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
+        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
+        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
+        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
+        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
+        neg             v6.4h, v21.4h           // -t1
+        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
+        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
+        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
+        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
+        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
+        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
+        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
+        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
+        neg             v3.4h, v17.4h           // +t2
+        neg             v4.4h, v16.4h           // +t3
+        neg             v28.4h, v23.4h          // +t4
+        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
+        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
+        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
+        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
+        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
+        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
+        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
+        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
+        trn1            v1.2d, v7.2d, v1.2d
+        trn1            v2.2d, v20.2d, v2.2d
+        trn1            v3.2d, v24.2d, v27.2d
+        trn1            v4.2d, v19.2d, v26.2d
+        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
+        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
+        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
+        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
+        trn2            v6.8h, v1.8h, v2.8h
+        trn1            v1.8h, v1.8h, v2.8h
+        trn2            v2.8h, v3.8h, v4.8h
+        trn1            v3.8h, v3.8h, v4.8h
+        trn2            v4.4s, v6.4s, v2.4s
+        trn1            v7.4s, v1.4s, v3.4s
+        trn2            v1.4s, v1.4s, v3.4s
+        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
+        trn1            v2.4s, v6.4s, v2.4s
+        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
+        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
+        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
+        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
+        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
+        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
+        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
+        neg             v2.8h, v3.8h            // -t4/2
+        neg             v6.8h, v4.8h            // -t3/2
+        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
+        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
+        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
+        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
+        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
+        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
+        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
+        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
+        uaddw           v0.8h, v0.8h, v5.8b
+        uaddw           v1.8h, v1.8h, v18.8b
+        uaddw           v2.8h, v2.8h, v22.8b
+        uaddw           v3.8h, v3.8h, v25.8b
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        st1             {v0.8b}, [x3], x1
+        st1             {v1.8b}, [x3], x1
+        st1             {v2.8b}, [x3], x1
+        st1             {v3.8b}, [x3]
+        ret
+endfunc
+
+// VC-1 4x8 inverse transform
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x8_neon, export=1
+        mov             x3, #16
+        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
+        mov             x4, x0
+        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
+        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
+        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
+        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
+        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
+        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
+        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
+        ld1             {v4.d}[1], [x2]         // 70 71 72 73
+        ld1             {v5.s}[0], [x0], x1
+        ld1             {v6.s}[0], [x0], x1
+        ld1             {v7.s}[0], [x0], x1
+        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
+        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
+        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
+        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
+        ld1             {v4.s}[0], [x0], x1
+        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
+        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
+        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
+        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
+        ld1             {v5.s}[1], [x0], x1
+        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
+        ld1             {v6.s}[1], [x0], x1
+        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
+        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
+        ld1             {v7.s}[1], [x0], x1
+        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
+        ld1             {v4.s}[1], [x0]
+        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
+        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
+        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
+        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
+        neg             v3.8h, v16.8h           // -t3/2
+        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
+        neg             v18.8h, v17.8h          // -t4/2
+        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
+        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
+        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
+        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
+        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
+        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
+        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
+        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
+        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
+        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
+        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
+        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
+        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
+        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
+        mov             d18, v3.d[1]            // 50 51 52 53
+        shl             v19.4h, v3.4h, #4       //          16 * src[8]
+        mov             d20, v16.d[1]           // 70 71 72 73
+        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
+        mov             d22, v17.d[1]           // 40 41 42 43
+        shl             v23.4h, v3.4h, #2       //           4 * src[8]
+        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
+        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
+        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
+        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
+        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
+        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
+        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
+        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
+        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
+        mov             d23, v1.d[1]            // 60 61 62 63
+        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
+        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
+        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
+        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
+        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
+        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
+        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
+        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
+        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
+        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
+        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
+        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
+        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
+        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
+        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
+        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
+        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
+        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
+        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
+        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
+        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
+        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
+        neg             v23.4h, v24.4h          // +t2
+        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
+        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
+        neg             v17.4h, v21.4h          // +t3
+        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
+        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
+        neg             v16.4h, v19.4h          // -t1
+        neg             v27.4h, v2.4h           // +t4
+        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
+        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
+        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
+        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
+        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
+        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
+        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
+        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
+        trn1            v0.2d, v20.2d, v0.2d
+        trn1            v2.2d, v18.2d, v22.2d
+        trn1            v3.2d, v25.2d, v3.2d
+        trn1            v1.2d, v26.2d, v1.2d
+        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
+        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
+        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
+        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
+        uaddw           v0.8h, v0.8h, v5.8b
+        uaddw           v2.8h, v2.8h, v6.8b
+        uaddw           v3.8h, v3.8h, v7.8b
+        uaddw           v1.8h, v1.8h, v4.8b
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        sqxtun          v1.8b, v1.8h
+        st1             {v0.s}[0], [x4], x1
+        st1             {v2.s}[0], [x4], x1
+        st1             {v3.s}[0], [x4], x1
+        st1             {v1.s}[0], [x4], x1
+        st1             {v0.s}[1], [x4], x1
+        st1             {v2.s}[1], [x4], x1
+        st1             {v3.s}[1], [x4], x1
+        st1             {v1.s}[1], [x4]
+        ret
+endfunc
+
+// VC-1 4x4 inverse transform
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x4_neon, export=1
+        mov             x3, #16
+        ldr             d0, .Lcoeffs_it4
+        mov             x4, x0
+        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
+        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
+        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
+        ld1             {v4.d}[0], [x2]         // 30 31 32 33
+        ld1             {v5.s}[0], [x0], x1
+        ld1             {v5.s}[1], [x0], x1
+        ld1             {v6.s}[0], [x0], x1
+        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
+        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
+        ld1             {v6.s}[1], [x0]
+        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
+        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
+        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
+        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
+        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
+        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
+        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
+        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
+        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
+        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
+        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
+        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
+        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
+        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
+        neg             v7.4h, v3.4h            // -t3/2
+        neg             v16.4h, v4.4h           // -t4/2
+        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
+        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
+        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
+        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
+        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
+        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
+        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
+        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
+        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
+        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
+        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
+        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
+        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
+        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
+        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
+        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
+        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
+        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
+        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
+        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
+        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
+        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
+        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
+        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
+        neg             v3.4h, v2.4h            // -t4/2
+        neg             v7.4h, v4.4h            // -t3/2
+        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
+        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
+        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
+        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
+        trn1            v0.2d, v4.2d, v3.2d
+        trn1            v1.2d, v2.2d, v7.2d
+        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
+        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
+        uaddw           v0.8h, v0.8h, v5.8b
+        uaddw           v1.8h, v1.8h, v6.8b
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        st1             {v0.s}[0], [x4], x1
+        st1             {v0.s}[1], [x4], x1
+        st1             {v1.s}[0], [x4], x1
+        st1             {v1.s}[1], [x4]
+        ret
+endfunc
+
+// VC-1 8x8 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x8_dc_neon, export=1
+        ldrsh           w2, [x2]
+        mov             x3, x0
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        add             w2, w2, w2, lsl #1
+        ld1             {v3.8b}, [x0], x1
+        ld1             {v4.8b}, [x0], x1
+        add             w2, w2, #1
+        ld1             {v5.8b}, [x0], x1
+        asr             w2, w2, #1
+        ld1             {v6.8b}, [x0], x1
+        add             w2, w2, w2, lsl #1
+        ld1             {v7.8b}, [x0]
+        add             w0, w2, #16
+        asr             w0, w0, #5
+        dup             v16.8h, w0
+        uaddw           v0.8h, v16.8h, v0.8b
+        uaddw           v1.8h, v16.8h, v1.8b
+        uaddw           v2.8h, v16.8h, v2.8b
+        uaddw           v3.8h, v16.8h, v3.8b
+        uaddw           v4.8h, v16.8h, v4.8b
+        uaddw           v5.8h, v16.8h, v5.8b
+        sqxtun          v0.8b, v0.8h
+        uaddw           v6.8h, v16.8h, v6.8b
+        sqxtun          v1.8b, v1.8h
+        uaddw           v7.8h, v16.8h, v7.8b
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        sqxtun          v4.8b, v4.8h
+        st1             {v0.8b}, [x3], x1
+        sqxtun          v0.8b, v5.8h
+        st1             {v1.8b}, [x3], x1
+        sqxtun          v1.8b, v6.8h
+        st1             {v2.8b}, [x3], x1
+        sqxtun          v2.8b, v7.8h
+        st1             {v3.8b}, [x3], x1
+        st1             {v4.8b}, [x3], x1
+        st1             {v0.8b}, [x3], x1
+        st1             {v1.8b}, [x3], x1
+        st1             {v2.8b}, [x3]
+        ret
+endfunc
+
+// VC-1 8x4 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x4_dc_neon, export=1
+        ldrsh           w2, [x2]
+        mov             x3, x0
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        add             w2, w2, w2, lsl #1
+        ld1             {v3.8b}, [x0]
+        add             w0, w2, #1
+        asr             w0, w0, #1
+        add             w0, w0, w0, lsl #4
+        add             w0, w0, #64
+        asr             w0, w0, #7
+        dup             v4.8h, w0
+        uaddw           v0.8h, v4.8h, v0.8b
+        uaddw           v1.8h, v4.8h, v1.8b
+        uaddw           v2.8h, v4.8h, v2.8b
+        uaddw           v3.8h, v4.8h, v3.8b
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        st1             {v0.8b}, [x3], x1
+        st1             {v1.8b}, [x3], x1
+        st1             {v2.8b}, [x3], x1
+        st1             {v3.8b}, [x3]
+        ret
+endfunc
+
+// VC-1 4x8 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x8_dc_neon, export=1
+        ldrsh           w2, [x2]
+        mov             x3, x0
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v1.s}[0], [x0], x1
+        ld1             {v2.s}[0], [x0], x1
+        add             w2, w2, w2, lsl #4
+        ld1             {v3.s}[0], [x0], x1
+        add             w2, w2, #4
+        asr             w2, w2, #3
+        add             w2, w2, w2, lsl #1
+        ld1             {v0.s}[1], [x0], x1
+        add             w2, w2, #16
+        asr             w2, w2, #5
+        dup             v4.8h, w2
+        ld1             {v1.s}[1], [x0], x1
+        ld1             {v2.s}[1], [x0], x1
+        ld1             {v3.s}[1], [x0]
+        uaddw           v0.8h, v4.8h, v0.8b
+        uaddw           v1.8h, v4.8h, v1.8b
+        uaddw           v2.8h, v4.8h, v2.8b
+        uaddw           v3.8h, v4.8h, v3.8b
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun          v3.8b, v3.8h
+        st1             {v0.s}[0], [x3], x1
+        st1             {v1.s}[0], [x3], x1
+        st1             {v2.s}[0], [x3], x1
+        st1             {v3.s}[0], [x3], x1
+        st1             {v0.s}[1], [x3], x1
+        st1             {v1.s}[1], [x3], x1
+        st1             {v2.s}[1], [x3], x1
+        st1             {v3.s}[1], [x3]
+        ret
+endfunc
+
+// VC-1 4x4 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x4_dc_neon, export=1
+        ldrsh           w2, [x2]
+        mov             x3, x0
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v1.s}[0], [x0], x1
+        ld1             {v0.s}[1], [x0], x1
+        add             w2, w2, w2, lsl #4
+        ld1             {v1.s}[1], [x0]
+        add             w0, w2, #4
+        asr             w0, w0, #3
+        add             w0, w0, w0, lsl #4
+        add             w0, w0, #64
+        asr             w0, w0, #7
+        dup             v2.8h, w0
+        uaddw           v0.8h, v2.8h, v0.8b
+        uaddw           v1.8h, v2.8h, v1.8b
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        st1             {v0.s}[0], [x3], x1
+        st1             {v1.s}[0], [x3], x1
+        st1             {v0.s}[1], [x3], x1
+        st1             {v1.s}[1], [x3]
+        ret
+endfunc
+
+.align  5
+.Lcoeffs_it8:
+.quad   0x000F00090003
+.Lcoeffs_it4:
+.quad   0x0011000B0005
+.Lcoeffs:
+.quad   0x00050002
+
+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of lower block
+//   x1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter4_neon, export=1
+        sub             x3, x0, w1, sxtw #2
+        ldr             d0, .Lcoeffs
+        ld1             {v1.s}[0], [x0], x1     // P5
+        ld1             {v2.s}[0], [x3], x1     // P1
+        ld1             {v3.s}[0], [x3], x1     // P2
+        ld1             {v4.s}[0], [x0], x1     // P6
+        ld1             {v5.s}[0], [x3], x1     // P3
+        ld1             {v6.s}[0], [x0], x1     // P7
+        ld1             {v7.s}[0], [x3]         // P4
+        ld1             {v16.s}[0], [x0]        // P8
+        ushll           v17.8h, v1.8b, #1       // 2*P5
+        dup             v18.8h, w2              // pq
+        ushll           v2.8h, v2.8b, #1        // 2*P1
+        uxtl            v3.8h, v3.8b            // P2
+        uxtl            v4.8h, v4.8b            // P6
+        uxtl            v19.8h, v5.8b           // P3
+        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
+        uxtl            v3.8h, v6.8b            // P7
+        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
+        ushll           v5.8h, v5.8b, #1        // 2*P3
+        uxtl            v6.8h, v7.8b            // P4
+        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
+        uxtl            v3.8h, v16.8b           // P8
+        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
+        uxtl            v1.8h, v1.8b            // P5
+        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
+        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
+        sub             v3.4h, v6.4h, v1.4h     // P4-P5
+        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
+        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
+        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        abs             v4.4h, v3.4h
+        srshr           v7.4h, v17.4h, #3
+        srshr           v2.4h, v2.4h, #3
+        sshr            v4.4h, v4.4h, #1        // clip
+        srshr           v5.4h, v5.4h, #3
+        abs             v7.4h, v7.4h            // a2
+        sshr            v3.4h, v3.4h, #8        // clip_sign
+        abs             v2.4h, v2.4h            // a1
+        cmeq            v16.4h, v4.4h, #0       // test clip == 0
+        abs             v17.4h, v5.4h           // a0
+        sshr            v5.4h, v5.4h, #8        // a0_sign
+        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
+        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
+        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
+        bsl             v19.8b, v7.8b, v2.8b    // a3
+        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
+        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
+        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
+        mov             w0, v5.s[1]             // move to gp reg
+        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        cmhs            v5.4h, v0.4h, v4.4h
+        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
+        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
+        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        sqxtun          v0.8b, v6.8h
+        sqxtun          v1.8b, v1.8h
+        st1             {v0.s}[0], [x3], x1
+        st1             {v1.s}[0], [x3]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of right block
+//   x1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter4_neon, export=1
+        sub             x3, x0, #4              // where to start reading
+        ldr             d0, .Lcoeffs
+        ld1             {v1.8b}, [x3], x1
+        sub             x0, x0, #1              // where to start writing
+        ld1             {v2.8b}, [x3], x1
+        ld1             {v3.8b}, [x3], x1
+        ld1             {v4.8b}, [x3]
+        dup             v5.8h, w2               // pq
+        trn1            v6.8b, v1.8b, v2.8b
+        trn2            v1.8b, v1.8b, v2.8b
+        trn1            v2.8b, v3.8b, v4.8b
+        trn2            v3.8b, v3.8b, v4.8b
+        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
+        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
+        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
+        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
+        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
+        uxtl            v6.8h, v7.8b            // P2, P6
+        uxtl            v7.8h, v2.8b            // P3, P7
+        uxtl            v1.8h, v1.8b            // P4, P8
+        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
+        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
+        uxtl            v4.8h, v4.8b            // P1, P5
+        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
+        mov             d6, v6.d[1]             // P6
+        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
+        mov             d4, v4.d[1]             // P5
+        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
+        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
+        sub             v7.4h, v1.4h, v4.4h     // P4-P5
+        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        srshr           v3.8h, v3.8h, #3
+        abs             v6.4h, v7.4h
+        sshr            v7.4h, v7.4h, #8        // clip_sign
+        srshr           v2.4h, v2.4h, #3
+        abs             v3.8h, v3.8h            // a1, a2
+        sshr            v6.4h, v6.4h, #1        // clip
+        mov             d16, v3.d[1]            // a2
+        abs             v17.4h, v2.4h           // a0
+        cmeq            v18.4h, v6.4h, #0       // test clip == 0
+        sshr            v2.4h, v2.4h, #8        // a0_sign
+        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
+        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
+        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
+        bsl             v19.8b, v16.8b, v3.8b   // a3
+        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
+        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
+        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
+        mov             w2, v5.s[1]             // move to gp reg
+        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        cmhs            v5.4h, v0.4h, v6.4h
+        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
+        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
+        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        sqxtun          v3.8b, v4.8h
+        sqxtun          v2.8b, v1.8h
+        st2             {v2.b, v3.b}[0], [x0], x1
+        st2             {v2.b, v3.b}[1], [x0], x1
+        st2             {v2.b, v3.b}[2], [x0], x1
+        st2             {v2.b, v3.b}[3], [x0]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of lower block
+//   x1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter8_neon, export=1
+        sub             x3, x0, w1, sxtw #2
+        ldr             d0, .Lcoeffs
+        ld1             {v1.8b}, [x0], x1       // P5
+        movi            v2.2d, #0x0000ffff00000000
+        ld1             {v3.8b}, [x3], x1       // P1
+        ld1             {v4.8b}, [x3], x1       // P2
+        ld1             {v5.8b}, [x0], x1       // P6
+        ld1             {v6.8b}, [x3], x1       // P3
+        ld1             {v7.8b}, [x0], x1       // P7
+        ushll           v16.8h, v1.8b, #1       // 2*P5
+        ushll           v3.8h, v3.8b, #1        // 2*P1
+        ld1             {v17.8b}, [x3]          // P4
+        uxtl            v4.8h, v4.8b            // P2
+        ld1             {v18.8b}, [x0]          // P8
+        uxtl            v5.8h, v5.8b            // P6
+        dup             v19.8h, w2              // pq
+        uxtl            v20.8h, v6.8b           // P3
+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
+        uxtl            v4.8h, v7.8b            // P7
+        ushll           v6.8h, v6.8b, #1        // 2*P3
+        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
+        uxtl            v7.8h, v17.8b           // P4
+        uxtl            v17.8h, v18.8b          // P8
+        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
+        uxtl            v1.8h, v1.8b            // P5
+        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
+        sub             v4.8h, v7.8h, v1.8h     // P4-P5
+        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
+        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
+        abs             v17.8h, v4.8h
+        sshr            v4.8h, v4.8h, #8        // clip_sign
+        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
+        sshr            v17.8h, v17.8h, #1      // clip
+        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
+        srshr           v16.8h, v16.8h, #3
+        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        cmeq            v5.8h, v17.8h, #0       // test clip == 0
+        srshr           v3.8h, v3.8h, #3
+        abs             v16.8h, v16.8h          // a2
+        abs             v3.8h, v3.8h            // a1
+        srshr           v6.8h, v6.8h, #3
+        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
+        abs             v20.8h, v6.8h           // a0
+        sshr            v6.8h, v6.8h, #8        // a0_sign
+        bsl             v18.16b, v16.16b, v3.16b // a3
+        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
+        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
+        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
+        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
+        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
+        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
+        mov             w0, v5.s[1]             // move to gp reg
+        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        mov             w2, v5.s[3]
+        orr             v2.16b, v3.16b, v2.16b
+        cmhs            v3.8h, v0.8h, v17.8h
+        and             w0, w0, w2
+        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
+        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
+        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
+        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        sqxtun          v0.8b, v7.8h
+        sqxtun          v1.8b, v1.8h
+        st1             {v0.8b}, [x3], x1
+        st1             {v1.8b}, [x3]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of right block
+//   x1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter8_neon, export=1
+        sub             x3, x0, #4              // where to start reading
+        ldr             d0, .Lcoeffs
+        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
+        sub             x0, x0, #1              // where to start writing
+        ld1             {v2.8b}, [x3], x1
+        add             x4, x0, x1, lsl #2
+        ld1             {v3.8b}, [x3], x1
+        ld1             {v4.8b}, [x3], x1
+        ld1             {v5.8b}, [x3], x1
+        ld1             {v6.8b}, [x3], x1
+        ld1             {v7.8b}, [x3], x1
+        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
+        ld1             {v17.8b}, [x3]
+        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
+        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
+        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
+        dup             v4.8h, w2               // pq
+        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
+        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
+        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
+        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
+        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
+        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
+        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
+        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
+        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
+        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
+        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
+        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
+        trn1            v7.2s, v6.2s, v3.2s     // P1
+        trn1            v18.2s, v19.2s, v16.2s  // P2
+        trn2            v3.2s, v6.2s, v3.2s     // P5
+        trn2            v6.2s, v19.2s, v16.2s   // P6
+        trn1            v16.2s, v2.2s, v17.2s   // P3
+        trn2            v2.2s, v2.2s, v17.2s    // P7
+        ushll           v7.8h, v7.8b, #1        // 2*P1
+        trn1            v17.2s, v1.2s, v5.2s    // P4
+        ushll           v19.8h, v3.8b, #1       // 2*P5
+        trn2            v1.2s, v1.2s, v5.2s     // P8
+        uxtl            v5.8h, v18.8b           // P2
+        uxtl            v6.8h, v6.8b            // P6
+        uxtl            v18.8h, v16.8b          // P3
+        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
+        uxtl            v2.8h, v2.8b            // P7
+        ushll           v5.8h, v16.8b, #1       // 2*P3
+        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
+        uxtl            v16.8h, v17.8b          // P4
+        uxtl            v1.8h, v1.8b            // P8
+        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
+        uxtl            v2.8h, v3.8b            // P5
+        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
+        sub             v3.8h, v16.8h, v2.8h    // P4-P5
+        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
+        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
+        abs             v1.8h, v3.8h
+        sshr            v3.8h, v3.8h, #8        // clip_sign
+        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
+        sshr            v1.8h, v1.8h, #1        // clip
+        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
+        srshr           v17.8h, v19.8h, #3
+        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        cmeq            v6.8h, v1.8h, #0        // test clip == 0
+        srshr           v7.8h, v7.8h, #3
+        abs             v17.8h, v17.8h          // a2
+        abs             v7.8h, v7.8h            // a1
+        srshr           v5.8h, v5.8h, #3
+        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
+        abs             v19.8h, v5.8h           // a0
+        sshr            v5.8h, v5.8h, #8        // a0_sign
+        bsl             v18.16b, v17.16b, v7.16b // a3
+        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
+        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
+        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
+        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
+        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
+        mov             w2, v5.s[1]             // move to gp reg
+        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        mov             w3, v5.s[3]
+        cmhs            v5.8h, v0.8h, v1.8h
+        and             w5, w2, w3
+        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
+        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
+        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        sqxtun          v1.8b, v2.8h
+        sqxtun          v0.8b, v16.8h
+        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
+        st2             {v0.b, v1.b}[0], [x0], x1
+        st2             {v0.b, v1.b}[1], [x0], x1
+        st2             {v0.b, v1.b}[2], [x0], x1
+        st2             {v0.b, v1.b}[3], [x0]
+1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
+        st2             {v0.b, v1.b}[4], [x4], x1
+        st2             {v0.b, v1.b}[5], [x4], x1
+        st2             {v0.b, v1.b}[6], [x4], x1
+        st2             {v0.b, v1.b}[7], [x4]
+2:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of lower block
+//   x1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter16_neon, export=1
+        sub             x3, x0, w1, sxtw #2
+        ldr             d0, .Lcoeffs
+        ld1             {v1.16b}, [x0], x1      // P5
+        movi            v2.2d, #0x0000ffff00000000
+        ld1             {v3.16b}, [x3], x1      // P1
+        ld1             {v4.16b}, [x3], x1      // P2
+        ld1             {v5.16b}, [x0], x1      // P6
+        ld1             {v6.16b}, [x3], x1      // P3
+        ld1             {v7.16b}, [x0], x1      // P7
+        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
+        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
+        ld1             {v18.16b}, [x3]         // P4
+        uxtl            v19.8h, v4.8b           // P2[0..7]
+        ld1             {v20.16b}, [x0]         // P8
+        uxtl            v21.8h, v5.8b           // P6[0..7]
+        dup             v22.8h, w2              // pq
+        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
+        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
+        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
+        uxtl2           v4.8h, v4.16b           // P2[8..15]
+        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
+        uxtl2           v5.8h, v5.16b           // P6[8..15]
+        uxtl            v23.8h, v6.8b           // P3[0..7]
+        uxtl            v24.8h, v7.8b           // P7[0..7]
+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
+        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
+        uxtl            v25.8h, v18.8b          // P4[0..7]
+        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
+        uxtl2           v26.8h, v6.16b          // P3[8..15]
+        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
+        uxtl2           v7.8h, v7.16b           // P7[8..15]
+        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
+        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
+        uxtl2           v18.8h, v18.16b         // P4[8..15]
+        uxtl            v23.8h, v20.8b          // P8[0..7]
+        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
+        uxtl            v24.8h, v1.8b           // P5[0..7]
+        uxtl2           v20.8h, v20.16b         // P8[8..15]
+        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
+        uxtl2           v1.8h, v1.16b           // P5[8..15]
+        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
+        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
+        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
+        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
+        abs             v27.8h, v26.8h
+        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
+        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
+        abs             v28.8h, v7.8h
+        sshr            v27.8h, v27.8h, #1      // clip[0..7]
+        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
+        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
+        sshr            v23.8h, v28.8h, #1      // clip[8..15]
+        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
+        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
+        srshr           v17.8h, v17.8h, #3
+        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
+        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
+        srshr           v16.8h, v16.8h, #3
+        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
+        abs             v17.8h, v17.8h          // a1[0..7]
+        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
+        srshr           v3.8h, v3.8h, #3
+        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
+        abs             v16.8h, v16.8h          // a2[0..7]
+        srshr           v19.8h, v19.8h, #3
+        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
+        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
+        abs             v3.8h, v3.8h            // a1[8..15]
+        srshr           v4.8h, v4.8h, #3
+        abs             v19.8h, v19.8h          // a2[8..15]
+        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
+        srshr           v6.8h, v6.8h, #3
+        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
+        abs             v17.8h, v4.8h           // a0[0..7]
+        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
+        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
+        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        abs             v19.8h, v6.8h           // a0[8..15]
+        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
+        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
+        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
+        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
+        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
+        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
+        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
+        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
+        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
+        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
+        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
+        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
+        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
+        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
+        mov             w0, v5.s[1]             // move to gp reg
+        cmhs            v19.8h, v3.8h, v27.8h
+        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
+        mov             w2, v5.s[3]
+        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
+        orr             v16.16b, v20.16b, v17.16b
+        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
+        cmtst           v2.2d, v5.2d, v2.2d
+        cmhs            v3.8h, v0.8h, v23.8h
+        mov             w4, v5.s[1]
+        mov             w5, v5.s[3]
+        and             w0, w0, w2
+        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
+        orr             v2.16b, v7.16b, v2.16b
+        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
+        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
+        and             w2, w4, w5
+        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
+        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
+        and             w0, w0, w2
+        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
+        sqxtun          v2.8b, v25.8h
+        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
+        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
+        sqxtun          v0.8b, v24.8h
+        sqxtun2         v2.16b, v18.8h
+        sqxtun2         v0.16b, v1.8h
+        st1             {v2.16b}, [x3], x1
+        st1             {v0.16b}, [x3]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of right block
+//   x1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter16_neon, export=1
+        sub             x3, x0, #4              // where to start reading
+        ldr             d0, .Lcoeffs
+        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
+        sub             x0, x0, #1              // where to start writing
+        ld1             {v2.8b}, [x3], x1
+        add             x4, x0, x1, lsl #3
+        ld1             {v3.8b}, [x3], x1
+        add             x5, x0, x1, lsl #2
+        ld1             {v4.8b}, [x3], x1
+        add             x6, x4, x1, lsl #2
+        ld1             {v5.8b}, [x3], x1
+        ld1             {v6.8b}, [x3], x1
+        ld1             {v7.8b}, [x3], x1
+        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
+        ld1             {v17.8b}, [x3], x1
+        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
+        ld1             {v2.8b}, [x3], x1
+        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
+        ld1             {v19.8b}, [x3], x1
+        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
+        ld1             {v4.8b}, [x3], x1
+        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
+        ld1             {v21.8b}, [x3], x1
+        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
+        ld1             {v6.8b}, [x3], x1
+        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
+        ld1             {v23.8b}, [x3], x1
+        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
+        ld1             {v17.8b}, [x3], x1
+        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
+        ld1             {v25.8b}, [x3]
+        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
+        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
+        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
+        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
+        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
+        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
+        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
+        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
+        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
+        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
+        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
+        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
+        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
+        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
+        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
+        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
+        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
+        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
+        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
+        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
+        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
+        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
+        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
+        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
+        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
+        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
+        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
+        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
+        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
+        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
+        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
+        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
+        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
+        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
+        uxtl            v17.8h, v27.8b          // P2[0..7]
+        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
+        uxtl            v20.8h, v21.8b          // P6[0..7]
+        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
+        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
+        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
+        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
+        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
+        uxtl            v26.8h, v26.8b          // P2[8..15]
+        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
+        uxtl            v17.8h, v18.8b          // P6[8..15]
+        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
+        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
+        uxtl            v28.8h, v7.8b           // P3[0..7]
+        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
+        uxtl            v16.8h, v16.8b          // P7[0..7]
+        uxtl            v26.8h, v21.8b          // P3[8..15]
+        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
+        uxtl            v22.8h, v22.8b          // P7[8..15]
+        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
+        uxtl            v27.8h, v27.8b          // P4[0..7]
+        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
+        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
+        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
+        uxtl            v4.8h, v18.8b           // P4[8..15]
+        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
+        uxtl            v1.8h, v1.8b            // P8[0..7]
+        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
+        uxtl            v2.8h, v2.8b            // P8[8..15]
+        uxtl            v16.8h, v19.8b          // P5[0..7]
+        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
+        uxtl            v18.8h, v23.8b          // P5[8..15]
+        dup             v19.8h, w2              // pq
+        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
+        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
+        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
+        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
+        abs             v23.8h, v21.8h
+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
+        abs             v26.8h, v22.8h
+        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
+        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
+        sshr            v23.8h, v23.8h, #1      // clip[0..7]
+        sshr            v26.8h, v26.8h, #1      // clip[8..15]
+        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
+        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
+        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
+        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
+        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
+        srshr           v5.8h, v5.8h, #3
+        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
+        srshr           v2.8h, v6.8h, #3
+        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
+        srshr           v6.8h, v24.8h, #3
+        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
+        abs             v5.8h, v5.8h            // a1[0..7]
+        srshr           v24.8h, v25.8h, #3
+        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
+        abs             v2.8h, v2.8h            // a2[0..7]
+        abs             v6.8h, v6.8h            // a1[8..15]
+        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
+        abs             v17.8h, v24.8h          // a2[8..15]
+        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
+        srshr           v3.8h, v3.8h, #3
+        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
+        srshr           v7.8h, v7.8h, #3
+        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
+        abs             v2.8h, v3.8h            // a0[8..15]
+        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
+        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
+        abs             v5.8h, v7.8h            // a0[0..7]
+        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
+        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
+        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
+        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
+        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
+        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
+        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
+        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
+        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
+        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
+        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
+        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
+        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
+        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
+        mov             w7, v2.s[1]
+        mov             w8, v2.s[3]
+        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
+        mov             w2, v5.s[1]             // move to gp reg
+        cmhs            v2.8h, v3.8h, v26.8h
+        mov             w3, v5.s[3]
+        cmhs            v5.8h, v0.8h, v23.8h
+        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
+        and             w9, w7, w8
+        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
+        and             w10, w2, w3
+        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
+        and             w9, w10, w9
+        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
+        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
+        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
+        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
+        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
+        sqxtun          v2.8b, v4.8h
+        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
+        sqxtun          v0.8b, v27.8h
+        sqxtun          v1.8b, v16.8h
+        sqxtun          v3.8b, v18.8h
+        tbnz            w2, #0, 1f
+        st2             {v0.b, v1.b}[0], [x0], x1
+        st2             {v0.b, v1.b}[1], [x0], x1
+        st2             {v0.b, v1.b}[2], [x0], x1
+        st2             {v0.b, v1.b}[3], [x0]
+1:      tbnz            w3, #0, 2f
+        st2             {v0.b, v1.b}[4], [x5], x1
+        st2             {v0.b, v1.b}[5], [x5], x1
+        st2             {v0.b, v1.b}[6], [x5], x1
+        st2             {v0.b, v1.b}[7], [x5]
+2:      tbnz            w7, #0, 3f
+        st2             {v2.b, v3.b}[0], [x4], x1
+        st2             {v2.b, v3.b}[1], [x4], x1
+        st2             {v2.b, v3.b}[2], [x4], x1
+        st2             {v2.b, v3.b}[3], [x4]
+3:      tbnz            w8, #0, 4f
+        st2             {v2.b, v3.b}[4], [x6], x1
+        st2             {v2.b, v3.b}[5], [x6], x1
+        st2             {v2.b, v3.b}[6], [x6], x1
+        st2             {v2.b, v3.b}[7], [x6]
+4:      ret
+endfunc
+
+// Copy at most the specified number of bytes from source to destination buffer,
+// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
+// On entry:
+//   x0 -> source buffer
+//   w1 = max number of bytes to copy
+//   x2 -> destination buffer, optimally 8-byte aligned
+// On exit:
+//   w0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+        // Offset by 80 to screen out cases that are too short for us to handle,
+        // and also make it easy to test for loop termination, or to determine
+        // whether we need an odd number of half-iterations of the loop.
+        subs            w1, w1, #80
+        b.mi            90f
+
+        // Set up useful constants
+        movi            v20.4s, #3, lsl #24
+        movi            v21.4s, #3, lsl #16
+
+        tst             w1, #32
+        b.ne            1f
+
+          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
+          ext             v25.16b, v0.16b, v1.16b, #1
+          ext             v26.16b, v0.16b, v1.16b, #2
+          ext             v27.16b, v0.16b, v1.16b, #3
+          ext             v29.16b, v1.16b, v2.16b, #1
+          ext             v30.16b, v1.16b, v2.16b, #2
+          ext             v31.16b, v1.16b, v2.16b, #3
+          bic             v24.16b, v0.16b, v20.16b
+          bic             v25.16b, v25.16b, v20.16b
+          bic             v26.16b, v26.16b, v20.16b
+          bic             v27.16b, v27.16b, v20.16b
+          bic             v28.16b, v1.16b, v20.16b
+          bic             v29.16b, v29.16b, v20.16b
+          bic             v30.16b, v30.16b, v20.16b
+          bic             v31.16b, v31.16b, v20.16b
+          eor             v24.16b, v24.16b, v21.16b
+          eor             v25.16b, v25.16b, v21.16b
+          eor             v26.16b, v26.16b, v21.16b
+          eor             v27.16b, v27.16b, v21.16b
+          eor             v28.16b, v28.16b, v21.16b
+          eor             v29.16b, v29.16b, v21.16b
+          eor             v30.16b, v30.16b, v21.16b
+          eor             v31.16b, v31.16b, v21.16b
+          cmeq            v24.4s, v24.4s, #0
+          cmeq            v25.4s, v25.4s, #0
+          cmeq            v26.4s, v26.4s, #0
+          cmeq            v27.4s, v27.4s, #0
+          add             w1, w1, #32
+          b               3f
+
+1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
+        ext             v25.16b, v3.16b, v4.16b, #1
+        ext             v26.16b, v3.16b, v4.16b, #2
+        ext             v27.16b, v3.16b, v4.16b, #3
+        ext             v29.16b, v4.16b, v5.16b, #1
+        ext             v30.16b, v4.16b, v5.16b, #2
+        ext             v31.16b, v4.16b, v5.16b, #3
+        bic             v24.16b, v3.16b, v20.16b
+        bic             v25.16b, v25.16b, v20.16b
+        bic             v26.16b, v26.16b, v20.16b
+        bic             v27.16b, v27.16b, v20.16b
+        bic             v28.16b, v4.16b, v20.16b
+        bic             v29.16b, v29.16b, v20.16b
+        bic             v30.16b, v30.16b, v20.16b
+        bic             v31.16b, v31.16b, v20.16b
+        eor             v24.16b, v24.16b, v21.16b
+        eor             v25.16b, v25.16b, v21.16b
+        eor             v26.16b, v26.16b, v21.16b
+        eor             v27.16b, v27.16b, v21.16b
+        eor             v28.16b, v28.16b, v21.16b
+        eor             v29.16b, v29.16b, v21.16b
+        eor             v30.16b, v30.16b, v21.16b
+        eor             v31.16b, v31.16b, v21.16b
+        cmeq            v24.4s, v24.4s, #0
+        cmeq            v25.4s, v25.4s, #0
+        cmeq            v26.4s, v26.4s, #0
+        cmeq            v27.4s, v27.4s, #0
+        // Drop through...
+2:        mov             v0.16b, v5.16b
+          ld1             {v1.16b, v2.16b}, [x0], #32
+        cmeq            v28.4s, v28.4s, #0
+        cmeq            v29.4s, v29.4s, #0
+        cmeq            v30.4s, v30.4s, #0
+        cmeq            v31.4s, v31.4s, #0
+        orr             v24.16b, v24.16b, v25.16b
+        orr             v26.16b, v26.16b, v27.16b
+        orr             v28.16b, v28.16b, v29.16b
+        orr             v30.16b, v30.16b, v31.16b
+          ext             v25.16b, v0.16b, v1.16b, #1
+        orr             v22.16b, v24.16b, v26.16b
+          ext             v26.16b, v0.16b, v1.16b, #2
+          ext             v27.16b, v0.16b, v1.16b, #3
+          ext             v29.16b, v1.16b, v2.16b, #1
+        orr             v23.16b, v28.16b, v30.16b
+          ext             v30.16b, v1.16b, v2.16b, #2
+          ext             v31.16b, v1.16b, v2.16b, #3
+          bic             v24.16b, v0.16b, v20.16b
+          bic             v25.16b, v25.16b, v20.16b
+          bic             v26.16b, v26.16b, v20.16b
+        orr             v22.16b, v22.16b, v23.16b
+          bic             v27.16b, v27.16b, v20.16b
+          bic             v28.16b, v1.16b, v20.16b
+          bic             v29.16b, v29.16b, v20.16b
+          bic             v30.16b, v30.16b, v20.16b
+          bic             v31.16b, v31.16b, v20.16b
+        addv            s22, v22.4s
+          eor             v24.16b, v24.16b, v21.16b
+          eor             v25.16b, v25.16b, v21.16b
+          eor             v26.16b, v26.16b, v21.16b
+          eor             v27.16b, v27.16b, v21.16b
+          eor             v28.16b, v28.16b, v21.16b
+        mov             w3, v22.s[0]
+          eor             v29.16b, v29.16b, v21.16b
+          eor             v30.16b, v30.16b, v21.16b
+          eor             v31.16b, v31.16b, v21.16b
+          cmeq            v24.4s, v24.4s, #0
+          cmeq            v25.4s, v25.4s, #0
+          cmeq            v26.4s, v26.4s, #0
+          cmeq            v27.4s, v27.4s, #0
+        cbnz            w3, 90f
+        st1             {v3.16b, v4.16b}, [x2], #32
+3:          mov             v3.16b, v2.16b
+            ld1             {v4.16b, v5.16b}, [x0], #32
+          cmeq            v28.4s, v28.4s, #0
+          cmeq            v29.4s, v29.4s, #0
+          cmeq            v30.4s, v30.4s, #0
+          cmeq            v31.4s, v31.4s, #0
+          orr             v24.16b, v24.16b, v25.16b
+          orr             v26.16b, v26.16b, v27.16b
+          orr             v28.16b, v28.16b, v29.16b
+          orr             v30.16b, v30.16b, v31.16b
+            ext             v25.16b, v3.16b, v4.16b, #1
+          orr             v22.16b, v24.16b, v26.16b
+            ext             v26.16b, v3.16b, v4.16b, #2
+            ext             v27.16b, v3.16b, v4.16b, #3
+            ext             v29.16b, v4.16b, v5.16b, #1
+          orr             v23.16b, v28.16b, v30.16b
+            ext             v30.16b, v4.16b, v5.16b, #2
+            ext             v31.16b, v4.16b, v5.16b, #3
+            bic             v24.16b, v3.16b, v20.16b
+            bic             v25.16b, v25.16b, v20.16b
+            bic             v26.16b, v26.16b, v20.16b
+          orr             v22.16b, v22.16b, v23.16b
+            bic             v27.16b, v27.16b, v20.16b
+            bic             v28.16b, v4.16b, v20.16b
+            bic             v29.16b, v29.16b, v20.16b
+            bic             v30.16b, v30.16b, v20.16b
+            bic             v31.16b, v31.16b, v20.16b
+          addv            s22, v22.4s
+            eor             v24.16b, v24.16b, v21.16b
+            eor             v25.16b, v25.16b, v21.16b
+            eor             v26.16b, v26.16b, v21.16b
+            eor             v27.16b, v27.16b, v21.16b
+            eor             v28.16b, v28.16b, v21.16b
+          mov             w3, v22.s[0]
+            eor             v29.16b, v29.16b, v21.16b
+            eor             v30.16b, v30.16b, v21.16b
+            eor             v31.16b, v31.16b, v21.16b
+            cmeq            v24.4s, v24.4s, #0
+            cmeq            v25.4s, v25.4s, #0
+            cmeq            v26.4s, v26.4s, #0
+            cmeq            v27.4s, v27.4s, #0
+          cbnz            w3, 91f
+          st1             {v0.16b, v1.16b}, [x2], #32
+        subs            w1, w1, #64
+        b.pl            2b
+
+90:     add             w0, w1, #80
+        ret
+
+91:     sub             w1, w1, #32
+        b               90b
+endfunc
diff --git a/libavcodec/aasc.c b/libavcodec/aasc.c
index 86cb9e85a1..26570f49e5 100644
--- a/libavcodec/aasc.c
+++ b/libavcodec/aasc.c
@@ -104,26 +104,26 @@ static int aasc_decode_frame(AVCodecContext *avctx,
         ff_msrle_decode(avctx, s->frame, 8, &s->gb);
         break;
     case MKTAG('A', 'A', 'S', 'C'):
-        switch (compr) {
-        case 0:
-            stride = (avctx->width * psize + psize) & ~psize;
-            if (buf_size < stride * avctx->height)
-                return AVERROR_INVALIDDATA;
-            for (i = avctx->height - 1; i >= 0; i--) {
-                memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
-                buf += stride;
-                buf_size -= stride;
-            }
-            break;
-        case 1:
-            bytestream2_init(&s->gb, buf, buf_size);
-            ff_msrle_decode(avctx, s->frame, 8, &s->gb);
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+    switch (compr) {
+    case 0:
+        stride = (avctx->width * psize + psize) & ~psize;
+        if (buf_size < stride * avctx->height)
             return AVERROR_INVALIDDATA;
+        for (i = avctx->height - 1; i >= 0; i--) {
+            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
+            buf += stride;
+            buf_size -= stride;
         }
         break;
+    case 1:
+        bytestream2_init(&s->gb, buf, buf_size);
+        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+        return AVERROR_INVALIDDATA;
+    }
+        break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unknown FourCC: %X\n", avctx->codec_tag);
         return -1;
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index 1fb900ecb1..e358f8d9e3 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -74,7 +74,6 @@
 #define AC3_DYNAMIC_RANGE1      0

 typedef int                     INTFLOAT;
-typedef unsigned int            UINTFLOAT;
 typedef int16_t                 SHORTFLOAT;

 #else /* USE_FIXED */
@@ -94,7 +93,6 @@ typedef int16_t                 SHORTFLOAT;
 #define AC3_DYNAMIC_RANGE1      1.0f

 typedef float                   INTFLOAT;
-typedef float                   UINTFLOAT;
 typedef float                   SHORTFLOAT;

 #endif /* USE_FIXED */
diff --git a/libavcodec/alacdsp.c b/libavcodec/alacdsp.c
index b3c1c424f3..9996eb4319 100644
--- a/libavcodec/alacdsp.c
+++ b/libavcodec/alacdsp.c
@@ -29,12 +29,12 @@ static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
     int i;

     for (i = 0; i < nb_samples; i++) {
-        uint32_t a, b;
+        int32_t a, b;

         a = buffer[0][i];
         b = buffer[1][i];

-        a -= (int)(b * decorr_left_weight) >> decorr_shift;
+        a -= (b * decorr_left_weight) >> decorr_shift;
         b += a;

         buffer[0][i] = b;
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 2e9a3581de..d9571b437f 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -153,6 +153,7 @@ extern AVCodec ff_hap_decoder;
 extern AVCodec ff_hevc_decoder;
 extern AVCodec ff_hevc_qsv_decoder;
 extern AVCodec ff_hevc_rkmpp_decoder;
+extern AVCodec ff_hevc_rpi_decoder;
 extern AVCodec ff_hevc_v4l2m2m_decoder;
 extern AVCodec ff_hnm4_video_decoder;
 extern AVCodec ff_hq_hqa_decoder;
@@ -917,6 +918,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
     }
 }

+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
+{
+    const enum AVPixelFormat *pf = p->pix_fmts;
+
+    // Assume good if we lack info
+    if (pf == NULL)
+        return 1;
+    if (fmt == AV_PIX_FMT_NONE)
+        return 0;
+
+    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
+        if (*pf == fmt)
+            return 1;
+    }
+    return 0;
+}
+
+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
+{
+    const AVCodec *p, *experimental = NULL;
+    void *i = 0;
+
+    id= remap_deprecated_codec_id(id);
+    while ((p = av_codec_iterate(&i))) {
+        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
+                experimental = p;
+            } else
+                return (AVCodec *)p;
+        }
+        p = p->next;
+    }
+    return (AVCodec *)experimental;
+}
+
 static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
 {
     const AVCodec *p, *experimental = NULL;
diff --git a/libavcodec/alsdec.c b/libavcodec/alsdec.c
index 917e7b6264..e736905a76 100644
--- a/libavcodec/alsdec.c
+++ b/libavcodec/alsdec.c
@@ -1017,7 +1017,7 @@ static int read_block(ALSDecContext *ctx, ALSBlockData *bd)

     *bd->shift_lsbs = 0;

-    if (get_bits_left(gb) < 7)
+    if (get_bits_left(gb) < 1)
         return AVERROR_INVALIDDATA;

     // read block type flag and read the samples accordingly
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index de5627ad02..4684e40a46 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -102,7 +102,7 @@ typedef struct APEFilter {
     int16_t *historybuffer; ///< filter memory
     int16_t *delay;         ///< filtered values

-    uint32_t avg;
+    int avg;
 } APEFilter;

 typedef struct APERice {
@@ -930,7 +930,7 @@ static av_always_inline int filter_3800(APEPredictor *p,
     p->coeffsB[filter][0] += (((d3 >> 29) & 4) - 2) * sign;
     p->coeffsB[filter][1] -= (((d4 >> 30) & 2) - 1) * sign;

-    p->filterB[filter] = p->lastA[filter] + (unsigned)(predictionB >> shift);
+    p->filterB[filter] = p->lastA[filter] + (predictionB >> shift);
     p->filterA[filter] = p->filterB[filter] + (unsigned)((int)(p->filterA[filter] * 31U) >> 5);

     return p->filterA[filter];
@@ -955,7 +955,7 @@ static void long_filter_high_3800(int32_t *buffer, int order, int shift, int len
             dotprod += delay[j] * (unsigned)coeffs[j];
             coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
-        buffer[i] -= (unsigned)(dotprod >> shift);
+        buffer[i] -= dotprod >> shift;
         for (j = 0; j < order - 1; j++)
             delay[j] = delay[j + 1];
         delay[order - 1] = buffer[i];
@@ -1088,13 +1088,13 @@ static av_always_inline int predictor_update_3930(APEPredictor *p,
                                                   const int delayA)
 {
     int32_t predictionA, sign;
-    uint32_t d0, d1, d2, d3;
+    int32_t d0, d1, d2, d3;

     p->buf[delayA]     = p->lastA[filter];
     d0 = p->buf[delayA    ];
-    d1 = p->buf[delayA    ] - (unsigned)p->buf[delayA - 1];
-    d2 = p->buf[delayA - 1] - (unsigned)p->buf[delayA - 2];
-    d3 = p->buf[delayA - 2] - (unsigned)p->buf[delayA - 3];
+    d1 = p->buf[delayA    ] - p->buf[delayA - 1];
+    d2 = p->buf[delayA - 1] - p->buf[delayA - 2];
+    d3 = p->buf[delayA - 2] - p->buf[delayA - 3];

     predictionA = d0 * p->coeffsA[filter][0] +
                   d1 * p->coeffsA[filter][1] +
@@ -1105,10 +1105,10 @@ static av_always_inline int predictor_update_3930(APEPredictor *p,
     p->filterA[filter] = p->lastA[filter] + ((int)(p->filterA[filter] * 31U) >> 5);

     sign = APESIGN(decoded);
-    p->coeffsA[filter][0] += (((int32_t)d0 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][1] += (((int32_t)d1 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][2] += (((int32_t)d2 < 0) * 2 - 1) * sign;
-    p->coeffsA[filter][3] += (((int32_t)d3 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][0] += ((d0 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][1] += ((d1 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][2] += ((d2 < 0) * 2 - 1) * sign;
+    p->coeffsA[filter][3] += ((d3 < 0) * 2 - 1) * sign;

     return p->filterA[filter];
 }
@@ -1587,7 +1587,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         for (ch = 0; ch < s->channels; ch++) {
             sample8 = (uint8_t *)frame->data[ch];
             for (i = 0; i < blockstodecode; i++)
-                *sample8++ = (s->decoded[ch][i] + 0x80U) & 0xff;
+                *sample8++ = (s->decoded[ch][i] + 0x80) & 0xff;
         }
         break;
     case 16:
diff --git a/libavcodec/argo.c b/libavcodec/argo.c
index 8f58e682f6..f633ec2691 100644
--- a/libavcodec/argo.c
+++ b/libavcodec/argo.c
@@ -608,9 +608,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     uint32_t chunk;
     int ret;

-    if (avpkt->size < 4)
-        return AVERROR_INVALIDDATA;
-
     bytestream2_init(gb, avpkt->data, avpkt->size);

     if ((ret = ff_reget_buffer(avctx, frame, 0)) < 0)
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index c4ab93aeeb..cd926f7b33 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -39,6 +39,8 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
 OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
 OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
+                                          arm/rpi_hevcpred_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
 OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
@@ -137,10 +139,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
 NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                          arm/hevcdsp_idct_neon.o    \
                                           arm/hevcdsp_deblock_neon.o    \
                                           arm/hevcdsp_idct_neon.o       \
                                           arm/hevcdsp_qpel_neon.o       \
                                           arm/hevcdsp_sao_neon.o
+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
+                                          arm/rpi_hevc_misc_neon.o       \
+                                          arm/rpi_hevcdsp_deblock_neon.o \
+                                          arm/rpi_hevcdsp_idct_neon.o    \
+                                          arm/rpi_hevcdsp_res8_neon.o    \
+                                          arm/rpi_hevcdsp_res16_neon.o   \
+                                          arm/rpi_hevcdsp_sao_neon.o     \
+                                          arm/rpi_hevcpred_init_neon.o   \
+                                          arm/rpi_hevcpred_intra_angular_neon.o \
+                                          arm/rpi_hevcpred_intra_dc_neon.o \
+                                          arm/rpi_hevcpred_intra_filter_neon.o \
+                                          arm/rpi_hevcpred_intra_hv_neon.o \
+                                          arm/rpi_hevcpred_intra_planar_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index fdbf86b45e..4755f20e2e 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -26,83 +26,209 @@
 #include "libavutil/internal.h"
 #include "libavcodec/cabac.h"

+
 #define get_cabac_inline get_cabac_inline_arm
 static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-                                                 uint8_t *const state)
+                                                 uint8_t *state)
 {
-    int bit;
-    void *reg_b, *reg_c, *tmp;
+    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
+    int bit, ptr, low, tmp1, tmp2;
+    __asm__ volatile (
+        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
+        "ldrb    %[ptr], [%[state]]                       \n\t"
+        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
+        "and     %[tmp2], %[bit], #0xc0                   \n\t"
+        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
+        "ldr     %[low], [%[c], %[low_off]]               \n\t"
+        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
+        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
+        "mov     %[tmp1], %[bit]                          \n\t"
+        "cmp     %[low], %[bit], lsl #17                  \n\t"
+        "itt     ge                                       \n\t"
+        "movge   %[tmp1], %[tmp2]                         \n\t"
+        "mvnge   %[ptr], %[ptr]                           \n\t"
+        "clz     %[tmp2], %[tmp1]                         \n\t"
+        "it      ge                                       \n\t"
+        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
+        "sub     %[tmp2], %[tmp2], #23                    \n\t"
+        "and     %[bit], %[ptr], #1                       \n\t"
+        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
+        "lsl     %[low], %[low], %[tmp2]                  \n\t"
+        "lsls    %[ptr], %[low], #16                      \n\t"
+        "bne     1f                                       \n\t"
+        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
+        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "strb    %[mlps_tables], [%[state]]               \n\t"
+        "rbit    %[state], %[low]                         \n\t"
+        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
+#else
+        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
+        "strb    %[mlps_tables], [%[state]]               \n\t"
+        "rbit    %[state], %[low]                         \n\t"
+        "cmp     %[tmp1], %[ptr]                          \n\t"
+#if CONFIG_THUMB
+        "it      cs                                       \n\t"
+        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
+#else
+        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
+#endif
+#endif
+        "clz     %[state], %[state]                       \n\t"
+        "movw    %[mlps_tables], #0xffff                  \n\t"
+        "sub     %[state], %[state], #16                  \n\t"
+        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
+        "rev     %[tmp1], %[tmp1]                         \n\t"
+        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
+        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
+        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
+#if CONFIG_THUMB
+        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
+        "add     %[low], %[low], %[tmp1]                  \n\t"
+#else
+        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
+#endif
+        "str     %[low], [%[c], %[low_off]]               \n\t"
+        "b       2f                                       \n\t"
+        "1:                                               \n\t"
+        "strb    %[mlps_tables], [%[state]]               \n\t"
+        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
+        "str     %[low], [%[c], %[low_off]]               \n\t"
+        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
+        "2:                                               \n\t"
+    :  // Outputs
+             [state]"+r"(state),
+       [mlps_tables]"+r"(mlps_tables),
+               [bit]"=&r"(bit),
+               [ptr]"=&r"(ptr),
+               [low]"=&r"(low),
+              [tmp1]"=&r"(tmp1),
+              [tmp2]"=&r"(tmp2)
+    :  // Inputs
+               [c]"r"(c),
+         [low_off]"J"(offsetof(CABACContext, low)),
+       [range_off]"J"(offsetof(CABACContext, range)),
+         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
+         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
+         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+    :  // Clobbers
+       "cc", "memory"
+    );
+    return bit;
+}

-    __asm__ volatile(
-        "ldrb       %[bit]        , [%[state]]                  \n\t"
-        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
-        "mov        %[tmp]        , %[range]                    \n\t"
-        "and        %[range]      , %[range]    , #0xC0         \n\t"
-        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
-        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
-        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
-        "cmp        %[tmp]        , %[low]                      \n\t"
-        "it         gt                                          \n\t"
-        "movgt      %[range]      , %[r_c]                      \n\t"
-        "itt        cc                                          \n\t"
-        "mvncc      %[bit]        , %[bit]                      \n\t"
-        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
-        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
-        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
-        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
-        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-        "uxth       %[r_c]        , %[low]                      \n\t"
-        "strb       %[r_b]        , [%[state]]                  \n\t"
-        "tst        %[r_c]        , %[r_c]                      \n\t"
-        "bne        2f                                          \n\t"
-        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+#define get_cabac_bypass get_cabac_bypass_arm
+static inline int get_cabac_bypass_arm(CABACContext * const c)
+{
+    uint32_t low = c->low, range, ptr, tmp;
+    int rv;
+    __asm volatile (
+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
+        "mov        %[rv]    , #0                   \n\t"
+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
+        "lsl        %[low]   , #1                   \n\t"
+#if !UNCHECKED_BITSTREAM_READER
+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
+#endif
+        "cmp        %[low]   , %[range], lsl #17    \n\t"
+        "itt         cs                              \n\t"
+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
+        "movcs      %[rv]    , #1                   \n\t"
 #if UNCHECKED_BITSTREAM_READER
-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
-        "add        %[r_c]        , %[r_c]      , #2            \n\t"
-        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
+#else
+        "cmp        %[tmp]   , %[ptr]               \n\t"
+#if CONFIG_THUMB
+        "it         cs                              \n\t"
+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
 #else
-        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
-        "cmp        %[r_c]        , %[r_b]                      \n\t"
-        "itt        lt                                          \n\t"
-        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
-        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
+#endif
 #endif
-        "sub        %[r_c]        , %[low]      , #1            \n\t"
-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
-        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
-        "rev        %[tmp]        , %[tmp]                      \n\t"
-        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
-        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
-        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
-        "movw       %[r_b]        , #0xFFFF                     \n\t"
-        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
-        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
-        "add        %[low]        , %[low]      , %[tmp]        \n\t"
-        "2:                                                     \n\t"
-        :    [bit]"=&r"(bit),
-             [low]"+&r"(c->low),
-           [range]"+&r"(c->range),
-             [r_b]"=&r"(reg_b),
-             [r_c]"=&r"(reg_c),
-             [tmp]"=&r"(tmp)
-        :        [c]"r"(c),
-             [state]"r"(state),
-            [tables]"r"(ff_h264_cabac_tables),
-              [byte]"M"(offsetof(CABACContext, bytestream)),
-               [end]"M"(offsetof(CABACContext, bytestream_end)),
-          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
-           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
-          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
-        : "memory", "cc"
-        );
+        "lsls       %[range] , %[low], #16          \n\t"
+        "bne        1f                              \n\t"

-    return bit & 1;
+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
+        "rev        %[tmp]   , %[tmp]               \n\t"
+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]   , 0xFFFF               \n\t"
+        "sub        %[low]   , %[tmp]               \n\t"
+        "1:                                         \n\t"
+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
+        : // Outputs
+               [rv]"=&r"(rv),
+              [low]"+r"(low),
+            [range]"=&r"(range),
+              [ptr]"=&r"(ptr),
+              [tmp]"=&r"(tmp)
+        : // Inputs
+                    [c]"r"(c),
+              [low_off]"J"(offsetof(CABACContext, low)),
+            [range_off]"J"(offsetof(CABACContext, range)),
+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
+        : // Clobbers
+            "memory", "cc"
+    );
+    return rv;
 }
+
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+{
+    uint32_t low = c->low, range, ptr, tmp;
+    __asm volatile (
+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
+        "lsl        %[low]   , #1                   \n\t"
+#if !UNCHECKED_BITSTREAM_READER
+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
+#endif
+        "cmp        %[low]   , %[range], lsl #17    \n\t"
+        "it         cs                              \n\t"
+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
+        "it         cc                              \n\t"
+        "rsbcc      %[rv]    , %[rv], #0            \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
+#else
+        "cmp        %[tmp]   , %[ptr]               \n\t"
+#if CONFIG_THUMB
+        "it         cs                              \n\t"
+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
+#else
+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
+#endif
+#endif
+        "lsls       %[range] , %[low], #16          \n\t"
+        "bne        1f                              \n\t"
+
+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
+        "rev        %[tmp]   , %[tmp]               \n\t"
+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]   , 0xFFFF               \n\t"
+        "sub        %[low]   , %[tmp]               \n\t"
+        "1:                                         \n\t"
+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
+        : // Outputs
+               [rv]"+r"(rv),
+              [low]"+r"(low),
+            [range]"=&r"(range),
+              [ptr]"=&r"(ptr),
+              [tmp]"=&r"(tmp)
+        : // Inputs
+                    [c]"r"(c),
+              [low_off]"J"(offsetof(CABACContext, low)),
+            [range_off]"J"(offsetof(CABACContext, range)),
+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
+        : // Clobbers
+            "memory", "cc"
+    );
+    return rv;
+}
+
 #endif /* HAVE_ARMV6T2_INLINE */

 #endif /* AVCODEC_ARM_CABAC_H */
diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
new file mode 100644
index 0000000000..c88dec6eff
--- /dev/null
+++ b/libavcodec/arm/rpi_hevc_cabac.h
@@ -0,0 +1,607 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
+#define AVCODEC_ARM_HEVC_CABAC_H
+
+#include "config.h"
+#if HAVE_ARMV6T2_INLINE
+
+#define hevc_mem_bits32 hevc_mem_bits32_arm
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+{
+    unsigned int n;
+    __asm__ (
+        "rev        %[n], %[x]                     \n\t"
+        : [n]"=r"(n)
+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+        :
+        );
+    return n << (bits & 7);
+}
+
+
+// ---------------------------------------------------------------------------
+//
+// Helper fns - little bits of code where ARM has an instraction that the
+// compiler doesn't know about / use
+
+#define trans_scale_sat trans_scale_sat_arm
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    int rv;
+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+
+    __asm__ (
+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
+    : [rv]"=r"(rv)
+    : [t]"r"(t)
+    :
+    );
+    return rv;
+}
+
+#define update_rice update_rice_arm
+static inline void update_rice_arm(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    int t = last_coeff_abs_level_remaining << 1;
+    __asm__ (
+    "lsrs  %[t], %[t], %[shift]             \n\t"
+
+    "it    eq                               \n\t"
+    "subeq %[stat], %[stat], #1             \n\t"
+    "cmp   %[t], #6                         \n\t"
+    "adc   %[stat], %[stat], #0             \n\t"
+    "usat  %[stat], #8, %[stat]             \n\t"
+    : [stat]"+r"(*stat_coeff),
+         [t]"+r"(t)
+    :  [shift]"r"(c_rice_param)
+    : "cc"
+    );
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC get loops
+//
+// Where the loop is simple enough we can normally do 10-30% better than the
+// compiler
+
+// Get the residual greater than 1 bits
+
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i, reg_b, st, tmp, bit, rv;
+     __asm__ (
+         "mov        %[i]          , #0                          \n\t"
+         "mov        %[rv]         , #0                          \n\t"
+         "1:                                                     \n\t"
+         "add        %[i]          , %[i]        , #1            \n\t"
+         "cmp        %[rv]         , #0                          \n\t"
+         "ite        eq                                          \n\t"
+         "usateq     %[st]         , #2          , %[i]          \n\t"
+         "movne      %[st]         , #0                          \n\t"
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "and        %[bit]        , %[bit]      , #1            \n\t"
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "it         ne                                          \n\t"
+         "cmpne      %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+
+// If reload is not required then we must have run out of flags to decode
+         "tst        %[tmp]        , %[tmp]                      \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "rbit       %[bit]        , %[low]                      \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "clz        %[bit]        , %[bit]                      \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "sub        %[bit]        , %[bit]      , #16           \n\t"
+         "cmp        %[n]          , %[i]                        \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
+#endif
+
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+r"(c->low),
+            [range]"+r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+r"(c->bytestream),
+                [i]"=&r"(i),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st),
+               [rv]"=&r"(rv)
+          :  [state0]"r"(state0),
+                  [n]"r"(n),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+    return rv;
+}
+
+
+// n must be > 0 on entry
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t * ctx_map,
+    uint8_t * p)
+{
+    unsigned int reg_b, tmp, st, bit;
+     __asm__ (
+// Get bin from map
+#if CONFIG_THUMB
+         "add        %[ctx_map]    , %[n]                        \n\t"
+         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
+#else
+         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
+#endif
+         "1:                                                     \n\t"
+
+// Load state & ranges
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+// Renorm
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+         "tst        %[bit]        , #1                          \n\t"
+         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+// GCC asm seems to need strbne written differently for thumb and arm
+#if CONFIG_THUMB
+         "it         ne                                          \n\t"
+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
+#else
+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
+#endif
+
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "subs       %[n]          , %[n]        , #1            \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+#if CONFIG_THUMB
+         "itt        ne                                          \n\t"
+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
+#else
+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
+#endif
+         "bne        1b                                          \n\t"
+
+// If we have bits left then n must be 0 so give up now
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "rbit       %[bit]        , %[low]                      \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "clz        %[bit]        , %[bit]                      \n\t"
+         "cmp        %[n]          , #0                          \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "sub        %[bit]        , %[bit]      , #16           \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
+#endif
+
+// Check to see if we still have more to do
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+r"(c->low),
+            [range]"+r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+r"(c->bytestream),
+              [idx]"+r"(p),
+                [n]"+r"(n),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st),
+          [ctx_map]"+r"(ctx_map)
+          :  [state0]"r"(state0),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+
+    return p;
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC_BY22 functions
+
+
+#define get_cabac_by22_start get_cabac_by22_start_arm
+static inline void get_cabac_by22_start_arm(CABACContext * const c)
+{
+    const uint8_t *ptr = c->bytestream;
+    register uint32_t low __asm__("r1"), range __asm__("r2");
+    uint32_t m, range8, bits;
+#if !USE_BY22_DIV
+    uintptr_t inv;
+#endif
+
+    av_assert2(offsetof (CABACContext, low) == 0);
+    av_assert2(offsetof (CABACContext, range) == 4);
+    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
+    __asm__ volatile (
+        "ldmia   %[c], {%[low], %[range]}                         \n\t"
+        : // Outputs
+               [low]"=r"(low),
+             [range]"=r"(range)
+        : // Inputs
+                 [c]"r"(c)
+        : // Clobbers
+    );
+#if !USE_BY22_DIV
+    inv = (uintptr_t)cabac_by22_inv_range;
+#endif
+    __asm__ volatile (
+        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
+#if !USE_BY22_DIV
+        "uxtb    %[range8], %[range]                              \n\t"
+#endif
+        "rbit    %[bits], %[low]                                  \n\t"
+        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
+        "clz     %[bits], %[bits]                                 \n\t"
+        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
+        "rev     %[m], %[m]                                       \n\t"
+        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
+        "eor     %[m], %[m], #0x80000000                          \n\t"
+#if !USE_BY22_DIV
+        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
+        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
+        "str     %[range], [%[c], %[bits_off]]                    \n\t"
+#else
+        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
+#endif
+#if CONFIG_THUMB
+        "lsr     %[m], %[ptr]                                     \n\t"
+        "eor     %[range], %[low], %[m]                           \n\t"
+#else
+        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
+#endif
+        : // Outputs
+               [ptr]"+&r"(ptr),
+               [low]"+&r"(low),
+             [range]"+&r"(range),
+#if !USE_BY22_DIV
+               [inv]"+&r"(inv),
+#endif
+                 [m]"=&r"(m),
+            [range8]"=&r"(range8),
+              [bits]"=&r"(bits)
+        : // Inputs
+                   [c]"r"(c),
+            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
+             [ptr_off]"J"(offsetof (CABACContext, bytestream))
+        : // Clobbers
+            "memory"
+    );
+    c->low = range;
+#if !USE_BY22_DIV
+    c->range = inv;
+#endif
+}
+
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+{
+    uint32_t rv = c->low &~ 1, tmp;
+    __asm__ (
+        "cmp      %[inv] , #0                    \n\t"
+        "it       ne                             \n\t"
+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+        :  // Outputs
+             [rv]"+r"(rv),
+             [tmp]"=r"(tmp)
+        :  // Inputs
+             [inv]"r"(c->range)
+        :  // Clobbers
+                "cc"
+    );
+    return rv << 1;
+}
+
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
+{
+    uint32_t bits, ptr, tmp1, tmp2;
+    __asm__ volatile (
+        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
+        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
+        "rsb     %[tmp1], %[n], #32                \n\t"
+        "add     %[bits], %[bits], %[n]            \n\t"
+        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
+        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
+        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
+#if CONFIG_THUMB
+        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
+        "ldr     %[ptr], [%[ptr]]                  \n\t"
+#else
+        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
+#endif
+        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
+        "and     %[tmp2], %[bits], #7              \n\t"
+        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
+        "rev     %[ptr], %[ptr]                    \n\t"
+        "lsl     %[tmp1], %[tmp1], #23             \n\t"
+#if CONFIG_THUMB
+        "lsl     %[val], %[n]                      \n\t"
+        "sub     %[val], %[tmp1]                   \n\t"
+#else
+        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
+#endif
+        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
+        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
+        "str     %[val], [%[cc], %[low_off]]       \n\t"
+        :  // Outputs
+            [val]"+r"(val),
+           [bits]"=&r"(bits),
+            [ptr]"=&r"(ptr),
+           [tmp1]"=&r"(tmp1),
+           [tmp2]"=&r"(tmp2)
+        :  // Inputs
+                  [cc]"r"(c),
+                   [n]"r"(n),
+            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
+             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
+           [range_off]"J"(offsetof(CABACContext, by22.range)),
+             [low_off]"J"(offsetof(CABACContext, low))
+        :  // Clobbers
+           "memory"
+    );
+}
+
+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
+{
+    uint32_t last_coeff_abs_level_remaining;
+    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
+    __asm__ volatile (
+        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
+        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
+        "bic     %[remain], %[remain], #1                     \n\t"
+        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
+        "cmp     %[prefix], #0                                \n\t"
+        "it      ne                                           \n\t"
+        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
+        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
+        "lsl     %[remain], %[remain], #1                     \n\t"
+        "mvn     %[prefix], %[remain]                         \n\t"
+        "clz     %[prefix], %[prefix]                         \n\t"
+        "rsbs    %[n1], %[prefix], #2                         \n\t"
+        "bcc     1f                                           \n\t"
+        "adc     %[n1], %[rice], %[prefix]                    \n\t"
+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
+        "rsb     %[n2], %[n1], #32                            \n\t"
+        "and     %[tmp1], %[tmp2], #7                         \n\t"
+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
+        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
+        "mul     %[n2], %[range], %[n2]                       \n\t"
+        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
+        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
+        "rsb     %[tmp2], %[rice], #31                        \n\t"
+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
+        "lsl     %[n2], %[n2], #23                            \n\t"
+#if CONFIG_THUMB
+        "lsl     %[range], %[n1]                              \n\t"
+        "sub     %[range], %[n2]                              \n\t"
+#else
+        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
+#endif
+        "rev     %[ptr], %[ptr]                               \n\t"
+        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
+#if CONFIG_THUMB
+        "lsr     %[remain], %[tmp2]                           \n\t"
+        "add     %[remain], %[n2]                             \n\t"
+#else
+        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
+#endif
+        "b       3f                                           \n\t"
+        "1:                                                   \n\t"
+        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
+        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
+        "bhi     2f                                           \n\t"
+        "sub     %[n1], %[n2], #2                             \n\t"
+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
+        "rsb     %[n2], %[n1], #32                            \n\t"
+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
+        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
+        "mul     %[n2], %[range], %[n2]                       \n\t"
+        "rsb     %[range], %[rice], #34                       \n\t"
+        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
+        "and     %[tmp1], %[tmp2], #7                         \n\t"
+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
+        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
+        "rsb     %[prefix], %[prefix], %[range]               \n\t"
+        "orr     %[remain], %[remain], #0x80000000            \n\t"
+        "rev     %[ptr], %[ptr]                               \n\t"
+        "lsl     %[n2], %[n2], #23                            \n\t"
+        "mov     %[range], #2                                 \n\t"
+#if CONFIG_THUMB
+        "lsl     %[tmp2], %[n1]                               \n\t"
+        "sub     %[tmp2], %[n2]                               \n\t"
+#else
+        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
+#endif
+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
+        "lsl     %[rice], %[range], %[rice]                   \n\t"
+        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
+#if CONFIG_THUMB
+        "lsr     %[remain], %[prefix]                         \n\t"
+        "add     %[remain], %[rice]                           \n\t"
+#else
+        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
+#endif
+        "b       4f                                           \n\t"
+        "2:                                                   \n\t"
+        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
+#if CONFIG_THUMB
+        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
+        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
+#else
+        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
+#endif
+        "rsb     %[tmp1], %[prefix], #32                      \n\t"
+        "push    {%[rice]}                                    \n\t"
+        "and     %[rice], %[n1], #7                           \n\t"
+        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
+        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
+        "mul     %[remain], %[range], %[tmp1]                 \n\t"
+        "rev     %[tmp2], %[tmp2]                             \n\t"
+        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
+        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
+        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
+        "sub     %[tmp2], %[n2], #2                           \n\t"
+        "lsl     %[remain], %[remain], #23                    \n\t"
+#if CONFIG_THUMB
+        "lsl     %[ptr], %[prefix]                            \n\t"
+        "rsb     %[remain], %[ptr]                            \n\t"
+#else
+        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
+#endif
+        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
+        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
+        "bic     %[n1], %[remain], #1                         \n\t"
+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
+        "cmp     %[tmp1], #0                                  \n\t"
+        "rsb     %[rice], %[tmp2], #32                        \n\t"
+        "it      ne                                           \n\t"
+        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
+        "and     %[tmp1], %[prefix], #7                       \n\t"
+#if CONFIG_THUMB
+        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
+        "ldr     %[ptr], [%[ptr]]                             \n\t"
+#else
+        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
+#endif
+        "lsl     %[n1], %[n1], #1                             \n\t"
+        "lsr     %[rice], %[n1], %[rice]                      \n\t"
+        "rsb     %[n2], %[n2], #34                            \n\t"
+        "mul     %[range], %[range], %[rice]                  \n\t"
+        "pop     {%[rice]}                                    \n\t"
+        "rev     %[ptr], %[ptr]                               \n\t"
+        "orr     %[n1], %[n1], #0x80000000                    \n\t"
+        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
+        "mov     %[prefix], #2                                \n\t"
+        "lsl     %[range], %[range], #23                      \n\t"
+#if CONFIG_THUMB
+        "lsl     %[remain], %[tmp2]                           \n\t"
+        "rsb     %[range], %[remain]                          \n\t"
+#else
+        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
+#endif
+        "lsl     %[remain], %[prefix], %[rice]                \n\t"
+#if CONFIG_THUMB
+        "lsr     %[n1], %[n2]                                 \n\t"
+        "add     %[remain], %[n1]                             \n\t"
+#else
+        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
+#endif
+        "3:                                                   \n\t"
+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
+        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
+        "4:                                                   \n\t"
+        "str     %[range], [%[cc], %[low_off]]                \n\t"
+        :  // Outputs
+            [remain]"=&r"(last_coeff_abs_level_remaining),
+              [rice]"+r"(rice_param),
+            [prefix]"=&r"(prefix),
+                [n1]"=&r"(n1),
+             [range]"=&r"(range),
+                [n2]"=&r"(n2),
+               [ptr]"=&r"(ptr),
+              [tmp1]"=&r"(tmp1),
+              [tmp2]"=&r"(tmp2)
+        :  // Inputs
+                          [cc]"r"(c),
+            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
+                     [low_off]"J"(offsetof(CABACContext, low)),
+                   [range_off]"J"(offsetof(CABACContext, range)),
+               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
+              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
+                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
+        :  // Clobbers
+           "cc", "memory"
+    );
+    return last_coeff_abs_level_remaining;
+}
+
+#endif /* HAVE_ARMV6T2_INLINE */
+
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
new file mode 100644
index 0000000000..978b7b6947
--- /dev/null
+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
@@ -0,0 +1,183 @@
+/*
+ * ARM NEON optimised IDCT functions for HEVC decoding
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+@ Included multiple times from hevc_idct_neon.S
+@ Macros defined there
+
+#define DC_SHIFT  (15 - BIT_DEPTH)
+#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
+#define TRN_SHIFT (20 - BIT_DEPTH)
+
+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r1, #DC_ADD
+        asr         r1, #DC_SHIFT
+        vdup.16     q0, r1
+        vdup.16     q1, r1
+        vst1.16     {q0, q1}, [r0]
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r2, r0, #32
+        mov         r3, #64
+        add         r1, #DC_ADD
+        asr         r1, #DC_SHIFT
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vst1.16     {q8, q9}, [r0], r3
+        vst1.16     {q8, q9}, [r2], r3
+        vst1.16     {q8, q9}, [r0]
+        vst1.16     {q8, q9}, [r2]
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r2, r0, #32
+        mov         r3, #64
+        add         r1, #DC_ADD
+        mov         ip, #16*16
+        asr         r1, #DC_SHIFT
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+1:      vst1.16     {q8, q9}, [r0], r3
+        subs        ip, ip, #32
+        vst1.16     {q8, q9}, [r2], r3
+        bhi         1b
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
+        ldrsh       r1, [r0]
+        add         r2, r0, #32
+        mov         r3, #64
+        add         r1, #DC_ADD
+        mov         ip, #32*32
+        asr         r1, #DC_SHIFT
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+1:      vst1.16     {q8, q9}, [r0], r3
+        subs        ip, ip, #32
+        vst1.16     {q8, q9}, [r2], r3
+        bhi         1b
+        bx lr
+endfunc
+
+
+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
+        vldr.i32    s0, =0x00240053 // 36 and 83
+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
+
+        tr4_shift   #7
+
+        vzip.16     d28, d29
+        vzip.16     d30, d31
+        vzip.32     q14, q15
+
+        tr4_shift   #TRN_SHIFT
+
+        vst4.16     {q14, q15}, [r0 :256]
+        bx lr
+
+        .ltorg
+endfunc
+
+
+
+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
+        vmov.i32    d0, #0x4a  // 74
+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
+        vmov.i32    d1, #0x1d  // 29
+        vmov.i32    d2, #0x37  // 55
+
+        tr4_luma_shift #7
+
+        vzip.16     d28, d29
+        vzip.16     d30, d31
+        vzip.32     q14, q15
+
+        tr4_luma_shift #TRN_SHIFT
+
+        vst4.16     {q14, q15}, [r0 :256]
+        bx lr
+endfunc
+
+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
+        add      r2, r0, #16
+        adr      r3, tr4f
+        vpush    {d8-d15}
+        vld1.16  {d0, d1}, [r3]
+        mov      r3, #32
+
+        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
+            "sub      r0, r0, #128-8",                              \
+            "sub      r2, r2, #128-8",                              \
+            "cmp      r1, #4"
+        ble      2f
+
+        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
+            "sub      r0, r0, #128+8",                              \
+            "sub      r2, r2, #128+8+16-32",                        \
+            "mov      r3, #64"
+
+        vzip.16  d16, d17
+        vzip.16  d18, d19
+
+        vzip.16  d20, d21
+        vzip.16  d22, d23
+        vzip.16  d28, d29
+        vzip.16  d30, d31
+        vzip.32  q10, q11
+        vzip.32  q14, q15
+1:
+        vzip.16  d24, d25
+        vzip.16  d26, d27
+        vzip.32  q8, q9
+        vzip.32  q12, q13
+
+        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
+        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
+
+        vpop     {d8-d15}
+        bx       lr
+
+2:      vmov.i64 q10, #0
+        sub      r0, r0, #8
+        vmov.i64 q11, #0
+        sub      r2, r2, #8+16-32
+        vmov.i64 q14, #0
+        mov      r3, #64
+        vmov.i64 q15, #0
+
+        vzip.16  d16, d17
+        vzip.16  d18, d19
+
+        b        1b
+
+endfunc
+
+#undef DC_SHIFT
+#undef DC_ADD
+#undef TRN_SHIFT
+
diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
new file mode 100644
index 0000000000..161bb0d7c9
--- /dev/null
+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
@@ -0,0 +1,267 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Written by John Cox, Ben Avison
+*/
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ rpi_zap_coeff_vals_neon(
+@   uint16_t * buf,          [r0]
+@   unsigned int log_n_m2)   [r1]
+
+function rpi_zap_coeff_vals_neon, export=1
+        mov      ip, #1
+        vmov.i64 q0, #0
+        teq      r1, #0
+        vmov.i64 q1, #0
+        beq      2f
+
+        lsl      ip, r1    @ 2, 4 or 8
+        add      r2, r0, #32
+        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
+        mov      r3, #64
+1:      vst1.8   {q0,q1}, [r0:256], r3
+        subs     ip, #2
+        vst1.8   {q0,q1}, [r2:256], r3
+        bne      1b
+        bx       lr
+
+2:      vst1.8   {q0,q1}, [r0:256]
+        bx       lr
+endfunc
+
+@ PIC jump tables are more expensive than absolute for A32 code
+.set jent_pic, CONFIG_PIC || CONFIG_THUMB
+
+@ Jump table entry - if in neon mode the bottom bit must be set
+@ ? There is probably a real asm instruction to do this but I haven't found it
+.macro jent lab
+.if jent_pic
+T       .short ((0 + \lab) - (0 + 98b)) / 2
+A       .short (0 + \lab) - (4 + 98b)
+.else
+T       .word   1 + \lab
+A       .word   \lab
+.endif
+.endm
+
+.set expected_next, 0
+
+.macro cpy_compound val, p1, p2, drop_thru=0
+.if \p1 + \p2 != \val
+.error "Bad addition!  \p1 + \p2 != \val"
+.endif
+.if expected_next != 0 && expected_next != \val
+.error "Drop thru failure"
+.endif
+\val\():
+        push       {r0-r3}
+        bl          100\p1\()b
+        pop        {r0-r3}
+        add         r0, #\p1
+        add         r2, #\p1
+.if \drop_thru == 0
+        b           \p2\()b
+.set expected_next, 0
+.else
+.set expected_next, \p2
+.endif
+.endm
+
+@ ff_hevc_cpy_blks8x4_neon(
+@   dst         [r0]
+@   dst_stride  [r1]
+@   src         [r2]
+@   src_stride  [r3]
+@   width       [sp, #0] (bytes)
+@   height)     [sp, #4]
+@
+@ Power of 2 widths are directly coded, all others are done in stripes
+@ We expect the vast majority of calls to be power of 2
+@
+@ Currently has min width of 8, but we could make that 4 without issue
+@ Min height is 4
+
+function ff_hevc_rpi_cpy_blks8x4_neon, export=1
+        ldr         r12, [sp, #0]
+        push       {r11, lr}
+.if jent_pic
+A       adr         lr,  98f - 2
+.else
+A       adr         lr,  98f - 4
+.endif
+        lsr         r12, #3
+        ldr         r11, [sp, #(8 + 4)]
+.if jent_pic
+A       lsl         r12, #1
+A       ldrsh       lr,  [lr,  r12]
+A       add         pc,  lr
+T       tbh         [pc, r12, lsl #1]
+.else
+        @ A32 only, Thumb is always PIC
+        ldr         pc,  [lr,  r12, lsl #2]
+.endif
+
+98:
+T       .short      0 @ unused
+        jent        8f
+        jent        16f
+        jent        24f
+        jent        32f
+        jent        40f
+        jent        48f
+        jent        56f
+        jent        64f
+        jent        72f
+        jent        80f
+        jent        88f
+        jent        96f
+        jent        104f
+        jent        112f
+        jent        120f
+        jent        128f
+
+1008:
+        push       {r11, lr}
+8:
+        add         lr,  r2,  r3
+        lsl         r3,  #1
+        add         r12, r0,  r1
+        lsl         r1,  #1
+1:
+        vld1.32    {d0 }, [r2],  r3
+        vld1.32    {d1 }, [lr],  r3
+        vld1.32    {d2 }, [r2],  r3
+        vld1.32    {d3 }, [lr],  r3
+        subs        r11,  #4
+        vst1.32    {d0 }, [r0],  r1
+        vst1.32    {d1 }, [r12], r1
+        vst1.32    {d2 }, [r0],  r1
+        vst1.32    {d3 }, [r12], r1
+        bgt         1b
+        pop        {r11, pc}
+
+10016:
+        push       {r11, lr}
+16:
+        add         lr,  r2,  r3
+        lsl         r3,  #1
+        add         r12, r0,  r1
+        lsl         r1,  #1
+1:
+        vld1.32    {q0 }, [r2],  r3
+        vld1.32    {q1 }, [lr],  r3
+        vld1.32    {q2 }, [r2],  r3
+        vld1.32    {q3 }, [lr],  r3
+        subs        r11, #4
+        vst1.32    {q0 }, [r0],  r1
+        vst1.32    {q1 }, [r12], r1
+        vst1.32    {q2 }, [r0],  r1
+        vst1.32    {q3 }, [r12], r1
+        bgt         1b
+        pop        {r11, pc}
+
+10032:
+        push       {r11, lr}
+32:
+        add         lr,  r2,  r3
+        lsl         r3,  #1
+        add         r12, r0,  r1
+        lsl         r1,  #1
+1:
+        vld1.32    {q8,  q9 }, [r2],  r3
+        vld1.32    {q10, q11}, [lr],  r3
+        vld1.32    {q12, q13}, [r2],  r3
+        vld1.32    {q14, q15}, [lr],  r3
+        subs        r11, #4
+        vst1.32    {q8,  q9 }, [r0],  r1
+        vst1.32    {q10, q11}, [r12], r1
+        vst1.32    {q12, q13}, [r0],  r1
+        vst1.32    {q14, q15}, [r12], r1
+        bgt         1b
+        pop        {r11, pc}
+
+10064:
+        push       {r11, lr}
+64:
+        add         lr,  r2,  #32
+        add         r12, r0,  #32
+1:
+        vld1.32    {q8,  q9 }, [r2],  r3
+        vld1.32    {q10, q11}, [lr],  r3
+        vld1.32    {q12, q13}, [r2],  r3
+        vld1.32    {q14, q15}, [lr],  r3
+        subs        r11, #2
+        vst1.32    {q8,  q9 }, [r0],  r1
+        vst1.32    {q10, q11}, [r12], r1
+        vst1.32    {q12, q13}, [r0],  r1
+        vst1.32    {q14, q15}, [r12], r1
+        bgt         1b
+        pop        {r11, pc}
+
+128:
+        push       {r4, r5}
+        @ We could do this with fewer registers if we jump around but I
+        @ have a primative urge to load sequentially
+        mov         r4,  #64
+        add         lr,  r2,  #32
+        add         r12, r0,  #32
+        sub         r3,  r4
+        sub         r1,  r4
+1:
+        vld1.32    {q8,  q9 }, [r2],  r4
+        vld1.32    {q10, q11}, [lr],  r4
+        vld1.32    {q12, q13}, [r2],  r3
+        vld1.32    {q14, q15}, [lr],  r3
+        subs        r11, #1
+        vst1.32    {q8,  q9 }, [r0],  r4
+        vst1.32    {q10, q11}, [r12], r4
+        vst1.32    {q12, q13}, [r0],  r1
+        vst1.32    {q14, q15}, [r12], r1
+        bgt         1b
+        pop        {r4, r5, r11, pc}
+
+@ Use drop_thru where we can
+cpy_compound 104, 64, 40, 1
+cpy_compound 40, 32, 8
+
+cpy_compound 112, 64, 48, 1
+cpy_compound 48, 32, 16
+
+cpy_compound 120, 64, 56, 1
+cpy_compound 56, 32, 24, 1
+cpy_compound 24, 16, 8
+
+cpy_compound 72, 64, 8
+cpy_compound 80, 64, 16
+cpy_compound 88, 64, 24
+cpy_compound 96, 64, 32
+
+
+endfunc
+
diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
new file mode 100644
index 0000000000..9d21f6a882
--- /dev/null
+++ b/libavcodec/arm/rpi_hevc_misc_neon.h
@@ -0,0 +1,438 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
+#define AVCODEC_ARM_RPI_HEVC_MISC_H
+
+#include "config.h"
+#if HAVE_NEON_INLINE && !CONFIG_THUMB
+
+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
+                                                       int pixel_shift, int height,
+                                                       ptrdiff_t stride_src)
+{
+    const uint8_t *src2 = src + stride_src;
+    stride_src <<= 1;
+    switch (pixel_shift)
+    {
+        case 2:
+            __asm__ volatile (
+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
+                "beq         2f                                \n\t"
+                "1:                                            \n\t"
+                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.32     {q0}, [%[dst]]!                   \n\t"
+                "beq         3f                                \n\t"
+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.32     {q1}, [%[dst]]!                   \n\t"
+                "bne         1b                                \n\t"
+                "2:                                            \n\t"
+                "vst1.32     {q0}, [%[dst]]                    \n\t"
+                "b           4f                                \n\t"
+                "3:                                            \n\t"
+                "vst1.32     {q1}, [%[dst]]                    \n\t"
+                "4:                                            \n\t"
+                :  // Outputs
+                           [src]"+r"(src),
+                          [src2]"+r"(src2),
+                           [dst]"+r"(dst),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_src]"r"(stride_src)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+        case 1:
+            __asm__ volatile (
+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
+                "beq         2f                                \n\t"
+                "1:                                            \n\t"
+                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
+                "vzip.16     d0, d1                            \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.16     {d0}, [%[dst]]!                   \n\t"
+                "beq         3f                                \n\t"
+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
+                "vzip.16     d2, d3                            \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.16     {d2}, [%[dst]]!                   \n\t"
+                "bne         1b                                \n\t"
+                "2:                                            \n\t"
+                "vzip.16     d0, d1                            \n\t"
+                "vst1.16     {d0}, [%[dst]]                    \n\t"
+                "b           4f                                \n\t"
+                "3:                                            \n\t"
+                "vzip.16     d2, d3                            \n\t"
+                "vst1.16     {d2}, [%[dst]]                    \n\t"
+                "4:                                            \n\t"
+                :  // Outputs
+                           [src]"+r"(src),
+                          [src2]"+r"(src2),
+                           [dst]"+r"(dst),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_src]"r"(stride_src)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+        default:
+            __asm__ volatile (
+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
+                "subs        %[height], #8                     \n\t"
+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
+                "beq         2f                                \n\t"
+                "1:                                            \n\t"
+                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
+                "vzip.8      d0, d1                            \n\t"
+                "subs        %[height], #8                     \n\t"
+                "vst1.8      {d0}, [%[dst]]!                   \n\t"
+                "beq         3f                                \n\t"
+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
+                "vzip.8      d2, d3                            \n\t"
+                "subs        %[height], #8                     \n\t"
+                "vst1.8      {d2}, [%[dst]]!                   \n\t"
+                "bne         1b                                \n\t"
+                "2:                                            \n\t"
+                "vzip.8      d0, d1                            \n\t"
+                "vst1.8      {d0}, [%[dst]]                    \n\t"
+                "b           4f                                \n\t"
+                "3:                                            \n\t"
+                "vzip.8      d2, d3                            \n\t"
+                "vst1.8      {d2}, [%[dst]]                    \n\t"
+                "4:                                            \n\t"
+                :  // Outputs
+                           [src]"+r"(src),
+                          [src2]"+r"(src2),
+                           [dst]"+r"(dst),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_src]"r"(stride_src)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+    }
+}
+
+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
+                                                       int pixel_shift, int height,
+                                                      ptrdiff_t stride_dst)
+{
+    uint8_t *dst2 = dst + stride_dst;
+    stride_dst <<= 1;
+    switch (pixel_shift)
+    {
+        case 2:
+            __asm__ volatile (
+                "subs        %[height], #4                     \n\t"
+                "vld1.32     {q0}, [%[src]]!                   \n\t"
+                "beq         2f                                \n\t"
+                "1:                                            \n\t"
+                "vld1.32     {q1}, [%[src]]!                   \n\t"
+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "beq         3f                                \n\t"
+                "vld1.32     {q0}, [%[src]]!                   \n\t"
+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "bne         1b                                \n\t"
+                "2:                                            \n\t"
+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
+                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
+                "b           4f                                \n\t"
+                "3:                                            \n\t"
+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
+                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
+                "4:                                            \n\t"
+                :  // Outputs
+                           [dst]"+r"(dst),
+                          [dst2]"+r"(dst2),
+                           [src]"+r"(src),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_dst]"r"(stride_dst)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+        case 1:
+            __asm__ volatile (
+                "subs        %[height], #4                     \n\t"
+                "vld1.16     {d0}, [%[src]]!                   \n\t"
+                "beq         2f                                \n\t"
+                "1:                                            \n\t"
+                "vld1.16     {d2}, [%[src]]!                   \n\t"
+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
+                "beq         3f                                \n\t"
+                "vld1.16     {d0}, [%[src]]!                   \n\t"
+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
+                "subs        %[height], #4                     \n\t"
+                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
+                "bne         1b                                \n\t"
+                "2:                                            \n\t"
+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
+                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
+                "b           4f                                \n\t"
+                "3:                                            \n\t"
+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
+                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
+                "4:                                            \n\t"
+                :  // Outputs
+                           [dst]"+r"(dst),
+                          [dst2]"+r"(dst2),
+                           [src]"+r"(src),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_dst]"r"(stride_dst)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+        default:
+            __asm__ volatile (
+                "subs        %[height], #8                     \n\t"
+                "vld1.8      {d0}, [%[src]]!                   \n\t"
+                "beq         2f                                \n\t"
+                "1:                                            \n\t"
+                "vld1.8      {d2}, [%[src]]!                   \n\t"
+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
+                "subs        %[height], #8                     \n\t"
+                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
+                "beq         3f                                \n\t"
+                "vld1.8      {d0}, [%[src]]!                   \n\t"
+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
+                "subs        %[height], #8                     \n\t"
+                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
+                "bne         1b                                \n\t"
+                "2:                                            \n\t"
+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
+                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
+                "b           4f                                \n\t"
+                "3:                                            \n\t"
+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
+                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
+                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
+                "4:                                            \n\t"
+                :  // Outputs
+                           [dst]"+r"(dst),
+                          [dst2]"+r"(dst2),
+                           [src]"+r"(src),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_dst]"r"(stride_dst)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+    }
+}
+
+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
+                                                       int pixel_shift, int height,
+                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
+{
+    int x, y;
+    switch (pixel_shift)
+    {
+        case 2:
+            __asm__ volatile (
+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
+                "sub         %[height], #2                 \n\t"
+                "1:                                        \n\t"
+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
+                "str         %[y], [%[dst]], %[stride_dst] \n\t"
+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
+                "subs        %[height], #2                 \n\t"
+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
+                "bne         1b                            \n\t"
+                "str         %[y], [%[dst]]                \n\t"
+                :  // Outputs
+                             [x]"=&r"(x),
+                             [y]"=&r"(y),
+                           [src]"+r"(src),
+                           [dst]"+r"(dst),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_src]"r"(stride_src),
+                    [stride_dst]"r"(stride_dst)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+        case 1:
+            __asm__ volatile (
+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
+                "sub         %[height], #2                 \n\t"
+                "1:                                        \n\t"
+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
+                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
+                "subs        %[height], #2                 \n\t"
+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
+                "bne         1b                            \n\t"
+                "strh        %[y], [%[dst]]                \n\t"
+                :  // Outputs
+                             [x]"=&r"(x),
+                             [y]"=&r"(y),
+                           [src]"+r"(src),
+                           [dst]"+r"(dst),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_src]"r"(stride_src),
+                    [stride_dst]"r"(stride_dst)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+        default:
+            __asm__ volatile (
+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
+                "sub         %[height], #2                 \n\t"
+                "1:                                        \n\t"
+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
+                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
+                "subs        %[height], #2                 \n\t"
+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
+                "bne         1b                            \n\t"
+                "strb        %[y], [%[dst]]                \n\t"
+                :  // Outputs
+                             [x]"=&r"(x),
+                             [y]"=&r"(y),
+                           [src]"+r"(src),
+                           [dst]"+r"(dst),
+                        [height]"+r"(height)
+                :  // Inputs
+                    [stride_src]"r"(stride_src),
+                    [stride_dst]"r"(stride_dst)
+                :  // Clobbers
+                    "cc", "memory"
+            );
+            break;
+    }
+}
+
+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
+                                              int pixel_shift, int height,
+                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
+{
+    if (stride_dst == 1 << pixel_shift)
+        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
+    else if (stride_src == 1 << pixel_shift)
+        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
+    else
+        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
+}
+
+#endif /* HAVE_NEON_INLINE */
+
+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
new file mode 100644
index 0000000000..325c26a49b
--- /dev/null
+++ b/libavcodec/arm/rpi_hevc_mv_arm.h
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Written by John Cox, Ben Avison
+*/
+
+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
+#define AVCODEC_ARM_RPI_HEVC_MV_H
+
+#if HAVE_ARMV6T2_INLINE
+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
+{
+    MvXY r;
+    __asm__ (
+        "sadd16    %[r], %[a], %[b]        \n\t"
+        : [r]"=r"(r)
+        : [a]"r"(a),
+          [b]"r"(b)
+        :
+        );
+    return r;
+}
+#define mvxy_add mvxy_add_arm
+#endif
+
+#if HAVE_ARMV6T2_INLINE
+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
+{
+    int t;
+    __asm__ (
+    "ssat   %[td], #8,    %[td]          \n\t"
+    "ssat   %[tb], #8,    %[tb]          \n\t"
+    "eor    %[t],  %[td], %[td], asr #31 \n\t"
+    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
+    "asr    %[t],  #1                    \n\t"
+    "add    %[t],  #0x4000               \n\t"
+    "it ne                               \n\t"
+    "sdivne %[t],  %[t],  %[td]          \n\t"
+    "mov    %[td], #32                   \n\t"
+    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
+    "ssat   %[td], #13,   %[td], asr #6  \n\t"
+    "mov    %[tb], #127                  \n\t"
+    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
+    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
+// This takes the sign of x & y for rounding at the "wrong" point
+// (i.e. after adding 127) but for the range of values (-1,-127)
+// where it does the wrong thing you get the right answer (0) anyway
+    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
+    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
+    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
+    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
+    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
+    :
+         [t]"=&r"(t),
+        [xy]"+r"(xy),
+        [td]"+r"(td),
+        [tb]"+r"(tb)
+    :
+    :
+        "cc"
+    );
+    return xy;
+}
+#define mv_scale_xy mv_scale_xy_arm
+#endif
+#endif
+
+#endif // AVCODEC_ARM_RPI_HEVC_MV_H
+
diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
new file mode 100644
index 0000000000..62b9326532
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
+#define AVCODEC_ARM_HEVCDSP_ARM_H
+
+#include "libavcodec/rpi_hevcdsp.h"
+
+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..88a3b4e5e7
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
@@ -0,0 +1,1634 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
+        vsubl.u8  q0, \Q0a, \P0a
+        vsubl.u8  q1, \P1a, \Q1a
+        vdup.16   d4, r2
+        \I1
+        vshl.i16  q0, #2
+        \I2
+        vadd.i16  q0, q1
+        \I3
+        vmovl.u8  q2, d4
+        \I4
+        vneg.s16  q1, q2
+        \I5
+        vrshr.s16 q0, #3
+        \I6
+        \I7
+        \I8
+        vmin.s16  q0, q2
+        vmovl.u8  q2, \Q0a
+        vmax.s16  q0, q1
+        vaddw.u8  q1, q0, \P0a
+        vsub.i16  q0, q2, q0
+        vqmovun.s16 \P0a, q1
+        vqmovun.s16 \Q0a, q0
+.endm
+
+
+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
+        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
+        lsr       r12, r2, #16
+        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
+        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
+        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
+        vdup.16   d4, r2          @ tc0a, tc0b
+        vdup.16   d6, r12         @ tc1a, tc1b
+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
+        \I1
+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
+        \I2
+        vmovl.u8  q2, d4          @ tc0a, tc0b
+        \I3
+        vmovl.u8  q3, d6          @ tc1a, tc1b
+        \I4
+        vmin.s16  q0, q2
+        \I5
+        vneg.s16  q2, q2          @ -tc0a, -tc0b
+        \I6
+        vmin.s16  q1, q3
+        \I7
+        vneg.s16  q3, q3          @ -tc1a, -tc1b
+        vmax.s16  q0, q2          @ delta0a
+        vmovl.u8  q2, \Q0a
+        vmax.s16  q1, q3          @ delta0b
+        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
+        vsub.i16  q0, q2, q0      @ q0a - delta0a
+        vmovl.u8  q2, \Q0b
+        vsub.i16  q2, q1          @ q0b - delta0b
+        vaddw.u8  q1, \P0b        @ p0b + delta0b
+        vqmovun.s16 \Q0a, q0
+        vqmovun.s16 \P0a, q3
+        vqmovun.s16 \Q0b, q2
+        vqmovun.s16 \P0b, q1
+.endm
+
+
+@ Preserves r12
+@ Clobbers r2
+@ P0a et al all contain UVUVUVUV
+@ r2 (tc4) contains
+@   [0..7]   tc U a
+@   [8..15]  tc V a
+
+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
+        vsub.i16  q0, \Q0a, \P0a
+        vsub.i16  q1, \P1a, \Q1a
+        vdup.16   d4, r2
+        \I1
+        vshl.i16  q0, #2
+        \I2
+        vadd.i16  q0, q1
+        \I3
+        vshll.u8  q2, d4, #\bit_depth - 8
+        \I4
+        vneg.s16  q1, q2
+        \I5
+        vrshr.s16 q0, #3
+        \I6
+        \I7
+        \I8
+        vmin.s16  q0, q2
+        vmov.i16  q2, #0
+        vmax.s16  q0, q1
+        vadd.i16  \P0a, q0
+        vsub.i16  \Q0a, q0
+        vmov.i16  q1, #(1 << \bit_depth) - 1
+        vmax.s16  \P0a, q2
+        vmax.s16  \Q0a, q2
+        vmin.s16  \P0a, q1
+        vmin.s16  \Q0a, q1
+.endm
+
+@ Clobbers r2, r12
+@ P0a et al all contain UVUVUVUV
+@ r2 (tc4) contains
+@   [0..7]   tc U a
+@   [8..15]  tc V a
+@  [16..23]  tc U b
+@  [24..31]  tc V b
+
+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
+        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
+        lsr       r12, r2, #16
+        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
+        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
+        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
+        vdup.16   d4, r2          @ tc0a, tc0b
+        vdup.16   d6, r12         @ tc1a, tc1b
+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
+        \I1
+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
+        \I2
+        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
+        \I3
+        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
+        \I4
+        vmin.s16  q0, q2
+        \I5
+        vneg.s16  q2, q2          @ -tc0a, -tc0b
+        \I6
+        vmin.s16  q1, q3
+        \I7
+        vneg.s16  q3, q3          @ -tc1a, -tc1b
+        vmax.s16  q0, q2          @ delta0a
+        vadd.i16  \P0a, q0        @ p0a + delta0a
+        vsub.i16  \Q0a, q0        @ q0a - delta0a
+        vmax.s16  q1, q3          @ delta0b
+        vadd.i16  \P0b, q1        @ p0b + delta0b
+        vsub.i16  \Q0b, q1        @ q0b - delta0b
+        vmov.i16  q2, #0
+        vmov.i16  q3, #(1 << \bit_depth) - 1
+        vmax.s16  \P0a, q2
+        vmax.s16  \Q0a, q2
+        vmax.s16  \P0b, q2
+        vmax.s16  \Q0b, q2
+        vmin.s16  \P0a, q3
+        vmin.s16  \Q0a, q3
+        vmin.s16  \P0b, q3
+        vmin.s16  \Q0b, q3
+.endm
+
+
+
+@   uint8_t *_no_p,     [sp+0]
+@   uint8_t *_no_q)     [sp+4]
+
+.macro hevc_loop_filter_luma_start
+        ldr     r12, [r3]
+        ldr      r3, [r3, #4]
+        orrs     r3, r12, r3, lsl #16
+        it       eq
+        bxeq     lr
+        push     {r4-r10,lr}            @ 32 bytes
+        ldrd     r4, r5, [sp, #32]      @ &_no_p
+        ldrb     r4, [r4]
+        ldrb     r5, [r5]
+        movs     r10, r4
+        it ne
+        movne    r10, #1
+        cmp      r5, #0
+        it ne
+        orrne    r10, #2
+.endm
+
+@ Input:
+@  r2          beta    (raw: needs shift for bitdepth > 8)
+@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
+@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
+@
+@ Input & output
+@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
+@ 16-bit:  q8-q15
+@
+@  r1         -r1
+@  r10        b1->C, b0->N  (r10 junk)
+@
+@ Junks:
+@  r5, r6, r7, r8, r9
+
+.macro m_filter_luma bit_depth, Q11, Q15
+.if \bit_depth == 8
+        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
+        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
+        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
+        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
+        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
+        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
+.endif
+        vadd.i16    q0, q9, \Q11  @ P2 + P0
+.if \bit_depth > 8
+        lsl         r3, r3, #(\bit_depth - 8)
+.endif
+        vadd.i16    q1, q14, q12  @ Q2 + Q0
+.if \bit_depth > 8
+        lsl         r2, r2, #(\bit_depth - 8)
+.endif
+        vsub.i16    q0, q10       @ P2 - P1 + P0
+        lsr         r5, r3, #16
+        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
+.if \bit_depth == 8
+        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
+        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
+.endif
+        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
+        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
+        vmov.i64    q2, #0xffffffff0000
+        vbic        q0, q2        @ only dp0(') and dp3(')
+        vbic        q1, q2        @ only dq0(') and dq3(')
+        vsra.u64    q0, #16
+        vsra.u64    q1, #16
+        vdup.16     q3, r2        @ beta
+        vdup.16     d14, r3       @ tC[0]
+        vdup.16     d15, r5       @ tC[1]
+        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
+        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
+        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
+        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
+        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
+        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
+        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
+        vshl.s16    q6, q7, #2    @ tC[] * 4
+        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
+        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
+        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
+        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
+        cmp         r7, #0
+        beq         .Lbypasswrite
+
+        vcgt.s16    q5, q6, q5    @ if < tc25
+        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
+        vand        q4, q5
+        vbic        d8, d4
+        vbic        d9, d4
+        vshr.s16    q3, #2        @ beta_2 = beta >> 2
+        vsra.u64    q4, #16
+        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
+        vshl.i16    q7, #1        @ tc2 = tC[] << 1
+        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
+        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
+        vand        d6, d8        @ && beta_2 tests, prime in ms half
+        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
+        vneg.s16    q6, q7        @ -tc2
+        vmovn.i32   d8, q3
+        vshrn.i32   d6, q3, #16
+        vand        d6, d8
+        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
+        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
+        vadd.i16    q0, \Q11, q12 @ p0 + q0
+        ands        r9, r7, r8
+        beq         1f
+
+        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
+        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
+        lsr         r3, r9, #16
+        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
+        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
+        vadd.i16    q0, q8, q9    @ p3 + p2
+        vadd.i16    q5, \Q15, q14 @ q2 + q3
+        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
+        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
+        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
+        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
+        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
+        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
+        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
+        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
+        vrshr.s16   q0, #3        @ scale, with rounding
+        vrshr.s16   q5, #3
+        vrshr.s16   q1, #2
+        vrshr.s16   q4, #2
+        vrshr.s16   q2, #3
+        vrshr.s16   q3, #3
+        vsub.i16    q0, q9        @ find difference
+        vsub.i16    q5, q14
+        vsub.i16    q1, q10
+        vsub.i16    q4, q13
+        vsub.i16    q2, \Q11
+        vsub.i16    q3, q12
+        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
+        vmax.s16    q5, q6
+        vmax.s16    q1, q6
+        vmax.s16    q4, q6
+        vmax.s16    q2, q6
+        vmax.s16    q3, q6
+        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
+        vdup.16     d13, r3
+        vmin.s16    q0, q7
+        vmin.s16    q5, q7
+        vmin.s16    q1, q7
+        vmin.s16    q4, q7
+        vmin.s16    q2, q7
+        vmin.s16    q3, q7
+        vadd.i16    q0, q9        @ apply difference
+        vadd.i16    q5, q14
+        vadd.i16    q1, q10
+        vadd.i16    q4, q13
+        vadd.i16    q2, \Q11
+        vadd.i16    q3, q12
+        vbit        q9, q0, q6    @ apply filtered values according to mask
+        vbit        q14, q5, q6
+        vbit        q10, q1, q6
+        vbit        q13, q4, q6
+        vbit        \Q11, q2, q6
+        vbit        q12, q3, q6
+        vneg.s16    q6, q7        @ restore -tc2
+
+1:
+        bics        r9, r7, r8
+        beq         2f
+
+        vsub.i16    q0, q12, \Q11 @ q0 - p0
+        vsub.i16    q1, q13, q10  @ q1 - p1
+        lsr         r3, r9, #16
+        vshl.i16    q2, q0, #3
+        lsr         r7, r5, #16
+        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
+        lsr         r8, r6, #16
+        vshl.i16    q2, q1, #1
+        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
+        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
+        vsub.i16    q5, q3, q4
+        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
+        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
+        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
+        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
+        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
+        vmax.s16    q6, q5        @
+        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
+        vdup.16     q0, r2        @ beta
+        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
+        vshr.s16    q4, #1        @ tc_2 = tc >> 1
+        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
+        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
+        vshr.s16    q2, q0, #1    @ beta >> 1
+        vadd.i16    q2, q0        @ beta + (beta >> 1)
+        vneg.s16    q0, q4        @ -tc_2
+        vabs.s16    q5, q5        @ abs(original delta0)
+        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
+        vmax.s16    q1, q0
+        vmax.s16    q3, q0
+        vshl.s16    q0, q7, #2    @ 8 * tc
+        vadd.i16    q7, q0        @ 10 * tc
+        vdup.16     d0, r9
+        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
+        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
+        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
+        vdup.16     d8, r5        @ dp0 + dp3
+        vdup.16     d9, r7        @ dp0' + dp3'
+        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
+        vdup.16     d10, r6       @ dq0 + dq3
+        vdup.16     d11, r8       @ dq0' + dq3'
+        vand        q7, q0        @ AND block and line masks
+        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
+        vadd.i16    q0, q1, q10   @ p1 + deltap1
+        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
+        vadd.i16    q3, q3, q13   @ q1 + deltaq1
+        vadd.i16    q1, \Q11, q6  @ p0 + delta0
+        vsub.i16    q2, q12, q6   @ q0 - delta0
+        vand        q4, q7        @ AND nd_p test with block/line masks
+        vand        q5, q7        @ AND nd_q test with block/line masks
+        vbit        q10, q0, q4
+        vbit        \Q11, q1, q7
+        vbit        q12, q2, q7
+        vbit        q13, q3, q5
+
+2:
+.if \bit_depth == 8
+        vmovn.i16 d16, q8
+        vmovn.i16 d23, \Q15
+        neg       r1, r1
+        vqmovun.s16 d17, q9
+        vqmovun.s16 d18, q10
+        vqmovun.s16 d19, \Q11
+        lsls      r10, #31
+        vqmovun.s16 d20, q12
+        vqmovun.s16 d21, q13
+        vqmovun.s16 d22, q14
+.else
+        vmov.i16  q0, #0
+        vmov.i16  q1, #(1 << \bit_depth - 1)
+        @ q8 & q15 should be unaltered and so don't require clipping
+        neg       r1, r1
+        vmax.s16  q9,  q0
+        vmax.s16  q10, q0
+        vmax.s16  q11, q0
+        vmax.s16  q12, q0
+        vmax.s16  q13, q0
+        vmax.s16  q14, q0
+        lsls      r10, #31
+        vmin.s16  q9,  q1
+        vmin.s16  q10, q1
+        vmin.s16  q11, q1
+        vmin.s16  q12, q1
+        vmin.s16  q13, q1
+        vmin.s16  q14, q1
+.endif
+        bx        lr
+.endm
+
+function hevc_loop_filter_luma_body
+        m_filter_luma 8, q15, q11
+endfunc
+
+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
+@   uint8_t *_pix,      [r0]
+@   ptrdiff_t _stride,  [r1]
+@   int _beta,          [r2]
+@   int *_tc,           [r3]
+@   uint8_t *_no_p,     [sp+0]
+@   uint8_t *_no_q)     [sp+4]
+
+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
+        hevc_loop_filter_luma_start
+
+        sub      r4, r0, #4
+        b        .Lv_loop_luma_common
+endfunc
+
+@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
+@   uint8_t * pix_r,    [r0]
+@   ptrdiff_t _stride,  [r1]
+@   int _beta,          [r2]
+@   int tc2,            [r3]
+@   int no_f,           [sp+0]
+@   uint8_t * pix_l)    [sp+4]
+
+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        push     {r4-r10,lr}            @ 32 bytes
+        ldr      r4, [sp, #36]
+        ldr      r10, [sp, #32]
+
+.Lv_loop_luma_common:
+        vpush    {d8-d15}
+
+        @ It's slightly faster to do unlaned loads and transpose in the
+        @ 8-bit case, even though it needs more instructions, because
+        @ VLD4.8 is a really slow way to read from memory.
+        vld1.32 {d16[0]}, [r4:32], r1
+        vld1.32 {d20[0]}, [r0:32], r1
+        vld1.32 {d16[1]}, [r4:32], r1
+        vld1.32 {d20[1]}, [r0:32], r1
+        vld1.32 {d17[0]}, [r4:32], r1
+        vld1.32 {d21[0]}, [r0:32], r1
+        vld1.32 {d17[1]}, [r4:32], r1
+        vld1.32 {d21[1]}, [r0:32], r1
+        vld1.32 {d18[0]}, [r4:32], r1
+        vld1.32 {d22[0]}, [r0:32], r1
+        vld1.32 {d18[1]}, [r4:32], r1
+        vld1.32 {d22[1]}, [r0:32], r1
+        vld1.32 {d19[0]}, [r4:32], r1
+        vld1.32 {d23[0]}, [r0:32], r1
+        vld1.32 {d19[1]}, [r4:32]
+        vld1.32 {d23[1]}, [r0:32]
+        vuzp.16 q8, q9
+        vuzp.16 q10, q11
+        vuzp.8  q8, q9
+        vuzp.8  q10, q11
+        vswp    d17, d18
+        vswp    d21, d22
+
+        bl hevc_loop_filter_luma_body
+
+        add     r6, r4, r1
+        add     r2, r0, r1
+        lsl     r1, #1
+
+        vpop     {d8-d15}
+
+        @ no_p[1]
+        bmi     1f
+        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
+        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
+
+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
+1:
+        @ no_q[1]
+        bcs     1f
+        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
+        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
+
+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+1:
+        pop      {r4-r10,pc}
+
+.Lbypasswrite:
+        vpop     {d8-d15}
+        pop      {r4-r10,pc}
+endfunc
+
+.macro m_filter_v_luma_16 bit_depth
+        vpush    {d8-d15}
+
+        @ Uses slightly fewer instructions to do laned loads than unlaned
+        @ and transpose.  This also means that we can use the same code for
+        @ both split & unsplit deblock
+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+
+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+
+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+
+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+
+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+
+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+
+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+
+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+
+        bl hevc_loop_filter_luma_body_\bit_depth
+
+        add      r6, r4, r1
+        add      r2, r0, r1
+        lsl      r1, #1
+
+        vpop     {d8-d15}
+
+        @ p[1]
+        bmi      1f
+        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
+        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
+        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
+        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
+        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
+        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
+        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
+1:
+        @ q[1]
+        bcs      1f
+        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
+        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
+        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
+        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
+1:
+        pop      {r4-r10,pc}
+.endm
+
+
+
+
+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
+@                                 ptrdiff_t stride, [r1]
+@                                 int beta,         [r2]
+@                                 int32_t *tc,      [r3]
+@                                 uint8_t *no_p,    sp[0]
+@                                 uint8_t *no_q);   sp[4]
+@
+@ Src should always be on 8 byte boundry & all in the same slice
+
+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
+        hevc_loop_filter_luma_start
+        b        .Lh_loop_filter_luma_common_8
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        push     {r4-r10,lr}            @ 32 bytes
+        ldr      r10, [sp, #32]
+
+.Lh_loop_filter_luma_common_8:
+        sub      r4, r0, r1, lsl #2
+        add      r0, r4, r1
+        lsl      r1, #1
+        vpush    {d8-d15}
+
+        vld1.8  {d16}, [r4], r1
+        vld1.8  {d17}, [r0], r1
+        vld1.8  {d18}, [r4], r1
+        vld1.8  {d19}, [r0], r1
+        vld1.8  {d20}, [r4], r1
+        vld1.8  {d21}, [r0], r1
+        vld1.8  {d22}, [r4]
+        vld1.8  {d23}, [r0]
+
+        bl hevc_loop_filter_luma_body
+
+        add      r0, r0, r1, lsl #1
+        add      r2, r4, r1, lsl #1
+        add      r6, r4, r1, asr #1
+        vpop     {d8-d15}
+
+        @ P2-P0
+        bcs      1f
+        vst1.8   {d22}, [r4], r1
+        vst1.8   {d21}, [r6]
+        vst1.8   {d20}, [r4]
+1:
+        @ Q0-Q2
+        bmi      1f
+        vst1.8   {d19}, [r0], r1
+        vst1.8   {d18}, [r2]
+        vst1.8   {d17}, [r0]
+1:
+        pop      {r4-r10,pc}
+endfunc
+
+
+.macro m_filter_h_luma_16 bit_depth
+        sub      r4, r0, r1, lsl #2
+        add      r0, r4, r1
+        lsl      r1, #1
+        vpush    {d8-d15}
+
+        vld1.16 { q8}, [r4], r1
+        vld1.16 { q9}, [r0], r1
+        vld1.16 {q10}, [r4], r1
+        vld1.16 {q11}, [r0], r1
+        vld1.16 {q12}, [r4], r1
+        vld1.16 {q13}, [r0], r1
+        vld1.16 {q14}, [r4]
+        vld1.16 {q15}, [r0]
+
+        bl hevc_loop_filter_luma_body_\bit_depth
+
+        add      r0, r0, r1, lsl #1
+        add      r2, r4, r1, lsl #1
+        add      r6, r4, r1, asr #1
+        vpop     {d8-d15}
+
+        @ P2-P0
+        bcs      1f
+        vst1.16  {q14}, [r4], r1
+        vst1.16  {q13}, [r6]
+        vst1.16  {q12}, [r4]
+1:
+        bmi      1f
+        vst1.16  {q11}, [r0], r1
+        vst1.16  {q10}, [r2]
+        vst1.16  { q9}, [r0]
+1:
+        pop      {r4-r10,pc}
+.endm
+
+
+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     unsigned int no_f);    // r3
+@
+@ no_f
+@ 0  tl P0
+@ 1  tr P1
+@ 2  bl Q0
+@ 3  br Q1
+@
+@ Probably not worth having the P/Qa only special case in this direction
+@ Given layout we won't save any memory reads or avoid any cache dirtying
+@ We would save a bit of computation but I expect the partials to be less
+@ common in the H direction than V due to how we arrange deblock.
+
+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
+        sub      r12, r0, r1
+        cmp      r2, #0
+        it eq
+        bxeq     lr
+        vld1.8   {d26,d27}, [r0]
+        lsl      r1, #1
+        sub      r0, r1
+        vld1.8   {d18,d19}, [r12], r1
+        vld1.8   {d16,d17}, [r0], r1
+        vld1.8   {d28,d29}, [r12]
+
+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
+        "sub      r12, r0, r1, asr #1"
+
+        lsls     r3, #29                @ b2 -> N, b3 -> C
+        it pl
+        vstrpl   d26, [r0, #0]
+        it cc
+        vstrcc   d27, [r0, #8]
+        lsls     r3, #2                 @ b0 -> N, b1 -> C
+        it pl
+        vstrpl   d18, [r12, #0]
+        it cc
+        vstrcc   d19, [r12, #8]
+        bx       lr
+
+endfunc
+
+
+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     unsigned int no_f);    // r3
+@
+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+@
+@ Macro here actual function near bottom
+
+.macro m_filter_h_uv_16 bit_depth
+        sub      r12, r0, r1
+        cmp      r2, #0
+        it eq
+        bxeq     lr
+        vld1.16  {q12, q13}, [r0]
+        lsl      r1, #1
+        sub      r0, r1
+        vld1.16  {q10, q11}, [r12], r1
+        vld1.16  {q8,  q9 }, [r0], r1
+        vld1.16  {q14, q15}, [r12]
+
+        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
+        "sub      r12, r0, r1, asr #1", \
+        "cmp      r3, #0"
+
+        bne      1f
+        vst1.16  {q10, q11}, [r12]
+        vst1.16  {q12, q13}, [r0]
+        bx       lr
+
+        @ At least one no_f bit is set
+        @ Which means we need to break this apart in an ugly fashion
+1:
+        lsls     r3, #29                @ b2 -> N, b3 -> C
+        itt pl
+        vstrpl   d24, [r0, #0]
+        vstrpl   d25, [r0, #8]
+        itt cc
+        vstrcc   d26, [r0, #16]
+        vstrcc   d27, [r0, #24]
+        lsls     r3, #2                 @ b0 -> N, b1 -> C
+        itt pl
+        vstrpl   d20, [r12, #0]
+        vstrpl   d21, [r12, #8]
+        itt cc
+        vstrcc   d22, [r12, #16]
+        vstrcc   d23, [r12, #24]
+        bx       lr
+.endm
+
+
+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     uint8_t * src_l,       // r3
+@                                     unsigned int no_f);   // sp[0]
+@
+@ no_f:
+@ 0  tl P0
+@ 1  tr Q0
+@ 2  bl P1
+@ 3  br Q1
+
+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
+        cmp      r2, #0
+        it eq
+        bxeq     lr
+        push     {lr}
+        vld2.16  {d16[0], d18[0]}, [r3], r1
+        vld2.16  {d20[0], d22[0]}, [r0], r1
+
+        cmp      r2, #0x10000
+        vld2.16  {d16[1], d18[1]}, [r3], r1
+        vld2.16  {d20[1], d22[1]}, [r0], r1
+
+        vld2.16  {d16[2], d18[2]}, [r3], r1
+        vld2.16  {d20[2], d22[2]}, [r0], r1
+
+        vld2.16  {d16[3], d18[3]}, [r3], r1
+        vld2.16  {d20[3], d22[3]}, [r0], r1
+        blo      10f
+
+        vld2.16  {d17[0], d19[0]}, [r3], r1
+        vld2.16  {d21[0], d23[0]}, [r0], r1
+
+        sub      ip, r0, r3
+        vld2.16  {d17[1], d19[1]}, [r3], r1
+        vld2.16  {d21[1], d23[1]}, [r0], r1
+
+        cmp      ip, #4
+        vld2.16  {d17[2], d19[2]}, [r3], r1
+        vld2.16  {d21[2], d23[2]}, [r0], r1
+
+        vld2.16  {d17[3], d19[3]}, [r3]
+        vld2.16  {d21[3], d23[3]}, [r0]
+
+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
+        "ldr      lr, [sp, #4]", \
+        "neg      r1, r1",       \
+        "it eq; cmpeq lr, #0",   \
+        "add      r3, #2",       \
+        "add      ip, r3, r1",   \
+        "add      r2, r0, r1",   \
+        "lsl      r1, #1"
+
+        bne      1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
+        vst2.16   {d19[2], d21[2]}, [ip], r1
+        vst2.16   {d19[1], d21[1]}, [r3], r1
+        vst2.16   {d19[0], d21[0]}, [ip], r1
+        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
+        vst2.16   {d18[2], d20[2]}, [ip], r1
+        vst2.16   {d18[1], d20[1]}, [r3]
+        vst2.16   {d18[0], d20[0]}, [ip]
+        pop       {pc}
+
+@ Either split or partial
+1:
+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
+        ittt cs
+        addcs    r0, r0, r1, lsl #1
+        addcs    r2, r2, r1, lsl #1
+        bcs      1f
+        @ Q0b
+        vst1.16  {d21[3]}, [r0], r1
+        vst1.16  {d21[2]}, [r2], r1
+        vst1.16  {d21[1]}, [r0], r1
+        vst1.16  {d21[0]}, [r2], r1
+1:
+        ittt mi
+        addmi    r3, r3, r1, lsl #1
+        addmi    ip, ip, r1, lsl #1
+        bmi      1f
+        @ P0b
+        vst1.16  {d19[3]}, [r3], r1
+        vst1.16  {d19[2]}, [ip], r1
+        vst1.16  {d19[1]}, [r3], r1
+        vst1.16  {d19[0]}, [ip], r1
+1:
+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
+        bcs      1f
+        @ Q0a
+        vst1.16  {d20[3]}, [r0], r1
+        vst1.16  {d20[2]}, [r2], r1
+        vst1.16  {d20[1]}, [r0]
+        vst1.16  {d20[0]}, [r2]
+1:
+        it       mi
+        popmi    {pc}
+        @ P0a
+        vst1.16  {d18[3]}, [r3], r1
+        vst1.16  {d18[2]}, [ip], r1
+        vst1.16  {d18[1]}, [r3]
+        vst1.16  {d18[0]}, [ip]
+        pop      {pc}
+
+@ Single lump (rather than double)
+10:
+        @ As we have post inced r0/r3 in the load the easiest thing to do is
+        @ to subtract and write forwards, rather than backwards (as above)
+        @ b0 (P0a) -> N, b1 (Q0a) -> C
+
+        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
+        "ldr      lr, [sp, #4]",       \
+        "add      r3, #2",             \
+        "sub      r0, r0, r1, lsl #2", \
+        "sub      r3, r3, r1, lsl #2", \
+        "lsls     lr, #31",            \
+        "add      r2, r0, r1",         \
+        "add      ip, r3, r1",         \
+        "lsl      r1, #1"
+
+        bcs      3f
+        @ Q0a
+        vst1.16  {d20[0]}, [r0], r1
+        vst1.16  {d20[1]}, [r2], r1
+        vst1.16  {d20[2]}, [r0]
+        vst1.16  {d20[3]}, [r2]
+3:
+        it       mi
+        popmi    {pc}
+        @ P0a
+        vst1.16  {d18[0]}, [r3], r1
+        vst1.16  {d18[1]}, [ip], r1
+        vst1.16  {d18[2]}, [r3]
+        vst1.16  {d18[3]}, [ip]
+        pop      {pc}
+
+endfunc
+
+
+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+@                                     unsigned int stride,   // r1
+@                                     uint32_t tc4,          // r2
+@                                     uint8_t * src_l,       // r3
+@                                     unsigned int no_f);   // sp[0]
+@
+
+@ no_f
+@ 0  tl P0a
+@ 1  tr Q0a
+@ 2  bl P0b
+@ 3  br Q0b
+
+@ P1: q8,  q12
+@ P0: q9,  q13
+@ Q0: q10, q14
+@ Q1: q11, q15
+
+.macro m_filter_v_uv2_16 bit_depth
+        cmp      r2, #0
+        it eq
+        bxeq     lr
+        push     {lr}
+        vld2.32  {d16[0], d18[0]}, [r3], r1
+        vld2.32  {d20[0], d22[0]}, [r0], r1
+
+        cmp      r2, #0x10000
+        vld2.32  {d16[1], d18[1]}, [r3], r1
+        vld2.32  {d20[1], d22[1]}, [r0], r1
+
+        vld2.32  {d17[0], d19[0]}, [r3], r1
+        vld2.32  {d21[0], d23[0]}, [r0], r1
+
+        vld2.32  {d17[1], d19[1]}, [r3], r1
+        vld2.32  {d21[1], d23[1]}, [r0], r1
+        blo      10f
+
+        vld2.32  {d24[0], d26[0]}, [r3], r1
+        vld2.32  {d28[0], d30[0]}, [r0], r1
+
+        sub      ip, r0, r3
+        vld2.32  {d24[1], d26[1]}, [r3], r1
+        vld2.32  {d28[1], d30[1]}, [r0], r1
+
+        cmp      ip, #8
+        vld2.32  {d25[0], d27[0]}, [r3], r1
+        vld2.32  {d29[0], d31[0]}, [r0], r1
+
+        vld2.32  {d25[1], d27[1]}, [r3]
+        vld2.32  {d29[1], d31[1]}, [r0]
+
+        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
+        "ldr      lr, [sp, #4]", \
+        "neg      r1, r1",       \
+        "it eq; cmpeq lr, #0",   \
+        "add      r3, #4",       \
+        "add      ip, r3, r1",   \
+        "add      r2, r0, r1",   \
+        "lsl      r1, #1"
+
+        bne      1f
+
+@ Much/most of the time r0 == r3 + 8 and no_f == 0
+@ so it is worth having this special case
+        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
+        vst2.32   {d27[0], d29[0]}, [ip], r1
+        vst2.32   {d26[1], d28[1]}, [r3], r1
+        vst2.32   {d26[0], d28[0]}, [ip], r1
+        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
+        vst2.32   {d19[0], d21[0]}, [ip], r1
+        vst2.32   {d18[1], d20[1]}, [r3]
+        vst2.32   {d18[0], d20[0]}, [ip]
+        pop       {pc}
+
+@ Either split or partial
+1:
+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
+        ittt cs
+        addcs    r0, r0, r1, lsl #1
+        addcs    r2, r2, r1, lsl #1
+        bcs      1f
+        @ Q0b
+        vst1.32  {d29[1]}, [r0], r1
+        vst1.32  {d29[0]}, [r2], r1
+        vst1.32  {d28[1]}, [r0], r1
+        vst1.32  {d28[0]}, [r2], r1
+1:
+        ittt mi
+        addmi    r3, r3, r1, lsl #1
+        addmi    ip, ip, r1, lsl #1
+        bmi      1f
+        @ P0b
+        vst1.32  {d27[1]}, [r3], r1
+        vst1.32  {d27[0]}, [ip], r1
+        vst1.32  {d26[1]}, [r3], r1
+        vst1.32  {d26[0]}, [ip], r1
+1:
+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
+        bcs      1f
+        @ Q0a
+        vst1.32  {d21[1]}, [r0], r1
+        vst1.32  {d21[0]}, [r2], r1
+        vst1.32  {d20[1]}, [r0]
+        vst1.32  {d20[0]}, [r2]
+1:
+        it       mi
+        popmi    {pc}
+        @ P0a
+        vst1.32  {d19[1]}, [r3], r1
+        vst1.32  {d19[0]}, [ip], r1
+        vst1.32  {d18[1]}, [r3]
+        vst1.32  {d18[0]}, [ip]
+        pop      {pc}
+
+@ Single lump (rather than double)
+10:
+        @ As we have post inced r0/r3 in the load the easiest thing to do is
+        @ to subtract and write forwards, rather than backwards (as above)
+        @ b0 (P0a) -> N, b1 (Q0a) -> C
+
+        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
+        "ldr      lr, [sp, #4]",       \
+        "add      r3, #4",             \
+        "sub      r0, r0, r1, lsl #2", \
+        "sub      r3, r3, r1, lsl #2", \
+        "lsls     lr, #31",            \
+        "add      r2, r0, r1",         \
+        "add      ip, r3, r1",         \
+        "lsl      r1, #1"
+
+        bcs      3f
+        @ Q0a
+        vst1.32  {d20[0]}, [r0], r1
+        vst1.32  {d20[1]}, [r2], r1
+        vst1.32  {d21[0]}, [r0]
+        vst1.32  {d21[1]}, [r2]
+3:
+        it       mi
+        popmi    {pc}
+        @ P0a
+        vst1.32  {d18[0]}, [r3], r1
+        vst1.32  {d18[1]}, [ip], r1
+        vst1.32  {d19[0]}, [r3]
+        vst1.32  {d19[1]}, [ip]
+        pop      {pc}
+.endm
+
+
+@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
+@ But in real world testing it is ~20% slower, presumably due to code size
+
+#if 0 // NEON version
+
+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
+ *                                            int in_inc0, int in_inc1)
+ */
+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
+        mov         ip, sp
+        push        {a1-a3,v1-v8,lr}
+        ldm         ip, {v1-v6}
+        cmp         a1, #2
+        bls         2f
+        vpush       {d8-d13}
+        sub         v5, v5, #10
+        sub         v6, v6, #10
+1:
+        vld2.32     {d0[0], d2[0]}, [a3]!
+        vld2.32     {d4[0], d6[0]}, [a4]!
+          vmov.u8     q12, #0
+        ldrb        a2, [a3], #1
+        ldrb        ip, [a4], #1
+        ldrb        v8, [a3], #1
+        ldrb        lr, [a4], #1
+        add         a2, v1, a2, lsl #2
+        vld1.8      {d24[0]}, [a3], v5
+        add         ip, v3, ip, lsl #2
+        vld1.8      {d25[0]}, [a4], v6
+        add         v8, v2, v8, lsl #2
+        vld1.32     {d16[0]}, [a2]
+        add         lr, v4, lr, lsl #2
+        vld1.32     {d20[0]}, [ip]
+        vld1.32     {d18[0]}, [v8]
+        vld1.32     {d22[0]}, [lr]
+
+        vld2.32     {d0[1], d2[1]}, [a3]!
+        vld2.32     {d4[1], d6[1]}, [a4]!
+        ldrb        a2, [a3], #1
+          vmov.u16    d12, #1
+        ldrb        ip, [a4], #1
+          vmov.u16    d13, #2
+        ldrb        v8, [a3], #1
+          vmov.u16    d27, #4
+        ldrb        lr, [a4], #1
+        add         a2, v1, a2, lsl #2
+        vld1.8      {d24[2]}, [a3], v5
+        add         ip, v3, ip, lsl #2
+        vld1.8      {d25[2]}, [a4], v6
+        add         v8, v2, v8, lsl #2
+        vld1.32     {d16[1]}, [a2]
+        add         lr, v4, lr, lsl #2
+        vld1.32     {d20[1]}, [ip]
+        vld1.32     {d18[1]}, [v8]
+        vld1.32     {d22[1]}, [lr]
+
+        vld2.32     {d1[0], d3[0]}, [a3]!
+        vld2.32     {d5[0], d7[0]}, [a4]!
+        ldrb        a2, [a3], #1
+        ldrb        ip, [a4], #1
+        ldrb        lr, [a4], #1
+        ldrb        v8, [a3], #1
+        add         a2, v1, a2, lsl #2
+        vld1.8      {d24[4]}, [a3], v5
+        add         ip, v3, ip, lsl #2
+        vld1.8      {d25[4]}, [a4], v6
+        add         v8, v2, v8, lsl #2
+        vld1.32     {d17[0]}, [a2]
+        add         lr, v4, lr, lsl #2
+        vld1.32     {d21[0]}, [ip]
+        vld1.32     {d19[0]}, [v8]
+        vld1.32     {d23[0]}, [lr]
+
+        vld2.32     {d1[1], d3[1]}, [a3]!
+        vld2.32     {d5[1], d7[1]}, [a4]!
+        ldrb        a2, [a3], #1
+        ldrb        ip, [a4], #1
+        ldrb        v8, [a3], #1
+        ldrb        lr, [a4], #1
+        add         a2, v1, a2, lsl #2
+        vld1.8      {d24[6]}, [a3], v5
+        add         ip, v3, ip, lsl #2
+        vld1.8      {d25[6]}, [a4], v6
+        add         v8, v2, v8, lsl #2
+        vld1.32     {d17[1]}, [a2]
+        add         lr, v4, lr, lsl #2
+        vld1.32     {d21[1]}, [ip]
+        vld1.32     {d19[1]}, [v8]
+        vld1.32     {d23[1]}, [lr]
+
+        @ So now we have:
+        @ q0.32[i]  = curr[i].mv[0]
+        @ q1.32[i]  = curr[i].mv[1]
+        @ q2.32[i]  = neigh[i].mv[0]
+        @ q3.32[i]  = neigh[i].mv[1]
+        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
+        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
+        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
+        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
+        @ d24.16[i] = curr[i].pred_flag
+        @ d25.16[i] = neigh[i].pred_flag
+
+        vtst.16     d28, d24, d12
+        vtst.16     d29, d24, d13
+        vadd.i16    d8, d24, d12
+        vadd.i16    d9, d25, d12
+        vtst.16     d30, d25, d12
+        vtst.16     d31, d25, d13
+        veor        d26, d8, d9
+          ldr         lr, [sp, 6*8 + 1*4]
+        vmovl.s16   q4, d28
+        vmovl.s16   q5, d29
+          teq         lr, #1
+        vmovl.s16   q14, d30
+          it ne
+          lslne       v1, lr, #1
+        vmovl.s16   q15, d31
+          it ne
+          rsbne       v2, v1, #32
+        vbif        q0, q1, q4
+        vbif        q2, q3, q14
+        vbif        q1, q0, q5
+        vbif        q3, q2, q15
+        vabd.s16    q12, q0, q2
+        vabd.s16    q2, q1
+        vabd.s16    q0, q3
+        vabd.s16    q1, q3
+        vbif        q8, q9, q4
+        vbif        q10, q11, q14
+        vbif        q9, q8, q5
+        vbif        q11, q10, q15
+        vclt.u16    d6, d24, d27
+        vclt.u16    d8, d2, d27
+        vclt.u16    d7, d25, d27
+        vclt.u16    d9, d3, d27
+        vclt.u16    d2, d0, d27
+        vclt.u16    d0, d4, d27
+        vclt.u16    d3, d1, d27
+        vclt.u16    d1, d5, d27
+        vceq.i32    q12, q10, q8
+        vceq.i32    q10, q9
+        vceq.i32    q8, q11
+        vceq.i32    q9, q11
+        vshrn.i32   d6, q3, #8
+        vshrn.i32   d7, q4, #8
+        vshrn.i32   d8, q1, #8
+        vshrn.i32   d9, q0, #8
+        vmovn.i32   d4, q12
+        vmovn.i32   d2, q10
+        vmovn.i32   d3, q8
+        vmovn.i32   d5, q9
+        vand        q2, q3
+        vrev16.8    q3, q3
+        vand        q2, q3
+        vand        q1, q4
+        vrev16.8    q4, q4
+        vand        q1, q4
+        vand        d4, d5
+        vand        d2, d3
+        vbic        d0, d12, d4
+        vshr.u16    d26, #2
+        vbic        d0, d2
+        vmov.i16    d1, #0x5555
+        vorr        d0, d26
+          bne         10f
+
+        @ Merge results into result word, no duplicates
+        vmov        a2, s0
+        vmov        v8, s1
+        vmov.u16    ip, d0[1]
+        vmov.u16    lr, d0[3]
+        lsl         a2, #30
+        lsl         v8, #30
+        lsl         ip, #30
+        lsl         lr, #30
+        orr         a2, ip, a2, lsr #2
+        orr         v8, lr, v8, lsr #2
+        orr         a2, v8, a2, lsr #4
+        subs        a1, #4
+        orr         v7, a2, v7, lsr #8
+        bhi         1b
+
+        mov         a1, #32
+        ldr         a3, [sp, #6*8]
+        vpop        {d8-d13}
+        sub         a1, a1, a3, lsl #1
+        mov         a1, v7, lsr a1
+        pop         {a2-a4,v1-v8,pc}
+10:
+        @ Merge results into result word, with duplicates
+        vmul.i16    d0, d1
+        vmov        a2, s0
+        vmov        v8, s1
+        vmov.u16    ip, d0[1]
+        vmov.u16    lr, d0[3]
+        lsl         a2, v2
+        subs        a1, #4
+        lsl         v8, v2
+        lsl         ip, v2
+        lsl         lr, v2
+        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
+T       lsr         a2, v1
+T       orr         a2, ip, a2
+A       orr         a2, ip, a2, lsr v1
+        lsl         ip, v1, #1
+T       lsr         v8, v1
+T       orr         v8, lr, v8
+A       orr         v8, lr, v8, lsr v1
+        lsl         lr, v1, #2
+T       lsr         a2, ip
+T       orr         a2, v8, a2
+A       orr         a2, v8, a2, lsr ip
+        ldr         v1, [sp, #6*8 + 12*4]
+T       lsr         v7, lr
+T       orr         v7, a2, v7
+A       orr         v7, a2, v7, lsr lr
+        bhi         1b
+
+        mov         a1, #32
+        ldrd        a3, a4, [sp, #6*8]
+        vpop        {d8-d13}
+        mls         a1, a3, a4, a1
+        mls         a1, a3, a4, a1
+        mov         a1, v7, lsr a1
+        pop         {a2-a4,v1-v8,pc}
+
+
+2:
+        sub         v5, v5, #10
+        sub         v6, v6, #10
+        vmov.u8     d16, #0
+        blo         3f
+        vld2.32     {d0[0], d1[0]}, [a3]!
+        vld2.32     {d2[0], d3[0]}, [a4]!
+        ldrb        a2, [a3], #1
+        ldrb        ip, [a4], #1
+        ldrb        lr, [a4], #1
+        ldrb        v8, [a3], #1
+        add         a2, v1, a2, lsl #2
+        vld1.8      {d16[0]}, [a3], v5
+        add         ip, v3, ip, lsl #2
+        vld1.8      {d16[4]}, [a4], v6
+        add         v8, v2, v8, lsl #2
+        vld1.32     {d4[0]}, [a2]
+        add         lr, v4, lr, lsl #2
+        vld1.32     {d5[0]}, [ip]
+        vld1.32     {d6[0]}, [v8]
+        vld1.32     {d7[0]}, [lr]
+
+3:
+        vld2.32     {d0[1], d1[1]}, [a3]!
+        vld2.32     {d2[1], d3[1]}, [a4]!
+        ldrb        a2, [a3], #1
+          vmov.u16    d17, #1
+        ldrb        ip, [a4], #1
+          vmov.u16    d18, #2
+        ldrb        v8, [a3], #1
+          vmov.u16    d19, #4
+        ldrb        lr, [a4], #1
+        add         a2, v1, a2, lsl #2
+        vld1.8      {d16[2]}, [a3], v5
+        add         ip, v3, ip, lsl #2
+        vld1.8      {d16[6]}, [a4], v6
+        add         v8, v2, v8, lsl #2
+        vld1.32     {d4[1]}, [a2]
+        add         lr, v4, lr, lsl #2
+        vld1.32     {d5[1]}, [ip]
+        vld1.32     {d6[1]}, [v8]
+        vld1.32     {d7[1]}, [lr]
+
+        @ So now we have:
+        @ d0.32[i]  = curr[i].mv[0]
+        @ d1.32[i]  = curr[i].mv[1]
+        @ d2.32[i]  = neigh[i].mv[0]
+        @ d3.32[i]  = neigh[i].mv[1]
+        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
+        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
+        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
+        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
+        @ d16.16[i] = curr[i].pred_flag
+        @ d16.16[2+i] = neigh[i].pred_flag
+
+        vtst.16     d20, d16, d17
+        vtst.16     d22, d16, d18
+        vadd.i16    d30, d16, d17
+        vswp        d2, d3
+        ldr         lr, [sp, #1*4]
+        vmovl.s16   q10, d20
+          teq         lr, #1
+        vmovl.s16   q11, d22
+          it ne
+          lslne       v1, lr, #1
+        vbif        d0, d1, d20
+        vbif        d4, d6, d20
+        vbif        d3, d2, d21
+        vbif        d5, d7, d21
+        vbif        d1, d0, d22
+        vbif        d6, d4, d22
+        vbif        d2, d3, d23
+        vbif        d7, d5, d23
+        vshr.u16    d30, #2
+        vabd.s16    d24, d0, d3
+        vabd.s16    d25, d1, d2
+        vabd.s16    q0, q0, q1
+        vceq.i32    d2, d4, d5
+        vceq.i32    d20, d5, d6
+        vceq.i32    d21, d4, d7
+        vceq.i32    d3, d6, d7
+        vclt.u16    d6, d24, d19
+        vclt.u16    d7, d25, d19
+        vclt.u16    d22, d1, d19
+        vclt.u16    d23, d0, d19
+        vshrn.i32   d6, q3, #8
+        vmovn.i32   d2, q1
+        vshrn.i32   d7, q11, #8
+        vmovn.i32   d3, q10
+        vand        q0, q3, q1
+          it ne
+          rsbne       v2, v1, #32
+        vrev16.8    q3, q3
+        vand        q0, q3
+        vsra.u64    d30, #32
+        vshr.u64    q1, q0, #32
+        vand        q0, q1
+        vbic        d0, d17, d0
+        vand        d30, d30, d17
+        vbic        d0, d1
+        vmov.i16    d1, #0x5555
+        vorr        d0, d30
+          bne         10f
+
+        @ Construct result word, no duplicates
+        cmp         a1, #2
+        vmov.u16    a1, d0[1]
+        vmov.u16    a2, d0[0]
+        it eq
+        orreq       a1, a2, a1, lsl #2
+        pop         {a2-a4,v1-v8,pc}
+10:
+        @ Construct result word, with duplicates
+        cmp         a1, #2
+        vmul.i16    d0, d1
+        vmov        a2, s0
+        vmov.u16    a1, d0[1]
+        lsl         a2, #16
+        pkhbt       a1, a1, a1, lsl #16
+        lsr         a2, v2
+        lsr         a1, v2
+T       itt eq
+T       lsleq       a1, v1
+T       orreq       a1, a2, a1
+A       orreq       a1, a2, a1, lsl v1
+        pop         {a2-a4,v1-v8,pc}
+endfunc
+
+
+
+#else // non-NEON version
+
+
+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
+ *                                            int in_inc0, in_inc1)
+ */
+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
+        add         ip, sp, #4*4
+        push        {a2-a4,v1-v8,lr}
+        mov         v6, #32
+1:      ldmdb       ip, {v1-v4}
+        ldrsb       v5, [a3, #8]    @ curr->ref_idx
+        ldrsb       v8, [a3, #9]
+        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
+        ldrsb       lr, [a4, #9]
+        ldr         v1, [v1, v5, lsl #2]
+        ldrb        v5, [a3, #10]   @ curr->pred_flag
+        ldr         v2, [v2, v8, lsl #2]
+        ldrb        v8, [a4, #10]   @ neigh->pred_flag
+        ldr         v3, [v3, ip, lsl #2]
+        ldr         v4, [v4, lr, lsl #2]
+        teq         v5, #3
+        beq         20f
+        teq         v8, #3
+        beq         90f
+
+        tst         v5, #1
+        itee        ne
+        ldrne       v5, [a3, #0]    @ curr->mv[0]
+        moveq       v1, v2
+        ldreq       v5, [a3, #4]    @ curr->mv[1]
+        tst         v8, #1
+        itee        ne
+        ldrne       v8, [a4, #0]    @ neigh->mv[0]
+        moveq       v3, v4
+        ldreq       v8, [a4, #4]    @ neigh->mv[1]
+        teq         v1, v3
+        bne         10f
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v8, v5
+        ssub16      v5, v5, v8
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        @ drop through
+10:     it          ne
+        movne       v5, #1<<30
+11:
+        sub         v6, v6, #2
+T       mov         v7, v7, lsr #2
+        subs        a2, a2, #1
+A       orr         v7, v5, v7, lsr #2
+T       orr         v7, v5, v7
+        bhi         11b
+
+        ldrd        v3, v4, [sp, #16*4]
+        ldr         a2, [sp]
+        add         ip, sp, #16*4
+        subs        a1, a1, #1
+        add         a3, a3, v3
+        add         a4, a4, v4
+        bhi         1b
+        mov         a1, v7, lsr v6
+        pop         {a2-a4,v1-v8,pc}
+
+20:     teq         v8, #3
+        bne         10b
+
+        teq         v1, v3
+        it          eq
+        teqeq       v2, v4
+        bne         40f
+        teq         v1, v2
+        bne         30f
+
+        ldrd        v1, v2, [a3]    @ curr->mv
+        ldrd        v3, v4, [a4]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v3, v1
+        ssub16      v5, v1, v3
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        bne         25f
+        ssub16      ip, v4, v2
+        ssub16      v5, v2, v4
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        beq         11b
+        @ drop through
+25:     ssub16      ip, v4, v1
+        ssub16      v5, v1, v4
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        bne         10b
+        ssub16      ip, v3, v2
+        ssub16      v5, v2, v3
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        b           10b
+
+30:     ldrd        v1, v2, [a3]    @ curr->mv
+        ldrd        v3, v4, [a4]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        ssub16      ip, v3, v1
+        ssub16      v5, v1, v3
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        bne         10b
+        ssub16      ip, v4, v2
+        ssub16      v5, v2, v4
+        sel         v5, v5, ip
+        ands        v5, v5, lr
+        b           10b
+
+40:     teq         v1, v4
+        ite         eq
+        teqeq       v2, v3
+        bne         10b
+
+        ldrd        v1, v2, [a3]    @ curr->mv
+        ldrd        v3, v4, [a4]    @ neigh->mv
+        ldr         lr, =0xFFFCFFFC
+        b           25b
+
+90:
+        mov         v5, #1<<30
+        b           11b
+endfunc
+
+
+#endif
+
+
+@ =============================================================================
+@
+@ 10 bit
+
+function hevc_loop_filter_luma_body_10
+        m_filter_luma 10, q11, q15
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
+        hevc_loop_filter_luma_start
+        b        .Lh_loop_luma_common_10
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        push     {r4-r10,lr}            @ 32 bytes
+        ldr      r10, [sp, #32]
+.Lh_loop_luma_common_10:
+        m_filter_h_luma_16 10
+endfunc
+
+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
+        hevc_loop_filter_luma_start
+        sub      r4, r0, #8
+        b        .Lv_loop_luma_common_10
+endfunc
+
+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        push     {r4-r10,lr}            @ 32 bytes
+        ldr      r4, [sp, #36]
+        ldr      r10, [sp, #32]
+
+.Lv_loop_luma_common_10:
+        m_filter_v_luma_16 10
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
+        m_filter_h_uv_16 10
+endfunc
+
+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
+        m_filter_v_uv2_16 10
+endfunc
+
diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
new file mode 100644
index 0000000000..7ed5c7dc52
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+/* uses registers q8 - q13 for temp values */
+.macro tr4_luma_shift shift
+        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
+        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
+        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
+        vaddl.s16   q11, d28, d31   // src0 + src3
+
+        vmul.i32    q12, q8, d1[0]  // 29 * c0
+        vmul.i32    q13, q10, d2[0] // 55 * c2
+        vmul.i32    q8, q8, d2[0]   // 55 * c0
+        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
+
+        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
+        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
+        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
+        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
+
+        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
+        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
+        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
+        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
+
+        vqrshrn.s32 d28, q12, \shift
+        vqrshrn.s32 d29, q13, \shift
+        vqrshrn.s32 d30, q11, \shift
+        vqrshrn.s32 d31, q8, \shift
+.endm
+
+/* uses registers q8 - q11 for temp values */
+.macro tr4_shift shift
+        vmull.s16   q9, d29, d0[0]   // 83 * src1
+        vmull.s16   q8, d29, d0[1]   // 36 * src1
+        vshll.s16   q14, d28, #6     // 64 * src0
+        vshll.s16   q10, d30, #6     // 64 * src2
+        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
+        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
+        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
+        vadd.s32    q14, q11, q9     // e0 + o0
+        vadd.s32    q15, q10, q8     // e1 + o1
+        vsub.s32    q8, q10, q8      // e1 - o1
+        vsub.s32    q9, q11, q9      // e0 - o0
+
+        vqrshrn.s32 d28, q14, \shift
+        vqrshrn.s32 d29, q15, \shift
+        vqrshrn.s32 d30, q8, \shift
+        vqrshrn.s32 d31, q9, \shift
+.endm
+
+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
+                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
+                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
+                   shift, I1, I2, I3
+
+        vmull.s16  q4, \d1, d1[1]        // 89 * src1
+        \I1
+        vmull.s16  q5, \d1, d1[0]        // 75 * src1
+        \I2
+        vmull.s16  q6, \d1, d1[3]        // 50 * src1
+        \I3
+        vmull.s16  q7, \d1, d1[2]        // 18 * src1
+        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
+        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
+        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
+        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
+
+          // tr4
+          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
+          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
+
+        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
+        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
+        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
+        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
+
+          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
+          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
+          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
+          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
+          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
+          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
+
+        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
+        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
+        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
+        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
+
+          vsub.i32   q3, \tmp1, q1       // e0 - o0
+          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
+          vadd.i32   q1, \tmp0, q2       // e1 + o1
+          vsub.i32   q2, \tmp0, q2       // e1 - o1
+
+        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
+        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
+        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
+        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
+        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
+        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
+        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
+        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
+        vqrshrn.s32   \d0, \tmp0, #\shift
+        vqrshrn.s32   \d4, \tmp1, #\shift
+        vqrshrn.s32   \d1, q3, #\shift
+        vqrshrn.s32   \d5, q1, #\shift
+        vqrshrn.s32   \d2, q6, #\shift
+        vqrshrn.s32   \d6, q5, #\shift
+        vqrshrn.s32   \d3, q7, #\shift
+        vqrshrn.s32   \d7, q4, #\shift
+.endm
+
+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
+        vld1.16     {\d0}, [r0 :64], r3
+        vld1.16     {\d1}, [r2 :64], r3
+        vld1.16     {\d2}, [r0 :64], r3
+        vld1.16     {\d3}, [r2 :64], r3
+        vld1.16     {\d4}, [r0 :64], r3
+        vld1.16     {\d5}, [r2 :64], r3
+        vld1.16     {\d6}, [r0 :64], r3
+        vld1.16     {\d7}, [r2 :64], r3
+
+        tr8_process \
+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
+            \q01, \q23, 7, "\I1", "\I2", "\I3"
+.endm
+
+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
+        tr8_process \
+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
+            \q01, \q23, \shift
+
+        vzip.16    \d0, \d4
+        vzip.16    \d1, \d5
+        vzip.16    \d2, \d6
+        vzip.16    \d3, \d7
+        vst4.16    {\d0-\d3}, [r0 :128], r3
+        vst4.16    {\d4-\d7}, [r2 :128], r3
+.endm
+
+#define BIT_DEPTH 8
+#include "rpi_hevc_idct_fn_neon.S"
+
+.text
+
+.align 4
+tr4f:
+.word 0x00240053  // 36 and d1[0] = 83
+.word 0x00000000
+tr8f:
+.word 0x0059004b  // 89, d0[0] = 75
+.word 0x00320012  // 50, d0[2] = 18
+tr16:
+.word 0x005a0057  // 90, d2[0] = 87
+.word 0x00500046  // 80, d2[2] = 70
+.word 0x0039002b  // 57, d2[0] = 43
+.word 0x00190009  // 25, d2[2] = 9
+
+#undef BIT_DEPTH
+#define BIT_DEPTH 10
+#include "rpi_hevc_idct_fn_neon.S"
+
diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
new file mode 100644
index 0000000000..109fa98c29
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/rpi_hevcdsp.h"
+#include "rpi_hevcdsp_arm.h"
+
+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevcdsp_rpi_init_neon(c, bit_depth);
+}
diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
new file mode 100644
index 0000000000..9294ab8010
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/rpi_hevcdsp.h"
+#include "rpi_hevcdsp_arm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/bit_depth_template.c"
+
+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
+// have been removed from head as we never use them.
+
+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
+                             uint8_t * _pix_l);
+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+                             unsigned int no_f);
+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                             uint8_t * src_l,
+                             unsigned int no_f);
+
+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
+                             uint8_t * _pix_l);
+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
+                             unsigned int no_f);
+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                             uint8_t * src_l,
+                             unsigned int no_f);
+
+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
+
+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
+
+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+
+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+
+
+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+
+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+
+
+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+
+
+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_v);
+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride, int dc_u);
+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
+                                       ptrdiff_t stride);
+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+
+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+
+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+
+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+
+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height);
+
+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+
+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height);
+
+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+
+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
+                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
+                                                int in_inc0, int in_inc1);
+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
+
+
+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
+}
+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
+}
+
+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+}
+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+}
+
+#if SAO_FILTER_N == 6
+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
+}
+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+{
+    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
+}
+
+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+}
+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+{
+    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+}
+
+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height)
+{
+    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+}
+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+                                  int eo, int width, int height)
+{
+    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+}
+
+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height)
+{
+    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+}
+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height)
+{
+    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+}
+#endif
+
+
+
+#if RPI_HEVC_SAO_BUF_STRIDE != 160
+#error SAO edge src stride not 160 - value used in .S
+#endif
+
+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+    if (bit_depth == 8) {
+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
+#if SAO_FILTER_N == 6
+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
+#endif
+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
+
+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
+
+#if SAO_FILTER_N == 6
+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
+#endif
+    }
+    else if (bit_depth == 10) {
+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
+
+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
+#if SAO_FILTER_N == 6
+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
+#endif
+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
+
+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
+
+#if SAO_FILTER_N == 6
+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
+#endif
+    }
+
+    assert(offsetof(HEVCRpiMvField, mv) == 0);
+    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
+    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
+    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
+    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
+}
diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
new file mode 100644
index 0000000000..93876d14c0
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
@@ -0,0 +1,620 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+ .arch_extension mp @ enable PLDW
+
+#define BIT_DEPTH 10
+
+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+        vmax.s16  \Q0, \Q_MIN
+        vmax.s16  \Q1, \Q_MIN
+        vmax.s16  \Q2, \Q_MIN
+        vmax.s16  \Q3, \Q_MIN
+        vmin.s16  \Q0, \Q_MAX
+        vmin.s16  \Q1, \Q_MAX
+        vmin.s16  \Q2, \Q_MAX
+        vmin.s16  \Q3, \Q_MAX
+.endm
+
+@ add_residual4x4(
+@  uint16_t *_dst,    [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
+        add         ip, r0, r2
+        vld1.16     {q10, q11}, [r1]
+        lsl         r2, #1
+        vld1.16     {d0}, [r0 :64], r2
+        vld1.16     {d1}, [ip :64], r2
+        vld1.16     {d2}, [r0 :64]
+        vld1.16     {d3}, [ip :64]
+        sub         r0, r2
+        vqadd.s16   q0,  q10
+        sub         ip, r2
+        vqadd.s16   q1,  q11
+        vmov.i16    q8,  #0
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vst1.16     {d0}, [r0 :64], r2
+        vst1.16     {d1}, [ip :64], r2
+        vst1.16     {d2}, [r0 :64]
+        vst1.16     {d3}, [ip :64]
+        bx          lr
+
+endfunc
+
+@ add_residual4x4_dc(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
+        add         ip, r0, r1
+        vdup.16     q15, r2
+        lsl         r1, #1
+        vld1.16     {d0}, [r0 :64], r1
+        vld1.16     {d1}, [ip :64], r1
+        vld1.16     {d2}, [r0 :64]
+        vld1.16     {d3}, [ip :64]
+        sub         r0, r1
+        vqadd.s16   q0,  q15
+        sub         ip, r1
+        vqadd.s16   q1,  q15
+        vmov.i16    q8,  #0
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        vmax.s16    q0,  q0,  q8
+        vmax.s16    q1,  q1,  q8
+        vmin.s16    q0,  q0,  q9
+        vmin.s16    q1,  q1,  q9
+        vst1.16     {d0}, [r0 :64], r1
+        vst1.16     {d1}, [ip :64], r1
+        vst1.16     {d2}, [r0 :64]
+        vst1.16     {d3}, [ip :64]
+        bx          lr
+
+endfunc
+
+
+@ add_residual8x8(
+@  uint16_t *_dst,    [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
+        mov         r3, #8
+        vmov.i64    q8,  #0
+        add         ip, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        lsl         r2, #1
+1:
+        vldm        r1!, {q10-q13}
+        vld1.16     {q0}, [r0 :128], r2
+        vld1.16     {q1}, [ip :128], r2
+        vld1.16     {q2}, [r0 :128]
+        vld1.16     {q3}, [ip :128]
+        sub         r0, r2
+        vqadd.s16   q0,  q10
+        sub         ip, r2
+        vqadd.s16   q1,  q11
+        subs        r3, #4
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst1.16     {q0}, [r0 :128], r2
+        vst1.16     {q1}, [ip :128], r2
+        vst1.16     {q2}, [r0 :128], r2
+        vst1.16     {q3}, [ip :128], r2
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ add_residual4x4_dc_c(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc_uv)         [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
+        mov         r3, #4
+        vdup.32     q15, r2
+        b           9f
+endfunc
+
+@ add_residual8x8_dc(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r2
+        mov         r3, #8
+9:
+        vmov.i16    q8,  #0
+        add         ip, r0, r1
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        lsl         r1, #1
+1:
+        vld1.16     {q0}, [r0 :128], r1
+        vld1.16     {q1}, [ip :128], r1
+        vld1.16     {q2}, [r0 :128]
+        vld1.16     {q3}, [ip :128]
+        sub         r0, r1
+        vqadd.s16   q0,  q15
+        sub         ip, r1
+        vqadd.s16   q1,  q15
+        subs        r3, #4
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q15
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst1.16     {q0}, [r0 :128], r1
+        vst1.16     {q1}, [ip :128], r1
+        vst1.16     {q2}, [r0 :128], r1
+        vst1.16     {q3}, [ip :128], r1
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ add_residual16x16(
+@  uint16_t *_dst,    [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
+        add         ip, r0, r2
+        vmov.i16    q8,  #0
+        lsl         r2, #1
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        mov         r3, #16
+1:
+        vldm        r1!, {q10-q13}
+        @ For RPI Sand we could guarantee :256 but not for general
+        @ non-RPI allocation. :128 is as good as we can claim
+        vld1.16     {q0, q1}, [r0 :128]
+        subs        r3, #2
+        vld1.16     {q2, q3}, [ip :128]
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst1.16     {q0, q1}, [r0 :128], r2
+        vst1.16     {q2, q3}, [ip :128], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual8x8_dc_c(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc_uv)         [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
+        mov         r3, #8
+        vdup.32     q15, r2
+        b           9f
+endfunc
+
+@ add_residual16x16_dc(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
+        vdup.i16    q15, r2
+        mov         r3, #16
+9:
+        vmov.i16    q8,  #0
+        add         ip, r0, r1
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        lsl         r1, #1
+1:
+        @ For RPI Sand we could guarantee :256 but not for general
+        @ non-RPI allocation. :128 is as good as we can claim
+        vld1.16     {q0, q1}, [r0 :128]
+        subs        r3, #2
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q15
+        vld1.16     {q2, q3}, [ip :128]
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q15
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst1.16     {q0, q1}, [r0 :128], r1
+        vst1.16     {q2, q3}, [ip :128], r1
+        bne         1b
+        bx          lr
+
+endfunc
+
+
+@ add_residual32x32(
+@  uint16_t *_dst,    [r0]
+@  int16_t *res,      [r1]
+@  ptrdiff_t stride)  [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
+        push        {lr}
+        mov         r3, #32
+        vmov.i16    q8,  #0
+        add         lr, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        add         ip, r0, #32
+1:
+        vldm        r1!, {q10-q13}
+        vldm        r0,  {q0-q3}
+        vqadd.s16   q0,  q10
+          pldw        [lr]
+        vqadd.s16   q1,  q11
+          add         lr, r2
+        vqadd.s16   q2,  q12
+        subs        r3, #1
+        vqadd.s16   q3,  q13
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst1.16     {q0-q1}, [r0], r2
+        vst1.16     {q2-q3}, [ip], r2
+        bne         1b
+        pop         {pc}
+
+endfunc
+
+@ add_residual16x16_dc_c(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc_uv)         [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
+        mov         r3, #16
+        vdup.32     q15, r2
+        b           9f
+endfunc
+
+@ add_residual32x32_dc(
+@  uint16_t *_dst,    [r0]
+@  ptrdiff_t stride,  [r1]
+@  int dc)            [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r2
+        mov         r3, #32
+9:
+        vmov.i16    q8,  #0
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        add         ip, r0, #32
+1:
+        vldm        r0,  {q0-q3}
+        vqadd.s16   q0,  q15
+        subs        r3, #1
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q15
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst1.16     {q0-q1}, [r0], r1
+        vst1.16     {q2-q3}, [ip], r1
+        bne         1b
+        bx          lr
+
+endfunc
+
+@ ============================================================================
+@ U add
+
+@ add_residual4x4_u(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        add         ip, r0, r2
+        vld1.16     {q10, q11}, [r1 :256]
+        lsl         r2, #1
+        vld2.16     {d0, d2}, [r0 :128], r2
+        vld2.16     {d1, d3}, [ip :128], r2
+        vld2.16     {d4, d6}, [r0 :128]
+        vld2.16     {d5, d7}, [ip :128]
+        sub         r0, r2
+        vmov.i16    q8,  #0
+        sub         ip, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q15
+        clip16_4    q0, q1, q2, q3, q8, q9
+
+        vst2.16     {d0, d2}, [r0 :128], r2
+        vst2.16     {d1, d3}, [ip :128], r2
+        vst2.16     {d4, d6}, [r0 :128]
+        vst2.16     {d5, d7}, [ip :128]
+        bx          lr
+endfunc
+
+@ add_residual8x8_u(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        mov         r3, #8
+        vmov.i16    q8,  #0
+        add         ip, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        lsl         r2, #1
+1:
+        vld2.16     {q0, q1}, [r0 :256]
+        subs        r3, #2
+        vld2.16     {q2, q3}, [ip :256]
+        vld1.16     {q10, q11}, [r1 :256]!
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q15
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q15
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0 :256], r2
+        vst2.16     {q2, q3}, [ip :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_u(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
+        push        {lr}
+        vdup.16     q15, r3
+        mov         r3, #16
+        vmov.i16    q8,  #0
+        add         lr, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        add         ip, r0, #32
+1:
+        vld2.16     {q0, q1}, [r0 :256]
+        vld2.16     {q2, q3}, [ip :256]
+        vld1.16     {q10, q11}, [r1 :256]!
+        vqadd.s16   q0,  q10
+          pldw        [lr]
+        vqadd.s16   q1,  q15
+          add         lr, r2
+        vqadd.s16   q2,  q11
+        subs        r3, #1
+        vqadd.s16   q3,  q15
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0 :256], r2
+        vst2.16     {q2, q3}, [ip :256], r2
+        bne         1b
+        pop         {pc}
+endfunc
+
+@ ============================================================================
+@ V add
+
+@ add_residual4x4_v(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        add         ip, r0, r2
+        vld1.16     {q10, q11}, [r1 :256]
+        lsl         r2, #1
+        vld2.16     {d0, d2}, [r0 :128], r2
+        vld2.16     {d1, d3}, [ip :128], r2
+        vld2.16     {d4, d6}, [r0 :128]
+        vld2.16     {d5, d7}, [ip :128]
+        sub         r0, r2
+        vmov.i16    q8,  #0
+        sub         ip, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q10
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q11
+        clip16_4    q0, q1, q2, q3, q8, q9
+
+        vst2.16     {d0, d2}, [r0 :128], r2
+        vst2.16     {d1, d3}, [ip :128], r2
+        vst2.16     {d4, d6}, [r0 :128]
+        vst2.16     {d5, d7}, [ip :128]
+        bx          lr
+endfunc
+
+@ add_residual8x8_v(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
+        vdup.16     q15, r3
+        mov         r3, #8
+        vmov.i16    q8,  #0
+        add         ip, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        lsl         r2, #1
+1:
+        vld2.16     {q0, q1}, [r0 :256]
+        subs        r3, #2
+        vld2.16     {q2, q3}, [ip :256]
+        vld1.16     {q10, q11}, [r1 :256]!
+        vqadd.s16   q0,  q15
+        vqadd.s16   q1,  q10
+        vqadd.s16   q2,  q15
+        vqadd.s16   q3,  q11
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0 :256], r2
+        vst2.16     {q2, q3}, [ip :256], r2
+        bne         1b
+        bx          lr
+endfunc
+
+@ add_residual16x16_v(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc)               [r3]
+
+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
+        push        {lr}
+        vdup.16     q15, r3
+        mov         r3, #16
+        vmov.i16    q8,  #0
+        add         lr, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        add         ip, r0, #32
+1:
+        vld2.16     {q0, q1}, [r0 :256]
+        vld2.16     {q2, q3}, [ip :256]
+        vld1.16     {q10, q11}, [r1 :256]!
+        vqadd.s16   q0,  q15
+          pldw        [lr]
+        vqadd.s16   q1,  q10
+          add         lr, r2
+        vqadd.s16   q2,  q15
+        subs        r3, #1
+        vqadd.s16   q3,  q11
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0 :256], r2
+        vst2.16     {q2, q3}, [ip :256], r2
+        bne         1b
+        pop         {pc}
+endfunc
+
+@ ============================================================================
+@ U & V add
+
+@ add_residual4x4_c(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
+        vmov.i16    q8,  #0
+        add         ip, r0, r2
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        lsl         r2, #1
+        vldm        r1, {q10-q13}
+        vld2.16     {d0, d2}, [r0 :128], r2
+        vld2.16     {d1, d3}, [ip :128], r2
+        vld2.16     {d4, d6}, [r0 :128]
+        vld2.16     {d5, d7}, [ip :128]
+
+        sub         r0, r2
+        vqadd.s16   q0,  q10
+        sub         ip, r2
+        vqadd.s16   q1,  q12
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q13
+        clip16_4    q0, q1, q2, q3, q8, q9
+
+        vst2.16     {d0, d2}, [r0 :128], r2
+        vst2.16     {d1, d3}, [ip :128], r2
+        vst2.16     {d4, d6}, [r0 :128]
+        vst2.16     {d5, d7}, [ip :128]
+        bx          lr
+endfunc
+
+@ add_residual8x8_c(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
+        push        {lr}
+        add         ip, r0, r2
+        lsl         r2, #1
+        vmov.i16    q8,  #0
+        add         r3, r1, #(8*8*2)  @ Offset to V
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        mov         lr, #8
+1:
+        vld1.16     {q10, q11}, [r1 :256]!
+        subs        lr, #2
+        vld2.16     {q0, q1}, [r0 :256]
+        vld2.16     {q2, q3}, [ip :256]
+        vld1.16     {q12, q13}, [r3 :256]!
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q12
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q13
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0 :256], r2
+        vst2.16     {q2, q3}, [ip :256], r2
+        bne         1b
+        pop         {pc}
+endfunc
+
+@ add_residual16x16_c(
+@   uint16_t *_dst,       [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
+        push        {r4, lr}
+        vmov.i16    q8,  #0
+        add         r3,  r1, #(16*16*2)  @ Offset to V
+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
+        add         ip, r0, #32
+        add         r4, r0, r2
+        mov         lr, #16
+1:
+        vld2.16     {q0, q1}, [r0 :256]
+        vld2.16     {q2, q3}, [ip :256]
+        vld1.16     {q10, q11}, [r1 :256]!
+        vld1.16     {q12, q13}, [r3 :256]!
+        vqadd.s16   q0,  q10
+          pldw        [r4]
+        vqadd.s16   q1,  q12
+          add         r4, r2
+        vqadd.s16   q2,  q11
+        subs        lr, #1
+        vqadd.s16   q3,  q13
+        clip16_4    q0, q1, q2, q3, q8, q9
+        vst2.16     {q0, q1}, [r0 :256], r2
+        vst2.16     {q2, q3}, [ip :256], r2
+        bne         1b
+        pop         {r4,pc}
+endfunc
+
diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
new file mode 100644
index 0000000000..d9a1d7d98c
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
@@ -0,0 +1,741 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+ .arch_extension mp @ enable PLDW
+
+@ General notes:
+@
+@ Residual is generally only guaranteed to be clipped to 16 bits.
+@ This means that we do need to do vmovl, vqadd, vqmovun
+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
+@ with this).
+@
+@ There is an exception for the DC case because its transform is guaranteed
+@ to be small enough that overflow cannot occur during the first add.
+
+@ ============================================================================
+@ Y add
+
+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
+        add         ip, r0, r2
+        vld1.16     {q0, q1}, [r1]
+        lsl         r2, #1
+        vld1.32     d4[0], [r0], r2
+        rsb         r3, r2, #0
+        vld1.32     d4[1], [ip], r2
+        vld1.32     d5[0], [r0], r3
+        vld1.32     d5[1], [ip], r3
+        vmovl.u8    q8, d4
+        vmovl.u8    q9, d5
+        vqadd.s16   q0, q8
+        vqadd.s16   q1, q9
+        vqmovun.s16 d0, q0
+        vqmovun.s16 d1, q1
+        vst1.32     d0[0], [r0], r2
+        vst1.32     d0[1], [ip], r2
+        vst1.32     d1[0], [r0]
+        vst1.32     d1[1], [ip]
+        bx          lr
+endfunc
+
+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
+        push        {r4, lr}
+        vld1.16     {q0, q1}, [r1]!
+        add         ip, r0, r2
+        vld1.8      {d6}, [r0]
+        add         r4, r0, r2, lsl #1
+        vld1.8      {d7}, [ip]
+        add         lr, ip, r2, lsl #1
+        lsl         r2, #1
+        mov         r3, #8-2
+        vmovl.u8    q2, d6
+        vmovl.u8    q3, d7
+        vqadd.s16   q2, q0
+        vqadd.s16   q3, q1
+1:
+          vld1.16     {q0, q1}, [r1]!
+        subs        r3, #2
+        vqmovun.s16 d4, q2
+        vqmovun.s16 d5, q3
+          vld1.8      {d6}, [r4], r2
+          vld1.8      {d7}, [lr], r2
+        vst1.8      {d4}, [r0], r2
+        vst1.8      {d5}, [ip], r2
+          vmovl.u8    q2, d6
+            pldw        [r4]
+          vmovl.u8    q3, d7
+          vqadd.s16   q2, q0
+          vqadd.s16   q3, q1
+        bne         1b
+
+          vqmovun.s16 d4, q2
+          vqmovun.s16 d5, q3
+          vst1.8      {d4}, [r0]
+          vst1.8      {d5}, [ip]
+          pop         {r4, pc}
+endfunc
+
+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
+        vld1.16     {q0, q1}, [r1]!
+        add         ip, r0, r2
+        vld1.8      {q3}, [r0]
+        mov         r3, #16-1
+        vmovl.u8    q2, d6
+        vmovl.u8    q3, d7
+        vqadd.s16   q2, q0
+        vqadd.s16   q3, q1
+1:
+          vld1.16     {q0, q1}, [r1]!
+        subs        r3, #1
+        vqmovun.s16 d4, q2
+        vqmovun.s16 d5, q3
+          vld1.8      {q3}, [ip], r2
+        vst1.8      {q2}, [r0], r2
+          vmovl.u8    q2, d6
+            pldw        [ip]
+          vmovl.u8    q3, d7
+          vqadd.s16   q2, q0
+          vqadd.s16   q3, q1
+        bne         1b
+
+          vqmovun.s16 d4, q2
+          vqmovun.s16 d5, q3
+          vst1.8      {q2}, [r0]
+          bx          lr
+endfunc
+
+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
+        vldm        r1!, {q0-q3}
+        vld1.8      {q8, q9}, [r0]
+        add         ip, r0, r2
+        vmovl.u8    q10, d16
+        mov         r3, #32-1
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vqadd.s16   q10, q0
+        vqadd.s16   q11, q1
+        vqadd.s16   q12, q2
+        vqadd.s16   q13, q3
+1:
+          vldm        r1!, {q0-q3}
+        vqmovun.s16 d20, q10
+        vqmovun.s16 d21, q11
+        vqmovun.s16 d22, q12
+        vqmovun.s16 d23, q13
+          vld1.8      {q8, q9}, [ip], r2
+        subs        r3, #1
+        vst1.8      {q10, q11}, [r0], r2
+          vmovl.u8    q10, d16
+            pldw        [ip]
+          vmovl.u8    q11, d17
+          vmovl.u8    q12, d18
+          vmovl.u8    q13, d19
+          vqadd.s16   q10, q0
+          vqadd.s16   q11, q1
+          vqadd.s16   q12, q2
+          vqadd.s16   q13, q3
+        bne     1b
+
+          vqmovun.s16 d20, q10
+          vqmovun.s16 d21, q11
+          vqmovun.s16 d22, q12
+          vqmovun.s16 d23, q13
+          vst1.8      {q10, q11}, [r0]
+          bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
+        add         ip, r0, r1
+        vdup.16     q15, r2
+        lsl         r1, #1
+        vld1.32     d4[0], [r0], r1
+        rsb         r3, r1, #0
+        vld1.32     d4[1], [ip], r1
+        vld1.32     d5[0], [r0], r3
+        vld1.32     d5[1], [ip], r3
+        vaddw.u8    q0, q15, d4
+        vaddw.u8    q1, q15, d5
+        vqmovun.s16 d0, q0
+        vqmovun.s16 d1, q1
+        vst1.32     d0[0], [r0], r1
+        vst1.32     d0[1], [ip], r1
+        vst1.32     d1[0], [r0]
+        vst1.32     d1[1], [ip]
+        bx          lr
+endfunc
+
+@ ============================================================================
+@ DC Y or C add
+
+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
+        mov         r3,  #4-2
+        vdup.32     q15, r2
+        b           1f
+endfunc
+
+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
+        vdup.16     q15, r2
+        mov         r3, #8-2
+1:      vld1.8      d16, [r0]
+        add         ip, r0, r1
+        push        {r4, lr}
+        vld1.8      d17, [ip]
+        add         r4, r0, r1, lsl #1
+        vaddw.u8    q0, q15, d16
+        lsl         r1, #1
+        vaddw.u8    q1, q15, d17
+        add         lr, ip, r1
+1:
+          vld1.8      {d16}, [r4], r1
+          vld1.8      {d17}, [lr], r1
+        subs        r3, #2
+        vqmovun.s16 d4, q0
+        vqmovun.s16 d5, q1
+          vaddw.u8    q0, q15, d16
+          vaddw.u8    q1, q15, d17
+        vst1.8      {d4}, [r0], r1
+        vst1.8      {d5}, [ip], r1
+        bne         1b
+
+          vqmovun.s16 d4, q0
+          vqmovun.s16 d5, q1
+          vst1.8      {d4}, [r0]
+          vst1.8      {d5}, [ip]
+          pop         {r4, pc}
+endfunc
+
+
+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
+        mov         r3,  #8-1
+        vdup.32     q15, r2
+        b           1f
+endfunc
+
+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
+        vdup.16     q15, r2
+        mov         r3,  #16-1
+1:      vld1.8      {q8}, [r0]
+        add         ip, r0, r1
+        vaddw.u8    q0, q15, d16
+        vaddw.u8    q1, q15, d17
+1:
+          vld1.8      {q8}, [ip], r1
+        subs        r3, #1
+        vqmovun.s16 d4, q0
+        vqmovun.s16 d5, q1
+          vaddw.u8    q0, q15, d16
+          vaddw.u8    q1, q15, d17
+        vst1.8      {q2}, [r0], r1
+        bne         1b
+
+          vqmovun.s16 d4, q0
+          vqmovun.s16 d5, q1
+          vst1.8      {q2}, [r0]
+          bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
+        mov         r3,  #16-1
+        vdup.32     q15, r2
+        b           1f
+endfunc
+
+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
+@   uint8_t * dst,              // [r0]
+@   unsigned int stride,        // [r1]
+@   int dc)                     // [r2]
+
+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
+        vdup.16     q15, r2
+        mov         r3, #32-1
+1:      vld1.8      {q8, q9}, [r0]
+        add         ip, r0, r1
+        vaddw.u8    q0, q15, d16
+        vaddw.u8    q1, q15, d17
+        vaddw.u8    q2, q15, d18
+        vaddw.u8    q3, q15, d19
+1:
+        vqmovun.s16 d20, q0
+        vqmovun.s16 d21, q1
+        vqmovun.s16 d22, q2
+        vqmovun.s16 d23, q3
+          vld1.8      {q8, q9}, [ip], r1
+        subs        r3, #1
+          vaddw.u8    q0, q15, d16
+          vaddw.u8    q1, q15, d17
+          vaddw.u8    q2, q15, d18
+          vaddw.u8    q3, q15, d19
+        vst1.8      {q10, q11}, [r0], r1
+        bne     1b
+
+          vqmovun.s16 d20, q0
+          vqmovun.s16 d21, q1
+          vqmovun.s16 d22, q2
+          vqmovun.s16 d23, q3
+          vst1.8      {q10, q11}, [r0]
+          bx          lr
+endfunc
+
+@ ============================================================================
+@ U add
+
+@ add_residual4x4_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride,     [r2]
+@   int dc_v)             [r3]
+
+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
+        add         ip, r0, r2
+        vld1.16     {q0, q1}, [r1]
+        lsl         r2, #1
+        vld1.8      {d16}, [r0 :64], r2
+        vld1.8      {d17}, [ip :64], r2
+        vld1.8      {d18}, [r0 :64]
+        sub         r0, r2
+        vld1.8      {d19}, [ip :64]
+        sub         ip, r2
+        vdup.16     q2, r3
+        vdup.16     q3, r3
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vzip.16     q0, q2
+        vzip.16     q1, q3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q2
+        vqmovun.s16 d2,  q1
+        vqmovun.s16 d3,  q3
+        vst1.8      {d0}, [r0 :64], r2
+        vst1.8      {d1}, [ip :64], r2
+        vst1.8      {d2}, [r0 :64]
+        vst1.8      {d3}, [ip :64]
+        bx          lr
+endfunc
+
+@ add_residual8x8_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+@   int dc_v)             [r3]
+
+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
+        vdup.16     q15, r3
+        add         ip, r0, r2
+        push        {r4, lr}
+        vld2.8      {d16, d17}, [r0 :128]
+        lsl         r2, #1
+        vld2.8      {d18, d19}, [ip :128]
+        mov         r3, #8-2
+        vld1.16     {q0, q1}, [r1 :256]!
+        add         r4, r0, r2
+        vmovl.u8    q10, d16
+        add         lr, ip, r2
+        vmovl.u8    q11, d18
+        vqadd.s16   q0,  q10
+        vaddw.u8    q2,  q15, d17
+        vqadd.s16   q1,  q11
+        vaddw.u8    q3,  q15, d19
+1:
+        vqmovun.s16 d20,  q0
+        vqmovun.s16 d21,  q2
+          vld2.8      {d16, d17}, [r4 :128], r2
+        subs        r3, #2
+        vqmovun.s16 d22,  q1
+        vqmovun.s16 d23,  q3
+        vst2.8      {d20, d21}, [r0 :128], r2
+          vld2.8      {d18, d19}, [lr :128], r2
+        vst2.8      {d22, d23}, [ip :128], r2
+          vld1.16     {q0, q1}, [r1 :256]!
+          vmovl.u8    q10, d16
+          vmovl.u8    q11, d18
+          vqadd.s16   q0,  q10
+          vaddw.u8    q2,  q15, d17
+          vqadd.s16   q1,  q11
+          vaddw.u8    q3,  q15, d19
+        bne         1b
+
+          vqmovun.s16 d20,  q0
+          vqmovun.s16 d21,  q2
+          vqmovun.s16 d22,  q1
+          vqmovun.s16 d23,  q3
+          vst2.8      {d20, d21}, [r0 :128]
+          vst2.8      {d22, d23}, [ip :128]
+          pop         {r4, pc}
+endfunc
+
+@ add_residual16x16_u(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+@   int dc_v)             [r3]
+
+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
+        vdup.16     q15, r3
+        add         ip, r0, r2
+        vld2.8      {q8, q9}, [r0 :256]
+        mov         r3, #16-1
+        vld1.16     {q0, q1}, [r1 :256]!
+        vmovl.u8    q11, d16
+        vmovl.u8    q12, d17
+        vqadd.s16   q0,  q11
+        vaddw.u8    q11, q15, d18
+        vqadd.s16   q1,  q12
+        vaddw.u8    q12, q15, d19
+1:
+          vld2.8      {q8, q9}, [ip :256], r2
+        subs        r3, #1
+        vqmovun.s16 d20, q0
+        vqmovun.s16 d22, q11
+        vqmovun.s16 d21, q1
+        vqmovun.s16 d23, q12
+          vld1.16     {q0, q1}, [r1 :256]!
+        vst2.8      {q10, q11}, [r0 :256], r2
+          vmovl.u8    q11, d16
+            pldw        [ip]
+          vmovl.u8    q12, d17
+          vqadd.s16   q0,  q11
+          vaddw.u8    q11, q15, d18
+          vqadd.s16   q1,  q12
+          vaddw.u8    q12, q15, d19
+        bne         1b
+
+          vqmovun.s16 d20, q0
+          vqmovun.s16 d22, q11
+          vqmovun.s16 d21, q1
+          vqmovun.s16 d23, q12
+          vst2.8      {q10, q11}, [r0 :256]
+          bx          lr
+endfunc
+
+@ ============================================================================
+@ V add
+
+@ add_residual4x4_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
+        add         ip, r0, r2
+        vld1.16     {q2, q3}, [r1]
+        lsl         r2, #1
+        vld1.8      {d16}, [r0 :64], r2
+        vld1.8      {d17}, [ip :64], r2
+        vld1.8      {d18}, [r0 :64]
+        sub         r0, r2
+        vld1.8      {d19}, [ip :64]
+        sub         ip, r2
+        vdup.16     q0, r3
+        vdup.16     q1, r3
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vzip.16     q0, q2
+        vzip.16     q1, q3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q2
+        vqmovun.s16 d2,  q1
+        vqmovun.s16 d3,  q3
+        vst1.8      {d0}, [r0 :64], r2
+        vst1.8      {d1}, [ip :64], r2
+        vst1.8      {d2}, [r0 :64]
+        vst1.8      {d3}, [ip :64]
+        bx          lr
+endfunc
+
+@ add_residual8x8_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
+        vdup.16     q15, r3
+        add         ip, r0, r2
+        push        {r4, lr}
+        vld2.8      {d16, d17}, [r0 :128]
+        lsl         r2, #1
+        vld2.8      {d18, d19}, [ip :128]
+        mov         r3, #8-2
+        vld1.16     {q0, q1}, [r1 :256]!
+        add         r4, r0, r2
+        vmovl.u8    q10, d17
+        add         lr, ip, r2
+        vmovl.u8    q11, d19
+        vqadd.s16   q0,  q10
+        vaddw.u8    q2,  q15, d16
+        vqadd.s16   q1,  q11
+        vaddw.u8    q3,  q15, d18
+1:
+        vqmovun.s16 d20,  q2
+        vqmovun.s16 d21,  q0
+          vld2.8      {d16, d17}, [r4 :128], r2
+        subs        r3, #2
+        vqmovun.s16 d22,  q3
+        vqmovun.s16 d23,  q1
+        vst2.8      {d20, d21}, [r0 :128], r2
+          vld2.8      {d18, d19}, [lr :128], r2
+        vst2.8      {d22, d23}, [ip :128], r2
+          vld1.16     {q0, q1}, [r1 :256]!
+          vmovl.u8    q10, d17
+          vmovl.u8    q11, d19
+          vqadd.s16   q0,  q10
+          vaddw.u8    q2,  q15, d16
+          vqadd.s16   q1,  q11
+          vaddw.u8    q3,  q15, d18
+        bne         1b
+
+          vqmovun.s16 d20,  q2
+          vqmovun.s16 d21,  q0
+          vqmovun.s16 d22,  q3
+          vqmovun.s16 d23,  q1
+          vst2.8      {d20, d21}, [r0 :128]
+          vst2.8      {d22, d23}, [ip :128]
+          pop         {r4, pc}
+endfunc
+
+@ add_residual16x16_v(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
+        vdup.16     q15, r3
+        add         ip, r0, r2
+        vld2.8      {q8, q9}, [r0 :256]
+        mov         r3, #16-1
+        vld1.16     {q0, q1}, [r1 :256]!
+        vmovl.u8    q11, d18
+        vmovl.u8    q12, d19
+        vqadd.s16   q0,  q11
+        vaddw.u8    q11, q15, d16
+        vqadd.s16   q1,  q12
+        vaddw.u8    q12, q15, d17
+1:
+          vld2.8      {q8, q9}, [ip :256], r2
+        subs        r3, #1
+        vqmovun.s16 d20, q11
+        vqmovun.s16 d22, q0
+        vqmovun.s16 d21, q12
+        vqmovun.s16 d23, q1
+          vld1.16     {q0, q1}, [r1 :256]!
+        vst2.8      {q10, q11}, [r0 :256], r2
+          vmovl.u8    q11, d18
+            pldw        [ip]
+          vmovl.u8    q12, d19
+          vqadd.s16   q0,  q11
+          vaddw.u8    q11, q15, d16
+          vqadd.s16   q1,  q12
+          vaddw.u8    q12, q15, d17
+        bne         1b
+
+          vqmovun.s16 d20, q11
+          vqmovun.s16 d22, q0
+          vqmovun.s16 d21, q12
+          vqmovun.s16 d23, q1
+          vst2.8      {q10, q11}, [r0 :256]
+          bx          lr
+endfunc
+
+@ ============================================================================
+@ U & V add
+
+@ add_residual4x4_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
+        add         ip, r0, r2
+        vld1.16     {q0, q1}, [r1]!       @ all of U
+        lsl         r2, #1
+        vld1.8      {d16}, [r0 :64], r2
+        rsb         r3, r2, #0
+        vld1.8      {d17}, [ip :64], r2
+        vld1.16     {q2, q3}, [r1]        @ all of V
+        vld1.8      {d18}, [r0 :64], r3
+        vld1.8      {d19}, [ip :64], r3
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vzip.16     q0, q2
+        vzip.16     q1, q3
+        vqadd.s16   q0,  q10
+        vqadd.s16   q2,  q11
+        vqadd.s16   q1,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q2
+        vqmovun.s16 d2,  q1
+        vqmovun.s16 d3,  q3
+        vst1.8      {d0}, [r0 :64], r2
+        vst1.8      {d1}, [ip :64], r2
+        vst1.8      {d2}, [r0 :64]
+        vst1.8      {d3}, [ip :64]
+        bx          lr
+endfunc
+
+@ add_residual8x8_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
+        vld2.8      {d16, d17}, [r0 :128]
+        add         r3, r1, #(8*8*2)  @ Offset to V
+        vld1.16     {q0}, [r1 :128]!
+        add         ip, r0, r2
+        vld1.16     {q1}, [r3 :128]!
+        vmovl.u8    q10, d16
+        push        {lr}
+        vmovl.u8    q8,  d17
+        mov         lr, #8-1
+        vqadd.s16   q10, q0
+        vqadd.s16   q1,  q8
+1:
+          vld2.8      {d16, d17}, [ip :128], r2
+        subs        lr, #1
+          vld1.16     {q0}, [r1 :128]!
+        vqmovun.s16 d20, q10
+        vqmovun.s16 d21, q1
+          vld1.16     {q1}, [r3 :128]!
+        vst2.8      {d20, d21}, [r0 :128], r2
+          vmovl.u8    q10, d16
+            pldw        [ip]
+          vmovl.u8    q8,  d17
+          vqadd.s16   q10, q0
+          vqadd.s16   q1,  q8
+        bne         1b
+
+          vqmovun.s16 d20, q10
+          vqmovun.s16 d21, q1
+          vst2.8      {d20, d21}, [r0 :128]
+          pop         {pc}
+endfunc
+
+@ add_residual16x16_c(
+@   uint8_t *_dst,        [r0]
+@   const int16_t *res,   [r1]
+@   ptrdiff_t stride)     [r2]
+
+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
+        vld2.8      {q8, q9}, [r0 :256]
+        add         r3, r1, #(16*16*2)  @ Offset to V
+        vld1.16     {q0, q1}, [r1 :256]!
+        add         ip, r0, r2
+        vld1.16     {q2, q3}, [r3 :256]!
+        vmovl.u8    q10, d16
+        push        {lr}
+        vmovl.u8    q8,  d17
+        mov         lr, #16-1
+        vmovl.u8    q11, d18
+        vmovl.u8    q9,  d19
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q8
+        vqadd.s16   q2,  q11
+        vqadd.s16   q3,  q9
+1:
+          vld2.8      {q8, q9}, [ip :256], r2
+        subs        lr, #1
+        vqmovun.s16 d20, q0
+        vqmovun.s16 d22, q2
+        vqmovun.s16 d21, q1
+        vqmovun.s16 d23, q3
+          vld1.16     {q0, q1}, [r1 :256]!
+        vst2.8      {d20-d23}, [r0 :256], r2
+          vld1.16     {q2, q3}, [r3 :256]!
+          vmovl.u8    q10, d16
+            pldw        [ip]
+          vmovl.u8    q8,  d17
+          vmovl.u8    q11, d18
+          vmovl.u8    q9,  d19
+          vqadd.s16   q0,  q10
+          vqadd.s16   q1,  q8
+          vqadd.s16   q2,  q11
+          vqadd.s16   q3,  q9
+        bne         1b
+
+          vqmovun.s16 d20, q0
+          vqmovun.s16 d22, q2
+          vqmovun.s16 d21, q1
+          vqmovun.s16 d23, q3
+          vst2.8      {d20-d23}, [r0 :256]
+          pop         {pc}
+endfunc
+
+@ 32x32 chroma never occurs so NIF
+
+@ ============================================================================
diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
new file mode 100644
index 0000000000..b56e0f9644
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
@@ -0,0 +1,2245 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.set EDGE_SRC_STRIDE, 160
+
+@ PIC jump tables are fractionally more expensive than absolute in our code
+.set jent_pic, CONFIG_PIC
+
+
+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
+        vshr.u8   q12, q8, #3
+        \I1
+        vadd.i8   q8, \Q_K128
+        \I2
+        vshr.u8   q13, q9, #3
+        \I3
+        vadd.i8   q9, \Q_K128
+        \I4
+        vtbl.8    d24, \XLAT0, d24
+        vtbl.8    d25, \XLAT0, d25
+        vtbl.8    d26, \XLAT1, d26
+        vtbl.8    d27, \XLAT1, d27
+
+        vqadd.s8  q8, q12
+        vshr.u8   q12, q10, #3
+        vadd.i8   q10, \Q_K128
+        vqadd.s8  q9, q13
+        vshr.u8   q13, q11, #3
+        vadd.i8   q11, \Q_K128
+
+        vtbl.8    d24, \XLAT0, d24
+        vtbl.8    d25, \XLAT0, d25
+        vtbl.8    d26, \XLAT1, d26
+        vtbl.8    d27, \XLAT1, d27
+        vqadd.s8  q10, q12
+        vsub.i8   q8, \Q_K128
+        vqadd.s8  q11, q13
+        vsub.i8   q9, \Q_K128
+        vsub.i8   q10, \Q_K128
+        vsub.i8   q11, \Q_K128
+.endm
+
+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
+        \L1
+        \L2
+        \L3
+        \L4
+        \L5
+        vadd.i8   q12, q8, \Q_K128
+        vshr.u8   q8, #3
+        vtbl.8    d16, \XLAT0, d16
+        vtbl.8    d17, \XLAT1, d17
+        vqadd.s8  q12, q8
+        bmi       2f
+1:        \L1
+          \L2
+          \L3
+          \L4
+          \L5
+        vsub.i8   q13, q12, \Q_K128
+          vadd.i8   q12, q8, \Q_K128
+          vshr.u8   q8, #3
+        \S1
+        \S2
+        \S3
+        \S4
+          vtbl.8    d16, \XLAT0, d16
+          vtbl.8    d17, \XLAT1, d17
+          vqadd.s8  q12, q8
+          bpl       1b
+2:        vsub.i8   q13, q12, \Q_K128
+          \S1
+          \S2
+          \S3
+          \S4
+.endm
+
+
+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+        vmax.s16  \Q0, \Q_MIN
+        vmax.s16  \Q1, \Q_MIN
+        vmax.s16  \Q2, \Q_MIN
+        vmax.s16  \Q3, \Q_MIN
+        vmin.s16  \Q0, \Q_MAX
+        vmin.s16  \Q1, \Q_MAX
+        vmin.s16  \Q2, \Q_MAX
+        vmin.s16  \Q3, \Q_MAX
+.endm
+
+@ Clobbers q12, q13
+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
+        \I1
+        vtbl.8    d24, \XLAT0, d24
+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
+        vtbl.8    d25, \XLAT1, d25
+        \I2
+        vtbl.8    d26, \XLAT0, d26
+        vtbl.8    d27, \XLAT1, d27
+        vaddw.s8  \Q0, d24
+        vaddw.s8  \Q1, d25
+        vaddw.s8  \Q2, d26
+        vaddw.s8  \Q3, d27
+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
+.endm
+
+@ Clobbers q10, q11, q12
+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
+        \L1
+        \L2
+        \L3
+        \L4
+        \L5
+        vshrn.i16 d24, \Q0, #\bit_depth - 5
+        vshrn.i16 d25, \Q1, #\bit_depth - 5
+        vtbl.8    d24, \XLAT0, d24
+        vtbl.8    d25, \XLAT1, d25
+        vaddw.s8  q10, \Q0, d24
+        vaddw.s8  q11, \Q1, d25
+        bmi       2f
+1:        \L1
+          \L2
+          \L3
+          \L4
+          \L5
+        vmax.s16  q10, \Q_MIN
+        vmax.s16  q11, \Q_MIN
+          vshrn.i16 d24, \Q0, #\bit_depth - 5
+          vshrn.i16 d25, \Q1, #\bit_depth - 5
+        vmin.s16  q10, \Q_MAX
+        vmin.s16  q11, \Q_MAX
+        \S1
+        \S2
+        \S3
+        \S4
+          vtbl.8    d24, \XLAT0, d24
+          vtbl.8    d25, \XLAT1, d25
+          vaddw.s8  q10, \Q0, d24
+          vaddw.s8  q11, \Q1, d25
+          bpl       1b
+2:        vmax.s16  q10, \Q_MIN
+          vmax.s16  q11, \Q_MIN
+          vmin.s16  q10, \Q_MAX
+          vmin.s16  q11, \Q_MAX
+          \S1
+          \S2
+          \S3
+          \S4
+.endm
+
+
+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
+@ so we are quite safe stuffing it into a byte array
+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
+@ precision
+
+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
+@ array via the stack
+@ Given that sao_left_class > 28 can cause wrap we can't just poke
+@ all 4 bytes in at once
+@
+@ It also loads other common regs
+
+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
+function band_load_y
+        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
+        ldr       r4, [sp, #20]         @ sao_left_class
+        vmov.i64  d4, #0
+        vmov.i64  q0, #0
+        pld       [r1]
+        vld2.8    {q8}, [ip]
+        sub       ip, sp, #8*5
+        vmov.i64  q1, #0
+        add       r4, ip, r4
+        vpush     {d0-d4}               @ Put zero array on stack
+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
+        ldr       ip, [ip, #8*5 + 28]   @ height
+        vst1.32   {d16[0]}, [r4]
+        add       r4, r1, r3
+        vpop      {d0-d4}               @ Pop modified array
+        sub       ip, ip, #1
+        vorr      d0, d0, d4
+        bx        lr
+endfunc
+
+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
+function band_load_c
+        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
+        ldr       r4, [sp, #20]         @ sao_left_class1
+        vmov.i64  d24, #0
+        vmov.i64  q10, #0
+        pld       [r1]
+        vld2.8    {q8}, [ip]
+        sub       ip, sp, #8*5
+        vmov.i64  q11, #0
+        add       r4, ip, r4
+        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
+        vpush     {d20-d24}             @ Put zero array on stack
+        vld2.8    {q9}, [ip]
+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
+        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
+        vst1.32   {d16[0]}, [r4]
+        add       ip, sp, ip
+        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
+        vldmia    sp, {d0-d3}           @ Load modified array
+        vldr      d16, [sp, #8*4]
+        add       r4, r1, r3
+        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
+        vst1.32   {d18[0]}, [ip]
+        vorr      d0, d0, d16
+        vldmia    sp, {d4-d7}           @ Load modified array
+        vldr      d18, [sp, #8*4]
+        ldr       ip, [sp, #8*5 + 36]   @ height
+        add       sp, sp, #8*5
+        vorr      d4, d4, d18
+        sub       ip, ip, #1
+        bx        lr
+endfunc
+
+
+@ ff_hevc_rpi_sao_band_64_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_rpi_sao_band_64_neon_8, export=1
+        push      {r4-r6, lr}
+        vmov.u8   q15, #128
+        bl        band_load_y
+
+1:      vldmia    r1, {q8-q11}
+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
+            "pld       [r4]",                 \
+            "subs      ip, #1",               \
+            "it ne; addne r4, r3",            \
+            "add       r1, r3"
+        vstmia    r0, {q8-q11}
+        add       r0, r2
+        bpl       1b
+
+        pop       {r4-r6, pc}
+endfunc
+
+@ ff_hevc_rpi_sao_band_32_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_rpi_sao_band_32_neon_8, export=1
+        push      {r4-r6, lr}
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        vmov.u8   q15, #128
+        bl        band_load_y
+
+1:      vld1.8    { q8, q9 }, [r1, :128], r3
+        subs      ip, #2
+        vld1.8    {q10, q11}, [r6, :128], r3
+
+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
+
+        vst1.8    { q8, q9 }, [r0, :128], r2
+        vst1.8    {q10, q11}, [r5, :128], r2
+        bpl       1b
+
+        pop       {r4-r6, pc}
+endfunc
+
+@ ff_hevc_rpi_sao_band_16_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_rpi_sao_band_16_neon_8, export=1
+        push      {r4-r6, lr}
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        vmov.u8   q15, #128
+        bl        band_load_y
+
+1:      vld1.8    { q8}, [r1, :128], r3
+        subs      ip, #4
+        vld1.8    { q9}, [r6, :128], r3
+        vld1.8    {q10}, [r1, :128], r3
+        vld1.8    {q11}, [r6, :128], r3
+
+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
+
+        vst1.8    { q8}, [r0, :128], r2
+        vst1.8    { q9}, [r5, :128], r2
+        vst1.8    {q10}, [r0, :128], r2
+        vst1.8    {q11}, [r5, :128], r2
+        bpl       1b
+
+        pop       {r4-r6, pc}
+endfunc
+
+@ ff_hevc_rpi_sao_band_8_neon_8 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+function ff_hevc_rpi_sao_band_8_neon_8, export=1
+        ldr       ip, [sp, #8]          @ width
+        push      {r4-r6, lr}
+        vmov.u8   q15, #128
+        cmp       ip, #8
+        bl        band_load_y
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        blt       4f
+
+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
+            "vld1.8    {d16}, [r1, :64], r3", \
+            "subs      ip, #2",               \
+            "vld1.8    {d17}, [r6, :64], r3", \
+            "",                               \
+            "",                               \
+            "vst1.8 {d26}, [r0, :64], r2",    \
+            "vst1.8 {d27}, [r5, :64], r2"
+        pop       {r4-r6, pc}
+4:
+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
+            "vld1.32   {d16[0]}, [r1, :32], r3", \
+            "subs      ip, #4",                  \
+            "vld1.32   {d16[1]}, [r6, :32], r3", \
+            "vld1.32   {d17[0]}, [r1, :32], r3", \
+            "vld1.32   {d17[1]}, [r6, :32], r3", \
+            "vst1.32   {d26[0]}, [r0, :32], r2", \
+            "vst1.32   {d26[1]}, [r5, :32], r2", \
+            "vst1.32   {d27[0]}, [r0, :32], r2", \
+            "vst1.32   {d27[1]}, [r5, :32], r2"
+        pop       {r4-r6, pc}
+endfunc
+
+@ ff_hevc_rpi_sao_band_c_32_neon_8(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
+        push      {r4-r6, lr}
+        add       r5, r0, #32
+        add       r6, r1, #32
+        vmov.u8   q15, #128
+        bl        band_load_c
+
+1:      vld2.8    { q8, q9 }, [r1, :128], r3
+        subs      ip, #1
+        vld2.8    {q10, q11}, [r6, :128], r3
+
+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
+            "pld       [r4]",                 \
+            "it ne; addne r4, r3"
+
+        vst2.8    { q8, q9 }, [r0, :128], r2
+        vst2.8    {q10, q11}, [r5, :128], r2
+        bpl       1b
+
+        pop     {r4-r6, pc}
+endfunc
+
+@ ff_hevc_rpi_sao_band_c_16_neon_8(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
+        push      {r4-r6, lr}
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        vmov.u8   q15, #128
+        bl        band_load_c
+
+1:      vld2.8    { q8, q9 }, [r1, :128], r3
+        subs      ip, #2
+        vld2.8    {q10, q11}, [r6, :128], r3
+
+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
+
+        vst2.8    { q8, q9 }, [r0, :128], r2
+        vst2.8    {q10, q11}, [r5, :128], r2
+        bpl       1b
+
+        pop     {r4-r6, pc}
+endfunc
+
+@ ff_hevc_rpi_sao_band_c_8_neon_8(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
+        ldr       ip, [sp, #16]         @ width
+        push      {r4-r6, lr}
+        vmov.u8   q15, #128
+        cmp       ip, #8
+        bl        band_load_c
+        blt       4f
+
+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
+            "vld2.8    {d16-d17}, [r1, :128], r3", \
+            "subs      ip, #1",                    \
+            "",                                    \
+            "",                                    \
+            "",                                    \
+            "vst2.8    {d26-d27}, [r0, :128], r2"
+        pop       {r4-r6, pc}
+4:
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
+            "vld1.8    {d16}, [r1, :64], r3", \
+            "subs      ip, #2",               \
+            "vld1.8    {d17}, [r6, :64], r3", \
+            "vuzp.8    d16, d17",             \
+            "",                               \
+            "vzip.8    d26, d27",             \
+            "vst1.8    {d26}, [r0, :64], r2", \
+            "vst1.8    {d27}, [r5, :64], r2"
+        pop       {r4-r6, pc}
+endfunc
+
+
+@ ff_hevc_rpi_sao_band_64_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_64_16 bit_depth
+        push      {r4-r6, lr}
+        vmov.i64  q2, #0
+        vmov.i16  q3, #(1 << \bit_depth) - 1
+        bl        band_load_y
+        vpush     {q4-q7}
+
+1:      vldm      r1, {q4-q11}
+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
+            "subs      ip, #1",                                                  \
+            "add       r1, r3"
+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
+        vstm      r0, {q4-q11}
+        add       r0, r2
+        bpl       1b
+
+        vpop      {q4-q7}
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_64_neon_10, export=1
+        band_64_16 10
+endfunc
+
+@ ff_hevc_rpi_sao_band_32_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_32_16 bit_depth
+        push      {r4-r6, lr}
+        vmov.i64  q2, #0
+        vmov.i16  q3, #(1 << \bit_depth) - 1
+        bl        band_load_y
+
+1:      vldm      r1, {q8-q11}
+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
+            "subs      ip, #1",                                                   \
+            "add       r1, r3"
+        vstm      r0, {q8-q11}
+        add       r0, r2
+        bpl       1b
+
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_32_neon_10, export=1
+        band_32_16 10
+endfunc
+
+@ ff_hevc_rpi_sao_band_16_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_16_16 bit_depth
+        push      {r4-r6, lr}
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        vmov.i64  q14, #0
+        vmov.i16  q15, #(1 << \bit_depth) - 1
+        bl        band_load_y
+
+1:      vld1.16   { q8, q9 }, [r1, :128], r3
+        subs      r12, #2
+        vld1.16   {q10, q11}, [r6, :128], r3
+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
+        vst1.16   { q8, q9 }, [r0, :128], r2
+        vst1.16   {q10, q11}, [r5, :128], r2
+        bpl       1b
+
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_16_neon_10, export=1
+        band_16_16 10
+endfunc
+
+@ ff_hevc_rpi_sao_band_8_neon_10 (
+@   uint8_t *_dst,              [r0]
+@   uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,       [r2]
+@   ptrdiff_t stride_src,       [r3]
+@   int16_t *sao_offset_val,    [sp, #0]
+@   int sao_left_class,         [sp, #4]
+@   int width,                  [sp, #8]
+@   int height)                 [sp, #12]
+
+.macro band_8_16 bit_depth
+        ldr       ip, [sp, #8]          @ width
+        push      {r4-r6, lr}
+        vmov.i64  q14, #0
+        cmp       ip, #8
+        vmov.i16  q15, #(1 << \bit_depth) - 1
+        bl        band_load_y
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        blt       4f
+
+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
+            "vld1.16   {q8}, [r1, :128], r3",                           \
+            "subs      ip, #2",                                         \
+            "vld1.16   {q9}, [r6, :128], r3",                           \
+            "",                                                         \
+            "",                                                         \
+            "vst1.16   {q10}, [r0, :128], r2",                          \
+            "vst1.16   {q11}, [r5, :128], r2"
+        pop       {r4-r6, pc}
+4:
+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
+            "vld1.16   {d16}, [r1, :64], r3",                           \
+            "subs      ip, #4",                                         \
+            "vld1.16   {d17}, [r6, :64], r3",                           \
+            "vld1.16   {d18}, [r1, :64], r3",                           \
+            "vld1.16   {d19}, [r6, :64], r3",                           \
+            "vst1.16   {d20}, [r0, :64], r2",                           \
+            "vst1.16   {d21}, [r5, :64], r2",                           \
+            "vst1.16   {d22}, [r0, :64], r2",                           \
+            "vst1.16   {d23}, [r5, :64], r2"
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_8_neon_10, export=1
+        band_8_16 10
+endfunc
+
+
+@ ff_hevc_rpi_sao_band_c_32_neon_10(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+.macro band_c_32_16 bit_depth
+        push      {r4-r6, lr}
+        add       r5, r0, #32
+        add       r6, r1, #32
+        sub       r2, #64
+        sub       r3, #64
+        vmov.i64  q14, #0
+        vmov.i16  q15, #(1 << \bit_depth) - 1
+        bl        band_load_c
+        mov       lr, #64
+        vpush     {q4-q7}
+
+1:      vld2.16   { q4, q5 }, [r1, :128], lr
+        subs      ip, #1
+        vld2.16   { q6, q7 }, [r6, :128], lr
+        vld2.16   { q8, q9 }, [r1, :128], r3
+        vld2.16   {q10, q11}, [r6, :128], r3
+
+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
+            "pld       [r4]",                                                      \
+            "it ne; addne r4, r3"
+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
+
+        vst2.16   { q4, q5 }, [r0, :128], lr
+        vst2.16   { q6, q7 }, [r5, :128], lr
+        vst2.16   { q8, q9 }, [r0, :128], r2
+        vst2.16   {q10, q11}, [r5, :128], r2
+
+        bpl       1b
+
+        vpop      {q4-q7}
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
+        band_c_32_16 10
+endfunc
+
+
+@ ff_hevc_rpi_sao_band_c_16_neon_10(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+.macro band_c_16_16 bit_depth
+        push      {r4-r6, lr}
+        add       r5, r0, #32
+        add       r6, r1, #32
+        vmov.i64  q14, #0
+        vmov.i16  q15, #(1 << \bit_depth) - 1
+        bl        band_load_c
+
+1:      vld2.16   { q8, q9 }, [r1, :128], r3
+        subs      ip, #1
+        vld2.16   {q10, q11}, [r6, :128], r3
+
+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
+
+        vst2.16   { q8, q9 }, [r0, :128], r2
+        vst2.16   {q10, q11}, [r5, :128], r2
+
+        bpl       1b
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
+        band_c_16_16 10
+endfunc
+
+
+@ ff_hevc_rpi_sao_band_c_8_neon_10(
+@   uint8_t * dst          [r0]
+@   uint8_t * src          [r1]
+@   uint32_t dst_stride    [r2]
+@   uint32_t src_stride    [r3]
+@   const int16_t * table1 sp[0]
+@   uint32_t offset1       sp[4]
+@   const int16_t * table2 sp[8]
+@   uint32_t offset2       sp[12]
+@   int width              sp[16]
+@   int height             sp[20]
+
+.macro band_c_8_16 bit_depth
+        ldr       ip, [sp, #16]         @ width
+        push      {r4-r6, lr}
+        vmov.i64  q14, #0
+        cmp       ip, #8
+        vmov.i16  q15, #(1 << \bit_depth) - 1
+        bl        band_load_c
+        blt       4f
+
+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
+            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
+            "subs      ip, #1",                                         \
+            "",                                                         \
+            "",                                                         \
+            "",                                                         \
+            "vst2.16   {q10,q11}, [r0, :128], r2"
+        pop       {r4-r6, pc}
+4:
+        add       r5, r0, r2
+        add       r6, r1, r3
+        lsl       r2, #1
+        lsl       r3, #1
+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
+            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
+            "subs      ip, #2",                                         \
+            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
+            "",                                                         \
+            "",                                                         \
+            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
+            "vst2.16   {d21,d23}, [r5, :128], r2"
+        pop       {r4-r6, pc}
+.endm
+
+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
+        band_c_8_16 10
+endfunc
+
+
+@ =============================================================================
+@ SAO EDGE
+
+@ r0    destination address
+@ r2    stride to post-increment r0 with
+@ [r5]  translate values
+@
+@ a <- c <- b
+@ a in q0 - q3
+@ c in q4 - q7
+@ b in q8 - q11
+@
+@ q12-15 used as temp
+@
+@ Can be used for both Y & C as we unzip/zip the deltas and
+@ transform "u/v" separately via d26/d27.  For Y d26=d27
+
+function edge_64b_body_8
+
+        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
+        vcgt.u8 q13,  q5,  q1
+        vcgt.u8 q14,  q6,  q2
+        vcgt.u8 q15,  q7,  q3
+
+        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
+        vcgt.u8  q1,  q5
+        vcgt.u8  q2,  q6
+        vcgt.u8  q3,  q7
+
+        vsub.s8  q0,  q12       @ a = sign(c-a)
+        vsub.s8  q1,  q13
+        vsub.s8  q2,  q14
+        vsub.s8  q3,  q15
+
+        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
+        vcgt.u8  q13, q5,  q9
+        vcgt.u8  q14, q6,  q10
+        vcgt.u8  q15, q7,  q11
+
+        vsub.s8  q0,  q12
+        vsub.s8  q1,  q13
+        vsub.s8  q2,  q14
+        vsub.s8  q3,  q15
+
+        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
+        vcgt.u8  q13, q9,  q5
+        vcgt.u8  q14, q10, q6
+        vcgt.u8  q15, q11, q7
+
+        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
+        vadd.s8  q1,  q13
+        vmov.u8  q12, #2
+        vadd.s8  q2,  q14
+        vadd.s8  q3,  q15
+
+        vadd.s8  q0,  q12
+        vadd.s8  q1,  q12
+
+        vld1.8   {d26, d27}, [r5]
+
+        vadd.s8  q2,  q12
+        vuzp.8   q0,  q1
+        vmov.u8  q15, #128
+        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
+
+        vtbl.8   d0,  {d26}, d0
+        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
+
+        vtbl.8   d1,  {d26}, d1
+        vadd.s8  q14, q5, q15
+
+        vtbl.8   d2,  {d27}, d2
+        vuzp.8   q2,  q3
+
+        vtbl.8   d3,  {d27}, d3
+
+        vtbl.8   d4,  {d26}, d4
+        vzip.8   q0,  q1
+
+        vtbl.8   d5,  {d26}, d5
+        vqadd.s8 q0,  q12
+        vqadd.s8 q1,  q14
+        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
+
+        vtbl.8   d6,  {d27}, d6
+        vtbl.8   d7,  {d27}, d7
+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
+        vzip.8   q2,  q3
+
+        vsub.s8  q0,  q15
+        vqadd.s8 q2,  q12
+        vqadd.s8 q3,  q14
+        vsub.s8  q1,  q15
+        vsub.s8  q2,  q15
+        vsub.s8  q3,  q15
+
+        bx      lr
+endfunc
+
+@ r0    destination address
+@ r2    stride to post-increment r0 with
+@ r4    upper clip value
+@ [r5]  translate values
+@
+@ a <- c <- b
+@ a in q0 - q3
+@ c in q4 - q7
+@ b in q8 - q11
+@
+@ q12-15 used as temp
+@
+@ Can be used for both Y & C as we unzip/zip the deltas and
+@ transform "u/v" separately via d26/d27.  For Y d26=d27
+
+function edge_64b_body_16
+
+        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
+        vcgt.u16 q13, q5, q1
+        vcgt.u16 q14, q6, q2
+        vcgt.u16 q15, q7, q3
+
+        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
+        vcgt.u16 q1, q1, q5
+        vcgt.u16 q2, q2, q6
+        vcgt.u16 q3, q3, q7
+
+        vsub.s16 q0, q0, q12 // a = sign(c-a)
+        vsub.s16 q1, q1, q13
+        vsub.s16 q2, q2, q14
+        vsub.s16 q3, q3, q15
+
+        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
+        vcgt.u16 q13, q5, q9
+        vcgt.u16 q14, q6, q10
+        vcgt.u16 q15, q7, q11
+
+        vsub.s16 q0, q0, q12
+        vsub.s16 q1, q1, q13
+        vsub.s16 q2, q2, q14
+        vsub.s16 q3, q3, q15
+
+        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
+        vcgt.u16 q13, q9, q5
+        vcgt.u16 q14, q10, q6
+        vcgt.u16 q15, q11, q7
+
+        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+        vadd.s16 q1, q1, q13
+        vadd.s16 q2, q2, q14
+        vadd.s16 q3, q3, q15
+
+        vmov.u8  q12, #2
+
+        vmovn.s16 d0, q0
+        vmovn.s16 d1, q1
+        vmovn.s16 d2, q2
+        vmovn.s16 d3, q3
+
+        vldr     d26, [r5]
+
+        vuzp.8   q0, q1
+
+        vldr     d27, [r5, #8]
+
+        vadd.s8  q0, q0, q12
+        vadd.s8  q1, q1, q12
+
+        vmov.i64 q12, #0
+
+        vtbl.8   d0, {d26}, d0
+        vtbl.8   d1, {d26}, d1
+        vtbl.8   d2, {d27}, d2
+        vtbl.8   d3, {d27}, d3
+
+        vdup.i16 q13, r4
+
+        vzip.8   q0, q1
+
+        @ Avoid overwrite whilst widening
+        vaddw.s8 q2, q6, d2
+        vaddw.s8 q3, q7, d3
+        vaddw.s8 q1, q5, d1
+        vaddw.s8 q0, q4, d0
+
+        @ now clip
+        clip16_4 q2, q3, q1, q0, q12, q13
+
+        bx       lr
+endfunc
+
+
+@ a <- c <- b
+@ a in q0
+@ c in q1
+@ b in q2
+@ Temp q3, q9, q10
+@
+@ d16, d17 (q8) xlat U, V
+@ q14.u8 #2
+@ q15.u8 #128
+
+function edge_16b_body_8
+        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
+        vadd.u8  q9,  q14, q9
+        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
+        vsub.u8  q9,  q9,  q0
+        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
+        vadd.u8  q9,  q9,  q0
+        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
+        vsub.u8  q0,  q9,  q0
+
+        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
+
+        vuzp.8   d0,  d1
+
+        vtbl.8   d0,  {d16}, d0
+        vtbl.8   d1,  {d17}, d1
+
+        vzip.8   d0,  d1
+        vqadd.s8 q0,  q3
+        vsub.s8  q0,  q15
+
+        bx      lr
+endfunc
+
+@ a <- c <- b
+@ a in q0
+@ c in q1
+@ b in q2
+@ Temp q3
+@
+@ q12, #0
+@ d16, d17 xlat U, V
+@ q14.u8 #2
+@ q15.u16 max
+function edge_16b_body_16
+        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
+        vadd.u16 q9, q14, q9
+        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
+        vsub.u16 q9, q9, q0
+        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
+        vadd.u16 q9, q9, q0
+        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
+        vsub.u16 q0, q9, q0
+
+        vmovn.s16 d0, q0
+        @ d1 will have random contents that we transform but
+        @ that doesn't matter as we then discard them
+        vuzp.8   d0, d1
+
+        vtbl.8   d0, {d16}, d0
+        vtbl.8   d1, {d17}, d1
+
+        vzip.8   d0, d1
+
+        vaddw.s8 q0, q1, d0
+
+        @ now clip
+        vmax.s16 q0, q12
+        vmin.s16 q0, q15
+        bx       lr
+endfunc
+
+
+@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
+@   int eo,                           [sp, #sp_base + 0]
+@   int width,                        [sp, #sp_base + 4]
+@   int height)                       [sp, #sp_base + 8]
+
+@ Jumps via jump_tab with
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   EDGE_SRC_STRIDE                   [r3]
+@   (1 << \bit_depth) - 1             [r4]
+@   * xlat_table                      [r5]  // setup_64b only
+@   int height                        [r12]
+@
+@   0                                 [q12] // > 8 bit
+@   2                                 [q14]
+@   128                               [q15] // = 8 bit
+@   r4                                [q15] // > 8 bit
+
+.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
+
+@ Build translate registers
+@ As translate values can only be 0-4 we don't care about junk in the rest
+@ of the register
+.if \is_chroma
+        ldr      ip, [sp, #0]
+        push     {r4-r6, lr}    @ 16 bytes
+        vld1.8   {d16[2]}, [r3]
+        add      r3, r3, #2
+        vld1.8   {d17[2]}, [ip]
+        add      ip, ip, #2
+        vld1.8   {d16[0]}, [r3]
+        add      r3, r3, #2
+        vld1.8   {d17[0]}, [ip]
+        add      ip, ip, #2
+        vld1.8   {d16[1]}, [r3]
+        add      r3, r3, #2
+        vld1.8   {d17[1]}, [ip]
+        add      ip, ip, #2
+        vld1.8   {d16[3]}, [r3]
+        add      r3, r3, #2
+        vld1.8   {d17[3]}, [ip]
+        add      ip, ip, #2
+        vld1.8   {d16[4]}, [r3]
+        vld1.8   {d17[4]}, [ip]
+        movw     r3, EDGE_SRC_STRIDE
+.set sp_base, 20
+.else
+        add      ip, r3, #4
+        vld1.8   {d16[1]}, [r3]
+        add      r3, r3, #2
+        vld1.8   {d17[0]}, [ip]
+        add      ip, ip, #2
+        vld1.8   {d16[0]}, [r3]
+        add      r3, r3, #6
+        vld1.8   {d17[1]}, [ip]
+        vld1.8   {d16[2]}, [r3]
+        movw     r3, EDGE_SRC_STRIDE
+        push     {r4-r6, lr}    @ 16 bytes
+        vzip.8   d16, d17
+        vmov     d17, d16
+.set sp_base, 16
+.endif
+
+@ If setup_64b we need the xlat table on the stack
+.if \setup_64b
+        sub      r5, sp, #16
+.endif
+
+@ Get jump address
+@ We have a special case for width 4 as the calling code doesn't detect it
+@ If we may have w4 then we add a 2nd jump table after the 1st
+.if \check_w4
+        ldr      r12, [sp, #sp_base + 4]        @ width
+        adr      r6, \jump_tab
+        ldr      lr, [sp, #sp_base + 0]        @ e0
+        cmp      r12, #8
+        it lt
+        addlt    r6, #16
+.else
+        ldr      lr, [sp, #sp_base + 0]        @ e0
+        adr      r6, \jump_tab
+.endif
+
+        ldr      r12, [sp, #sp_base + 8]        @ height
+
+.if \bit_depth > 8
+        movw     r4, (1 << \bit_depth) - 1
+.endif
+.if \setup_16b
+.if \bit_depth > 8
+        vmov.i64 q12, #0
+        vdup.16  q15, r4
+        vmov.u16 q14, #2
+.else
+        vmov.u8  q15, #128
+        vmov.u8  q14, #2
+.endif
+.endif
+
+@ If setup_64b we need q4-q7 saved.
+.if \setup_64b
+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
+.set sp_base, sp_base + 80
+.endif
+
+        ldr      r6, [r6, lr, lsl #2]
+
+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
+.if \do2
+        push     {r0, r1, r6, r12}
+.if jent_pic
+        bl       98f
+.else
+        blx      r6
+.endif
+        pop      {r0, r1, r6, r12}
+
+        add      r0, #64
+        add      r1, #64
+.endif
+
+.if jent_pic
+        bl       98f
+.else
+        blx      r6
+.endif
+
+@ Tidy up & return
+.if \setup_64b
+        vpop     {q4-q8}        @ spurious but harmless load of q8
+.endif
+        pop      {r4-r6, pc}
+
+.if jent_pic && !\xjump
+@ Magic label - used as 98b in jent macro
+98:
+        add      pc, r6
+.endif
+.endm
+
+
+.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
+.endm
+
+.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
+.endm
+
+
+.macro  edge_64b_e0, body_fn, pb
+        sub      r1, #8
+        mov      r6, lr
+1:      vldm     r1, {d7-d16}
+        // load a
+        vext.8   q0,  q3,  q4, #(16 - \pb)
+        add      r1, r3
+        vext.8   q1,  q4,  q5, #(16 - \pb)
+        subs     r12, #1
+        vext.8   q2,  q5,  q6, #(16 - \pb)
+        vext.8   q3,  q6,  q7, #(16 - \pb)
+        pld      [r1]
+        // load b
+        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
+        pld      [r1, #64]
+        vext.8   q8,  q4,  q5, #\pb
+        vext.8   q9,  q5,  q6, #\pb
+        vext.8   q10, q6,  q7, #\pb
+        bl       \body_fn
+        vstm     r0, {q0-q3}
+        add      r0, r0, r2
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_32bx2_e0, body_fn, pb
+        add      r6, r1, r3
+        push     {r7,lr}
+        sub      r1, #8
+        add      r7, r0, r2
+        lsl      r2, #1
+1:      vldmia   r1, {d7-d12}
+        // load a
+        vext.8   q0, q3, q4, #16 - \pb
+        add      r1, r1, r3, lsl #1
+        vext.8   q1, q4, q5, #16 - \pb
+        subs     r12, #2
+        // load b
+        vext.8   q8, q4, q5, #\pb
+        vext.8   q9, q5, q6, #\pb
+        vldr     d25, [r6, #-8]
+        vldmia   r6, {d12-d15}
+        vldr     d26, [r6, #32]
+        // load a
+        vext.8   q2, q12, q6, #16 - \pb
+        add      r6, r6, r3, lsl #1
+        vext.8   q3, q6, q7, #16 - \pb
+        // load b
+        vext.8   q10, q6, q7, #\pb
+        vext.8   q11, q7, q13, #\pb
+        bl       \body_fn
+        vst1.8   {q0-q1}, [r0, :256], r2
+        vst1.8   {q2-q3}, [r7, :256], r2
+        bgt      1b
+        pop      {r7,pc}
+.endm
+
+.macro  edge_16b_e0, body_fn, pb
+        sub      r1, #8
+        mov      r6, lr
+1:      vldmia   r1, {d1-d4}
+        add      r1, r3
+        subs     r12, #1
+        vext.8   q0, q0, q1, #16 - \pb
+        vext.8   q2, q1, q2, #\pb
+
+        bl       \body_fn
+        vst1.8   {q0}, [r0, :128], r2
+        bgt      1b
+        bx       r6
+.endm
+
+.macro  edge_8bx2_e0, body_fn, pb
+        add      r6, r1, r3
+        push     {r7,lr}
+        sub      r1, #8
+        add      r7, r0, r2
+        lsl      r2, #1
+1:      vldmia   r1, {d1-d2}
+        vldmia   r6, {d3-d4}
+        vldr     d6, [r1, #16]
+        subs     r12, #2
+        vldr     d7, [r6, #-8]
+        add      r1, r1, r3, lsl #1
+        vext.8   d0, d1, d2, #8 - \pb
+        add      r6, r6, r3, lsl #1
+        vext.8   d5, d3, d4, #\pb
+        vext.8   d4, d2, d6, #\pb
+        vext.8   d1, d7, d3, #8 - \pb
+
+        bl       \body_fn
+        vst1.8   {d0}, [r0, :64], r2
+        vst1.8   {d1}, [r7, :64], r2
+        bgt      1b
+        pop      {r7,pc}
+.endm
+
+.macro  edge_4bx4_e0, body_fn, pb
+        add      r6, r1, r3
+        push     {r7,lr}
+        add      r7, r0, r2
+        lsl      r2, #1
+
+        tst      r1, #4
+        bne      2f
+1:      // r1 (and assumed r6) are 64-bit aligned
+        vldr     d2, [r1]
+        vldr     d0, [r1, #-8]
+        add      r1, r1, r3, lsl #1
+        vldr     d20, [r6]
+        subs     r12, #4
+        vldr     d18, [r6, #-8]
+        add      r6, r6, r3, lsl #1
+        vldr     d3, [r1]
+        vshr.u64 d4, d2, #\pb * 8
+        vldr     d1, [r1, #-8]
+        add      r1, r1, r3, lsl #1
+        vldr     d21, [r6]
+        vext.8   d0, d0, d2, #8 - \pb
+        vldr     d19, [r6,#-8]
+        add      r6, r6, r3, lsl #1
+        vshr.u64 d22, d20, #\pb * 8
+        vext.8   d18, d18, d20, #8 - \pb
+        vshr.u64 d5, d3, #\pb * 8
+        vext.8   d1, d1, d3, #8 - \pb
+        vshr.u64 d23, d21, #\pb * 8
+        vext.8   d19, d19, d21, #8 - \pb
+        vsli.64  q1, q10, #32
+        vsli.64  q2, q11, #32
+        vsli.64  q0, q9, #32
+
+        bl       \body_fn
+        vst1.32  {d0[0]}, [r0, :32], r2
+        vst1.32  {d0[1]}, [r7, :32], r2
+        vst1.32  {d1[0]}, [r0, :32], r2
+        vst1.32  {d1[1]}, [r7, :32], r2
+        bgt      1b
+        pop      {r7,pc}
+
+2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
+        vldr     d20, [r1, #-4]
+        vldr     d22, [r1, #4]
+        add      r1, r1, r3, lsl #1
+        vldr     d2, [r6, #-4]
+        subs     r12, #4
+        vldr     d4, [r6, #4]
+        add      r6, r6, r3, lsl #1
+        vldr     d21, [r1, #-4]
+        vshl.i64 d18, d20, #\pb * 8
+        vldr     d23, [r1, #4]
+        add      r1, r1, r3, lsl #1
+        vldr     d3, [r6, #-4]
+        vext.8   d22, d20, d22, #\pb
+        vldr     d5, [r6, #4]
+        add      r6, r6, r3, lsl #1
+        vshl.i64 d0, d2, #\pb * 8
+        vext.8   d4, d2, d4, #\pb
+        vshl.i64 d19, d21, #\pb * 8
+        vext.8   d23, d21, d23, #\pb
+        vshl.i64 d1, d3, #\pb * 8
+        vext.8   d5, d3, d5, #\pb
+        vsri.64  q1, q10, #32
+        vsri.64  q0, q9, #32
+        vsri.64  q2, q11, #32
+
+        bl       \body_fn
+        vst1.32  {d0[0]}, [r0, :32], r2
+        vst1.32  {d0[1]}, [r7, :32], r2
+        vst1.32  {d1[0]}, [r0, :32], r2
+        vst1.32  {d1[1]}, [r7, :32], r2
+        bgt      2b
+        pop      {r7,pc}
+.endm
+
+
+.macro  edge_64b_e1, body_fn
+        sub      r1, r3
+        push     {lr}
+        add      r6, r1, #32
+        // load a
+        vld1.8   {q0-q1}, [r1, :256], r3
+        vld1.8   {q2-q3}, [r6, :256], r3
+        // load c
+        vld1.8   {q4-q5}, [r1, :256], r3
+        vld1.8   {q6-q7}, [r6, :256], r3
+1:      // load b
+        vld1.8   {q8-q9}, [r1, :256], r3
+        subs     r12, #1
+        vld1.8   {q10-q11}, [r6, :256], r3
+        bl       \body_fn
+        vstm     r0, {q0-q3}
+        // copy c to a
+        vmov.64  q0, q4
+        pld      [r1, r3]
+        vmov.64  q1, q5
+        it       le
+        pople    {lr}
+        vmov.64  q2, q6
+        it       le
+        bxle     lr
+        vmov.64  q3, q7
+        add      r0, r0, r2
+        // copy b to c
+        vmov.64  q4, q8
+        vmov.64  q5, q9
+        vmov.64  q6, q10
+        vmov.64  q7, q11
+        b        1b
+.endm
+
+.macro  edge_32bx2_e1, body_fn
+        sub      r6, r1, r3
+        vld1.8   {q2-q3}, [r1, :256], r3
+        vld1.8   {q0-q1}, [r6, :256]
+        mov      r6, lr
+
+1:      @ Given the data duplication here we could obviously do better than
+        @ using the generic body_fn but it almost certainly isn't worth it
+        vld1.8   {q8-q9}, [r1, :256], r3
+        subs     r12, #2
+        vmov     q4, q2
+        vmov     q5, q3
+        vld1.8   {q10-q11}, [r1, :256], r3
+        vmov     q6, q8
+        vmov     q7, q9
+
+        bl       \body_fn
+
+        vst1.8   {q0-q1}, [r0, :256], r2
+        // copy b to a
+        vmov     q0, q8
+        vmov     q1, q9
+        vst1.8   {q2-q3}, [r0, :256], r2
+        vmov     q2, q10
+        it       le
+        bxle     r6
+        vmov     q3, q11
+        b        1b
+.endm
+
+.macro  edge_16b_e1, body_fn
+        sub      r6, r1, r3
+        // load c
+        vld1.8   {q1}, [r1, :128], r3
+        // load a
+        vld1.8   {q0}, [r6, :128]
+        mov      r6, lr
+1:      // load b
+        vld1.8   {q2}, [r1, :128], r3
+        bl       \body_fn
+        vst1.8   {q0}, [r0, :128], r2
+        subs     r12, #1
+        // copy c to a
+        vmov.64  q0, q1
+        it       le
+        bxle     r6
+        // copy b to c
+        vmov.64  q1, q2
+        b        1b
+.endm
+
+.macro  edge_8bx2_e1, body_fn
+        sub      r6, r1, r3
+        lsl      r3, #1
+        push     {r7, lr}
+        vld1.8   {d1}, [r1, :64], r3
+        vld1.8   {d0}, [r6, :64], r3
+        add      r7, r0, r2
+        lsl      r2, #1
+1:      @ Given the data duplication here we could obviously do better than
+        @ using the generic body_fn but it almost certainly isn't worth it
+        vld1.8   {d4}, [r6, :64], r3
+        vmov     d2, d1
+        vld1.8   {d5}, [r1, :64], r3
+        subs     r12, #2
+        vmov     d3, d4
+
+        bl       \body_fn
+
+        vst1.8   {d0}, [r0, :64], r2
+        vst1.8   {d1}, [r7, :64], r2
+
+        // copy b to a
+        vmov     q0, q2
+        bgt      1b
+        pop      {r7, pc}
+.endm
+
+.macro  edge_4bx4_e1, body_fn
+        sub      r6, r1, r3
+        lsl      r3, #1
+        push     {r7, lr}
+        vld1.32  {d0[1]}, [r1, :32], r3
+        add      r7, r0, r2
+        vld1.32  {d0[0]}, [r6, :32], r3
+        lsl      r2, #1
+        vld1.32  {d4[1]}, [r1, :32], r3
+        vld1.32  {d4[0]}, [r6, :32], r3
+        vld1.32  {d5[1]}, [r1, :32], r3
+        vld1.32  {d5[0]}, [r6, :32], r3
+        vmov     d1, d4
+        vext.32  d2, d0, d4, #1
+        subs     r12, #4
+        vmov     d22, d5
+        vext.32  d3, d4, d5, #1
+        b        2f
+
+1:      vst1.32  {d0[0]}, [r0, :32], r2
+        vext.32  d2, d22, d4, #1
+        vst1.32  {d0[1]}, [r7, :32], r2
+        vmov     d0, d22
+        vst1.32  {d1[0]}, [r0, :32], r2
+        vext.32  d3, d4, d5, #1
+        vst1.32  {d1[1]}, [r7, :32], r2
+        vmov     d1, d4
+        vmov     d22, d5
+2:      @ Given the data duplication here we could probably do better than
+        @ using the generic body_fn but it almost certainly isn't worth it
+        bl       \body_fn
+        ble      3f
+        vld1.32  {d4[0]}, [r6, :32], r3
+        subs     r12, #4
+        vld1.32  {d4[1]}, [r1, :32], r3
+        vld1.32  {d5[0]}, [r6, :32], r3
+        vld1.32  {d5[1]}, [r1, :32], r3
+        b        1b
+
+3:      vst1.32  {d0[0]}, [r0, :32], r2
+        vst1.32  {d0[1]}, [r7, :32], r2
+        vst1.32  {d1[0]}, [r0, :32]
+        vst1.32  {d1[1]}, [r7, :32]
+        pop      {r7, pc}
+.endm
+
+.macro  edge_64b_e2, body_fn, pb
+        push     {lr}
+        sub      r6, r1, r3
+        // load c and a
+        vld1.8   {q4-q5}, [r1, :128]
+        vldr     d25, [r6, #-8]
+        vldmia   r6, {d16-d23}
+        vext.8   q0, q12, q8, #16 - \pb
+        add      r6, r1, #32
+        vext.8   q1, q8, q9, #16 - \pb
+        add      r1, r1, r3
+        vext.8   q2, q9, q10, #16 - \pb
+        vld1.8   {q6-q7}, [r6, :128]
+        sub      r6, r1, r3
+        vext.8   q3, q10, q11, #16 - \pb
+
+1:      // load b
+        vldmia   r1, {d16-d24}
+        vext.8   q8, q8, q9, #\pb
+        pld      [r1, r3]
+        vext.8   q9, q9, q10, #\pb
+        subs     r12, #1
+        vext.8   q10, q10, q11, #\pb
+        vext.8   q11, q11, q12, #\pb
+        bl       \body_fn
+        // next a is mostly available in c
+        vldr     d25, [r6, #-8]
+        vstmia   r0, {q0-q3}
+        vext.8   q3, q6, q7, #16 - \pb
+        it       le
+        pople    {lr}
+        vext.8   q2, q5, q6, #16 - \pb
+        it       le
+        bxle     lr
+        vext.8   q1, q4, q5, #16 - \pb
+        add      r6, r6, r3
+        vext.8   q0, q12, q4, #16 - \pb
+        add      r0, r0, r2
+        // next c is mostly available in b
+        vldr     d8, [r1]
+        vext.8   d9, d16, d17, #8 - \pb
+        vext.8   q5, q8, q9, #16 - \pb
+        add      r1, r1, r3
+        vext.8   q6, q9, q10, #16 - \pb
+        pld      [r6, #-8]
+        vext.8   q7, q10, q11, #16 - \pb
+        b        1b
+.endm
+
+.macro  edge_32bx2_e2, body_fn, pb
+        sub      r6, r1, r3
+        push     {r7, lr}
+        add      r7, r0, r2
+        lsl      r2, #1
+        // load a and first 32b of c
+        vld1.8   {q4-q5}, [r1, :256]
+        vldr     d25, [r6, #-8]
+        vld1.8   {q13-q14}, [r6, :256]
+        vldr     d31, [r1, #-8]
+        add      r6, r6, r3, lsl #1
+        vext.8   q0, q12, q13, #16 - \pb
+        add      r1, r1, r3, lsl #1
+        vext.8   q1, q13, q14, #16 - \pb
+        vext.8   q2, q15, q4, #16 - \pb
+        vext.8   q3, q4, q5, #16 - \pb
+1:
+        // load second 32b of c and second 32b of b
+        vldmia   r6, {d12-d16}
+        vldmia   r1, {d20-d24}
+        // first 32b of b is mostly available in second 32b of c
+        vext.8   q9, q7, q8, #\pb
+        subs     r12, #2
+        vext.8   q8, q6, q7, #\pb
+        vext.8   q10, q10, q11, #\pb
+        vext.8   q11, q11, q12, #\pb
+
+        bl       \body_fn
+
+        vst1.8   {q0-q1}, [r0, :256], r2
+        vst1.8   {q2-q3}, [r7, :256], r2
+        ble      2f
+
+        vldr     d25, [r6, #-8]
+        add      r6, r6, r3, lsl #1
+        vldr     d8, [r1]
+        vext.8   d9, d20, d21, #8 - \pb
+        vldr     d31, [r1, #-8]
+        add      r1, r1, r3, lsl #1
+        // first 32b of a is mostly available in second 32b of c
+        vext.8   q1, q6, q7, #16 - \pb
+        vext.8   q0, q12, q6, #16 - \pb
+        // first 32b of c is mostly available in second 32b of b
+        vext.8   q5, q10, q11, #16 - \pb
+        // second 32b of a is mostly available in first 32b of c
+        vext.8   q2, q15, q4, #16 - \pb
+        vext.8   q3, q4, q5, #16 - \pb
+        b        1b
+
+2:      pop      {r7, pc}
+.endm
+
+.macro  edge_16b_e2, body_fn, pb
+        push     {lr}
+        sub      r6, r1, r3
+        vld1.8   {q1}, [r1, :128], r3
+        vldr     d19, [r6, #-8]
+        vld1.8   {q10}, [r6, :128], r3
+
+1:      vldmia   r1, {d4-d6}
+        vext.8   q0, q9, q10, #16 - \pb
+        subs     r12, #1
+        vext.8   q2, q2, q3, #\pb
+        bl       \body_fn
+        vst1.8   {q0}, [r0, :128], r2
+        ble      2f
+        vmov     q10, q1
+        vldr     d2, [r1]
+        add      r1, r1, r3
+        vldr     d19, [r6, #-8]
+        add      r6, r6, r3
+        vext.8   d3, d4, d5, #8 - \pb
+        b        1b
+
+2:      pop      {pc}
+.endm
+
+.macro  edge_8bx2_e2, body_fn, pb
+        sub      r6, r1, r3
+        push     {r7, lr}
+        add      r7, r0, r2
+        lsl      r2, #1
+        vldr     d18, [r6, #-8]
+        vldr     d19, [r6]
+        add      r6, r6, r3, lsl #1
+        vldr     d20, [r1, #-8]
+        vldr     d2, [r1]
+        add      r1, r1, r3, lsl #1
+        vldmia   r6, {d3-d4}
+        vld1.8   {d21-d22}, [r1, :128]
+
+1:      vext.8   d0, d18, d19, #8 - \pb
+        vext.8   d4, d3, d4, #\pb
+        vext.8   d1, d20, d2, #8 - \pb
+        subs     r12, #2
+        vext.8   d5, d21, d22, #\pb
+
+        bl       \body_fn
+
+        vst1.8   {d0}, [r0, :64], r2
+        vst1.8   {d1}, [r7, :64], r2
+        ble      2f
+
+        vldr     d18, [r6, #-8]
+        add      r6, r6, r3, lsl #1
+        vldr     d20, [r1, #-8]
+        vmov     d19, d3
+        vldr     d2, [r1]
+        add      r1, r1, r3, lsl #1
+        vldmia   r6, {d3-d4}
+        vld1.8   {d21-d22}, [r1, :128]
+        b        1b
+
+2:      pop      {r7, pc}
+.endm
+
+.macro  edge_4bx4_e2, body_fn, pb
+        sub      r6, r1, r3
+        push     {r7-r9, lr}
+        add      r8, r1, r3
+        sub      r6, r6, #\pb
+        add      r8, r8, #\pb
+        add      r7, r0, r2
+        lsl      r2, #1
+
+1:      vld1.32  {d0[0]}, [r6], r3
+        subs     r12, #4
+        vld1.32  {d2[0]}, [r1], r3
+        vld1.32  {d4[0]}, [r8], r3
+        vld1.32  {d0[1]}, [r6], r3
+        vld1.32  {d2[1]}, [r1], r3
+        vld1.32  {d4[1]}, [r8], r3
+        vld1.32  {d1[0]}, [r6], r3
+        vld1.32  {d3[0]}, [r1], r3
+        vld1.32  {d5[0]}, [r8], r3
+        vld1.32  {d1[1]}, [r6], r3
+        vld1.32  {d3[1]}, [r1], r3
+        vld1.32  {d5[1]}, [r8], r3
+
+        bl       \body_fn
+
+        vst1.32  {d0[0]}, [r0, :32], r2
+        vst1.32  {d0[1]}, [r7, :32], r2
+        vst1.32  {d1[0]}, [r0, :32], r2
+        vst1.32  {d1[1]}, [r7, :32], r2
+        bgt      1b
+
+        pop      {r7-r9,pc}
+.endm
+
+.macro  edge_64b_e3, body_fn, pb
+        push     {lr}
+        sub      r6, r1, r3
+        // load c and a
+        vld1.8   {q4-q5}, [r1, :128]
+        vldmia   r6, {d16-d24}
+        vext.8   q0, q8, q9, #\pb
+        add      r6, r1, #32
+        vext.8   q1, q9, q10, #\pb
+        add      r1, r1, r3
+        vext.8   q2, q10, q11, #\pb
+        vld1.8   {q6-q7}, [r6, :128]
+        sub      r6, r1, r3
+        vext.8   q3, q11, q12, #\pb
+
+1:      // load b
+        vldr     d17, [r1, #-8]
+        vldmia   r1, {d18-d25}
+        vext.8   q8, q8, q9, #16 - \pb
+        pld      [r1, r3]
+        vext.8   q9, q9, q10, #16 - \pb
+        subs     r12, #1
+        vext.8   q10, q10, q11, #16 - \pb
+        vext.8   q11, q11, q12, #16 - \pb
+        bl       \body_fn
+        // next a is mostly available in c
+        vldr     d24, [r6, #64]
+        vstmia   r0, {q0-q3}
+        vext.8   q0, q4, q5, #\pb
+        it       le
+        pople    {lr}
+        vext.8   q1, q5, q6, #\pb
+        it       le
+        bxle     lr
+        vext.8   q2, q6, q7, #\pb
+        add      r6, r6, r3
+        vext.8   q3, q7, q12, #\pb
+        add      r0, r0, r2
+        // next c is mostly available in b
+        vext.8   d14, d22, d23, #\pb
+        vldr     d15, [r1, #56]
+        vext.8   q4, q8, q9, #\pb
+        add      r1, r1, r3
+        vext.8   q5, q9, q10, #\pb
+        vext.8   q6, q10, q11, #\pb
+        b        1b
+.endm
+
+.macro  edge_32bx2_e3, body_fn, pb
+        sub      r6, r1, r3
+        push     {r7, lr}
+        add      r7, r0, r2
+        lsl      r2, #1
+        // load a and first 32b of c
+        vldmia   r1, {d8-d12}
+        vldmia   r6, {d24-d28}
+        vext.8   q2, q4, q5, #\pb
+        add      r6, r6, r3, lsl #1
+        vext.8   q3, q5, q6, #\pb
+        add      r1, r1, r3, lsl #1
+        vext.8   q0, q12, q13, #\pb
+        vext.8   q1, q13, q14, #\pb
+1:
+        // load second 32b of c and second 32b of b
+        vldr     d25, [r6, #-8]
+        subs     r12, #2
+        vldmia   r6, {d12-d15}
+        vldr     d27, [r1, #-8]
+        vldmia   r1, {d20-d23}
+        // first 32b of b is mostly available in second 32b of c
+        vext.8   q8, q12, q6, #16 - \pb
+        vext.8   q9, q6, q7, #16 - \pb
+        vext.8   q11, q10, q11, #16 - \pb
+        vext.8   q10, q13, q10, #16 - \pb
+
+        bl       \body_fn
+
+        vst1.8   {q0-q1}, [r0, :256], r2
+        vst1.8   {q2-q3}, [r7, :256], r2
+        ble      2f
+
+        vldr     d24, [r6, #32]
+        add      r6, r6, r3, lsl #1
+        vldr     d11, [r1, #24]
+        vext.8   d10, d22, d23, #\pb
+        vldr     d30, [r1, #32]
+        add      r1, r1, r3, lsl #1
+        // first 32b of a is mostly available in second 32b of c
+        vext.8   q0, q6, q7, #\pb
+        vext.8   q1, q7, q12, #\pb
+        // first 32b of c is mostly available in second 32b of b
+        vext.8   q4, q10, q11, #\pb
+        // second 32b of a is mostly available in first 32b of c
+        vext.8   q3, q5, q15, #\pb
+        vext.8   q2, q4, q5, #\pb
+        b        1b
+
+2:      pop      {r7, pc}
+.endm
+
+.macro  edge_16b_e3, body_fn, pb
+        push     {lr}
+        sub      r6, r1, r3
+        vld1.8   {q1}, [r1, :128], r3
+        vldmia   r6, {d18-d20}
+        add      r6, r6, r3
+
+1:      vldr     d5, [r1, #-8]
+        vld1.8   {q3}, [r1, :128]
+        subs     r12, #1
+        vext.8   q0, q9, q10, #\pb
+        vext.8   q2, q2, q3, #16 - \pb
+        bl       \body_fn
+        vst1.8   {q0}, [r0, :128], r2
+        ble      2f
+        vmov     q9, q1
+        vldr     d3, [r1, #8]
+        add      r1, r1, r3
+        vldr     d20, [r6, #16]
+        add      r6, r6, r3
+        vext.8   d2, d4, d5, #\pb
+        b        1b
+
+2:      pop      {pc}
+.endm
+
+.macro  edge_8bx2_e3, body_fn, pb
+        sub      r6, r1, r3
+        push     {r7, lr}
+        add      r7, r0, r2
+        lsl      r2, #1
+        vld1.8   {d18-d19}, [r6]
+        add      r6, r6, r3, lsl #1
+        vldr     d20, [r1, #8]
+        vldr     d2, [r1]
+        add      r1, r1, r3, lsl #1
+        vldr     d4, [r6, #-8]
+        vldr     d3, [r6]
+        vldr     d21, [r1, #-8]
+        vldr     d22, [r1]
+
+1:      vext.8   d0, d18, d19, #\pb
+        vext.8   d4, d4, d3, #8 - \pb
+        vext.8   d1, d2, d20, #\pb
+        subs     r12, #2
+        vext.8   d5, d21, d22, #8 - \pb
+
+        bl       \body_fn
+
+        vst1.8   {d0}, [r0, :64], r2
+        vst1.8   {d1}, [r7, :64], r2
+        ble      2f
+
+        vldr     d19, [r6, #8]
+        add      r6, r6, r3, lsl #1
+        vldr     d20, [r1, #8]
+        vmov     d18, d3
+        vldr     d2, [r1]
+        add      r1, r1, r3, lsl #1
+        vldr     d4, [r6, #-8]
+        vldr     d3, [r6]
+        vldr     d21, [r1, #-8]
+        vldr     d22, [r1]
+        b        1b
+
+2:      pop      {r7, pc}
+.endm
+
+.macro  edge_4bx4_e3, body_fn, pb
+        @ e3 is the same as e2 but with the X offset reversed
+        edge_4bx4_e2 \body_fn, (-\pb)
+.endm
+
+@ Jump table entry - if in neon mode the bottom bit must be set
+@ ? There is probably a real asm instruction to do this but I haven't found it
+.macro jent lab
+.if jent_pic
+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
+@ simpler and clearer in the code to stick with .word
+T       .word  (0 + \lab) - (4 + 98b)
+A       .word  (0 + \lab) - (8 + 98b)
+.else
+T       .word   1 + \lab
+A       .word   \lab
+.endif
+.endm
+
+.macro edge_64b_bodies, body_fn, pb
+        jent    0f
+        jent    10f
+        jent    20f
+        jent    30f
+
+0:      edge_64b_e0     \body_fn, \pb
+10:     edge_64b_e1     \body_fn
+20:     edge_64b_e2     \body_fn, \pb
+30:     edge_64b_e3     \body_fn, \pb
+.endm
+
+.macro edge_32bx2_bodies, body_fn, pb
+        jent    0f
+        jent    10f
+        jent    20f
+        jent    30f
+
+0:      edge_32bx2_e0   \body_fn, \pb
+10:     edge_32bx2_e1   \body_fn
+20:     edge_32bx2_e2   \body_fn, \pb
+30:     edge_32bx2_e3   \body_fn, \pb
+.endm
+
+.macro edge_16b_bodies, body_fn, pb
+        jent    0f
+        jent    10f
+        jent    20f
+        jent    30f
+
+0:      edge_16b_e0     \body_fn, \pb
+10:     edge_16b_e1     \body_fn
+20:     edge_16b_e2     \body_fn, \pb
+30:     edge_16b_e3     \body_fn, \pb
+.endm
+
+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
+        jent    0f
+        jent    10f
+        jent    20f
+        jent    30f
+        jent    5f
+        jent    15f
+        jent    25f
+        jent    35f
+
+0:      edge_32bx2_e0   \body_fn_64b, \pb
+10:     edge_32bx2_e1   \body_fn_64b
+20:     edge_32bx2_e2   \body_fn_64b, \pb
+30:     edge_32bx2_e3   \body_fn_64b, \pb
+5:      edge_16b_e0     \body_fn_16b, \pb
+15:     edge_16b_e1     \body_fn_16b
+25:     edge_16b_e2     \body_fn_16b, \pb
+35:     edge_16b_e3     \body_fn_16b, \pb
+.endm
+
+.macro edge_16b_8bx2_bodies, body_fn, pb
+        jent    0f
+        jent    10f
+        jent    20f
+        jent    30f
+        jent    5f
+        jent    15f
+        jent    25f
+        jent    35f
+
+0:      edge_16b_e0     \body_fn, \pb
+10:     edge_16b_e1     \body_fn
+20:     edge_16b_e2     \body_fn, \pb
+30:     edge_16b_e3     \body_fn, \pb
+5:      edge_8bx2_e0    \body_fn, \pb
+15:     edge_8bx2_e1    \body_fn
+25:     edge_8bx2_e2    \body_fn, \pb
+35:     edge_8bx2_e3    \body_fn, \pb
+.endm
+
+.macro edge_8bx2_4bx4_bodies, body_fn, pb
+        jent    0f
+        jent    10f
+        jent    20f
+        jent    30f
+        jent    5f
+        jent    15f
+        jent    25f
+        jent    35f
+
+0:      edge_8bx2_e0    \body_fn, \pb
+10:     edge_8bx2_e1    \body_fn
+20:     edge_8bx2_e2    \body_fn, \pb
+30:     edge_8bx2_e3    \body_fn, \pb
+5:      edge_4bx4_e0    \body_fn, \pb
+15:     edge_4bx4_e1    \body_fn
+25:     edge_4bx4_e2    \body_fn, \pb
+35:     edge_4bx4_e3    \body_fn, \pb
+.endm
+
+@ void ff_hevc_rpi_sao_edge_8_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_8_neon_8, export=1
+        edge_16b_init   8, 0, 1, 99f
+99:
+        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_16_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_16_neon_8, export=1
+        edge_16b_init   8, 0, 0, 99f
+99:
+        edge_16b_bodies edge_16b_body_8, 1
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_32_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_32_neon_8, export=1
+        edge_64b_init   8, 0, 0, 99f
+99:
+        edge_32bx2_bodies edge_64b_body_8, 1
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_64_neon_8(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_64_neon_8, export=1
+        edge_64b_init   8, 0, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_8, 1
+endfunc
+
+@ ff_hevc_rpi_sao_edge_c_8_neon_8(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
+        edge_16b_init   8, 1, 1, 99f
+99:
+        edge_16b_8bx2_bodies edge_16b_body_8, 2
+endfunc
+
+@ ff_hevc_rpi_sao_edge_c_16_neon_8(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
+        edge_64b_init   8, 1, 0, 99f
+99:
+        edge_32bx2_bodies edge_64b_body_8, 2
+endfunc
+
+@ ff_hevc_rpi_sao_edge_c_32_neon_8(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
+        edge_64b_init   8, 1, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_8, 2
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_8_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_8_neon_10, export=1
+        edge_16b_init   10, 0, 1, 99f
+99:
+        edge_16b_8bx2_bodies edge_16b_body_16, 2
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_16_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_16_neon_10, export=1
+        edge_64b_init   10, 0, 0, 99f
+99:
+        edge_32bx2_bodies edge_64b_body_16, 2
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_64_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+@ We simply split the 32 case into 2 vertical stripes
+@ and call the fns for w32
+@
+@ Calling code will always have src != dst so we don't have to worry
+@ about edge effects
+
+function ff_hevc_rpi_sao_edge_64_neon_10, export=1
+        edge_64b_init   10, 0, 1, 99f, xjump=1
+endfunc
+
+@ void ff_hevc_rpi_sao_edge_32_neon_10(
+@   uint8_t *_dst,            [r0]
+@   uint8_t *_src,            [r1]
+@   int  stride_dst,          [r2]
+@   int16_t *_sao_offset_val, [r3]
+@   int eo,                   [sp, #0]
+@   int width,                [sp, #4]
+@   int height)               [sp, #8]
+
+function ff_hevc_rpi_sao_edge_32_neon_10, export=1
+        edge_64b_init   10, 0, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_16, 2
+endfunc
+
+@ ff_hevc_rpi_sao_edge_c_8_neon_10(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
+        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
+99:
+        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
+endfunc
+
+@ ff_hevc_rpi_sao_edge_c_32_neon_10(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
+        edge_64b_init   10, 1, 1, 99f, xjump=1
+endfunc
+
+
+@ ff_hevc_rpi_sao_edge_c_16_neon_10(
+@   uint8_t *_dst,                    [r0]
+@   const uint8_t *_src,              [r1]
+@   ptrdiff_t stride_dst,             [r2]
+@   const int16_t *_sao_offset_val_u, [r3]
+@   const int16_t *_sao_offset_val_v, [sp, #0]
+@   int eo,                           [sp, #4]
+@   int width,                        [sp, #8]
+@   int height)                       [sp, #12]
+
+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
+        edge_64b_init   10, 1, 0, 99f
+99:
+        edge_64b_bodies edge_64b_body_16, 4
+endfunc
+
diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
new file mode 100644
index 0000000000..36a23a5bf9
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_arm.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
+#define AVCODEC_ARM_HEVCPRED_ARM_H
+
+#include "libavcodec/rpi_hevcpred.h"
+
+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
+
diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
new file mode 100644
index 0000000000..80724d4cf3
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+
+#include "libavcodec/rpi_hevcpred.h"
+#include "rpi_hevcpred_arm.h"
+
+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevc_rpi_pred_init_neon(c, bit_depth);
+}
+
diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
new file mode 100644
index 0000000000..21e7700174
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rpi_hevcpred_arm.h"
+
+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
+
+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+
+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+
+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
+
+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+
+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+
+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
+{
+    switch (bit_depth)
+    {
+    case 8:
+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
+
+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
+
+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
+
+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
+
+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
+
+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
+        break;
+    case 10:
+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
+        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
+
+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
+
+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
+
+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
+
+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
+
+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
+        break;
+    default:
+        break;
+    }
+}
+
diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
new file mode 100644
index 0000000000..fa8f67cf03
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
@@ -0,0 +1,2984 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+/*
+ * General angular pred
+ *
+ * Horizontal (10) & Vertical (26) cases have their own file
+ * and are not dealt with properly here (luma filtering is missing)
+ *
+ * The inv_angle calculations are annoying - if it wasn't for the +128
+ * rounding step then the result would simply be the loop counter :-(
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.text
+
+@ Horizontal Patch functions
+@ These need a transpose before store so exist as smaller patches
+@ Patches can be called repeatedly without any intermediate setup
+@ to generate a horizontal block
+@
+@ It is almost certainly the case that larger patch fns can be built
+@ and they would be a little faster, but we would still need the small
+@ fns and code size (or at least instruction cache size) is an issue
+@ given how much code we already have here
+
+@ Generate 8x8 luma 8 patch
+@
+@ r3   Out stride
+@ r4   Angle add
+@ r7   Inv angle (_up only)
+@
+@ In/Out (updated)
+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
+@ r2   Left ptr - updated
+@ r10  Inv angle accumulator (_up only)
+@ r12  32 - angle frac (_down) or angle frac (_up)
+@ d0   Older reference samples
+@ d1=r8+r9  Newer reference samples
+@ d2   32 - angle frac
+@ d3   Angle frac
+@ q2   Partially computed next result (_up only)
+@
+@ Temps
+@ r5   Loop counter
+@ r6
+@ r7   (_down only)
+@ r11  (_up only)
+@ q2, q8-q11
+
+patch_h_down_8x8_8:
+        ldrd        r8, r9, [r2]        @ Left
+        rsb         r12, r6, #32
+        vmov        d0, r8, r9
+        vdup.8      d3, r6
+        lsr         r8, #8
+        vdup.8      d2, r12
+        orr         r8, r8, r9, lsl #24
+        ldr         r9, [r2, #5]!
+        vmov        d1, r8, r9
+        // drop through...
+patch_h_down_8x8_8_continue:
+        mov         r5, #8
+1:
+          subs        r12, r4
+        vmull.u8    q2, d0, d2
+          it          mi
+          addmi       r12, #32
+        vmlal.u8    q2, d1, d3
+          rsb         r6, r12, #32
+        vext.8      q8, q8, q9, #8
+          itt         mi
+          lsrmi       r7, r8, #8
+          vmovmi      d0, r8, r9
+          vdup.8      d2, r12
+        vext.8      q9, q9, q10, #8
+          it          mi
+          orrmi       r8, r7, r9, lsl #24
+        vext.8      q10, q10, q11, #8
+          it          mi
+          ldrmi       r9, [r2, #1]!
+        vmov        d22, d23
+        vrshrn.u16  d23, q2, #5
+          it          mi
+          vmovmi      d1, r8, r9
+        subs        r5, #1
+          vdup.8      d3, r6
+        bne         1b
+        // drop through...
+store_tran_8x8_8:
+        vzip.8      d16, d17
+        add         r6, r0, r3
+        vzip.8      d18, d19
+        lsl         r3, #1
+        vzip.8      d20, d21
+        add         r5, r0, r3
+        vzip.8      d22, d23
+        vzip.16     q8, q9
+        vzip.16     q10, q11
+        vzip.32     q8, q10
+        vzip.32     q9, q11
+        vst1.8      {d16}, [r0]!
+        vst1.8      {d17}, [r6], r3
+        vst1.8      {d20}, [r5], r3
+        vst1.8      {d21}, [r6], r3
+        vst1.8      {d18}, [r5], r3
+        vst1.8      {d19}, [r6], r3
+        vst1.8      {d22}, [r5]
+        asr         r3, #1
+        vst1.8      {d23}, [r6]
+
+        bx          lr
+
+patch_h_up_8x8_8:
+        ldrd        r8, r9, [r2]
+        rsb         r6, r4, #32
+        vmov        d0, r8, r9
+        vdup.8      d3, r4
+        lsr         r11, r8, #24
+        vdup.8      d2, r6
+        ldr         r8, [r2, #-1]!
+        orr         r9, r11, r9, lsl #8
+        vmov        d1, r8, r9
+        mov         r12, r4
+        vmull.u8    q2, d0, d2
+        vmlal.u8    q2, d1, d3
+patch_h_up_8x8_8_continue:
+        mov         r5, #8
+1:
+          add         r12, r4
+          mov         r11, #0
+          cmp         r12, #33
+          it          cs
+          addcs       r10, r7
+        vext.8      q8, q8, q9, #8
+          itt         cs
+          subcs       r12, #32
+          tstcs       r10, #1<<31
+          rsb         r6, r12, #32
+          it          eq
+          asreq       r11, r10, #8
+          it          cs
+          vmovcs      d0, r8, r9
+          vdup.8      d2, r6
+          it          cs
+          lsrcs       r6, r8, #24
+        vext.8      q9, q9, q10, #8
+          itt         cs
+          orrcs       r9, r6, r9, lsl #8
+          ldrbcs      r11, [r1, r11]
+          vdup.8      d3, r12
+        vext.8      q10, q10, q11, #8
+          it          hi
+          ldrbhi      r11, [r2, #-1]!
+        vmov        d22, d23
+        vrshrn.u16  d23, q2, #5
+          itt         cs
+          orrcs       r8, r11, r8, lsl #8
+          vmovcs      d1, r8, r9
+          vmull.u8    q2, d0, d2
+        subs        r5, #1
+          vmlal.u8    q2, d1, d3
+        bne         1b
+
+        b           store_tran_8x8_8
+
+
+.macro ADRT reg, val
+@ adr in T32 has enough range but not in A32
+A       adrl        \reg, \val
+T       adr         \reg, \val
+.endm
+
+@ ff_hevc_rpi_pred_angular_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_4_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r8, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        ldr         lr, [r2], #1        @ Top
+        rsb         r12, r6, #32
+        vmov        s0, lr
+        vdup.8      d3, r6
+        ldr         lr, [r2], #1
+        vdup.8      d2, r12
+        vmov        s2, lr
+          subs        r12, r4
+        vmull.u8    q2, d0, d2
+          it          mi
+          addmi       r12, #32
+        vmlal.u8    q2, d1, d3
+          rsb         r6, r12, #32
+          itt         mi
+          vmovmi      s0, lr
+          ldrmi       lr, [r2], #1
+          vdup.8      d2, r12
+          it          mi
+          vmovmi      s2, lr
+          vdup.8      d3, r6
+        mov         r5, #2
+1:
+        vrshrn.u16  d20, q2, #5
+            subs        r12, r4
+          vmull.u8    q2, d0, d2
+            it          mi
+            addmi       r12, #32
+          vmlal.u8    q2, d1, d3
+            rsb         r6, r12, #32
+        vext.64     q8, q8, q9, #1
+            it          mi
+            vmovmi      s0, lr
+        vext.64     q9, q9, q10, #1
+            it          mi
+            ldrmi       lr, [r2], #1
+            vdup.8      d2, r12
+            it          mi
+            vmovmi      s2, lr
+        subs        r5, #1
+            vdup.8      d3, r6
+        bne         1b
+
+          vrshrn.u16  d20, q2, #5
+            vmull.u8    q2, d0, d2
+        add         r12, r0,  r3
+            vmlal.u8    q2, d1, d3
+        lsl         r3,  #1
+          vext.64     q8, q8, q9, #1
+          vext.64     q9, q9, q10, #1
+            vrshrn.u16  d20, q2, #5
+
+98:
+        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
+        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
+        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
+        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
+        pop        {r4-r8, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        rsb         r12, r6, #32
+        ldr         lr, [r2]            @ Left
+        ldrb        r2, [r2, #-1]       @ Top-left
+        vmov        s0, lr
+        vdup.8      d2, r12
+        vdup.8      d3, r6
+        orr         lr, r2, lr, lsl #8
+        vmov        s2, lr
+        sub         r8, r7, #128
+        mov         r5, #3
+2:
+        vmull.u8    q2, d0, d2
+          subs        r12, r4
+        vmlal.u8    q2, d1, d3
+T         it          mi
+          addmi       r12, #32
+T         asr         r6, r8, #8
+T         it          mi
+T         ldrbmi      r2, [r1, r6]
+A         ldrbmi      r2, [r1, r8, asr #8]
+          rsb         r6, r12, #32
+          vdup.8      d2, r12
+          ittt        mi
+          vmovmi      s0, lr
+          orrmi       lr, r2, lr, lsl #8
+          vmovmi      s2, lr
+        vrshrn.u16  d20, q2, #5
+          vdup.8      d3, r6
+          it          mi
+          addmi       r8, r7
+        subs        r5, #1
+        vext.64     q8, q8, q9, #1
+        vext.64     q9, q9, q10, #1
+        bne         2b
+
+          vmull.u8    q2, d0, d2
+        add         r12, r0,  r3
+          vmlal.u8    q2, d1, d3
+        lsl         r3,  #1
+          vrshrn.u16  d20, q2, #5
+        b           98b
+
+@ Left of vertical - works down left
+18:
+        ldrh        r7, [r7]
+        rsb         r12, r6, #32
+        ldr         lr, [r1]            @ Top
+        ldrb        r1, [r2, #-1]       @ Top-left
+        vmov        s0, lr
+        vdup.8      d2, r12
+        vdup.8      d3, r6
+        orr         lr, r1, lr, lsl #8
+        vmov        s2, lr
+        sub         r8, r7, #128
+        mov         r5, #3
+2:
+        vmull.u8    q2, d0, d2
+          subs        r12, r4
+        vmlal.u8    q2, d1, d3
+T         it          mi
+          addmi       r12, #32
+T         asr         r6, r8, #8
+T         it          mi
+T         ldrbmi      r1, [r2, r6]
+A         ldrbmi      r1, [r2, r8, asr #8]
+          rsb         r6, r12, #32
+          vdup.8      d2, r12
+          ittt        mi
+          vmovmi      s0, lr
+          orrmi       lr, r1, lr, lsl #8
+          vmovmi      s2, lr
+        vrshrn.u16  d4, q2, #5
+          vdup.8      d3, r6
+          it          mi
+          addmi       r8, r7
+        subs        r5, #1
+        vst1.32     {d4[0]}, [r0], r3
+        bne         2b
+
+          vmull.u8    q2, d0, d2
+          vmlal.u8    q2, d1, d3
+          vrshrn.u16  d4, q2, #5
+          vst1.32     {d4[0]}, [r0]
+
+        pop         {r4-r8, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        ldr         lr, [r1], #1        @ Top
+        rsb         r12, r6, #32
+        vmov        s0, lr
+        vdup.8      d3, r6
+        ldr         lr, [r1], #1
+        vdup.8      d2, r12
+        vmov        s2, lr
+          subs        r12, r4
+        vmull.u8    q2, d0, d2
+          it          mi
+          addmi       r12, #32
+        vmlal.u8    q2, d1, d3
+          rsb         r6, r12, #32
+          itt         mi
+          vmovmi      s0, lr
+          ldrmi       lr, [r1], #1
+          vdup.8      d2, r12
+          it          mi
+          vmovmi      s2, lr
+          vdup.8      d3, r6
+        mov         r5, #2
+1:
+        vrshrn.u16  d6, q2, #5
+            subs        r12, r4
+          vmull.u8    q2, d0, d2
+            it          mi
+            addmi       r12, #32
+          vmlal.u8    q2, d1, d3
+            rsb         r6, r12, #32
+        vst1.32     {d6[0]}, [r0], r3
+            itt         mi
+            vmovmi      s0, lr
+            ldrmi       lr, [r1], #1
+            vdup.8      d2, r12
+            it          mi
+            vmovmi      s2, lr
+        subs        r5, #1
+            vdup.8      d3, r6
+        bne         1b
+
+          vrshrn.u16  d6, q2, #5
+            vmull.u8    q2, d0, d2
+            vmlal.u8    q2, d1, d3
+          vst1.32     {d6[0]}, [r0], r3
+            vrshrn.u16  d6, q2, #5
+            vst1.32     {d6[0]}, [r0]
+
+        pop         {r4-r8, pc}
+
+endfunc
+
+
+
+@ ff_hevc_rpi_pred_angular_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_8_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        bl          patch_h_down_8x8_8
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        bl          patch_h_up_8x8_8
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        ldrd        r8, r9, [r1]        @ Top
+        rsb         r12, r6, #32
+        ldrb        lr, [r2, #-1]       @ Top-left
+        ldrh        r7, [r7]
+        vmov        d0, r8, r9
+        lsl         r9, r9, #8
+        vdup.8      d2, r12
+        orr         r9, r9, r8, lsr #24
+        orr         r8, lr, r8, lsl #8
+        vmov        d1, r8, r9
+        sub         r1, r7, #128
+        mov         r5, #7
+1:
+        vdup.8      d3, r6
+        vmull.u8    q2, d0, d2
+          subs        r12, r12, r4
+        vmlal.u8    q2, d1, d3
+          ittt        mi
+          addmi       lr, r2, r1, asr #8
+          addmi       r12, r12, #32
+          vmovmi      d0, r8, r9
+          rsb         r6, r12, #32
+          itt         mi
+          lslmi       r9, r9, #8
+          ldrbmi      lr, [lr]
+          vdup.8      d2, r12
+        vrshrn.u16  d4, q2, #5
+          itttt       mi
+          orrmi       r9, r9, r8, lsr #24
+          orrmi       r8, lr, r8, lsl #8
+          vmovmi      d1, r8, r9
+          addmi       r1, r1, r7
+        subs        r5, r5, #1
+        vst1.8      {d4}, [r0], r3
+        bne         1b
+
+          vdup.8      d3, r6
+          vmull.u8    q2, d0, d2
+          vmlal.u8    q2, d1, d3
+          vrshrn.u16  d4, q2, #5
+          vst1.8      {d4}, [r0]
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        ldrd        r8, r9, [r1]        @ Top
+        rsb         r12, r6, #32
+        vmov        d0, r8, r9
+        vdup.8      d3, r6
+        mov         r5, #7
+        lsr         r8, #8
+        vdup.8      d2, r12
+        orr         r8, r8, r9, lsl #24
+        ldr         r9, [r1, #5]!
+        vmov        d1, r8, r9
+1:
+        vmull.u8    q2, d0, d2
+          subs        r12, r4
+        vmlal.u8    q2, d1, d3
+          it          mi
+          addmi       r12, #32
+          rsb         r6, r12, #32
+          itt         mi
+          vmovmi      d0, r8, r9
+          lsrmi       r8, #8
+          vdup.8      d2, r12
+          itt         mi
+          orrmi       r8, r8, r9, lsl #24
+          ldrmi       r9, [r1, #1]!
+        vrshrn.u16  d6, q2, #5
+          it          mi
+          vmovmi      d1, r8, r9
+          vdup.8      d3, r6
+        subs        r5, #1
+        vst1.8      {d6}, [r0], r3
+        bne         1b
+
+          vmull.u8    q2, d0, d2
+          vmlal.u8    q2, d1, d3
+          vrshrn.u16  d6, q2, #5
+          vst1.8      {d6}, [r0]
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_16_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
+
+        bl          patch_h_down_8x8_8
+        bl          patch_h_down_8x8_8_continue
+
+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
+        sub         r0, #16
+        mov         r6, r4
+        add         r0, r0, r3, lsl #3
+
+        bl          patch_h_down_8x8_8
+        bl          patch_h_down_8x8_8_continue
+
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+
+        push        {r2}
+        bl          patch_h_up_8x8_8
+        bl          patch_h_up_8x8_8_continue
+        pop         {r2}
+
+        sub         r0, #16
+        mov         r10, #-128
+        add         r2, #8
+        add         r0, r0, r3, lsl #3
+        sub         r10, r10, r7, lsl #3
+
+        bl          patch_h_up_8x8_8
+        bl          patch_h_up_8x8_8_continue
+
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.8      {q9}, [r1]
+        sub         r1, r2, #1
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        vdup.8      d6, r6
+        vext.8      q8, q9, q9, #15
+        sub         r8, r7, #128
+        vld1.8      {d16[0]}, [r1]
+        vdup.8      d7, r12
+        mov         r5, #15
+1:
+        vmull.u8    q0, d18, d7
+        subs        r12, r4
+        vmlal.u8    q0, d16, d6
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d19, d7
+        it          cc
+        addcc       r1, r2, r8, asr #8
+        vmlal.u8    q1, d17, d6
+        rsb         r6, r12, #32
+        vext.8      q10, q8, q8, #15
+        sub         r5, #1
+        vld1.8      {d20[0]}, [r1]
+        it          cc
+        addcc       r8, r7
+        vmov        q11, q8
+        teq         r5, #0
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmull.u8    q0, d22, d7
+        subs        r12, r4
+        vmlal.u8    q0, d20, d6
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d23, d7
+        it          cc
+        addcc       r1, r2, r8, asr #8
+        vmlal.u8    q1, d21, d6
+        rsb         r6, r12, #32
+        vext.8      q8, q10, q10, #15
+        sub         r5, #1
+        vld1.8      {d16[0]}, [r1]
+        it          cc
+        addcc       r8, r7
+        vmov        q9, q10
+        teq         r5, #0
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmull.u8    q0, d22, d7
+        vmlal.u8    q0, d20, d6
+        vmull.u8    q1, d23, d7
+        vmlal.u8    q1, d21, d6
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+4:
+        bcc         3b
+5:
+        vmull.u8    q0, d18, d7
+        vmlal.u8    q0, d16, d6
+        vmull.u8    q1, d19, d7
+        vmlal.u8    q1, d17, d6
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        vld1.8      {q9}, [r1]!
+        rsb         r12, r6, #32
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vext.8      q8, q9, q9, #1
+        vld1.8      {d17[7]}, [r1]!
+        mov         r5, #15
+1:
+        vmull.u8    q0, d16, d6
+        subs        r12, r4
+        vmlal.u8    q0, d18, d7
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d17, d6
+        rsb         r6, r12, #32
+        vmlal.u8    q1, d19, d7
+        sub         r5, #1
+        vext.8      q10, q8, q8, #1
+        teq         r5, #0
+        vld1.8      {d21[7]}, [r1]
+        it          cc
+        addcc       r1, #1
+        vmov        q11, q8
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmull.u8    q0, d20, d6
+        subs        r12, r4
+        vmlal.u8    q0, d22, d7
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d21, d6
+        rsb         r6, r12, #32
+        vmlal.u8    q1, d23, d7
+        sub         r5, #1
+        vext.8      q8, q10, q10, #1
+        teq         r5, #0
+        vld1.8      {d17[7]}, [r1]
+        it          cc
+        addcc       r1, #1
+        vmov        q9, q10
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmull.u8    q0, d20, d6
+        vmlal.u8    q0, d22, d7
+        vmull.u8    q1, d21, d6
+        vmlal.u8    q1, d23, d7
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+4:
+        bcc         3b
+5:
+        vmull.u8    q0, d16, d6
+        vmlal.u8    q0, d18, d7
+        vmull.u8    q1, d17, d6
+        vmlal.u8    q1, d19, d7
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_32_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_32_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r10, #4
+        mov         r1, r2
+1:
+        bl          patch_h_down_8x8_8
+        bl          patch_h_down_8x8_8_continue
+        bl          patch_h_down_8x8_8_continue
+        bl          patch_h_down_8x8_8_continue
+
+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
+        add         r1, r1, #8
+        mov         r6, r4
+        sub         r0, #32
+        subs        r10, #1
+        add         r0, r0, r3, lsl #3
+        bne         1b
+
+        pop        {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        vmov.i8     d6, #1<<2
+1:
+        push        {r2,r10}
+        bl          patch_h_up_8x8_8
+        bl          patch_h_up_8x8_8_continue
+        bl          patch_h_up_8x8_8_continue
+        bl          patch_h_up_8x8_8_continue
+        pop         {r2,r10}
+
+        vmov        r8, s12
+        sub         r0, #32
+        add         r2, #8
+        add         r0, r0, r3, lsl #3
+        sub         r10, r10, r7, lsl #3
+        vshr.u8     d6, #1
+        teq         r8, #0
+        bne         1b
+
+        pop        {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.8      {q0-q1}, [r1]
+        sub         r9, r2, #1
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+        mov         r5, #32
+1:
+        vld1.8      {d17[7]}, [r9]
+        add         r8, r7
+        vmov        q2, q0
+        vmov        q3, q1
+        add         r9, r2, r8, asr #8
+        vext.8      q1, q0, q1, #15
+        vext.8      q0, q8, q0, #15
+2:
+        vmull.u8    q10, d4, d19
+        subs        r12, r4
+        vmlal.u8    q10, d0, d18
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q11, d5, d19
+        rsb         r6, r12, #32
+        vmlal.u8    q11, d1, d18
+        sub         r5, #1
+        vmull.u8    q12, d6, d19
+        teq         r5, #0
+        vmlal.u8    q12, d2, d18
+        vmull.u8    q13, d7, d19
+        vmlal.u8    q13, d3, d18
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+        vrshrn.u16  d20, q10, #5
+        vrshrn.u16  d21, q11, #5
+        vrshrn.u16  d22, q12, #5
+        vrshrn.u16  d23, q13, #5
+        vst1.8      {q10-q11}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        add         r5, r1, #32
+        vld1.8      {q0-q1}, [r1]!
+        rsb         r12, r6, #32
+        vld1.8      {d16[0]}, [r5]
+        mov         r5, #32
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+1:
+        vmov        q2, q0
+        add         r1, #1
+        vmov        q3, q1
+        vext.8      q0, q0, q1, #1
+        vext.8      q1, q1, q8, #1
+2:
+        vmull.u8    q10, d0, d18
+        subs        r12, r4
+        vmlal.u8    q10, d4, d19
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q11, d1, d18
+        rsb         r6, r12, #32
+        vmlal.u8    q11, d5, d19
+        sub         r5, #1
+        vmull.u8    q12, d2, d18
+        teq         r5, #0
+        vmlal.u8    q12, d6, d19
+        vmull.u8    q13, d3, d18
+        vmlal.u8    q13, d7, d19
+        vld1.8      {d16[0]}, [r1]
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+        vrshrn.u16  d20, q10, #5
+        vrshrn.u16  d21, q11, #5
+        vrshrn.u16  d22, q12, #5
+        vrshrn.u16  d23, q13, #5
+        vst1.8      {q10-q11}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ Chroma 8 bit 4x4 patch fns
+        .text
+
+patch_h_down_c_4x4_8:
+        ldrd        r8, r9, [r2]        @ Left
+        rsb         r12, r6, #32
+        vmov        d0, r8, r9
+        vdup.8      d3, r6
+        lsr         r8, #16
+        vdup.8      d2, r12
+        orr         r8, r8, r9, lsl #16
+        ldr         r9, [r2, #6]!
+        vmov        d1, r8, r9
+        // drop through...
+patch_h_down_c_4x4_8_continue:
+        mov         r5, #4
+1:
+          subs        r12, r4
+        vmull.u8    q2, d0, d2
+          it          mi
+          addmi       r12, #32
+        vmlal.u8    q2, d1, d3
+          rsb         r6, r12, #32
+        vext.8      q8, q8, q9, #8
+          it          mi
+          lsrmi       r7, r8, #16
+        vmov        d18, d19
+          it          mi
+          vmovmi      d0, r8, r9
+          vdup.8      d2, r12
+          it          mi
+          orrmi       r8, r7, r9, lsl #16
+        vrshrn.u16  d19, q2, #5
+          itt         mi
+          ldrmi       r9, [r2, #2]!
+          vmovmi      d1, r8, r9
+        subs        r5, #1
+          vdup.8      d3, r6
+        bne         1b
+        // drop through...
+store_tran_c_4x4_8:
+        vzip.16     d16, d17
+        add         r6, r0, r3
+        vzip.16     d18, d19
+        lsl         r3, #1
+        vzip.32     q8, q9
+        add         r5, r0, r3
+        vst1.16     {d16}, [r0]!
+        vst1.16     {d17}, [r6], r3
+        vst1.16     {d18}, [r5]
+        asr         r3, #1
+        vst1.16     {d19}, [r6]
+
+        bx          lr
+
+patch_h_up_c_4x4_8:
+        ldrd        r8, r9, [r2]
+        rsb         r6, r4, #32
+        vmov        d0, r8, r9
+        vdup.8      d3, r4
+        lsr         r11, r8, #16
+        vdup.8      d2, r6
+        ldr         r8, [r2, #-2]!
+        orr         r9, r11, r9, lsl #16
+        vmov        d1, r8, r9
+        mov         r12, r4
+        vmull.u8    q2, d0, d2
+        vmlal.u8    q2, d1, d3
+patch_h_up_c_4x4_8_continue:
+        mov         r5, #4
+1:
+          add         r12, r4
+          cmp         r12, #33
+          it          cs
+          addcs       r10, r7
+          mov         r11, #0
+          itt         cs
+          subcs       r12, #32
+          tstcs       r10, #1<<31
+          rsb         r6, r12, #32
+          it          eq
+          asreq       r11, r10, #7
+          it          cs
+          vmovcs      d0, r8, r9
+          it          eq
+          biceq       r11, #1
+          vdup.8      d2, r6
+          it          cs
+          lsrcs       r6, r8, #16
+          vdup.8      d3, r12
+        vext.8      q8, q8, q9, #8
+          itt         cs
+          orrcs       r9, r6, r9, lsl #16
+          ldrhcs      r11, [r1, r11]
+        vmov        d18, d19
+          it          hi
+          ldrhhi      r11, [r2, #-2]!
+        vrshrn.u16  d19, q2, #5
+          itt         cs
+          orrcs       r8, r11, r8, lsl #16
+          vmovcs      d1, r8, r9
+          vmull.u8    q2, d0, d2
+        subs        r5, #1
+          vmlal.u8    q2, d1, d3
+        bne         1b
+
+        b           store_tran_c_4x4_8
+
+
+@ ff_hevc_rpi_pred_angular_c_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        bl          patch_h_down_c_4x4_8
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        bl          patch_h_up_c_4x4_8
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        ldrd        r8, r9, [r1]        @ Top
+        rsb         r12, r6, #32
+        ldrh        lr, [r2, #-2]       @ Top-left
+        ldrh        r7, [r7]
+        vmov        d0, r8, r9
+        lsl         r9, r9, #16
+        vdup.8      d2, r12
+        orr         r9, r9, r8, lsr #16
+        orr         r8, lr, r8, lsl #16
+        vmov        d1, r8, r9
+        sub         r1, r7, #128
+        mov         r5, #3
+1:
+        vdup.8      d3, r6
+        vmull.u8    q2, d0, d2
+          subs        r12, r12, r4
+        vmlal.u8    q2, d1, d3
+          itttt       mi
+          addmi       lr, r2, r1, asr #7
+          bicmi       lr, #1
+          addmi       r12, r12, #32
+          vmovmi      d0, r8, r9
+          rsb         r6, r12, #32
+          itt         mi
+          lslmi       r9, r9, #16
+          ldrhmi      lr, [lr]
+          vdup.8      d2, r12
+        vrshrn.u16  d4, q2, #5
+          itttt       mi
+          orrmi       r9, r9, r8, lsr #16
+          orrmi       r8, lr, r8, lsl #16
+          vmovmi      d1, r8, r9
+          addmi       r1, r1, r7
+        subs        r5, r5, #1
+        vst1.16     {d4}, [r0], r3
+        bne         1b
+
+          vdup.8      d3, r6
+          vmull.u8    q2, d0, d2
+          vmlal.u8    q2, d1, d3
+          vrshrn.u16  d4, q2, #5
+          vst1.16     {d4}, [r0]
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        ldrd        r8, r9, [r1]        @ Top
+        rsb         r12, r6, #32
+        vmov        d0, r8, r9
+        vdup.8      d3, r6
+        mov         r5, #3
+        lsr         r8, #16
+        vdup.8      d2, r12
+        orr         r8, r8, r9, lsl #16
+        ldr         r9, [r1, #6]!
+        vmov        d1, r8, r9
+1:
+        vmull.u8    q2, d0, d2
+          subs        r12, r4
+        vmlal.u8    q2, d1, d3
+          it          mi
+          addmi       r12, #32
+          rsb         r6, r12, #32
+          itt         mi
+          vmovmi      d0, r8, r9
+          lsrmi       r8, #16
+          vdup.8      d2, r12
+          itt         mi
+          orrmi       r8, r8, r9, lsl #16
+          ldrmi       r9, [r1, #2]!
+        vrshrn.u16  d6, q2, #5
+          it          mi
+          vmovmi      d1, r8, r9
+          vdup.8      d3, r6
+        subs        r5, #1
+        vst1.16     {d6}, [r0], r3
+        bne         1b
+
+          vmull.u8    q2, d0, d2
+          vmlal.u8    q2, d1, d3
+          vrshrn.u16  d6, q2, #5
+          vst1.16     {d6}, [r0]
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_c_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
+
+        bl          patch_h_down_c_4x4_8
+        bl          patch_h_down_c_4x4_8_continue
+
+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
+        sub         r0, #16
+        mov         r6, r4
+        add         r0, r0, r3, lsl #2
+
+        bl          patch_h_down_c_4x4_8
+        bl          patch_h_down_c_4x4_8_continue
+
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+
+        push        {r2}
+        bl          patch_h_up_c_4x4_8
+        bl          patch_h_up_c_4x4_8_continue
+        pop         {r2}
+
+        sub         r0, #16
+        mov         r10, #-128
+        add         r2, #8
+        add         r0, r0, r3, lsl #2
+        sub         r10, r10, r7, lsl #2
+
+        bl          patch_h_up_c_4x4_8
+        bl          patch_h_up_c_4x4_8_continue
+
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.8      {q9}, [r1]
+        sub         r1, r2, #2
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        vdup.8      d6, r6
+        vext.8      q8, q9, q9, #14
+        sub         r8, r7, #128
+        vld1.16     {d16[0]}, [r1]
+        vdup.8      d7, r12
+        mov         r5, #7
+1:
+        subs        r12, r4
+        vmull.u8    q0, d18, d7
+        it          cc
+        asrcc       r1, r8, #8
+        vmlal.u8    q0, d16, d6
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d19, d7
+        it          cc
+        addcc       r1, r2, r1, lsl #1
+        vmlal.u8    q1, d17, d6
+        rsb         r6, r12, #32
+        vext.8      q10, q8, q8, #14
+        sub         r5, #1
+        vld1.16     {d20[0]}, [r1]
+        it          cc
+        addcc       r8, r7
+        vmov        q11, q8
+        teq         r5, #0
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        subs        r12, r4
+        vmull.u8    q0, d22, d7
+        it          cc
+        asrcc       r1, r8, #8
+        vmlal.u8    q0, d20, d6
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d23, d7
+        it          cc
+        addcc       r1, r2, r1, lsl #1
+        vmlal.u8    q1, d21, d6
+        rsb         r6, r12, #32
+        vext.8      q8, q10, q10, #14
+        sub         r5, #1
+        vld1.16     {d16[0]}, [r1]
+        it          cc
+        addcc       r8, r7
+        vmov        q9, q10
+        teq         r5, #0
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmull.u8    q0, d22, d7
+        vmlal.u8    q0, d20, d6
+        vmull.u8    q1, d23, d7
+        vmlal.u8    q1, d21, d6
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+4:
+        bcc         3b
+5:
+        vmull.u8    q0, d18, d7
+        vmlal.u8    q0, d16, d6
+        vmull.u8    q1, d19, d7
+        vmlal.u8    q1, d17, d6
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        vld1.8      {q9}, [r1]!
+        rsb         r12, r6, #32
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vext.8      q8, q9, q9, #2
+        vld1.16     {d17[3]}, [r1]!
+        mov         r5, #7
+1:
+        vmull.u8    q0, d16, d6
+        subs        r12, r4
+        vmlal.u8    q0, d18, d7
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d17, d6
+        rsb         r6, r12, #32
+        vmlal.u8    q1, d19, d7
+        sub         r5, #1
+        vext.8      q10, q8, q8, #2
+        teq         r5, #0
+        vld1.16     {d21[3]}, [r1]
+        it          cc
+        addcc       r1, #2
+        vmov        q11, q8
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmull.u8    q0, d20, d6
+        subs        r12, r4
+        vmlal.u8    q0, d22, d7
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q1, d21, d6
+        rsb         r6, r12, #32
+        vmlal.u8    q1, d23, d7
+        sub         r5, #1
+        vext.8      q8, q10, q10, #2
+        teq         r5, #0
+        vld1.16     {d17[3]}, [r1]
+        it          cc
+        addcc       r1, #2
+        vmov        q9, q10
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vdup.8      d6, r6
+        vdup.8      d7, r12
+        vst1.8      {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmull.u8    q0, d20, d6
+        vmlal.u8    q0, d22, d7
+        vmull.u8    q1, d21, d6
+        vmlal.u8    q1, d23, d7
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+4:
+        bcc         3b
+5:
+        vmull.u8    q0, d16, d6
+        vmlal.u8    q0, d18, d7
+        vmull.u8    q1, d17, d6
+        vmlal.u8    q1, d19, d7
+        vrshrn.u16  d0, q0, #5
+        vrshrn.u16  d1, q1, #5
+        vst1.8      {q0}, [r0]
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_c_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r10, #4
+        mov         r1, r2
+1:
+        bl          patch_h_down_c_4x4_8
+        bl          patch_h_down_c_4x4_8_continue
+        bl          patch_h_down_c_4x4_8_continue
+        bl          patch_h_down_c_4x4_8_continue
+
+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
+        add         r1, r1, #4*2
+        mov         r6, r4
+        sub         r0, #32
+        subs        r10, #1
+        add         r0, r0, r3, lsl #2
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        vmov.i8     d6, #1<<2
+1:
+        push        {r2, r10}
+        bl          patch_h_up_c_4x4_8
+        bl          patch_h_up_c_4x4_8_continue
+        bl          patch_h_up_c_4x4_8_continue
+        bl          patch_h_up_c_4x4_8_continue
+        pop         {r2, r10}
+
+        vmov        r8, s12
+        sub         r0, #32
+        add         r2, #8
+        add         r0, r0, r3, lsl #2
+        sub         r10, r10, r7, lsl #2
+        vshr.u8     d6, #1
+        teq         r8, #0
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.8      {q0-q1}, [r1]
+        sub         r9, r2, #2
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+        mov         r5, #16
+1:
+        vld1.16     {d17[3]}, [r9]
+        add         r8, r7
+        vmov        q2, q0
+        vmov        q3, q1
+        asr         r9, r8, #8
+        vext.8      q1, q0, q1, #14
+        add         r9, r2, r9, lsl #1
+        vext.8      q0, q8, q0, #14
+2:
+        vmull.u8    q10, d4, d19
+        subs        r12, r4
+        vmlal.u8    q10, d0, d18
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q11, d5, d19
+        rsb         r6, r12, #32
+        vmlal.u8    q11, d1, d18
+        sub         r5, #1
+        vmull.u8    q12, d6, d19
+        teq         r5, #0
+        vmlal.u8    q12, d2, d18
+        vmull.u8    q13, d7, d19
+        vmlal.u8    q13, d3, d18
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+        vrshrn.u16  d20, q10, #5
+        vrshrn.u16  d21, q11, #5
+        vrshrn.u16  d22, q12, #5
+        vrshrn.u16  d23, q13, #5
+        vst1.8      {q10-q11}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        add         r5, r1, #32
+        vld1.8      {q0-q1}, [r1]!
+        rsb         r12, r6, #32
+        vld1.16     {d16[0]}, [r5]
+        mov         r5, #16
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+1:
+        vmov        q2, q0
+        add         r1, #2
+        vmov        q3, q1
+        vext.8      q0, q0, q1, #2
+        vext.8      q1, q1, q8, #2
+2:
+        vmull.u8    q10, d0, d18
+        subs        r12, r4
+        vmlal.u8    q10, d4, d19
+        it          cc
+        addcc       r12, #32
+        vmull.u8    q11, d1, d18
+        rsb         r6, r12, #32
+        vmlal.u8    q11, d5, d19
+        sub         r5, #1
+        vmull.u8    q12, d2, d18
+        teq         r5, #0
+        vmlal.u8    q12, d6, d19
+        vmull.u8    q13, d3, d18
+        vmlal.u8    q13, d7, d19
+        vld1.16     {d16[0]}, [r1]
+        vdup.8      d18, r6
+        vdup.8      d19, r12
+        vrshrn.u16  d20, q10, #5
+        vrshrn.u16  d21, q11, #5
+        vrshrn.u16  d22, q12, #5
+        vrshrn.u16  d23, q13, #5
+        vst1.8      {q10-q11}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+@------------------------------------------------------------------------------
+@ Data
+
+        .text
+        .balign  64
+angle_2:
+        .byte    32
+        .byte    26,  21,  17,  13,   9,   5,   2,   0
+        @ Sign inverted from standards table
+        .byte     2,   5,   9,  13,  17,  21,  26,  32
+        .byte    26,  21,  17,  13,   9,   5,   2,   0
+        @ Standard sign
+        .byte     2,   5,   9,  13,  17,  21,  26,  32
+
+        .balign   2
+
+        @ Sign inverted from standards table
+inv_angle:
+        .short   4096, 1638,  910,  630,  482,  390,  315
+        .short    256
+        .short    315,  390,  482,  630,  910, 1638, 4096
+
+@------------------------------------------------------------------------------
+@
+@ 10 bit fns
+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
+@ but runs out of register width for 12+ bit
+
+        .text
+        .balign 64
+
+patch_h_down_4x4_10:
+        ldrd        r8, r9, [r2]        @ Left
+        rsb         r12, r6, #32
+        vmov        d0, r8, r9
+        vdup.16     d3, r6
+        lsr         r8, #16
+        vdup.16     d2, r12
+        orr         r8, r8, r9, lsl #16
+        ldr         r9, [r2, #6]!
+        vmov        d1, r8, r9
+        // drop through...
+patch_h_down_4x4_10_continue:
+        mov         r5, #4
+1:
+          subs        r12, r4
+        vmul.u16    d4, d0, d2
+          it          mi
+          addmi       r12, #32
+        vmla.u16    d4, d1, d3
+          rsb         r6, r12, #32
+        vext.16     q8, q8, q9, #4
+          it          mi
+          lsrmi       r7, r8, #16
+        vmov        d18, d19
+          it          mi
+          vmovmi      d0, r8, r9
+          vdup.16     d2, r12
+          it          mi
+          orrmi       r8, r7, r9, lsl #16
+        vrshr.u16   d19, d4, #5
+          itt         mi
+          ldrmi       r9, [r2, #2]!
+          vmovmi      d1, r8, r9
+        subs        r5, #1
+          vdup.16     d3, r6
+        bne         1b
+        // drop through...
+store_tran_4x4_10:
+        vzip.16     d16, d17
+        add         r6, r0, r3
+        vzip.16     d18, d19
+        lsl         r3, #1
+        vzip.32     q8, q9
+        add         r5, r0, r3
+        vst1.16     {d16}, [r0]!
+        vst1.16     {d17}, [r6], r3
+        vst1.16     {d18}, [r5]
+        asr         r3, #1
+        vst1.16     {d19}, [r6]
+
+        bx          lr
+
+patch_h_up_4x4_10:
+        ldrd        r8, r9, [r2]
+        rsb         r6, r4, #32
+        vmov        d0, r8, r9
+        vdup.16     d3, r4
+        lsr         r11, r8, #16
+        vdup.16     d2, r6
+        ldr         r8, [r2, #-2]!
+        orr         r9, r11, r9, lsl #16
+        vmov        d1, r8, r9
+        mov         r12, r4
+        vmul.u16    d4, d0, d2
+        vmla.u16    d4, d1, d3
+patch_h_up_4x4_10_continue:
+        mov         r5, #4
+1:
+          add         r12, r4
+          cmp         r12, #33
+          it          cs
+          addcs       r10, r7
+          mov         r11, #0
+          itt         cs
+          subcs       r12, #32
+          tstcs       r10, #1<<31
+          rsb         r6, r12, #32
+          it          eq
+          asreq       r11, r10, #7
+          it          cs
+          vmovcs      d0, r8, r9
+          it          eq
+          biceq       r11, #1
+          vdup.16     d2, r6
+          it          cs
+          lsrcs       r6, r8, #16
+          vdup.16     d3, r12
+        vext.16     q8, q8, q9, #4
+          itt         cs
+          orrcs       r9, r6, r9, lsl #16
+          ldrhcs      r11, [r1, r11]
+        vmov        d18, d19
+          it          hi
+          ldrhhi      r11, [r2, #-2]!
+        vrshr.u16   d19, d4, #5
+          itt         cs
+          orrcs       r8, r11, r8, lsl #16
+          vmovcs      d1, r8, r9
+          vmul.u16    d4, d0, d2
+        subs        r5, #1
+          vmla.u16    d4, d1, d3
+        bne         1b
+
+        b           store_tran_4x4_10
+
+
+@ ff_hevc_rpi_pred_angular_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_4_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        bl          patch_h_down_4x4_10
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        bl          patch_h_up_4x4_10
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        ldrd        r8, r9, [r1]        @ Top
+        rsb         r12, r6, #32
+        ldrh        lr, [r2, #-2]       @ Top-left
+        ldrh        r7, [r7]
+        vmov        d0, r8, r9
+        lsl         r9, r9, #16
+        vdup.16     d2, r12
+        orr         r9, r9, r8, lsr #16
+        orr         r8, lr, r8, lsl #16
+        vmov        d1, r8, r9
+        sub         r1, r7, #128
+        mov         r5, #3
+1:
+        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
+        vdup.16     d3, r6
+        vmul.u16    d4, d0, d2
+          subs        r12, r12, r4
+        vmla.u16    d4, d1, d3
+          itttt       mi
+          addmi       lr, r2, r1, asr #7
+          bicmi       lr, #1
+          addmi       r12, r12, #32
+          vmovmi      d0, r8, r9
+          rsb         r6, r12, #32
+          itt         mi
+          lslmi       r9, r9, #16
+          ldrhmi      lr, [lr]
+          vdup.16     d2, r12
+        vrshr.u16   d4, d4, #5
+          itttt       mi
+          orrmi       r9, r9, r8, lsr #16
+          orrmi       r8, lr, r8, lsl #16
+          vmovmi      d1, r8, r9
+          addmi       r1, r1, r7
+        subs        r5, r5, #1
+        vst1.16     {d4}, [r0], r3
+        bne         1b
+
+          vdup.16     d3, r6
+          nop                           @ force next insn into pipeline 0 to enable
+          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
+          vmla.u16    d4, d1, d3
+          vrshr.u16   d4, d4, #5
+          vst1.16     {d4}, [r0]
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        ldrd        r8, r9, [r1]        @ Top
+        rsb         r12, r6, #32
+        vmov        d0, r8, r9
+        vdup.16     d3, r6
+        lsr         r8, #16
+        vdup.16     d2, r12
+        orr         r8, r8, r9, lsl #16
+        ldr         r9, [r1, #6]!
+        vmov        d1, r8, r9
+        mov         r5, #3
+1:
+        vmul.u16    d4, d0, d2
+          subs        r12, r4
+        vmla.u16    d4, d1, d3
+          it          mi
+          addmi       r12, #32
+          rsb         r6, r12, #32
+          itt         mi
+          vmovmi      d0, r8, r9
+          lsrmi       r8, #16
+          vdup.16     d2, r12
+          itt         mi
+          orrmi       r8, r8, r9, lsl #16
+          ldrmi       r9, [r1, #2]!
+        vrshr.u16   d4, d4, #5
+          it          mi
+          vmovmi      d1, r8, r9
+          vdup.16     d3, r6
+        subs        r5, #1
+        vst1.16     {d4}, [r0], r3
+        bne         1b
+
+          vmul.u16    d4, d0, d2
+          vmla.u16    d4, d1, d3
+          vrshr.u16   d4, d4, #5
+          vst1.16     {d4}, [r0]
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_8_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
+
+        bl          patch_h_down_4x4_10
+        bl          patch_h_down_4x4_10_continue
+
+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
+        sub         r0, #16
+        mov         r6, r4
+        add         r0, r0, r3, lsl #2
+
+        bl          patch_h_down_4x4_10
+        bl          patch_h_down_4x4_10_continue
+
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+
+        push        {r2}
+        bl          patch_h_up_4x4_10
+        bl          patch_h_up_4x4_10_continue
+        pop         {r2}
+
+        sub         r0, #16
+        mov         r10, #-128
+        add         r2, #8
+        add         r0, r0, r3, lsl #2
+        sub         r10, r10, r7, lsl #2
+
+        bl          patch_h_up_4x4_10
+        bl          patch_h_up_4x4_10_continue
+
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.16     {q9}, [r1]
+        sub         r1, r2, #2
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        vdup.16     q2, r6
+        vext.16     q8, q9, q9, #7
+        sub         r8, r7, #128
+        vld1.16     {d16[0]}, [r1]
+        vdup.16     q3, r12
+        mov         r5, #7
+1:
+        vmul.u16    q0, q9, q3
+        subs        r12, r4
+        vmla.u16    q0, q8, q2
+        ittt        cc
+        asrcc       r1, r8, #8
+        addcc       r12, #32
+        addcc       r1, r2, r1, lsl #1
+        vext.16     q10, q8, q8, #7
+        rsb         r6, r12, #32
+        vmov        q11, q8
+        sub         r5, #1
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r8, r7
+        vld1.16     {d20[0]}, [r1]
+        teq         r5, #0
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmul.u16    q0, q11, q3
+        subs        r12, r4
+        vmla.u16    q0, q10, q2
+        ittt        cc
+        asrcc       r1, r8, #8
+        addcc       r12, #32
+        addcc       r1, r2, r1, lsl #1
+        vext.16     q8, q10, q10, #7
+        rsb         r6, r12, #32
+        vmov        q9, q10
+        sub         r5, #1
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r8, r7
+        vld1.16     {d16[0]}, [r1]
+        teq         r5, #0
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmul.u16    q0, q11, q3
+        vmla.u16    q0, q10, q2
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r11, pc}
+4:
+        bcc         3b
+5:
+        vmul.u16    q0, q9, q3
+        vmla.u16    q0, q8, q2
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        vld1.16     {q9}, [r1]!
+        rsb         r12, r6, #32
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vext.16     q8, q9, q9, #1
+        vld1.16     {d17[3]}, [r1]!
+        mov         r5, #7
+1:
+        vmul.u16    q0, q8, q2
+        subs        r12, r4
+        vmla.u16    q0, q9, q3
+        it          cc
+        addcc       r12, #32
+        vext.16     q10, q8, q8, #1
+        rsb         r6, r12, #32
+        vld1.16     {d21[3]}, [r1]
+        sub         r5, #1
+        vmov        q11, q8
+        teq         r5, #0
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r1, #2
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmul.u16    q0, q10, q2
+        subs        r12, r4
+        vmla.u16    q0, q11, q3
+        it          cc
+        addcc       r12, #32
+        vext.16     q8, q10, q10, #1
+        rsb         r6, r12, #32
+        vld1.16     {d17[3]}, [r1]
+        sub         r5, #1
+        vmov        q9, q10
+        teq         r5, #0
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r1, #2
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmul.u16    q0, q10, q2
+        vmla.u16    q0, q11, q3
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r11, pc}
+4:
+        bcc         3b
+5:
+        vmul.u16    q0, q8, q2
+        vmla.u16    q0, q9, q3
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_16_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r10, #4
+        mov         r1, r2
+1:
+        bl          patch_h_down_4x4_10
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+
+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
+        add         r1, r1, #4*2
+        mov         r6, r4
+        sub         r0, #32
+        subs        r10, #1
+        add         r0, r0, r3, lsl #2
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        vmov.i8     d6, #1<<2
+1:
+        push        {r2, r10}
+        bl          patch_h_up_4x4_10
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        pop         {r2, r10}
+
+        vmov        r8, s12
+        sub         r0, #32
+        add         r2, #8
+        add         r0, r0, r3, lsl #2
+        sub         r10, r10, r7, lsl #2
+        vshr.u8     d6, #1
+        teq         r8, #0
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.16     {q0-q1}, [r1]
+        sub         r9, r2, #2
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+        mov         r5, #16
+1:
+        vld1.16     {d17[3]}, [r9]
+        add         r8, r7
+        vmov        q2, q0
+        vmov        q3, q1
+        asr         r9, r8, #8
+        vext.16     q1, q0, q1, #7
+        add         r9, r2, r9, lsl #1
+        vext.16     q0, q8, q0, #7
+2:
+        vmul.u16    q11, q2, q10
+        subs        r12, r4
+        vmla.u16    q11, q0, q9
+        it          cc
+        addcc       r12, #32
+        vmul.u16    q12, q3, q10
+        rsb         r6, r12, #32
+        vmla.u16    q12, q1, q9
+        sub         r5, #1
+        teq         r5, #0
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+        vrshr.u16   q11, q11, #5
+        vrshr.u16   q12, q12, #5
+        vst1.16     {q11-q12}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        add         r5, r1, #32
+        vld1.16     {q0-q1}, [r1]!
+        rsb         r12, r6, #32
+        vld1.16     {d16[0]}, [r5]
+        mov         r5, #16
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+1:
+        vmov        q2, q0
+        add         r1, #2
+        vmov        q3, q1
+        vext.16     q0, q0, q1, #1
+        vext.16     q1, q1, q8, #1
+2:
+        vmul.u16    q11, q0, q9
+        subs        r12, r4
+        vmla.u16    q11, q2, q10
+        it          cc
+        addcc       r12, #32
+        vmul.u16    q12, q1, q9
+        rsb         r6, r12, #32
+        vmla.u16    q12, q3, q10
+        sub         r5, #1
+        vld1.16     {d16[0]}, [r1]
+        teq         r5, #0
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+        vrshr.u16   q11, q11, #5
+        vrshr.u16   q12, q12, #5
+        vst1.16     {q11-q12}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_32_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_32_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r11, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #1
+        vpush       {d8}
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        add         sp, #8
+        mov         r10, #8
+        mov         r1, r2
+1:
+        bl          patch_h_down_4x4_10
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+        bl          patch_h_down_4x4_10_continue
+
+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
+        add         r1, r1, #4*2
+        mov         r6, r4
+        sub         r0, #64
+        subs        r10, #1
+        add         r0, r0, r3, lsl #2
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Up of Horizontal - works down up
+10:
+        add         sp, #8
+        ldrh        r7, [r7]
+        mov         r10, #-128
+        vmov.i8     d6, #1<<6
+1:
+        push        {r2, r10}
+        bl          patch_h_up_4x4_10
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        bl          patch_h_up_4x4_10_continue
+        pop         {r2, r10}
+
+        vmov        r8, s12
+        sub         r0, #64
+        add         r2, #8
+        add         r0, r0, r3, lsl #2
+        sub         r10, r10, r7, lsl #2
+        vshr.u8     d6, #1
+        teq         r8, #0
+        bne         1b
+
+        pop         {r4-r11, pc}
+
+@ Left of vertical - works down left
+18:
+        add         r5, r1, #32
+        vld1.16     {q1-q2}, [r1]
+        rsb         r12, r6, r6, lsl #16
+        vld1.16     {q3-q4}, [r5]
+        sub         r9, r2, #2
+        rsb         r4, r12, #0
+        rsb         r12, r12, #32 << 16
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        vmov        d0, d9
+        vmov        s2, r12
+        add         r10, r0, #32
+        mov         r5, #32
+1:
+        vld1.16     {d1[3]}, [r9]
+        add         r8, r7
+        vmov        q11, q4
+        vmov        q10, q3
+        asr         r9, r8, #8
+        vmov        q9, q2
+        add         r9, r2, r9, lsl #1
+        vmov        q8, q1
+        vext.16     q4, q3, q4, #7
+        vext.16     q3, q2, q3, #7
+        vext.16     q2, q1, q2, #7
+        vext.16     q1, q0, q1, #7
+2:
+        vmul.u16    q12, q8, d1[1]
+        adds        r12, r4
+        vmla.u16    q12, q1, d1[0]
+        it          cc
+        addcc       r12, #32 << 16
+        vmul.u16    q13, q9, d1[1]
+        it          cc
+        subcc       r12, #32
+        vmla.u16    q13, q2, d1[0]
+        sub         r5, #1
+        vmul.u16    q14, q10, d1[1]
+        teq         r5, #0
+        vmla.u16    q14, q3, d1[0]
+        vmul.u16    q15, q11, d1[1]
+        vmla.u16    q15, q4, d1[0]
+        vmov        s2, r12
+        vrshr.u16   q12, q12, #5
+        vrshr.u16   q13, q13, #5
+        vrshr.u16   q14, q14, #5
+        vrshr.u16   q15, q15, #5
+        vst1.16     {q12-q13}, [r0], r3
+        vst1.16     {q14-q15}, [r10], r3
+        bhi         2b
+        bne         1b
+
+        vpop        {d8}
+        vmov        d9, d0
+        pop         {r4-r11, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        add         r5, r1, #32
+        vld1.16     {q1-q2}, [r1]
+        rsb         r12, r6, r6, lsl #16
+        vld1.16     {q3-q4}, [r5]
+        add         r1, r1, #64
+        rsb         r4, r12, #0
+        rsb         r12, r12, #32 << 16
+        vmov        d1, d9
+        vmov        s1, r12
+        add         r10, r0, #32
+        mov         r5, #32
+1:
+        vld1.16     {d0[0]}, [r1]!
+        vmov        q8, q1
+        vmov        q9, q2
+        vmov        q10, q3
+        vmov        q11, q4
+        vext.16     q1, q1, q2, #1
+        vext.16     q2, q2, q3, #1
+        vext.16     q3, q3, q4, #1
+        vext.16     q4, q4, q0, #1
+2:
+        vmul.u16    q12, q1, d0[2]
+        adds        r12, r4
+        vmla.u16    q12, q8, d0[3]
+        it          cc
+        addcc       r12, #32 << 16
+        vmul.u16    q13, q2, d0[2]
+        it          cc
+        subcc       r12, #32
+        vmla.u16    q13, q9, d0[3]
+        sub         r5, #1
+        vmul.u16    q14, q3, d0[2]
+        teq         r5, #0
+        vmla.u16    q14, q10, d0[3]
+        vmul.u16    q15, q4, d0[2]
+        vmla.u16    q15, q11, d0[3]
+        vmov        s1, r12
+        vrshr.u16   q12, q12, #5
+        vrshr.u16   q13, q13, #5
+        vrshr.u16   q14, q14, #5
+        vrshr.u16   q15, q15, #5
+        vst1.16     {q12-q13}, [r0], r3
+        vst1.16     {q14-q15}, [r10], r3
+        bhi         2b
+        bne         1b
+
+        vpop        {d8}
+        vmov        d9, d1
+        pop         {r4-r11, pc}
+
+endfunc
+
+
+
+@ Generate 4x4 chroma patch
+@
+@ In (const)
+@ r1   Up ptr (_up only)
+@ r3   Out stride
+@ r4   Angle add
+@ r7   Inv angle (_up only)
+@
+@ In/Out (updated)
+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
+@ r2   Left ptr - updated
+@ r6   Angle frac (init to r4 + 32)
+@ r8   Inv angle accumulator
+@ q2   Cur Line - load before 1st call for down - set by _up
+@ q8   Cur Line - load before 1st call for up   - set by _down
+@
+@ Temps
+@ r5   Loop counter
+@ r12
+@ d0, q1, q12-q15
+
+patch_h_down_c_4x4_10:
+        vld1.16     {q12}, [r2]!
+        rsb         r12, r6, #32
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        mov         r5, #4
+1:
+        vmov        q13, q12
+        vext.16     q12, q12, q12, #2
+        vld1.32     {d25[1]}, [r2]!
+patch_h_down_c_4x4_10_continue:
+2:
+        vmov        q8, q9
+        subs        r12, r4
+        vmul.u16    q0, q13, q3
+        it          cc
+        addcc       r12, #32
+        vmla.u16    q0, q12, q2
+        rsb         r6, r12, #32
+        vmov        q9, q10
+        sub         r5, #1
+        vmov        q10, q11
+        teq         r5, #0
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vrshr.u16   q11, q0, #5
+        bhi         2b
+        bne         1b
+
+        bcs         3f
+        vmov        q13, q12
+        vext.16     q12, q12, q12, #2
+        vld1.32     {d25[1]}, [r2]!
+3:
+
+store_tran_c_4x4_10:
+T       add         r6, r0, r3
+        vzip.32     q8, q10
+A       add         r6, r0, r3
+T       lsl         r3, #1
+        vzip.32     q9, q11
+A       add         r5, r0, r3, lsl #1
+T       add         r5, r0, r3
+        vst2.32     {d16,d18}, [r0]!
+A       lsl         r3, #1
+        vst2.32     {d17,d19}, [r6], r3
+        asr         r3, #1
+        vst2.32     {d20,d22}, [r5]
+        mov         r5, #4
+        vst2.32     {d21,d23}, [r6]
+        bx          lr
+
+patch_h_up_c_4x4_10:
+        vld1.16     {q1}, [r2]
+        rsb         r12, r6, #32
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        mov         r5, #4
+1:
+        adds        r8, r7
+        vmov        q12, q1
+        it          mi
+        ldrmi       r6, [r2, #-4]!
+        vext.16     q1, q1, q1, #6
+        itt         pl
+        asrpl       r6, r8, #8
+        ldrpl       r6, [r1, r6, lsl #2]
+        vmov        s4, r6
+patch_h_up_c_4x4_10_continue:
+2:
+        vmov        q8, q9
+        subs        r12, r4
+        vmul.u16    q0, q12, q3
+        it          cc
+        addcc       r12, #32
+        vmla.u16    q0, q1, q2
+        rsb         r6, r12, #32
+        vmov        q9, q10
+        sub         r5, #1
+        vmov        q10, q11
+        teq         r5, #0
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vrshr.u16   q11, q0, #5
+        bhi         2b
+        bne         1b
+
+        bcs         store_tran_c_4x4_10
+        adds        r8, r7
+        vmov        q12, q1
+        it          mi
+        ldrmi       r6, [r2, #-4]!
+        vext.16     q1, q1, q1, #6
+        itt         pl
+        asrpl       r6, r8, #8
+        ldrpl       r6, [r1, r6, lsl #2]
+        vmov        s4, r6
+        b           store_tran_c_4x4_10
+
+
+@ ff_hevc_rpi_pred_angular_c_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r8, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #2
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        bl          patch_h_down_c_4x4_10
+        pop         {r4-r8, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        sub         r8, r7
+        bl          patch_h_up_c_4x4_10
+        pop         {r4-r8, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.16     {q9}, [r1]
+        sub         r1, r2, #4
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        vdup.16     q2, r6
+        vext.16     q8, q9, q9, #6
+        sub         r8, r7, #128
+        vld1.32     {d16[0]}, [r1]
+        vdup.16     q3, r12
+        mov         r5, #3
+1:
+        vmul.u16    q0, q9, q3
+        subs        r12, r4
+        vmla.u16    q0, q8, q2
+        ittt        cc
+        asrcc       r1, r8, #8
+        addcc       r12, #32
+        addcc       r1, r2, r1, lsl #2
+        vext.16     q10, q8, q8, #6
+        rsb         r6, r12, #32
+        vmov        q11, q8
+        sub         r5, #1
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r8, r7
+        vld1.32     {d20[0]}, [r1]
+        teq         r5, #0
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmul.u16    q0, q11, q3
+        subs        r12, r4
+        vmla.u16    q0, q10, q2
+        ittt        cc
+        asrcc       r1, r8, #8
+        addcc       r12, #32
+        addcc       r1, r2, r1, lsl #2
+        vext.16     q8, q10, q10, #6
+        rsb         r6, r12, #32
+        vmov        q9, q10
+        sub         r5, #1
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r8, r7
+        vld1.32     {d16[0]}, [r1]
+        teq         r5, #0
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmul.u16    q0, q11, q3
+        vmla.u16    q0, q10, q2
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r8, pc}
+4:
+        bcc         3b
+5:
+        vmul.u16    q0, q9, q3
+        vmla.u16    q0, q8, q2
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r8, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        vld1.16     {q9}, [r1]!
+        rsb         r12, r6, #32
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vext.16     q8, q9, q9, #2
+        vld1.32     {d17[1]}, [r1]!
+        mov         r5, #3
+1:
+        vmul.u16    q0, q8, q2
+        subs        r12, r4
+        vmla.u16    q0, q9, q3
+        it          cc
+        addcc       r12, #32
+        vext.16     q10, q8, q8, #2
+        rsb         r6, r12, #32
+        vld1.32     {d21[1]}, [r1]
+        sub         r5, #1
+        vmov        q11, q8
+        teq         r5, #0
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r1, #4
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         1b
+        beq         4f
+2:
+        vmul.u16    q0, q10, q2
+        subs        r12, r4
+        vmla.u16    q0, q11, q3
+        it          cc
+        addcc       r12, #32
+        vext.16     q8, q10, q10, #2
+        rsb         r6, r12, #32
+        vld1.32     {d17[1]}, [r1]
+        sub         r5, #1
+        vmov        q9, q10
+        teq         r5, #0
+        vrshr.u16   q0, q0, #5
+        it          cc
+        addcc       r1, #4
+        vdup.16     q2, r6
+        vdup.16     q3, r12
+        vst1.16     {q0}, [r0], r3
+        bhi         2b
+        bne         1b
+        bcc         5f
+3:
+        vmul.u16    q0, q10, q2
+        vmla.u16    q0, q11, q3
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r8, pc}
+4:
+        bcc         3b
+5:
+        vmul.u16    q0, q8, q2
+        vmla.u16    q0, q9, q3
+        vrshr.u16   q0, q0, #5
+        vst1.16     {q0}, [r0]
+
+        pop         {r4-r8, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_c_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r8, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #2
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
+
+        bl          patch_h_down_c_4x4_10
+        bl          patch_h_down_c_4x4_10_continue
+
+        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
+        sub         r0, #32
+        mov         r6, r4
+        add         r0, r0, r3, lsl #2
+
+        bl          patch_h_down_c_4x4_10
+        bl          patch_h_down_c_4x4_10_continue
+
+        pop         {r4-r8, pc}
+
+@ Up of Horizontal - works down up
+10:
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        sub         r8, r7
+
+        push        {r2, r8}
+        bl          patch_h_up_c_4x4_10
+        bl          patch_h_up_c_4x4_10_continue
+        pop         {r2, r8}
+
+        sub         r0, #32
+        mov         r6, r4
+        add         r2, #16
+        sub         r8, r8, r7, lsl #2
+        add         r0, r0, r3, lsl #2
+
+        bl          patch_h_up_c_4x4_10
+        bl          patch_h_up_c_4x4_10_continue
+
+        pop         {r4-r8, pc}
+
+@ Left of vertical - works down left
+18:
+        vld1.16     {q0-q1}, [r1]
+        sub         r9, r2, #4
+        rsb         r12, r6, #32
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+        mov         r5, #8
+1:
+        vld1.32     {d17[1]}, [r9]
+        add         r8, r7
+        vmov        q2, q0
+        vmov        q3, q1
+        asr         r9, r8, #8
+        vext.16     q1, q0, q1, #6
+        add         r9, r2, r9, lsl #2
+        vext.16     q0, q8, q0, #6
+2:
+        vmul.u16    q11, q2, q10
+        subs        r12, r4
+        vmla.u16    q11, q0, q9
+        it          cc
+        addcc       r12, #32
+        vmul.u16    q12, q3, q10
+        rsb         r6, r12, #32
+        vmla.u16    q12, q1, q9
+        sub         r5, #1
+        teq         r5, #0
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+        vrshr.u16   q11, q11, #5
+        vrshr.u16   q12, q12, #5
+        vst1.16     {q11-q12}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r8, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        add         r5, r1, #32
+        vld1.16     {q0-q1}, [r1]!
+        rsb         r12, r6, #32
+        vld1.32     {d16[0]}, [r5]
+        mov         r5, #8
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+1:
+        vmov        q2, q0
+        add         r1, #4
+        vmov        q3, q1
+        vext.16     q0, q0, q1, #2
+        vext.16     q1, q1, q8, #2
+2:
+        vmul.u16    q11, q0, q9
+        subs        r12, r4
+        vmla.u16    q11, q2, q10
+        it          cc
+        addcc       r12, #32
+        vmul.u16    q12, q1, q9
+        rsb         r6, r12, #32
+        vmla.u16    q12, q3, q10
+        sub         r5, #1
+        vld1.32     {d16[0]}, [r1]
+        teq         r5, #0
+        vdup.16     q9, r6
+        vdup.16     q10, r12
+        vrshr.u16   q11, q11, #5
+        vrshr.u16   q12, q12, #5
+        vst1.16     {q11-q12}, [r0], r3
+        bhi         2b
+        bne         1b
+
+        pop         {r4-r8, pc}
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_angular_c_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride        [r3]
+@       unsigned int mode       [sp, #0]  2..34
+
+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
+        ldr         r12, [sp]
+        push        {r4-r10, lr}
+        ADRT        r4, angle_2 - 2
+        ADRT        r7, inv_angle - 11*2
+        add         r7, r7, r12, lsl #1
+        lsl         r3, #2
+        vpush       {d8}
+        ldrsb       r6, [r4, r12]
+        cmp         r12, #26
+        ldrsb       r4, [r4, r12]
+        bge         26f
+        cmp         r12, #18
+        bge         18f
+        cmp         r12, #10
+        bge         10f
+
+@ Down of Horizontal - works down left
+        add         sp, #8
+        mov         r10, #4
+        mov         r1, r2
+1:
+        bl          patch_h_down_c_4x4_10
+        bl          patch_h_down_c_4x4_10_continue
+        bl          patch_h_down_c_4x4_10_continue
+        bl          patch_h_down_c_4x4_10_continue
+
+        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
+        add         r1, r1, #4*4
+        mov         r6, r4
+        sub         r0, #64
+        subs        r10, #1
+        add         r0, r0, r3, lsl #2
+        bne         1b
+
+        pop         {r4-r10, pc}
+
+@ Up of Horizontal - works down up
+10:
+        add         sp, #8
+        mov         r10, #4
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        sub         r8, r7
+2:
+        push        {r2, r8}
+        bl          patch_h_up_c_4x4_10
+        bl          patch_h_up_c_4x4_10_continue
+        bl          patch_h_up_c_4x4_10_continue
+        bl          patch_h_up_c_4x4_10_continue
+        pop         {r2, r8}
+
+        sub         r0, #64
+        mov         r6, r4
+        add         r2, #16
+        sub         r8, r8, r7, lsl #2
+        add         r0, r0, r3, lsl #2
+        subs        r10, #1
+        bne         2b
+
+        pop         {r4-r10, pc}
+
+@ Left of vertical - works down left
+18:
+        add         r5, r1, #32
+        vld1.16     {q1-q2}, [r1]
+        rsb         r12, r6, r6, lsl #16
+        vld1.16     {q3-q4}, [r5]
+        sub         r9, r2, #4
+        rsb         r4, r12, #0
+        rsb         r12, r12, #32 << 16
+        ldrh        r7, [r7]
+        mov         r8, #-128
+        vmov        d0, d9
+        vmov        s2, r12
+        add         r10, r0, #32
+        mov         r5, #16
+1:
+        vld1.32     {d1[1]}, [r9]
+        add         r8, r7
+        vmov        q11, q4
+        vmov        q10, q3
+        asr         r9, r8, #8
+        vmov        q9, q2
+        add         r9, r2, r9, lsl #2
+        vmov        q8, q1
+        vext.16     q4, q3, q4, #6
+        vext.16     q3, q2, q3, #6
+        vext.16     q2, q1, q2, #6
+        vext.16     q1, q0, q1, #6
+2:
+        vmul.u16    q12, q8, d1[1]
+        adds        r12, r4
+        vmla.u16    q12, q1, d1[0]
+        it          cc
+        addcc       r12, #32 << 16
+        vmul.u16    q13, q9, d1[1]
+        it          cc
+        subcc       r12, #32
+        vmla.u16    q13, q2, d1[0]
+        sub         r5, #1
+        vmul.u16    q14, q10, d1[1]
+        teq         r5, #0
+        vmla.u16    q14, q3, d1[0]
+        vmul.u16    q15, q11, d1[1]
+        vmla.u16    q15, q4, d1[0]
+        vmov        s2, r12
+        vrshr.u16   q12, q12, #5
+        vrshr.u16   q13, q13, #5
+        vrshr.u16   q14, q14, #5
+        vrshr.u16   q15, q15, #5
+        vst1.16     {q12-q13}, [r0], r3
+        vst1.16     {q14-q15}, [r10], r3
+        bhi         2b
+        bne         1b
+
+        vpop        {d8}
+        vmov        d9, d0
+        pop         {r4-r10, pc}
+
+@ Right of vertical - works along top - left unused
+26:
+        add         r5, r1, #32
+        vld1.16     {q1-q2}, [r1]
+        rsb         r12, r6, r6, lsl #16
+        vld1.16     {q3-q4}, [r5]
+        add         r1, r1, #64
+        rsb         r4, r12, #0
+        rsb         r12, r12, #32 << 16
+        vmov        d1, d9
+        vmov        s1, r12
+        add         r10, r0, #32
+        mov         r5, #16
+1:
+        vld1.32     {d0[0]}, [r1]!
+        vmov        q8, q1
+        vmov        q9, q2
+        vmov        q10, q3
+        vmov        q11, q4
+        vext.16     q1, q1, q2, #2
+        vext.16     q2, q2, q3, #2
+        vext.16     q3, q3, q4, #2
+        vext.16     q4, q4, q0, #2
+2:
+        vmul.u16    q12, q1, d0[2]
+        adds        r12, r4
+        vmla.u16    q12, q8, d0[3]
+        it          cc
+        addcc       r12, #32 << 16
+        vmul.u16    q13, q2, d0[2]
+        it          cc
+        subcc       r12, #32
+        vmla.u16    q13, q9, d0[3]
+        sub         r5, #1
+        vmul.u16    q14, q3, d0[2]
+        teq         r5, #0
+        vmla.u16    q14, q10, d0[3]
+        vmul.u16    q15, q4, d0[2]
+        vmla.u16    q15, q11, d0[3]
+        vmov        s1, r12
+        vrshr.u16   q12, q12, #5
+        vrshr.u16   q13, q13, #5
+        vrshr.u16   q14, q14, #5
+        vrshr.u16   q15, q15, #5
+        vst1.16     {q12-q13}, [r0], r3
+        vst1.16     {q14-q15}, [r10], r3
+        bhi         2b
+        bne         1b
+
+        vpop        {d8}
+        vmov        d9, d1
+        pop         {r4-r10, pc}
+
+endfunc
diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
new file mode 100644
index 0000000000..df8c1c25b9
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
@@ -0,0 +1,705 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+
+@ ff_hevc_rpi_pred_dc_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_4_neon_8, export=1
+
+        @ Average the els of top & left
+        ldr         r2, [r2]
+        vld1.32     {d0[0]}, [r1]
+        mov         r1, #2
+        vmov        s1, r2
+        vmov        s2, r2
+        vmov.i16    q2, #3
+        add         r2, r0, r3
+        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
+        lsl         r3, #1
+        vmovl.u8    q0, d0
+        vmov.i64    d7, #0xffff
+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
+
+        @ top line gets some smoothing
+        @ (top[i] + 3*dc + 2) >> 2
+        @ as does left
+        @ top_line[0] is extra special
+        @ (top[0] + left[0] + 2*dc + 2) >> 2
+
+        vmov.i64    d7, #0xff
+        vpadd.i16   d6, d6        @ 1 (all the same)
+        vrshr.u16   d6, #3
+        vmla.i16    q0, q2, d6[0]
+        vdup.8      d6, d6[0]
+        vrshrn.i16  d0, q0, #2
+
+        @ Store top line
+        vst1.32     {d0[0]}, [r0], r3
+
+        @ Store the rest
+        vshr.u64    d1, d0, #5*8
+        vshr.u64    d2, d0, #6*8
+        vshr.u64    d3, d0, #7*8
+        vbif        d1, d6, d7
+        vbif        d2, d6, d7
+        vst1.32     {d1[0]}, [r2], r3
+        vbif        d3, d6, d7
+        vst1.32     {d2[0]}, [r0]
+        vst1.32     {d3[0]}, [r2]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_c_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
+
+        @ Average the els of top & left
+        vld1.8      {d0}, [r1]
+        vld1.8      {d1}, [r2]
+A       add         r2, r0, r3, lsl #1
+A       lsl         r3, #2
+T       lsl         r3, #1
+T       add         r2, r0, r3
+T       lsl         r3, #1
+        vaddl.u8    q0, d0, d1
+        vadd.i16    d0, d1       @ d0 has 2 val pairs
+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
+        vpadd.i32   d3, d0, d0
+        vrshrn.u16  d0, q1, #3
+
+        @ Store
+        vst1.8      {d0}, [r0], r3
+        vst1.8      {d0}, [r2], r3
+        vst1.8      {d0}, [r0]
+        vst1.8      {d0}, [r2]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_8_neon_8, export=1
+
+        @ Average the els of top & left
+        vld1.8      {d0}, [r1]
+        mov         r1, #2
+        vld1.8      {d16}, [r2]
+        vmov.i16    q2, #3
+        vmov.i64    d7, #0xffff
+        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
+        vmovl.u8    q0, d0
+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
+
+        @ top line gets some smoothing
+        @ (top[i] + 3*dc + 2) >> 2
+        @ as does left
+        @ top_line[0] is extra special
+        @ (top[0] + left[0] + 2*dc + 2) >> 2
+
+        vmov.i64    d7, #0xff
+        vmovl.u8    q1, d16
+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
+        vpadd.i16   d6, d6        @ 1 (all the same)
+        vrshr.u16   d6, #4
+        vmla.i16    q1, q2, d6[0]
+        vmla.i16    q0, q2, d6[0]
+        vdup.8      d6, d6[0]
+        vrshrn.i16  d2, q1, #2
+        vrshrn.i16  d0, q0, #2
+
+        @ Store top line
+        vst1.8      {d0}, [r0], r3
+
+        @ Store the rest
+        vshr.u64    d2, #8
+        vbit        d6, d2, d7
+        vshr.u64    d2, #8
+        vst1.8      {d6}, [r0], r3
+        mov         r1, #6
+1:
+        vbit        d6, d2, d7
+        vshr.u64    d2, #8
+        vst1.8      {d6}, [r0], r3
+        subs        r1, #2
+        vbit        d6, d2, d7
+        vshr.u64    d2, #8
+        vst1.8      {d6}, [r0], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_c_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
+
+        @ Average the els of top & left
+        vld1.8      {q0}, [r1]
+        mov         r1, #8
+        vld1.8      {q1}, [r2]
+T       lsl         r3, #1
+        vaddl.u8    q0, d0, d1
+A       add         r2, r0, r3, lsl #1
+A       lsl         r3, #2
+T       add         r2, r0, r3
+T       lsl         r3, #1
+        vaddl.u8    q1, d2, d3
+        vadd.i16    q1, q0
+        vadd.i16    d3, d2        @ d3 has 2 val pairs
+        vpadd.i32   d2, d3, d3    @ This add U & V separately
+        vpadd.i32   d3, d3, d3
+        vrshrn.u16  d0, q1, #4
+        vrshrn.u16  d1, q1, #4
+
+        @ Store
+1:
+        vst1.8      {q0}, [r0], r3
+        subs        r1, #4
+        vst1.8      {q0}, [r2], r3
+        vst1.8      {q0}, [r0], r3
+        vst1.8      {q0}, [r2], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_16_neon_8, export=1
+
+        @ Average the els of top & left
+        vld1.8      {q8}, [r1]
+        mov         r1, #2
+        vld1.8      {q9}, [r2]
+        vaddl.u8    q10, d16, d17
+        vaddl.u8    q11, d16, d18
+        vaddl.u8    q0, d18, d19
+        vmov.i16    q1, #3
+        vadd.i16    q10, q0
+        vmovl.u8    q0, d18
+        vadd.i16    d20, d21
+        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
+
+        @ top line gets some smoothing
+        @ (top[i] + 3*dc + 2) >> 2
+        @ as does left
+        @ top_line[0] is extra special
+        @ (top[0] + left[0] + 2*dc + 2) >> 2
+
+        vmovl.u8    q2, d16
+        vmovl.u8    q9, d19
+        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
+        vmov.i64    d7, #0xffff
+        vmovl.u8    q8, d17
+        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
+        vmov.i64    d7, #0xff
+        vpadd.i16   d20, d20      @ 1 (all the same)
+        vrshr.u16   d21, d20, #5
+        vrshr.u16   d20, d20, #5
+        vmla.i16    q0, q10, d2[1]
+        vmla.i16    q9, q10, d2[1]
+        vmla.i16    q2, q10, q1
+        vmla.i16    q8, q10, d2[1]
+        vdup.8      q1, d20[0]
+        vrshrn.i16  d0, q0, #2
+        vrshrn.i16  d1, q9, #2
+        vrshrn.i16  d4, q2, #2
+        vrshrn.i16  d5, q8, #2
+        vext.8      q0, q0, q0, #1
+
+        @ Store top line
+        vst1.8      {q2}, [r0], r3
+
+        @ Store the rest
+        mov         r1, #15
+1:
+        vbit        d2, d0, d7
+        vext.8      q0, q0, q0, #1
+        subs        r1, #1
+        vst1.8      {q1}, [r0], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_c_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
+
+        @ Average the els of top & left
+        vld1.8      {q0-q1}, [r1]
+        mov         r1, #16
+        vld1.8      {q2-q3}, [r2]
+T       lsl         r3, #1
+        vaddl.u8    q0, d0, d1
+A       add         r2, r0, r3, lsl #1
+T       add         r2, r0, r3
+        vaddl.u8    q1, d2, d3
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vaddl.u8    q2, d4, d5
+        vaddl.u8    q3, d6, d7
+        vadd.i16    q0, q1
+        vadd.i16    q2, q3
+        vadd.i16    q0, q2
+        vadd.i16    d0, d1        @ d0 has 2 val pairs
+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
+        vpadd.i32   d5, d0, d0
+        vrshrn.u16  d0, q2, #5
+        vrshrn.u16  d1, q2, #5
+        vrshrn.u16  d2, q2, #5
+        vrshrn.u16  d3, q2, #5
+
+        @ Store
+1:
+        vst1.8      {q0-q1}, [r0], r3
+        subs        r1, #2
+        vst1.8      {q0-q1}, [r2], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_32_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_32_neon_8, export=1
+
+        @ Average the els of top & left
+        vld1.8      {q0-q1}, [r1]
+        mov         r1, #32
+        vld1.8      {q2-q3}, [r2]
+        add         r2, r0, r3
+        vaddl.u8    q0, d0, d1
+        lsl         r3, #1
+        vaddl.u8    q1, d2, d3
+        vaddl.u8    q2, d4, d5
+        vaddl.u8    q3, d6, d7
+        vadd.i16    q0, q1
+        vadd.i16    q2, q3
+        vadd.i16    q0, q2
+        vadd.i16    d0, d1        @ d0 has 4 vals
+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
+        vpadd.i16   d5, d0, d0
+        vrshrn.u16  d0, q2, #6
+        vrshrn.u16  d1, q2, #6
+        vrshrn.u16  d2, q2, #6
+        vrshrn.u16  d3, q2, #6
+
+        @ Store
+1:
+        vst1.8      {q0-q1}, [r0], r3
+        subs        r1, #2
+        vst1.8      {q0-q1}, [r2], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ -----------------------------------------------------------------------------
+@
+@ 10 Bit versions
+@
+@ There is no actual bit depth dependency in this code except that our
+@ intermediate results will overflow the 16 bits they are stored in
+@ All there functions are good to 10 bits - with the worst case being
+@ in dc_32 where we use all 16 bits.
+
+
+@ ff_hevc_rpi_pred_dc_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_4_neon_10, export=1
+
+        @ Average the els of top & left
+        vld1.16     {d0}, [r1]
+        mov         r1, #2
+        vld1.16     {d1}, [r2]
+T       lsl         r3, #1
+        vmov.i16    q2, #3
+A       add         r2, r0, r3, lsl #1
+T       add         r2, r0, r3
+        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
+        vmov.i64    d7, #0xffff
+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
+
+        @ top line gets some smoothing
+        @ (top[i] + 3*dc + 2) >> 2
+        @ as does left
+        @ top_line[0] is extra special
+        @ (top[0] + left[0] + 2*dc + 2) >> 2
+
+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
+        vpadd.i16   d6, d6        @ 1 (all the same)
+        vrshr.u16   d6, #3
+        vmla.i16    q0, q2, d6[0]
+        vrshr.u16   q0, #2
+
+        @ Store top line
+        vst1.16     {d0}, [r0], r3
+
+        @ Store the rest
+        vshr.u64    d3, d1, #1*16
+        vshr.u64    d4, d1, #2*16
+        vshr.u64    d5, d1, #3*16
+        vbif        d3, d6, d7
+        vbif        d4, d6, d7
+        vst1.16     {d3}, [r2], r3
+        vbif        d5, d6, d7
+        vst1.16     {d4}, [r0]
+        vst1.16     {d5}, [r2]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_c_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
+
+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
+
+        @ Average the els of top & left
+        vld1.8      {q0}, [r1]
+        vld1.8      {q1}, [r2]
+A       add         r2, r0, r3, lsl #2
+A       lsl         r3, #3
+T       lsl         r3, #2
+T       add         r2, r0, r3
+T       lsl         r3, #1
+        vadd.i16    q0, q1
+        vadd.i16    d0, d1       @ d0 has 2 val pairs
+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
+        vpadd.i32   d3, d0, d0
+        vrshr.u16   q0, q1, #3
+
+        vst1.16     {q0}, [r0], r3
+        vst1.16     {q0}, [r2], r3
+        vst1.16     {q0}, [r0]
+        vst1.16     {q0}, [r2]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_8_neon_10, export=1
+
+        @ Average the els of top & left
+        vld1.16     {q0}, [r1]
+        mov         r1, #2
+        vld1.16     {q8}, [r2]
+T       lsl         r3, #1
+        vmov.i16    q2, #3
+A       add         r2, r0, r3, lsl #1
+T       add         r2, r0, r3
+        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vmov.i64    d7, #0xffff
+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
+
+        @ top line gets some smoothing
+        @ (top[i] + 3*dc + 2) >> 2
+        @ as does left
+        @ top_line[0] is extra special
+        @ (top[0] + left[0] + 2*dc + 2) >> 2
+
+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
+        vpadd.i16   d6, d6        @ 1 (all the same)
+        vrshr.u16   d6, #4
+        vmla.i16    q8, q2, d6[0]
+        vmla.i16    q0, q2, d6[0]
+        vdup.16     q2, d6[0]
+        vdup.16     q9, d6[0]
+        vrshr.u16   q8, q8, #2
+        vrshr.u16   q0, q0, #2
+        vext.16     q1, q8, q8, #1
+
+        @ Store top line
+        vst1.16     {q0}, [r0], r3
+
+        @ Store the rest
+        vbit        d18, d2, d7
+        vst1.16     {q9}, [r2], r3
+        mov         r1, #6
+1:
+        vext.16     q8, q8, q8, #2
+        subs        r1, #2
+        vext.16     q1, q1, q1, #2
+        vbit        d4, d16, d7
+        vst1.16     {q2}, [r0], r3
+        vbit        d18, d2, d7
+        vst1.16     {q9}, [r2], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_c_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
+
+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
+
+        @ Average the els of top & left
+        vld1.16     {q0-q1}, [r1]
+        mov         r1, #8
+        vld1.16     {q2-q3}, [r2]
+T       lsl         r3, #2
+        vadd.i16    q1, q0
+A       add         r2, r0, r3, lsl #2
+A       lsl         r3, #3
+T       add         r2, r0, r3
+T       lsl         r3, #1
+        vadd.i16    q2, q3
+        vadd.i16    q1, q2
+        vadd.i16    d3, d2        @ d3 has 2 val pairs
+        vpadd.i32   d2, d3, d3    @ This add U & V separately
+        vpadd.i32   d3, d3, d3
+        vrshr.u16   q0, q1, #4
+        vrshr.u16   q1, q1, #4
+
+        @ Store
+1:
+        vst1.8      {q0-q1}, [r0], r3
+        subs        r1, #2
+        vst1.8      {q0-q1}, [r2], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_dc_16_neon_10, export=1
+
+        @ Average the els of top & left
+        vld1.16     {q8-q9}, [r1]
+        mov         r1, #2
+        vld1.16     {q10-q11}, [r2]
+        lsl         r3, #1        @ stride given in pels
+        vadd.i16    q0, q8, q9
+        vadd.i16    q1, q10, q11
+        vmov.i16    q3, #3
+        vadd.i16    q1, q0
+        vadd.i16    d0, d16, d20
+        vmov.i64    d31, #0xffff
+        vadd.i16    d3, d2
+        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
+
+        @ top line gets some smoothing
+        @ (top[i] + 3*dc + 2) >> 2
+        @ as does left
+        @ topline[0] is extra special
+        @ (top[0] + left[0] + 2*dc + 2) >> 2
+
+        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
+        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
+        vpadd.i16   d3, d3        @ 1 (all the same)
+        vrshr.u16   d2, d3, #5
+        vrshr.u16   d3, d3, #5
+        vmov        q0, q1
+        vmla.i16    q10, q1, d6[1]
+        vmla.i16    q11, q1, d6[1]
+        vmla.i16    q8, q1, q3
+        vmla.i16    q9, q1, d6[1]
+        vrshr.u16   q2, q10, #2
+        vrshr.u16   q3, q11, #2
+        vrshr.u16   q8, #2
+        vrshr.u16   q9, #2
+        vext.16     q2, q2, q2, #1
+        mov         r1, #7<<29
+
+        @ Store top line
+        vst1.16     {q8-q9}, [r0], r3
+
+        @ Store the rest
+1:
+        vbit        d0, d4, d31
+        vext.16     q2, q2, q2, #1
+        subs        r1, #1<<29
+        vst1.16     {q0-q1}, [r0], r3
+        bne         1b
+1:
+        vbit        d0, d6, d31
+        vext.16     q3, q3, q3, #1
+        subs        r1, #1<<29
+        vst1.16     {q0-q1}, [r0], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_c_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
+
+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
+
+        @ Average the els of top & left
+        vldm        r1, {q0-q3}
+        vldm        r2, {q8-q11}
+        vadd.i16    q0, q1
+        mov         r1, #16
+        vadd.i16    q2, q3
+        add         r2, r0, #32
+        vadd.i16    q8, q9
+        lsl         r3, #2
+        vadd.i16    q10, q11
+        vadd.u16    q0, q2
+        vadd.u16    q8, q10
+        vadd.i16    q0, q8
+        vadd.i16    d0, d1        @ d0 has 2 val pairs
+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
+        vpadd.i32   d5, d0, d0
+        vrshr.u16   q0, q2, #5
+        vrshr.u16   q1, q2, #5
+
+        @ Store
+1:
+        vst1.16     {q0-q1}, [r0], r3
+        subs        r1, #1
+        vst1.16     {q0-q1}, [r2], r3
+        bne         1b
+
+        bx           lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_dc_32_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]  (In pels)
+
+function ff_hevc_rpi_pred_dc_32_neon_10, export=1
+
+        @ Average the els of top & left
+        @ With 10 bits we are (just) safe from overflow in i16
+        vldm        r1, {q0-q3}
+        vldm        r2, {q8-q11}
+        vadd.i16    q0, q1
+        mov         r1, #32
+        vadd.i16    q2, q3
+        add         r2, r0, #32
+        vadd.i16    q8, q9
+        lsl         r3, #1
+        vadd.i16    q10, q11
+        vadd.u16    q0, q2
+        vadd.u16    q8, q10
+        vadd.i16    q0, q8
+        vadd.i16    d0, d1        @ d0 has 4 vals
+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
+        vpadd.i16   d5, d0, d0
+        vrshr.u16   q0, q2, #6
+        vrshr.u16   q1, q2, #6
+
+        @ Store
+1:
+        vst1.16     {q0-q1}, [r0], r3
+        subs        r1, #1
+        vst1.16     {q0-q1}, [r2], r3
+        bne         1b
+
+        bx           lr
+endfunc
+
+
diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
new file mode 100644
index 0000000000..f6969d3591
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
@@ -0,0 +1,881 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ All functions have the call
+@
+@ int ff_hevc_rpi_intra_filter_N_neon_PW(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+@
+@ Assumptions:
+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
+@  if reuseing this code)
+@
+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
+@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
+@
+@ We always have at least 64 pixel H frame width rounding - this lets us
+@ load UR widthout having to worry about exactly how many pixels are actually
+@ within the frame.  As partial loads will only occur very occasionally this
+@ should be a win in nearly all cases.
+@
+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
+@ so we do no maths on the contents
+@
+@ No filtering in 32bit fns as they are chroma only
+
+
+.equ    AVAIL_UR, 1
+.equ    AVAIL_U,  2
+.equ    AVAIL_UL, 4
+.equ    AVAIL_L,  8
+.equ    AVAIL_DL, 16
+
+.equ    FILTER_LIGHT, 0x40
+.equ    FILTER_STRONG, 0x80
+
+.equ    AVAIL_S_UR_N_U_C, 32 - 1
+.equ    AVAIL_S_U_N_UL_C, 32 - 2
+.equ    AVAIL_S_UL_N_L_C, 32 - 3
+.equ    AVAIL_S_L_N_DL_C, 32 - 4
+
+.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
+
+@ On entry
+@  r2   req
+@  r3   avail
+@ [sp, #sp_offset...]  args
+@
+@ On Exit:
+@
+@ Extend values:
+@  d_l  scalar contains value for L & DL
+@       if DL avail then this is is DL[0] so we don't need to load that
+@  d_ul scalar containing value for UL
+@  d_u  scalar containing value for U
+@  d_ur scalar containing value for UR
+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
+@ This means that L-light-filter works even if nreq DL (we never filter
+@ req-DL without req-L, but we do filter req-L without req-DL)
+@ If UR avail then d_ur == a_ur so U-filter good too
+@
+@ Data load pointers (only load if req & avail):
+@  r4   DL + stride
+@  r10  L
+@  r6   U
+@  r5   UR
+@
+@ Others:
+@  r2   req
+@  r7   req & avail
+@  r3   L + stride
+@  r8   DL + stride * 2
+@  r9   stride * 2
+@  cs   Load U
+@  mi   Load UR
+@
+@ Clobbered:
+@  r12
+
+.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
+
+.equ    src_l\@,   \sp_offset + 0
+.equ    src_u\@,   \sp_offset + 4
+.equ    src_ur\@,  \sp_offset + 8
+.equ    stride\@,  \sp_offset + 12
+.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
+.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
+
+@ r9    stride
+@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
+@ r4 = b_dl, r10 = b_l,             r8 = b_u
+
+        ldr        r5,  [sp, #src_ur\@]
+        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
+        ldr        r10, [sp, #src_l\@]
+        ldr        r9,  [sp, #stride\@]
+        ldr        r6,  [sp, #src_u\@]
+
+        @ This is quite a slow instruction but it replaces
+        @ a decent number of tests that yield a max of 2 flags/op
+        @ It is annoying we can't branch on Q!
+        @ If L navail (ne) then DL must be navail (pl)
+        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
+
+        mov        r4,  r5
+        sub        r7,  r10, r9
+        it vs
+        movvs      r4,  r6
+        add        r8,  r6,  #b_size\@ - pw\@
+        it cs
+        movcs      r4,  r7
+        ite ne
+        movne      r10, r4
+        addeq      r4,  r7,  r9,  lsl #\log2_s
+        it cc
+        movcc      r7,  r10
+        it mi
+        addmi      r4,  r10, r9,  lsl #\log2_s
+        vld1.\d_type {\d_ul}, [r7]
+        itt vc
+        movvc      r8,  r7
+        movvc      r6,  r7
+        vld1.\d_type {\d_l }, [r4], r9
+        tst        r3,  #AVAIL_UR
+        vld1.\d_type {\d_u }, [r6]
+        it eq
+        moveq      r5,  r8
+        and        r7,  r2,  r3
+        add        r8,  r4,  r9
+        vld1.\d_type {\d_ur}, [r5]
+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
+        add        r3,  r10, r9
+        lsl        r9,  #1
+.endm
+
+
+
+@ int ff_hevc_rpi_intra_filter_4_neon_8(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    pw_s,    0
+.set    pw,      (1 << pw_s)
+.set    log2_s,  2
+
+function ff_hevc_rpi_intra_filter_4_neon_8, export=1
+        push       {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
+
+        it cs
+        vldrcs     s2,  [r6]
+        ite pl
+        vmovpl     s3,  s4
+        vldrmi     s3,  [r5]
+
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        add        r12, r0,  #-pw
+        bpl        1f
+
+        vld1.8    {d0[0]}, [r10], r9
+        vld1.8    {d0[1]}, [r3],  r9
+        vld1.8    {d0[2]}, [r10]
+        vld1.8    {d0[3]}, [r3]
+1:
+        bcc        1f
+        vld1.8    {d0[5]}, [r4],  r9
+        vld1.8    {d0[6]}, [r8]
+        vld1.8    {d0[7]}, [r4]
+1:
+        vstr       d1,  [r1]            @ Up
+        vst1.8    {d31[7]}, [r12]
+        vstr       d0,  [r0]            @ Left
+        pop       {r4-r10, pc}
+endfunc
+
+
+@ int ff_hevc_rpi_intra_filter_4_neon_16(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    pw_s,    1
+.set    pw,      (1 << pw_s)
+.set    log2_s,  2
+
+function ff_hevc_rpi_intra_filter_4_neon_16, export=1
+        push       {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
+
+        it cs
+        vldrcs     d2,  [r6]
+        it mi
+        vldrmi     d3,  [r5]
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        add        r12, r0, #-pw
+        bpl        1f
+        vld1.16   {d0[0]}, [r10], r9
+        vld1.16   {d0[1]}, [r3],  r9
+        vld1.16   {d0[2]}, [r10]
+        vld1.16   {d0[3]}, [r3]
+1:
+        bcc        1f
+        vld1.16   {d1[1]}, [r4],  r9
+        vld1.16   {d1[2]}, [r8]
+        vld1.16   {d1[3]}, [r4]
+1:
+        vst1.16   {q1}, [r1]           @ Up
+        vst1.16   {d31[3]}, [r12]
+        vst1.16   {q0}, [r0]           @ Left
+        pop       {r4-r10, pc}
+endfunc
+
+
+@ int ff_hevc_rpi_intra_filter_8_neon_8(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    pw_s,    0
+.set    pw,      (1 << pw_s)
+.set    log2_s,  3
+
+function ff_hevc_rpi_intra_filter_8_neon_8, export=1
+        push      {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
+
+        it cs
+        vldrcs     d4,  [r6]
+        it mi
+        vldrmi     d5,  [r5]
+
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        bpl        1f
+        vld1.8    {d0[0]}, [r10], r9
+        vld1.8    {d0[1]}, [r3],  r9
+        vld1.8    {d0[2]}, [r10], r9
+        vld1.8    {d0[3]}, [r3],  r9
+        vld1.8    {d0[4]}, [r10], r9
+        vld1.8    {d0[5]}, [r3],  r9
+        vld1.8    {d0[6]}, [r10]
+        vld1.8    {d0[7]}, [r3]
+1:
+        bcc        1f
+        vld1.8    {d1[1]}, [r4],  r9
+        vld1.8    {d1[2]}, [r8],  r9
+        vld1.8    {d1[3]}, [r4],  r9
+        vld1.8    {d1[4]}, [r8],  r9
+        vld1.8    {d1[5]}, [r4],  r9
+        vld1.8    {d1[6]}, [r8]
+        vld1.8    {d1[7]}, [r4]
+1:
+        tst        r2,  #FILTER_LIGHT
+        add        r12, r0,  #-pw
+        beq        10f
+
+        @ Luma light filter
+        vext.8     q8,  q15, q2,  #15
+        vext.8     q12, q15, q0,  #15
+        vaddl.u8   q9,  d17, d5
+        vaddl.u8   q8,  d16, d4
+        vaddl.u8   q13, d25, d1
+        vaddl.u8   q12, d24, d0
+        vmov.u8    r3,  d5[7]           @ Save final pel
+        vmov.u8    r2,  d1[7]           @ Save final pel
+
+        vext.16    q2,  q8,  q9,  #1
+        vext.16    q3,  q9,  q9,  #1
+        vext.16    q0,  q12, q13, #1
+        vext.16    q1,  q13, q13, #1
+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
+        vadd.u16   q2,  q8
+        vadd.u16   q3,  q9
+        vadd.u16   q0,  q12
+        vadd.u16   q1,  q13
+
+        vrshrn.u16 d4,  q2,  #2
+        vrshrn.u16 d5,  q3,  #2
+        vrshrn.u16 d0,  q0,  #2
+        vrshrn.u16 d1,  q1,  #2
+        vrshr.u16  d30, #2
+        vmov.u8    d5[7], r3            @ Restore final pel
+        vmov.u8    d1[7], r2            @ Restore final pel
+        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
+
+10:
+        vst1.8    {q2 }, [r1]           @ Up
+        vst1.8    {d31[7]}, [r12]       @ Up-left
+        vst1.8    {q0 }, [r0]           @ Left
+        pop       {r4-r10, pc}
+endfunc
+
+
+@ int ff_hevc_rpi_intra_filter_8_neon_16(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    ur_size, sp_base + 16
+.set    dl_size, sp_base + 20
+.set    pw_s,    1
+.set    pw,      (1 << pw_s)
+.set    log2_s,  3
+.set    p_size,  (1 << log2_s)          @ size in pels
+
+function ff_hevc_rpi_intra_filter_8_neon_16, export=1
+        push      {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
+
+        it cs
+        vldmcs     r6,  {d4, d5}
+        ldr        r12, [sp, #ur_size]
+        bpl        1f
+        cmp        r12, #4
+        vldm       r5,  {d6, d7}
+        bgt        1f
+        vdup.16    d7,  d6[3]
+1:
+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
+        vdup.16    q1,  d0[0]
+        bpl        1f
+        vld1.16   {d0[0]}, [r10], r9
+        vld1.16   {d0[1]}, [r3],  r9
+        vld1.16   {d0[2]}, [r10], r9
+        vld1.16   {d0[3]}, [r3],  r9
+        vld1.16   {d1[0]}, [r10], r9
+        vld1.16   {d1[1]}, [r3],  r9
+        vld1.16   {d1[2]}, [r10]
+        vld1.16   {d1[3]}, [r3]
+1:
+        bcc        1f
+        ldr        r12, [sp, #dl_size]
+        vld1.16   {d2[1]}, [r4],  r9
+        cmp        r12, #p_size
+        vld1.16   {d2[2]}, [r8],  r9
+        vld1.16   {d2[3]}, [r4],  r9
+        blt        2f
+        vld1.16   {d3[0]}, [r8],  r9
+        vld1.16   {d3[1]}, [r4],  r9
+        vld1.16   {d3[2]}, [r8]
+        vld1.16   {d3[3]}, [r4]
+        b          1f
+2:
+        vdup.16    d3,  d2[3]
+1:
+        tst        r2,  #FILTER_LIGHT
+        add        r12, r0,  #-pw
+        beq        10f
+
+        @ Luma light filter
+        vext.16    q9,  q2,  q3,  #7
+        vext.16    q8,  q15, q2,  #7
+        vext.16    q13, q0,  q1,  #7
+        vext.16    q12, q15, q0,  #7
+        vadd.u16   q9,  q3
+        vadd.u16   q8,  q2
+        vadd.u16   q13, q1
+        vadd.u16   q12, q0
+        vmov.u16   r3,  d7[3]           @ Save final pel
+        vmov.u16   r2,  d3[3]           @ Save final pel
+
+        vext.16    q2,  q8,  q9,  #1
+        vext.16    q3,  q9,  q9,  #1
+        vext.16    q0,  q12, q13, #1
+        vext.16    q1,  q13, q13, #1
+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
+        vadd.u16   q2,  q8
+        vadd.u16   q3,  q9
+        vadd.u16   q0,  q12
+        vadd.u16   q1,  q13
+
+        vrshr.u16  q2,  #2
+        vrshr.u16  q3,  #2
+        vrshr.u16  q0,  #2
+        vrshr.u16  q1,  #2
+        vrshr.u16  d30, #2
+        vmov.u16   d7[3], r3            @ Restore final pel
+        vmov.u16   d3[3], r2            @ Restore final pel
+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
+
+10:
+        vst1.16   {q2,  q3}, [r1]       @ Up
+        vst1.16   {d31[3]}, [r12]       @ Up-left
+        vst1.16   {q0,  q1}, [r0]       @ Left
+        pop       {r4-r10, pc}
+endfunc
+
+@ int ff_hevc_rpi_intra_filter_16_neon_16(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    ur_size, sp_base + 16
+.set    dl_size, sp_base + 20
+.set    pw_s,    1
+.set    pw,      (1 << pw_s)
+.set    log2_s,  4
+.set    p_size,  (1 << log2_s)          @ size in pels
+
+function ff_hevc_rpi_intra_filter_16_neon_16, export=1
+        push      {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
+
+        vdup.16    q9,  d16[0]
+        vdup.16    q11, d20[0]
+
+        it cs
+        vldmcs     r6,  {d16-d19}
+        ldr        r12, [sp, #ur_size]
+        bpl        1f
+        cmp        r12, #12
+        @ Given chroma frame layout, if UR exists then it is always legit to
+        @ load all of it even if most of it is outside the frame.
+        vldm       r5,  {d20-d23}
+        bgt        1f
+        bge        4f
+        cmp        r12,  #8
+        bge        3f
+        vdup.16    d21, d20[3]
+3:      vdup.16    d22, d21[3]
+4:      vdup.16    d23, d22[3]
+
+1:
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        ldr        r12, [sp, #dl_size]
+        vdup.16    q1,  d0[0]
+        vdup.16    q2,  d0[0]
+        vdup.16    q3,  d0[0]
+        bpl        1f
+        vld1.16   {d0[0]}, [r10], r9
+        vld1.16   {d0[1]}, [r3],  r9
+        vld1.16   {d0[2]}, [r10], r9
+        vld1.16   {d0[3]}, [r3],  r9
+        vld1.16   {d1[0]}, [r10], r9
+        vld1.16   {d1[1]}, [r3],  r9
+        vld1.16   {d1[2]}, [r10], r9
+        vld1.16   {d1[3]}, [r3],  r9
+        vld1.16   {d2[0]}, [r10], r9
+        vld1.16   {d2[1]}, [r3],  r9
+        vld1.16   {d2[2]}, [r10], r9
+        vld1.16   {d2[3]}, [r3],  r9
+        vld1.16   {d3[0]}, [r10], r9
+        vld1.16   {d3[1]}, [r3],  r9
+        vld1.16   {d3[2]}, [r10]
+        vld1.16   {d3[3]}, [r3]
+1:
+        bcc        1f
+        vld1.16   {d4[1]}, [r4],  r9
+        cmp        r12, #4
+        vld1.16   {d4[2]}, [r8],  r9
+        vld1.16   {d4[3]}, [r4],  r9
+        ble        2f
+        vld1.16   {d5[0]}, [r8],  r9
+        vld1.16   {d5[1]}, [r4],  r9
+        cmp        r12, #12
+        vld1.16   {d5[2]}, [r8],  r9
+        vld1.16   {d5[3]}, [r4],  r9
+        blt        3f
+        vld1.16   {d6[0]}, [r8],  r9
+        vld1.16   {d6[1]}, [r4],  r9
+        vld1.16   {d6[2]}, [r8],  r9
+        vld1.16   {d6[3]}, [r4],  r9
+        ble        4f
+        vld1.16   {d7[0]}, [r8],  r9
+        vld1.16   {d7[1]}, [r4],  r9
+        vld1.16   {d7[2]}, [r8]
+        vld1.16   {d7[3]}, [r4]
+        b          1f
+2:      vdup.16    d5,  d4[3]
+3:      vdup.16    d6,  d5[3]
+4:      vdup.16    d7,  d6[3]
+1:
+        tst        r2,  #FILTER_LIGHT
+        add        r12, r0,  #-pw
+        beq        10f
+
+        vpush     {q5}
+        @ Luma light filter
+        @ Left
+        vext.16    q5,  q2,  q3,  #7
+        vext.16    q14, q1,  q2,  #7
+        vext.16    q13, q0,  q1,  #7
+        vext.16    q12, q15, q0,  #7
+
+        vadd.u16   q5,  q3
+        vadd.u16   q14, q2
+        vadd.u16   q13, q1
+        vadd.u16   q12, q0
+        vmov.u16   r2,  d7[3]           @ Save final pel
+
+        vext.16    q0,  q12, q13, #1
+        vext.16    q1,  q13, q14, #1
+        vext.16    q2,  q14, q5,  #1
+        vext.16    q3,  q5,  q5,  #1
+
+        vmov       d30, d24             @ d30[0] = l[0] + ul
+        vadd.u16   q0,  q12
+        vadd.u16   q1,  q13
+        vadd.u16   q2,  q14
+        vadd.u16   q3,  q5
+
+        vrshr.u16  q0,  #2
+        vrshr.u16  q1,  #2
+        vrshr.u16  q2,  #2
+        vrshr.u16  q3,  #2
+
+        @ Up
+        vext.16    q5,  q10, q11, #7
+        vext.16    q14, q9,  q10, #7
+        vext.16    q13, q8,  q9,  #7
+        vext.16    q12, q15, q8,  #7
+
+        vadd.u16   q5,  q11
+        vadd.u16   q14, q10
+        vadd.u16   q13, q9
+        vadd.u16   q12, q8
+        vmov.u16   r3,  d23[3]          @ Save final pel
+
+        vext.16    q8,  q12, q13, #1
+        vext.16    q9,  q13, q14, #1
+        vext.16    q10, q14, q5,  #1
+        vext.16    q11, q5,  q5,  #1
+
+        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
+        vadd.u16   q8,  q12
+        vadd.u16   q9,  q13
+        vadd.u16   q10, q14
+        vadd.u16   q11, q5
+
+        vrshr.u16  q8,  #2
+        vrshr.u16  q9,  #2
+        vrshr.u16  q10, #2
+        vrshr.u16  q11, #2
+
+        @ Misc
+        vrshr.u16  d30, #2
+        vmov.u16   d7[3], r2            @ Restore final pel
+        vmov.u16   d23[3], r3           @ Restore final pel
+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
+        vpop      {q5}
+
+10:
+        vstm       r1, {d16-d23}        @ Up
+        vst1.16   {d31[3]}, [r12]       @ Up-left
+        vstm       r0, { d0-d7 }        @ Left
+        pop       {r4-r10, pc}
+endfunc
+
+@ int ff_hevc_rpi_intra_filter_4_neon_32(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    pw_s,    2
+.set    pw,      (1 << pw_s)
+.set    log2_s,  2
+
+function ff_hevc_rpi_intra_filter_4_neon_32, export=1
+        push       {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
+
+        it cs
+        vldmcs     r6,  {d4, d5}
+        it mi
+        vldmmi     r5,  {d6, d7}
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        vdup.32    q1,  d0[0]
+        add        r12, r0,  #-pw
+        bpl        1f
+        vld1.32   {d0[0]}, [r10], r9
+        vld1.32   {d0[1]}, [r3],  r9
+        vld1.32   {d1[0]}, [r10]
+        vld1.32   {d1[1]}, [r3]
+1:
+        bcc        1f
+        vld1.32   {d2[1]}, [r4],  r9
+        vld1.32   {d3[0]}, [r8]
+        vld1.32   {d3[1]}, [r4]
+1:
+        vst1.32    {q2,  q3 }, [r1]     @ Up
+        vst1.32    {d31[1]}, [r12]
+        vst1.32    {q0,  q1 }, [r0]     @ Left
+        pop        {r4-r10, pc}
+endfunc
+
+
+@ int ff_hevc_rpi_intra_filter_8_neon_32(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    ur_size, sp_base + 16
+.set    dl_size, sp_base + 20
+.set    pw_s,    2
+.set    pw,      (1 << pw_s)
+.set    log2_s,  3
+.set    p_size,  (1 << log2_s)          @ size in pels
+
+function ff_hevc_rpi_intra_filter_8_neon_32, export=1
+        push       {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
+
+        vdup.32    q9,  d16[0]
+        vdup.32    q11, d20[0]
+
+        it cs
+        vldmcs     r6,  {q8,  q9 }
+        ldr        r12, [sp, #ur_size]
+        bpl        1f
+        cmp        r12, #p_size
+        vldm       r5,  {q10, q11}
+        bge        1f
+        vdup.32    q11, d21[1]
+1:
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        vdup.32    q1,  d0[0]
+        vdup.32    q2,  d0[0]
+        vdup.32    q3,  d0[0]
+        bpl        1f
+        vld1.32   {d0[0]}, [r10], r9
+        vld1.32   {d0[1]}, [r3],  r9
+        vld1.32   {d1[0]}, [r10], r9
+        vld1.32   {d1[1]}, [r3],  r9
+        vld1.32   {d2[0]}, [r10], r9
+        vld1.32   {d2[1]}, [r3],  r9
+        vld1.32   {d3[0]}, [r10]
+        vld1.32   {d3[1]}, [r3]
+1:
+        bcc        1f
+        ldr        r12, [sp, #dl_size]
+        vld1.32   {d4[1]}, [r4],  r9
+        cmp        r12, #p_size
+        vld1.32   {d5[0]}, [r8],  r9
+        vld1.32   {d5[1]}, [r4],  r9
+        blt        2f
+        vld1.32   {d6[0]}, [r8],  r9
+        vld1.32   {d6[1]}, [r4],  r9
+        vld1.32   {d7[0]}, [r8]
+        vld1.32   {d7[1]}, [r4]
+        b          1f
+2:
+        vdup.32    q3,  d5[1]
+1:
+        add        r12, r0,  #-pw
+        vstm       r1,  { q8-q11}       @ Up
+        vst1.32   {d31[1]}, [r12]
+        vstm       r0,  { q0-q3 }       @ Left
+        pop       {r4-r10, pc}
+endfunc
+
+
+@ int ff_hevc_rpi_intra_filter_16_neon_32(
+@    pixel * const left,                   [r0]
+@    pixel * const top,                    [r1]
+@    const unsigned int req,               [r2]
+@    const unsigned int avail,             [r3]
+@    const pixel * const src_l,            [sp, #0]
+@    const pixel * const src_u,            [sp, #4]
+@    const pixel * const src_ur,           [sp, #8]
+@    const unsigned int stride,            [sp, #12] (pels)
+@    const unsigned int top_right_size,    [sp, #16]
+@    const unsigned int down_left_size)    [sp, #20]
+
+.set    sp_base, 8*4
+.set    ur_size, sp_base + 16
+.set    dl_size, sp_base + 20
+.set    pw_s,    2
+.set    pw,      (1 << pw_s)
+.set    log2_s,  4
+.set    p_size,  (1 << log2_s)          @ size in pels
+
+function ff_hevc_rpi_intra_filter_16_neon_32, export=1
+        push       {r4-r10, lr}
+        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
+
+        @ Once we get this big we have run out of neon regs to store
+        @ everything at once so do in pieces
+
+        @ Up (have)
+        it cs
+        vldmcs     r6,  { q0-q3 }
+        ldr        r12, [sp, #ur_size]
+        it mi
+        vldmmi     r5,  { q8-q11}
+        it cs
+        vstmcs     r1,  { q0-q3 }
+        bpl        1f
+        cmp        r12, #12
+        add        lr,  r1,  #(pw << log2_s)
+        bgt        2f
+        cmp        r12, #8
+        bge        3f
+        vdup.16    q9,  d17[1]
+4:      vdup.16    d10, d19[1]
+3:      vdup.16    q11, d21[1]
+2:      vstm       lr, { q8-q11}
+1:
+
+        @ Left (have)
+        add        lr,  r0,  #-pw
+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
+        vst1.32   {d30[1]}, [lr]        @ UL
+        bpl        1f
+        vld1.32   { d0[0]}, [r10], r9
+        vld1.32   { d0[1]}, [r3],  r9
+        vld1.32   { d1[0]}, [r10], r9
+        vld1.32   { d1[1]}, [r3],  r9
+        vld1.32   { d2[0]}, [r10], r9
+        vld1.32   { d2[1]}, [r3],  r9
+        vld1.32   { d3[0]}, [r10], r9
+        vld1.32   { d3[1]}, [r3],  r9
+        vld1.32   { d4[0]}, [r10], r9
+        vld1.32   { d4[1]}, [r3],  r9
+        vld1.32   { d5[0]}, [r10], r9
+        vld1.32   { d5[1]}, [r3],  r9
+        vld1.32   { d6[0]}, [r10], r9
+        vld1.32   { d6[1]}, [r3],  r9
+        vld1.32   { d7[0]}, [r10]
+        vld1.32   { d7[1]}, [r3]
+        vstm       r0,  { q0-q3 }
+1:
+        bcc        1f
+        ldr        r12, [sp, #dl_size]
+        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
+        add        lr,  r0,  #(pw << log2_s)
+        vld1.32   {d16[1]}, [r4],  r9
+        cmp        r12, #4
+        vld1.32   {d17[0]}, [r8],  r9
+        vld1.32   {d17[1]}, [r4],  r9
+        ble        2f
+        vld1.32   {d18[0]}, [r8],  r9
+        vld1.32   {d18[1]}, [r4],  r9
+        cmp        r12, #12
+        vld1.32   {d19[0]}, [r8],  r9
+        vld1.32   {d19[1]}, [r4],  r9
+        blt        3f
+        vld1.32   {d20[0]}, [r8],  r9
+        vld1.32   {d20[1]}, [r4],  r9
+        vld1.32   {d21[0]}, [r8],  r9
+        vld1.32   {d21[1]}, [r4],  r9
+        ble        4f
+        vld1.32   {d22[0]}, [r8],  r9
+        vld1.32   {d22[1]}, [r4],  r9
+        vld1.32   {d23[0]}, [r8]
+        vld1.32   {d23[1]}, [r4]
+        b          5f
+2:      vdup.32    q9,  d17[1]
+3:      vdup.32    q10, d19[1]
+4:      vdup.32    q11, d21[1]
+5:      vstm       lr,  { q8-q11}
+1:
+        eors       r7,  r2
+        beq        99f
+
+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
+        vdup.32    q0,  d31[0]
+        vdup.32    q1,  d31[0]
+        vdup.32    q2,  d31[0]
+        vdup.32    q3,  d31[0]
+        add        lr,  r1,  #(pw << log2_s)
+        vdup.32    q8,  d31[1]
+        vdup.32    q9,  d31[1]
+        vdup.32    q10, d31[1]
+        vdup.32    q11, d31[1]
+        it cs
+        vstmcs     r1,  { q0-q3 }
+        it mi
+        vstmmi     lr,  { q8-q11}
+
+        lsls       r7,  #AVAIL_S_L_N_DL_C
+        vdup.32    q0,  d30[0]
+        vdup.32    q1,  d30[0]
+        vdup.32    q2,  d30[0]
+        vdup.32    q3,  d30[0]
+        add        lr,  r0,  #(pw << log2_s)
+        it mi
+        vstmmi     r0, { q0-q3 }
+        it cs
+        vstmcs     lr, { q0-q3 }
+
+99:
+        pop       {r4-r10, pc}
+endfunc
+
+
+
+
diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
new file mode 100644
index 0000000000..56819ae439
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
@@ -0,0 +1,920 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+/*
+ * Horizontal & Vertical special cases of angular intra pred
+ *
+ * Split out because:
+ *  Vertical, at least, is relatively common
+ *  Much simpler code than the general angular case
+ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
+ *
+ * *** Currently luma filtering is mandatory where it occurs, but there are
+ *     cases where it should be turned off (rdpcm & an extension sps flag).
+ *     These don't occur in the standard conformance suite for Main Profile
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ ff_hevc_rpi_pred_vertical_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
+        ldrb        ip, [r2, #-1]       @ Top-left
+        vld1.32     {d0[0]}, [r2 :32]   @ Left
+        add         r2, r0, r3
+        vld1.8      {d1[]}, [r1]
+        lsl         r3, #1
+        vdup.8      d4, ip
+        vmov.i8     d2, #128
+        vhsub.u8    d4, d0, d4
+        veor        d1, d2
+        vld1.32     {d0[0]}, [r1 :32]   @ Top
+        vqadd.s8    d1, d4
+        vmov.i64    d3, #0xff
+        vmov        d4, d0
+        veor        d5, d1, d2
+        veor        d1, d1, d2
+        vbit        d0, d1, d3
+        vshr.u64    d5, #8
+        vst1.32     {d0[0]}, [r0], r3
+        vshr.u64    d1, #16
+        vbit        d4, d5, d3
+        vshr.u64    d5, #16
+        vst1.32     {d4[0]}, [r2], r3
+        vbit        d0, d1, d3
+        vst1.32     {d0[0]}, [r0]
+        vbit        d4, d5, d3
+        vst1.32     {d4[0]}, [r2]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
+        ldrb        ip, [r2, #-1]       @ Top-left
+        vld1.8      {d0}, [r2 :64]      @ Left
+        vmov.i8     d1, #128
+        vld1.8      {d2[]}, [r1]
+        vld1.8      {d3}, [r1 :64]      @ Top
+        vdup.8      d4, ip
+        vhsub.u8    d4, d0, d4
+        veor        d2, d1
+        vmov.i64    d0, #0xff
+        mov         r1, #8
+        vqadd.s8    d2, d4, d2
+        veor        d1, d2, d1
+1:
+        vbit        d3, d1, d0
+        vshr.u64    d1, #8
+        vst1.8      {d3}, [r0 :64], r3
+        subs        r1, #2
+        vbit        d3, d1, d0
+        vshr.u64    d1, #8
+        vst1.8      {d3}, [r0 :64], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
+        ldrb        ip, [r2, #-1]       @ Top-left
+        vld1.8      {q0}, [r2 :128]     @ Left
+        vdup.8      q1, ip
+        vld1.8      {d4[],d5[]}, [r1]
+        vhsub.u8    q0, q1
+        vmov.i8     q1, #128
+        veor        q2, q1
+        vmov.i64    d16, #0xff
+        vqadd.s8    q0, q2
+        vld1.8      {q3}, [r1 :128]     @ Top
+        mov         r1, #16
+        veor        q0, q1
+        vmov        q1, q3
+        vext.8      q2, q0, q0, #1
+1:
+        vbit        d2, d0, d16
+        vbit        d6, d4, d16
+        vext.8      q0, q0, q0, #2
+        subs        r1, #2
+        vst1.8      {q1}, [r0 :128], r3
+        vext.8      q2, q2, q2, #2
+        vst1.8      {q3}, [r0 :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vert_32_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
+        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
+        add         r2,  r0,  r3
+        lsl         r3,  #1
+        mov         r1,  #16
+1:
+        vst1.8     {q0,  q1 }, [r0  :128], r3
+        subs        r1,  #1
+        vst1.8     {q0,  q1 }, [r2  :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_c_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
+        vld1.16    {d0 }, [r1  :64]    @ Up
+        add         r2,  r0,  r3,  lsl #1
+        lsl         r3,  #2
+
+        vst1.16    {d0 }, [r0  :64], r3
+        vst1.16    {d0 }, [r2  :64], r3
+        vst1.16    {d0 }, [r0  :64]
+        vst1.16    {d0 }, [r2  :64]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_c_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
+        vld1.16    {q0 }, [r1  :128]    @ Up
+        add         r2,  r0,  r3,  lsl #1
+        lsl         r3,  #2
+        mov         r1,  #4
+1:
+        vst1.16    {q0 }, [r0  :128], r3
+        subs        r1,  #2
+        vst1.16    {q0 }, [r2  :128], r3
+        vst1.16    {q0 }, [r0  :128], r3
+        vst1.16    {q0 }, [r2  :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_c_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
+        add         r2,  r0,  r3,  lsl #1
+        lsl         r3,  #2
+        mov         r1,  #8
+1:
+        vst1.16    {q0,  q1 }, [r0  :128], r3
+        subs        r1,  #1
+        vst1.16    {q0,  q1 }, [r2  :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontalal_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+@ ? Might be faster as simple arm
+
+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
+        ldrb        ip, [r2, #-1]       @ Top-left
+        vld1.32     {d0[0]}, [r1 :32]   @ Top
+        add         r1, r2, #3
+        vld1.8      {d1[]}, [r2]!
+        vdup.8      d2, ip
+        vmov.i8     d3, #128
+        vhsub.u8    d0, d2
+        veor        d1, d3
+        vld1.8      {d2[]}, [r2]!
+        add         ip, r0, r3
+        vqadd.s8    d0, d0, d1
+        lsl         r3, #1
+        vld1.8      {d1[]}, [r2]
+        vld1.8      {d4[]}, [r1]
+        veor        d0, d3
+        vst1.32     {d0[0]}, [r0 :32], r3
+        vst1.32     {d2[0]}, [ip :32], r3
+        vst1.32     {d1[0]}, [r0 :32]
+        vst1.32     {d4[0]}, [ip :32]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
+        ldrb        ip, [r2, #-1]       @ Top-left
+        vld1.8      {d0}, [r1 :64]      @ Top
+        vmov.i8     d1, #128
+        vld1.8      {d2[]}, [r2]!
+        mov         r1, #8-2
+        vdup.8      d3, ip
+        vhsub.u8    d0, d3
+        veor        d2, d1
+        vqadd.s8    d0, d2
+          vld1.8      {d2[]}, [r2]!
+        veor        d0, d1
+        vst1.8      {d0}, [r0], r3
+1:
+            vld1.8      {d0[]}, [r2]!
+        subs        r1, #2
+          vst1.8      {d2}, [r0 :64], r3
+              vld1.8      {d2[]}, [r2]!
+            vst1.8      {d0}, [r0 :64], r3
+        bne         1b
+
+              vst1.8      {d2}, [r0 :64]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
+        ldrb        ip, [r2, #-1]       @ Top-left
+        vld1.8      {q0}, [r1 :64]      @ Top
+        mov         r1, #16-2
+        vld1.8      {d4[],d5[]}, [r2]!
+        vdup.8      q3, ip
+        vhsub.u8    q0, q3
+        vmov.i8     q1, #128
+        veor        q2, q1
+        vqadd.s8    q0, q2
+          vld1.8      {d4[],d5[]}, [r2]!
+        veor        q0, q1
+        vst1.8      {q0}, [r0], r3
+1:
+            vld1.8      {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.8      {q2}, [r0 :64], r3
+              vld1.8      {d4[],d5[]}, [r2]!
+            vst1.8      {q0}, [r0 :64], r3
+        bne         1b
+
+              vst1.8      {q2}, [r0 :64]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_32_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
+        vld1.8      {d0[],d1[]}, [r2]!
+        add         ip, r0, #16
+        mov         r1, #32-2
+          vld1.8      {d2[],d3[]}, [r2]!
+        vst1.8      {q0}, [r0 :128], r3
+        vst1.8      {q0}, [ip :128], r3
+1:
+            vld1.8      {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.8      {q1}, [r0 :128], r3
+          vst1.8      {q1}, [ip :128], r3
+              vld1.8      {d2[],d3[]}, [r2]!
+            vst1.8      {q0}, [r0 :128], r3
+            vst1.8      {q0}, [ip :128], r3
+        bne         1b
+
+              vst1.8      {q1}, [r0 :128]
+              vst1.8      {q1}, [ip :128]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
+        add         r1, r2, #2
+        vld1.16     {d0[]}, [r2]
+        add         r2, #4
+        vld1.16     {d1[]}, [r1]
+        add         r1, #4
+        vld1.16     {d2[]}, [r2]
+A       add         r2, r0, r3, lsl #1
+T       lsl         r3, #1
+T       add         r2, r0, r3
+        vld1.16     {d3[]}, [r1]
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vst1.16     {d0}, [r0 :64], r3
+        vst1.16     {d1}, [r2 :64], r3
+        vst1.16     {d2}, [r0 :64]
+        vst1.16     {d3}, [r2 :64]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
+        vld1.16     {d0[],d1[]}, [r2]!
+        lsl         r3, #1
+          vld1.16     {d2[],d3[]}, [r2]!
+        mov         r1, #8-2
+        vst1.16     {q0}, [r0 :64], r3
+1:
+            vld1.16     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.16     {q1}, [r0 :64], r3
+              vld1.16     {d2[],d3[]}, [r2]!
+            vst1.16     {q0}, [r0 :64], r3
+        bne         1b
+
+              vst1.16     {q1}, [r0 :64]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
+        vld1.16     {d0[],d1[]}, [r2]!
+        lsl         r3, #1
+        add         ip, r0, #16
+        mov         r1, #16-2
+          vld1.16     {d2[],d3[]}, [r2]!
+        vst1.16     {q0}, [r0 :128], r3
+        vst1.16     {q0}, [ip :128], r3
+1:
+            vld1.16     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.16     {q1}, [r0 :128], r3
+          vst1.16     {q1}, [ip :128], r3
+              vld1.16     {d2[],d3[]}, [r2]!
+            vst1.16     {q0}, [r0 :128], r3
+            vst1.16     {q0}, [ip :128], r3
+        bne         1b
+
+              vst1.16     {q1}, [r0 :128]
+              vst1.16     {q1}, [ip :128]
+        bx          lr
+endfunc
+
+
+@------------------------------------------------------------------------------
+@
+@ 10 Bit
+@ Has clipping constants so 10-bit only but could easily be macroed up to
+@ 14-bit before we run out of bits
+
+
+@ ff_hevc_rpi_pred_vertical_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
+        ldrh        ip, [r2, #-2]       @ Top-left
+        vld1.16     {d0}, [r2 :64]      @ Left
+        vmov.i16    d2, #0
+        vld1.16     {d1[]}, [r1]
+T       lsl         r3, #1
+        vdup.16     d4, ip
+        vmov.i16    d3, #0x3ff
+        vld1.16     {d5}, [r1 :64]      @ Top
+        vhsub.u16   d4, d0, d4
+        vmov.i64    d0, #0xffff
+A       add         r2, r0, r3, lsl #1
+T       add         r2, r0, r3
+        vadd.i16    d1, d1, d4
+        vmov        d6, d5
+        vmax.s16    d1, d1, d2
+        vmin.s16    d2, d1, d3
+        vmin.s16    d1, d1, d3
+        vbit        d5, d1, d0
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vshr.u64    d2, #16
+        vshr.u64    d1, #32
+        vbit        d6, d2, d0
+        vst1.16     {d5}, [r0], r3
+        vshr.u64    d2, #32
+        vst1.16     {d6}, [r2], r3
+        vbit        d5, d1, d0
+        vst1.16     {d5}, [r0]
+        vbit        d6, d2, d0
+        vst1.16     {d6}, [r2]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
+        ldrh        ip, [r2, #-2]       @ Top-left
+        vld1.16     {q0}, [r2 :128]     @ Left
+        lsl         r3, #1
+        vdup.16     q1, ip
+        vld1.16     {d4[],d5[]}, [r1]
+        vhsub.u16   q0, q0, q1
+        vmov.i16    q1, #0
+        vadd.i16    q0, q2
+        vmov.i16    q2, #0x3ff
+        vld1.16     {q3}, [r1 :128]     @ Top
+        mov         r1, #8
+        vmax.s16    q0, q1
+        vmov        q1, q3
+        vmin.s16    q0, q2
+        vmov.i64    d16, #0xffff
+        vext.16     q2, q0, q0, #1
+1:
+        vbit        d2, d0, d16
+        vbit        d6, d4, d16
+        vext.16     q0, q0, q0, #2
+        subs        r1, #2
+        vst1.16     {q1}, [r0 :128], r3
+        vext.16     q2, q2, q2, #2
+        vst1.16     {q3}, [r0 :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
+        ldrh        ip, [r2, #-2]       @ Top-left
+        vld1.16     {q0-q1}, [r2 :128]  @ Left
+T       lsl         r3, #1
+        vdup.16     q2, ip
+A       add         r2, r0, r3, lsl #1
+T       add         r2, r0, r3
+        vld1.16     {d6[],d7[]}, [r1]
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vhsub.u16   q0, q2
+        vhsub.u16   q1, q2
+        vadd.i16    q0, q3
+        vadd.i16    q1, q3
+        vmov.i16    q2, #0
+        vld1.16     {q8-q9}, [r1 :128]  @ Top
+        mov         r1, #0
+        vmov.i16    q3, #0x3ff
+        vmax.s16    q0, q2
+        vmax.s16    q1, q2
+        vmin.s16    q0, q3
+        vmin.s16    q1, q3
+        vmov        q10, q8
+        vmov        q11, q9
+        vext.16     q2, q0, q1, #1
+        vext.16     q3, q1, q1, #1
+        vmov.i64    d24, #0xffff
+1:
+        vbit        d16, d0, d24
+        vbit        d20, d4, d24
+        vext.16     q0, q0, q0, #2
+        subs        r1, #1<<30
+        vst1.16     {q8-q9}, [r0 :128], r3
+        vext.16     q2, q2, q2, #2
+        vst1.16     {q10-q11}, [r2 :128], r3
+        bne         1b
+1:
+        vbit        d16, d2, d24
+        vbit        d20, d6, d24
+        vext.16     q1, q1, q1, #2
+        subs        r1, #1<<30
+        vst1.16     {q8-q9}, [r0 :128], r3
+        vext.16     q3, q3, q3, #2
+        vst1.16     {q10-q11}, [r2 :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_32_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
+        vldm        r1, { q0-q3 }    @ Up
+        lsl         r3, #1
+        mov         r1, #32
+        add         r2, r0, #32
+1:
+        vst1.16     {q0-q1}, [r0 :128], r3
+        subs        r1, #1
+        vst1.16     {q2-q3}, [r2 :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_c_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
+        vld1.16    {q0 }, [r1  :128]    @ Up
+        add         r2,  r0,  r3,  lsl #2
+        lsl         r3,  #3
+
+        vst1.16    {q0 }, [r0  :128], r3
+        vst1.16    {q0 }, [r2  :128], r3
+        vst1.16    {q0 }, [r0  :128]
+        vst1.16    {q0 }, [r2  :128]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_c_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
+        add         r2,  r0,  r3,  lsl #2
+        lsl         r3,  #3
+        mov         r1,  #4
+1:
+        vst1.16    {q0,  q1 }, [r0  :128], r3
+        subs        r1,  #1
+        vst1.16    {q0,  q1 }, [r2  :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_vertical_c_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
+        vldm        r1, { q0-q3 }    @ Up
+        lsl         r3, #2
+        mov         r1, #16
+        add         r2, r0, #32
+1:
+        vst1.16     {q0-q1}, [r0 :128], r3
+        subs        r1, #1
+        vst1.16     {q2-q3}, [r2 :128], r3
+        bne         1b
+
+        bx          lr
+endfunc
+
+@ ff_hevc_rpi_pred_horizontal_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
+        ldrh        ip, [r2, #-2]       @ Top-left
+        vld1.16     {d0}, [r1 :64]      @ Top
+        vmov.i16    d1, #0
+        vld1.16     {d2[]}, [r2]!
+T       lsl         r3, #1
+        vdup.16     d3, ip
+        vmov.i16    d4, #0x3ff
+        vhsub.u16   d0, d3
+A       add         ip, r0, r3, lsl #1
+T       add         ip, r0, r3
+        vld1.16     {d3[]}, [r2]!
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vadd.i16    d0, d2
+        vld1.16     {d2[]}, [r2]!
+        vmax.s16    d0, d1
+        vld1.16     {d1[]}, [r2]
+        vmin.s16    d0, d4
+        vst1.16     {d0}, [r0 :64], r3
+        vst1.16     {d3}, [ip :64], r3
+        vst1.16     {d2}, [r0 :64]
+        vst1.16     {d1}, [ip :64]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
+        ldrh        ip, [r2, #-2]       @ Top-left
+        vld1.16     {q0}, [r1 :128]     @ Top
+        lsl         r3, #1
+        vdup.16     q1, ip
+        mov         r1, #8-2
+        vhsub.u16   q0, q1
+        vld1.16     {d2[],d3[]}, [r2]!
+        vmov.i16    q2, #0
+        vadd.i16    q0, q1
+        vmov.i16    q1, #0x3ff
+        vmax.s16    q0, q2
+          vld1.16     {d4[],d5[]}, [r2]!
+        vmin.s16    q0, q1
+        vst1.16     {q0}, [r0 :128], r3
+1:
+            vld1.16     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.16     {q2}, [r0 :128], r3
+              vld1.16     {d4[],d5[]}, [r2]!
+            vst1.16     {q0}, [r0 :128], r3
+        bne         1b
+
+              vst1.16     {q2}, [r0 :128]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontalal_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
+        ldrh        ip, [r2, #-2]       @ Top-left
+        vld1.16     {q0-q1}, [r1 :128]  @ Top
+        lsl         r3, #1
+        vdup.16     q2, ip
+        add         ip, r0, r3
+        vhsub.u16   q0, q2
+        add         ip, #16
+        vhsub.u16   q1, q2
+        mov         r1, #16-2
+        vld1.16     {d4[],d5[]}, [r2]!
+        vmov.i16    q3, #0
+        vadd.u16    q0, q2
+        vadd.i16    q1, q2
+        vmov.i16    q2, #0x3ff
+        vmax.s16    q0, q3
+        vmax.s16    q1, q3
+          vld1.16     {d6[],d7[]}, [r2]!
+        vmin.s16    q0, q2
+        vmin.s16    q1, q2
+        vst1.16     {q0-q1}, [r0 :128], r3
+1:
+            vld1.16     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.16     {q3}, [r0 :128], r3
+          vst1.16     {q3}, [ip :128], r3
+              vld1.16     {d6[],d7[]}, [r2]!
+            vst1.16     {q0}, [r0 :128], r3
+            vst1.16     {q0}, [ip :128], r3
+        bne         1b
+
+              vst1.16     {q3}, [r0 :128]
+              vst1.16     {q3}, [ip :128]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_32_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
+        vld1.16     {d0[],d1[]}, [r2]!
+        add         ip, r0, #16
+        push        {lr}
+        mov         lr, #32
+          vld1.16     {d2[],d3[]}, [r2]!
+        lsl         r3, #1
+        vst1.16     {q0}, [r0 :128], lr
+        sub         r3, #32
+        vst1.16     {q0}, [ip :128], lr
+        mov         r1, #32-2
+        vst1.16     {q0}, [r0 :128], r3
+        vst1.16     {q0}, [ip :128], r3
+1:
+            vld1.16     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.16     {q1}, [r0 :128], lr
+          vst1.16     {q1}, [ip :128], lr
+          vst1.16     {q1}, [r0 :128], r3
+          vst1.16     {q1}, [ip :128], r3
+              vld1.16     {d2[],d3[]}, [r2]!
+            vst1.16     {q0}, [r0 :128], lr
+            vst1.16     {q0}, [ip :128], lr
+            vst1.16     {q0}, [r0 :128], r3
+            vst1.16     {q0}, [ip :128], r3
+        bne         1b
+
+              vst1.16     {q1}, [r0 :128], lr
+              vst1.16     {q1}, [ip :128], lr
+              vst1.16     {q1}, [r0 :128]
+              vst1.16     {q1}, [ip :128]
+        pop         {pc}
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
+        add         r1, r2, #4
+        vld1.32     {d0[],d1[]}, [r2]
+        add         r2, #8
+        vld1.32     {d2[],d3[]}, [r1]
+        add         r1, #8
+        vld1.32     {d4[],d5[]}, [r2]
+A       add         r2, r0, r3, lsl #2
+T       lsl         r3, #2
+T       add         r2, r0, r3
+        vld1.32     {d6[],d7[]}, [r1]
+A       lsl         r3, #3
+T       lsl         r3, #1
+        vst1.32     {q0}, [r0 :128], r3
+        vst1.32     {q1}, [r2 :128], r3
+        vst1.32     {q2}, [r0 :128]
+        vst1.32     {q3}, [r2 :128]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
+        vld1.32     {d0[],d1[]}, [r2]!
+        lsl         r3, #2
+        add         ip, r0, #16
+        mov         r1, #8-2
+          vld1.32     {d2[],d3[]}, [r2]!
+        vst1.32     {q0}, [r0 :128], r3
+        vst1.32     {q0}, [ip :128], r3
+1:
+            vld1.32     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.32     {q1}, [r0 :128], r3
+          vst1.32     {q1}, [ip :128], r3
+              vld1.32     {d2[],d3[]}, [r2]!
+            vst1.32     {q0}, [r0 :128], r3
+            vst1.32     {q0}, [ip :128], r3
+        bne         1b
+
+              vst1.32     {q1}, [r0 :128]
+              vst1.32     {q1}, [ip :128]
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
+        vld1.32     {d0[],d1[]}, [r2]!
+        add         ip, r0, #16
+        push        {lr}
+        mov         lr, #32
+          vld1.32     {d2[],d3[]}, [r2]!
+        lsl         r3, #2
+        vst1.32     {q0}, [r0 :128], lr
+        sub         r3, #32
+        vst1.32     {q0}, [ip :128], lr
+        mov         r1, #16-2
+        vst1.32     {q0}, [r0 :128], r3
+        vst1.32     {q0}, [ip :128], r3
+1:
+            vld1.32     {d0[],d1[]}, [r2]!
+        subs        r1, #2
+          vst1.32     {q1}, [r0 :128], lr
+          vst1.32     {q1}, [ip :128], lr
+          vst1.32     {q1}, [r0 :128], r3
+          vst1.32     {q1}, [ip :128], r3
+              vld1.32     {d2[],d3[]}, [r2]!
+            vst1.32     {q0}, [r0 :128], lr
+            vst1.32     {q0}, [ip :128], lr
+            vst1.32     {q0}, [r0 :128], r3
+            vst1.32     {q0}, [ip :128], r3
+        bne         1b
+
+              vst1.32     {q1}, [r0 :128], lr
+              vst1.32     {q1}, [ip :128], lr
+              vst1.32     {q1}, [r0 :128]
+              vst1.32     {q1}, [ip :128]
+        pop         {pc}
+endfunc
+
+
+
diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
new file mode 100644
index 0000000000..af8c4c03f0
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
@@ -0,0 +1,1043 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ Planar intra pred (8.4.4.2.4)
+@
+@ predSamples[ x ][ y ] =
+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
+@   ( x + 1 ) * p[ nTbS ][ -1 ] +
+@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
+@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
+
+@ All 10-bit functions would work with 9
+
+
+@ ff_hevc_rpi_pred_planar_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_4_neon_8, export=1
+
+        vld1.8      {d0}, [r1]          @ Top
+        adr         ip, nb_3_0_1_4
+        vld1.8      {d1}, [r2]          @ Left
+        vmov.i64    d2, #0xffffffff
+        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
+        add         r1, r0, r3
+        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
+        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
+        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
+        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
+        vshll.u8    q8, d4, #2
+        lsl         r3, #1
+        vsubl.u8    q2, d5, d4
+        vmlal.u8    q8, d0, d3
+        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
+        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
+        vshl.s16    q9, q2, #1
+        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
+        vadd.i16    d16, d4
+        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
+        vadd.i16    d17, d18
+        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
+        vadd.i16    q2, q8, q9
+        vmlal.u8    q8, d0, d6
+        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
+        vmlal.u8    q2, d0, d7
+        vrshrn.i16  d0, q8, #3
+        vst1.32     d0[0], [r0 :32], r3
+        vst1.32     d0[1], [r1 :32], r3
+        vrshrn.i16  d0, q2, #3
+        vst1.32     d0[0], [r0 :32]
+        vst1.32     d0[1], [r1 :32]
+
+        bx          lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_4_neon_10, export=1
+        @ Load from bytes & expand later - at the very least this uses less
+        @ memory than having a short table
+        vld1.16     {q0}, [r1 :64]      @ Top
+        adr         ip, nbh_3_0_1_4
+        vldr        d2, [r2, #8]        @ Left (lower)
+        vldr        d3, [ip, #8]        @ {1,2,3,4}
+T       lsl         r3, #1
+        vshl.s16    d4, d0, #2
+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
+        vldr        d5, [r2]            @ Left (upper)
+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
+        vldr        d6, [ip]            @ {3,2,1,0}
+        vmla.i16    d4, d3, d1          @ Acc set up
+        vsub.i16    d0, d2, d0          @ Add set up
+        vmov        d7, d6
+        vdup.16     d2, d5[0]
+        vdup.16     d3, d5[1]
+        vdup.16     d16, d5[2]
+        vadd.i16    d18, d0, d4
+        vshl.s16    d0, #1              @ x2
+        vadd.i16    d19, d0, d4
+        vdup.16     d17, d5[3]
+        vadd.i16    d4, d0, d18
+A       add         r1, r0, r3, lsl #1
+T       add         r1, r0, r3
+        vadd.i16    d5, d0, d19
+A       lsl         r3, #2
+T       lsl         r3, #1
+        vmla.i16    q9, q1, q3
+        vmla.i16    q2, q8, q3
+        vrshr.u16   q0, q9, #3
+        vst1.16     {d0}, [r0], r3
+        vrshr.u16   d2, d4, #3
+        vst1.16     {d1}, [r1], r3
+        vrshr.u16   d3, d5, #3
+        vst1.16     {d2}, [r0]
+        vst1.16     {d3}, [r1]
+
+        bx         lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_8_neon_8, export=1
+
+        vld1.8      {q0}, [r1]          @ Top
+        adr         ip, nb_7_0_1_8
+        vldr        d2, [r2, #8]        @ Left (lower)
+        mov         r1, #8
+        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
+        vshll.u8    q2, d0, #3
+        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
+        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
+        vldr        d6, [r2]            @ Left (upper)
+        vmlal.u8    q2, d3, d1
+        vsubl.u8    q0, d2, d0
+        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
+
+@ u8   7..0    [1]  d7
+@ u8  left[y]  [1]  d6
+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
+
+        vdup.8      d2, d6[0]
+        vadd.i16    q2, q0
+        vdup.8      d3, d6[1]
+        vadd.i16    q8, q2, q0
+1:
+        vmlal.u8    q2, d7, d2
+        subs        r1, #2
+        vadd.i16    q9, q8, q0
+        vmlal.u8    q8, d7, d3
+        vdup.8      d2, d6[2]
+        vdup.8      d3, d6[3]
+        vrshrn.i16  d20, q2, #4
+        vshr.u64    d6, #16
+        vmov        q2, q9
+        vst1.8      {d20}, [r0], r3
+        vrshrn.i16  d20, q8, #4
+        vadd.i16    q8, q2, q0
+        vst1.8      {d20}, [r0], r3
+        bne         1b
+
+        bx          lr
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_8_neon_10, export=1
+
+        adr         ip, nb_7_0_1_8
+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
+        lsl         r3, #1
+        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
+        add         ip, r2, #16
+        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
+        mov         r1, #8-2
+        vshl.s16    q3, q0, #3
+        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
+        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
+        vmla.i16    q3, q8, q2          @ Acc set up
+        vsub.i16    q0, q9, q0          @ Add set up
+        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
+        vadd.i16    q2, q3, q0
+
+@ u16  7..0        [1]  q1
+@ u32 left[y]      [1]  [r2]
+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
+
+        vld1.16     {d6[],d7[]}, [r2]!
+        vadd.i16    q8, q2, q0
+        vld1.16     {d18[],d19[]}, [r2]!
+        vmla.i16    q2, q1, q3
+        vadd.i16    q3, q8, q0
+        vmla.i16    q8, q1, q9
+1:
+        vrshr.u16   q9, q2, #4
+        subs        r1, #2
+        vmov        q2, q3
+        vrshr.u16   q10, q8, #4
+          vld1.16     {d6[],d7[]}, [r2]!
+        vst1.16     {q9}, [r0 :128], r3
+          vadd.i16    q8, q2, q0
+          vld1.16     {d18[],d19[]}, [r2]!
+          vmla.i16    q2, q1, q3
+          vadd.i16    q3, q8, q0
+          vmla.i16    q8, q1, q9
+        vst1.16     {q10}, [r0 :128], r3
+        bne         1b
+
+        vrshr.u16   q9, q2, #4
+        add         r3, r0
+        vrshr.u16   q10, q8, #4
+        vst1.16     {q9}, [r0 :128]
+        vst1.16     {q10}, [r3 :128]
+
+        bx         lr
+endfunc
+
+
+@------------------------------------------------------------------------------
+@
+@ Data - has to be in two lumps to ensure we can always reach using adr
+
+        .balign 64
+
+nb_31_0_1_32:
+        .byte   31, 30, 29, 28, 27, 26, 25, 24
+        .byte   23, 22, 21, 20, 19, 18, 17, 16
+nb_15_0_1_16:
+        .byte   15, 14, 13, 12, 11, 10,  9,  8
+        .byte    7,  6,  5,  4,  3,  2,  1,  0
+        .byte    1,  2,  3,  4,  5,  6,  7,  8
+        .byte    9, 10, 11, 12, 13, 14, 15, 16
+        .byte   17, 18, 19, 20, 21, 22, 23, 24
+        .byte   25, 26, 27, 28, 29, 30, 31, 32
+
+        @ should be back on a 64-byte boundary here
+
+        @ These could be extracted from the above array, but separate out
+        @ out for better (16 byte) alignment
+nb_3_0_1_4:
+        .byte    3,  2,  1,  0,  3,  2,  1,  0
+        .byte    1,  2,  3,  4,  1,  2,  3,  4
+nb_7_0_1_8:
+        .byte    7,  6,  5,  4,  3,  2,  1,  0
+        .byte    1,  2,  3,  4,  5,  6,  7,  8
+nbh_3_0_1_4:
+        .short   3,  2,  1,  0,  1,  2,  3,  4
+
+@------------------------------------------------------------------------------
+
+
+@ ff_hevc_rpi_pred_planar_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_16_neon_8, export=1
+
+        adr         ip, nb_15_0_1_16 + 16
+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
+        add         r2, #16
+        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
+        vld1.8      {d4[]}, [r1]        @ Top (right)
+        sub         ip, #16
+        vshll.u8    q3, d0, #4
+        mov         r1, #16
+        vshll.u8    q8, d1, #4
+        vld1.8      {d5[]}, [r2]        @ Left (lower)
+        sub         r2, #16
+        vmlal.u8    q3, d2, d4
+        vmlal.u8    q8, d3, d4          @ Acc set up
+        vsubl.u8    q1, d5, d0
+        vsubl.u8    q0, d5, d1          @ Add set up
+        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
+
+@ u8  15..0    [1]  q2
+@ u8  left[y]  [1]  [r2]
+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
+
+        vadd.i16    q3, q1
+        vadd.i16    q8, q0
+1:
+        vadd.i16    q10, q3, q1
+        subs        r1, #2
+        vld1.8      {d18[]}, [r2]!
+        vadd.i16    q11, q8, q0
+        vld1.8      {d19[]}, [r2]!
+        vmlal.u8    q3, d4, d18
+        vmlal.u8    q8, d5, d18
+        vadd.i16    q12, q10, q1
+        vmlal.u8    q10, d4, d19
+        vadd.i16    q13, q11, q0
+        vmlal.u8    q11, d5, d19
+        vrshrn.u16  d18, q3, #5
+        vrshrn.u16  d19, q8, #5
+        vmov        q3, q12
+        vst1.8      {q9}, [r0 :128], r3
+        vrshrn.u16  d18, q10, #5
+        vrshrn.u16  d19, q11, #5
+        vmov        q8, q13
+        vst1.8      {q9}, [r0 :128], r3
+        bne         1b
+
+        bx          lr
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_16_neon_10, export=1
+
+        @ Load from bytes & expand later - at the very least this uses less
+        @ memory than having a short table
+        adr         ip, nb_15_0_1_16 + 16
+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
+        add         r2, #32
+        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
+        lsl         r3, #1
+        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
+        sub         ip, #16
+        vmovl.u8    q8, d4
+        mov         r1, #16
+        vshl.i16    q9, q0, #4
+        vmovl.u8    q2, d5
+        vshl.i16    q10, q1, #4
+        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
+        sub         r2, #32
+        vld1.8      {q12}, [ip]         @ {15,14,13...0}
+        vmla.i16    q9, q8, q3
+        vmla.i16    q10, q2, q3         @ Acc set up
+        vsub.i16    q0, q11, q0
+        vsub.i16    q1, q11, q1         @ Add set up
+        vadd.i16    q2, q9, q0
+        vadd.i16    q3, q10, q1
+        vmovl.u8    q8, d24
+        vmovl.u8    q9, d25
+
+@ u16  15..0       [2]  q8,q9
+@ u32 left[y]      [2]  [r2]
+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
+
+1:
+        vadd.i16    q10, q2, q0
+        subs        r1, #2
+        vld1.16     {d24[],d25[]}, [r2]!
+        vadd.i16    q11, q3, q1
+        vld1.16     {d28[],d29[]}, [r2]!
+        vmla.i16    q2, q8, q12
+        vmla.i16    q3, q9, q12
+        vadd.i16    q12, q10, q0
+        vmla.i16    q10, q8, q14
+        vadd.i16    q13, q11, q1
+        vmla.i16    q11, q9, q14
+        vrshr.u16   q14, q2, #5
+        vrshr.u16   q15, q3, #5
+        vmov        q2, q12
+        vst1.16     {q14-q15}, [r0 :128], r3
+        vrshr.u16   q14, q10, #5
+        vrshr.u16   q15, q11, #5
+        vmov        q3, q13
+        vst1.16     {q14-q15}, [r0 :128], r3
+        bne         1b
+
+        bx         lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_32_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_32_neon_8, export=1
+
+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
+        adr         ip, nb_31_0_1_32 + 32
+        vpush       {d8-d12}
+        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
+        add         r2, #32
+        vld1.8      {d8[]}, [r1]        @ Top (right)
+        sub         ip, #32
+        vshll.u8    q8, d0, #5
+        mov         r1, #32
+        vld1.8      {d9[]}, [r2]        @ Left (lower)
+        sub         r2, #32
+        vshll.u8    q9, d1, #5
+        vshll.u8    q10, d2, #5
+        vshll.u8    q11, d3, #5
+        vmlal.u8    q8, d4, d8
+        vsubl.u8    q12, d9, d0
+        vmlal.u8    q9, d5, d8
+        vsubl.u8    q13, d9, d1
+        vmlal.u8    q10, d6, d8
+        vsubl.u8    q14, d9, d2
+        vmlal.u8    q11, d7, d8         @ Acc set up
+        vsubl.u8    q15, d9, d3         @ Add set up
+        vadd.i16    q8, q12
+        vadd.i16    q9, q13
+        vadd.i16    q10, q14
+        vadd.i16    q11, q15
+        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
+
+@ u8  31..0    [2]  q4,q5
+@ u8  left[y]  [2]  [r2]
+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
+
+        vld1.8      {d12[]}, [r2]!
+        vadd.i16    q0, q8, q12
+        b           2f
+1:
+          vld1.8      {d12[]}, [r2]!
+        vrshrn.u16  d3, q1, #6
+        vrshrn.u16  d2, q0, #6
+          vadd.i16    q0, q8, q12
+        vrshrn.u16  d4, q2, #6
+        vrshrn.u16  d5, q3, #6
+        vst1.8      {q1-q2}, [r0 :128], r3
+2:        vadd.i16    q1, q9, q13
+          subs        r1, #2
+          vadd.i16    q2, q10, q14
+          vadd.i16    q3, q11, q15
+          vmlal.u8    q8, d8, d12
+          vmlal.u8    q9, d9, d12
+          vmlal.u8    q10, d10, d12
+          vmlal.u8    q11, d11, d12
+            vld1.8      {d12[]}, [r2]!
+          vrshrn.u16  d19, q9, #6
+          vrshrn.u16  d18, q8, #6
+            vadd.i16    q8, q0, q12
+          vrshrn.u16  d20, q10, #6
+          vrshrn.u16  d21, q11, #6
+          vst1.8      {q9-q10}, [r0 :128], r3
+            vadd.i16    q9, q1, q13
+            vadd.i16    q10, q2, q14
+            vadd.i16    q11, q3, q15
+            vmlal.u8    q0, d8, d12
+            vmlal.u8    q1, d9, d12
+            vmlal.u8    q2, d10, d12
+            vmlal.u8    q3, d11, d12
+
+        bne         1b
+
+        vpop        {d8-d12}
+
+        vrshrn.u16  d3, q1, #6
+        vrshrn.u16  d2, q0, #6
+        vrshrn.u16  d4, q2, #6
+        vrshrn.u16  d5, q3, #6
+        vst1.8      {q1-q2}, [r0 :128]
+
+        bx          lr
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_32_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_32_neon_10, export=1
+
+        @ Load from bytes & expand later - at the very least this uses less
+        @ memory than having a short table
+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
+        adr         ip, nb_31_0_1_32 + 32
+        vpush       {q4-q7}
+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
+        add         r2, #64
+        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
+T       lsl         r3, #1
+        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
+        sub         ip, #32
+        vmovl.u8    q12, d28
+        mov         r1, #32
+        vmovl.u8    q13, d29
+        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
+        vmovl.u8    q14, d30
+        vmovl.u8    q15, d31
+        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
+        sub         r2, #64
+        vshl.i16    q8, q0, #5
+        vshl.i16    q9, q1, #5
+        vshl.i16    q10, q2, #5
+        vshl.i16    q11, q3, #5
+        vmla.i16    q8, q12, q4
+        vsub.i16    q0, q5, q0
+        vmla.i16    q9, q13, q4
+        vsub.i16    q1, q5, q1
+        vmla.i16    q10, q14, q4
+        vmov.u16    ip, d0[0]
+        vsub.i16    q2, q5, q2
+        vmla.i16    q11, q15, q4         @ Acc set up
+        vsub.i16    q3, q5, q3           @ Add set up
+        vadd.i16    q8, q0
+        vadd.i16    q9, q1
+        vadd.i16    q10, q2
+        vadd.i16    q11, q3
+        vmovl.u8    q4, d12
+        vmovl.u8    q5, d13
+        vmovl.u8    q6, d14
+        vmovl.u8    q7, d15
+
+@ u16 31..0    [4]  q4-q7
+@ u16 left[y]  [4]  [r2]
+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
+
+        vadd.i16    q12, q8, q0
+A       sub         r0, r0, r3, lsl #1
+T       sub         r0, r3
+1:
+        vld1.16     {d0[0]}, [r2]!
+A       add         r0, r0, r3, lsl #1
+T       add         r0, r3
+        vadd.i16    q13, q9, q1
+        subs        r1, #2
+        vadd.i16    q14, q10, q2
+        vadd.i16    q15, q11, q3
+        vmla.i16    q8, q4, d0[0]
+        vmla.i16    q9, q5, d0[0]
+        vmla.i16    q10, q6, d0[0]
+        vmla.i16    q11, q7, d0[0]
+        vmov.16     d0[0], ip
+        vrshr.u16   q8, #6
+        vrshr.u16   q9, #6
+        vrshr.u16   q10, #6
+        vrshr.u16   q11, #6
+        vstm        r0, {q8-q11}
+        vadd.i16    q8, q12, q0
+A       add         r0, r0, r3, lsl #1
+T       add         r0, r3
+        vld1.16     {d0[0]}, [r2]!
+        vadd.i16    q9, q13, q1
+        vadd.i16    q10, q14, q2
+        vadd.i16    q11, q15, q3
+        vmla.i16    q12, q4, d0[0]
+        vmla.i16    q13, q5, d0[0]
+        vmla.i16    q14, q6, d0[0]
+        vmla.i16    q15, q7, d0[0]
+        vmov.16     d0[0], ip
+        vrshr.u16   q12, #6
+        vrshr.u16   q13, #6
+        vrshr.u16   q14, #6
+        vrshr.u16   q15, #6
+        vstm        r0, {q12-q15}
+        vadd.i16    q12, q8, q0
+        bne         1b
+
+        vpop        {q4-q7}
+        bx          lr
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_c_4_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
+
+        vld1.8      {q0}, [r1]          @ Top
+        adr         ip, nbx2_3_0_1_4
+        vldr        d2, [r2, #8]        @ Left (lower)
+        mov         r1, #4
+        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
+        lsl         r3, #1
+        vshll.u8    q2, d0, #2
+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
+        vldr        d6, [r2]            @ Left (upper)
+        vmlal.u8    q2, d3, d1
+        vsubl.u8    q0, d2, d0
+        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
+
+@ u8   3..0    [1]  d7
+@ u8  left[y]  [1]  d6
+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
+
+        vdup.16     d2, d6[0]
+        vadd.i16    q2, q0
+        vdup.16     d3, d6[1]
+        vadd.i16    q8, q2, q0
+1:
+        vmlal.u8    q2, d7, d2
+        subs        r1, #2
+        vadd.i16    q9, q8, q0
+        vmlal.u8    q8, d7, d3
+        vdup.16     d2, d6[2]
+        vdup.16     d3, d6[3]
+        vrshrn.i16  d20, q2, #3
+        vmov        q2, q9
+        vst1.8      {d20}, [r0], r3
+        vrshrn.i16  d20, q8, #3
+        vadd.i16    q8, q2, q0
+        vst1.8      {d20}, [r0], r3
+        bne         1b
+
+        bx          lr
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_c_4_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
+
+        adr         ip, nbx2_3_0_1_4
+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
+        lsl         r3, #2
+        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
+        add         ip, r2, #16
+        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
+        vshl.s16    q3, q0, #2
+        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
+        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
+        vmla.i16    q3, q8, q2          @ Acc set up
+        vsub.i16    q0, q9, q0          @ Add set up
+        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
+        vadd.i16    q2, q3, q0
+
+@ u16  3..0        [1]  q1
+@ u32 left[y]      [1]  [r2]
+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
+
+        vld1.32     {d6[],d7[]}, [r2]!
+        vadd.i16    q8, q2, q0
+        vld1.32     {d18[],d19[]}, [r2]!
+        vmla.i16    q2, q1, q3
+        vadd.i16    q3, q8, q0
+        vmla.i16    q8, q1, q9
+
+        vrshr.u16   q9, q2, #3
+        vmov        q2, q3
+        vrshr.u16   q10, q8, #3
+          vld1.32     {d6[],d7[]}, [r2]!
+        vst1.16     {q9}, [r0 :128], r3
+          vadd.i16    q8, q2, q0
+          vld1.32     {d18[],d19[]}, [r2]!
+          vmla.i16    q2, q1, q3
+          vadd.i16    q3, q8, q0
+          vmla.i16    q8, q1, q9
+        vst1.16     {q10}, [r0 :128], r3
+
+          vrshr.u16   q9, q2, #3
+          add         r3, r0
+          vrshr.u16   q10, q8, #3
+          vst1.16     {q9}, [r0 :128]
+          vst1.16     {q10}, [r3 :128]
+
+          bx         lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_c_8_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
+
+        adr         ip, nbx2_7_0_1_8 + 16
+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
+        add         r2, #16
+        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
+        lsl         r3, #1
+        vld1.16     {d4[]}, [r1]        @ Top (right)
+        sub         ip, #16
+        vshll.u8    q3, d0, #3
+        mov         r1, #8
+        vshll.u8    q8, d1, #3
+        vld1.16     {d5[]}, [r2]        @ Left (lower)
+        sub         r2, #16
+        vmlal.u8    q3, d2, d4
+        vmlal.u8    q8, d3, d4          @ Acc set up
+        vsubl.u8    q1, d5, d0
+        vsubl.u8    q0, d5, d1          @ Add set up
+        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
+
+@ u8  7..0     [1]  q2
+@ u8  left[y]  [1]  [r2]
+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
+
+        vadd.i16    q3, q1
+        vadd.i16    q8, q0
+1:
+        vadd.i16    q10, q3, q1
+        subs        r1, #2
+        vld1.16     {d18[]}, [r2]!
+        vadd.i16    q11, q8, q0
+        vld1.16     {d19[]}, [r2]!
+        vmlal.u8    q3, d4, d18
+        vmlal.u8    q8, d5, d18
+        vadd.i16    q12, q10, q1
+        vmlal.u8    q10, d4, d19
+        vadd.i16    q13, q11, q0
+        vmlal.u8    q11, d5, d19
+        vrshrn.u16  d18, q3, #4
+        vrshrn.u16  d19, q8, #4
+        vmov        q3, q12
+        vst1.8      {q9}, [r0 :128], r3
+        vrshrn.u16  d18, q10, #4
+        vrshrn.u16  d19, q11, #4
+        vmov        q8, q13
+        vst1.8      {q9}, [r0 :128], r3
+        bne         1b
+
+        bx          lr
+
+endfunc
+
+
+@------------------------------------------------------------------------------
+@
+@ Data - has to be in two lumps to ensure we can always reach using adr
+
+        .balign 64
+
+nbx2_15_0_1_16:
+        .byte   15, 15, 14, 14, 13, 13, 12, 12
+        .byte   11, 11, 10, 10,  9,  9,  8,  8
+nbx2_7_0_1_8:
+        .byte    7,  7,  6,  6,  5,  5,  4,  4
+        .byte    3,  3,  2,  2,  1,  1,  0,  0
+        .byte    1,  1,  2,  2,  3,  3,  4,  4
+        .byte    5,  5,  6,  6,  7,  7,  8,  8
+        .byte    9,  9, 10, 10, 11, 11, 12, 12
+        .byte   13, 13, 14, 14, 15, 15, 16, 16
+
+        @ should be back on a 64-byte boundary here
+
+nbx2_3_0_1_4:
+        .byte    3,  3,  2,  2,  1,  1,  0,  0
+        .byte    1,  1,  2,  2,  3,  3,  4,  4
+
+@------------------------------------------------------------------------------
+
+
+@ ff_hevc_rpi_pred_planar_c_8_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
+
+        @ Load from bytes & expand later - at the very least this uses less
+        @ memory than having a short table
+        adr         ip, nbx2_7_0_1_8 + 16
+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
+        add         r2, #32
+        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
+        lsl         r3, #2
+        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
+        sub         ip, #16
+        vmovl.u8    q8, d4
+        mov         r1, #8
+        vshl.i16    q9, q0, #3
+        vmovl.u8    q2, d5
+        vshl.i16    q10, q1, #3
+        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
+        sub         r2, #32
+        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
+        vmla.i16    q9, q8, q3
+        vmla.i16    q10, q2, q3         @ Acc set up
+        vsub.i16    q0, q11, q0
+        vsub.i16    q1, q11, q1         @ Add set up
+        vadd.i16    q2, q9, q0
+        vadd.i16    q3, q10, q1
+        vmovl.u8    q8, d24
+        vmovl.u8    q9, d25
+
+@ u16  7..0        [2]  q8,q9
+@ u32 left[y]      [2]  [r2]
+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
+
+1:
+        vadd.i16    q10, q2, q0
+        subs        r1, #2
+        vld1.32     {d24[],d25[]}, [r2]!
+        vadd.i16    q11, q3, q1
+        vld1.32     {d28[],d29[]}, [r2]!
+        vmla.i16    q2, q8, q12
+        vmla.i16    q3, q9, q12
+        vadd.i16    q12, q10, q0
+        vmla.i16    q10, q8, q14
+        vadd.i16    q13, q11, q1
+        vmla.i16    q11, q9, q14
+        vrshr.u16   q14, q2, #4
+        vrshr.u16   q15, q3, #4
+        vmov        q2, q12
+        vst1.16     {q14-q15}, [r0 :128], r3
+        vrshr.u16   q14, q10, #4
+        vrshr.u16   q15, q11, #4
+        vmov        q3, q13
+        vst1.16     {q14-q15}, [r0 :128], r3
+        bne         1b
+
+        bx         lr
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_c_16_neon_8
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
+
+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
+        adr         ip, nbx2_15_0_1_16 + 32
+        vpush       {d8-d12}
+        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
+        add         r2, #32
+        vld1.16     {d8[]}, [r1]        @ Top (right)
+        sub         ip, #32
+        vshll.u8    q8, d0, #4
+        mov         r1, #16
+        vld1.16     {d9[]}, [r2]        @ Left (lower)
+        sub         r2, #32
+        vshll.u8    q9, d1, #4
+        lsl         r3, #1
+        vshll.u8    q10, d2, #4
+        vshll.u8    q11, d3, #4
+        vmlal.u8    q8, d4, d8
+        vsubl.u8    q12, d9, d0
+        vmlal.u8    q9, d5, d8
+        vsubl.u8    q13, d9, d1
+        vmlal.u8    q10, d6, d8
+        vsubl.u8    q14, d9, d2
+        vmlal.u8    q11, d7, d8         @ Acc set up
+        vsubl.u8    q15, d9, d3         @ Add set up
+        vadd.i16    q8, q12
+        vadd.i16    q9, q13
+        vadd.i16    q10, q14
+        vadd.i16    q11, q15
+        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
+
+@ u8  15..0    [2]  q4,q5
+@ u8  left[y]  [2]  [r2]
+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
+
+        vld1.16     {d12[]}, [r2]!
+        vadd.i16    q0, q8, q12
+        b           2f
+1:
+          vld1.16     {d12[]}, [r2]!
+        vrshrn.u16  d3, q1, #5
+        vrshrn.u16  d2, q0, #5
+          vadd.i16    q0, q8, q12
+        vrshrn.u16  d4, q2, #5
+        vrshrn.u16  d5, q3, #5
+        vst1.8      {q1-q2}, [r0 :128], r3
+2:        vadd.i16    q1, q9, q13
+          subs        r1, #2
+          vadd.i16    q2, q10, q14
+          vadd.i16    q3, q11, q15
+          vmlal.u8    q8, d8, d12
+          vmlal.u8    q9, d9, d12
+          vmlal.u8    q10, d10, d12
+          vmlal.u8    q11, d11, d12
+            vld1.16     {d12[]}, [r2]!
+          vrshrn.u16  d19, q9, #5
+          vrshrn.u16  d18, q8, #5
+            vadd.i16    q8, q0, q12
+          vrshrn.u16  d20, q10, #5
+          vrshrn.u16  d21, q11, #5
+          vst1.8      {q9-q10}, [r0 :128], r3
+            vadd.i16    q9, q1, q13
+            vadd.i16    q10, q2, q14
+            vadd.i16    q11, q3, q15
+            vmlal.u8    q0, d8, d12
+            vmlal.u8    q1, d9, d12
+            vmlal.u8    q2, d10, d12
+            vmlal.u8    q3, d11, d12
+
+        bne         1b
+
+        vpop        {d8-d12}
+
+        vrshrn.u16  d3, q1, #5
+        vrshrn.u16  d2, q0, #5
+        vrshrn.u16  d4, q2, #5
+        vrshrn.u16  d5, q3, #5
+        vst1.8      {q1-q2}, [r0 :128]
+
+        bx          lr
+
+endfunc
+
+
+@ ff_hevc_rpi_pred_planar_c_16_neon_10
+@       uint8_t *_src,          [r0]
+@       const uint8_t *_top,    [r1]
+@       const uint8_t *_left,   [r2]
+@       ptrdiff_t stride)       [r3]
+
+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
+
+        @ Load from bytes & expand later - at the very least this uses less
+        @ memory than having a short table
+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
+        adr         ip, nbx2_15_0_1_16 + 32
+        vpush       {q4-q7}
+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
+        add         r2, #64
+        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
+T       lsl         r3, #2
+        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
+        sub         ip, #32
+        vmovl.u8    q12, d28
+        mov         r1, #16
+        vmovl.u8    q13, d29
+        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
+        vmovl.u8    q14, d30
+        vmovl.u8    q15, d31
+        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
+        sub         r2, #64
+        vshl.i16    q8, q0, #4
+        vshl.i16    q9, q1, #4
+        vshl.i16    q10, q2, #4
+        vshl.i16    q11, q3, #4
+        vmla.i16    q8, q12, q4
+        vsub.i16    q0, q5, q0
+        vmla.i16    q9, q13, q4
+        vpush       {q0}
+        vsub.i16    q1, q5, q1
+        vmla.i16    q10, q14, q4
+        vsub.i16    q2, q5, q2
+        vmla.i16    q11, q15, q4         @ Acc set up
+        vsub.i16    q3, q5, q3           @ Add set up
+        vadd.i16    q8, q0
+        vadd.i16    q9, q1
+        vadd.i16    q10, q2
+        vadd.i16    q11, q3
+        vmovl.u8    q4, d12
+        vmovl.u8    q5, d13
+        vmovl.u8    q6, d14
+        vmovl.u8    q7, d15
+
+@ u16 31..0    [4]  q4-q7
+@ u16 left[y]  [4]  [r2]
+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
+
+        vadd.i16    q12, q8, q0
+A       sub         r0, r0, r3, lsl #2
+T       sub         r0, r3
+1:
+        vld1.32     {d0[],d1[]}, [r2]!
+A       add         r0, r0, r3, lsl #2
+T       add         r0, r3
+        vadd.i16    q13, q9, q1
+        subs        r1, #2
+        vadd.i16    q14, q10, q2
+        vadd.i16    q15, q11, q3
+        vmla.i16    q8, q4, q0
+        vmla.i16    q9, q5, q0
+        vmla.i16    q10, q6, q0
+        vmla.i16    q11, q7, q0
+        vld1.16     {q0}, [sp]
+        vrshr.u16   q8, #5
+        vrshr.u16   q9, #5
+        vrshr.u16   q10, #5
+        vrshr.u16   q11, #5
+        vstm        r0, {q8-q11}
+        vadd.i16    q8, q12, q0
+A       add         r0, r0, r3, lsl #2
+T       add         r0, r3
+        vld1.32     {d0[],d1[]}, [r2]!
+        vadd.i16    q9, q13, q1
+        vadd.i16    q10, q14, q2
+        vadd.i16    q11, q15, q3
+        vmla.i16    q12, q4, q0
+        vmla.i16    q13, q5, q0
+        vmla.i16    q14, q6, q0
+        vmla.i16    q15, q7, q0
+        vld1.16     {q0}, [sp]
+        vrshr.u16   q12, #5
+        vrshr.u16   q13, #5
+        vrshr.u16   q14, #5
+        vrshr.u16   q15, #5
+        vstm        r0, {q12-q15}
+        vadd.i16    q12, q8, q0
+        bne         1b
+
+        vpop        {q3-q7}
+        bx          lr
+
+endfunc
diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
index 914abfb6cc..d83d21d202 100644
--- a/libavcodec/arm/sbcdsp_neon.S
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
         /* TODO: merge even and odd cases (or even merge all four calls to this
          * function) in order to have only aligned reads from 'in' array
          * and reduce number of load instructions */
-        vld1.16         {d16, d17}, [r0, :64]!
-        vld1.16         {d20, d21}, [r2, :128]!
+        vld1.16         {d4, d5}, [r0, :64]!
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmull.s16       q0, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmull.s16       q1, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
+        vmull.s16       q0, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmull.s16       q1, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!

-        vmlal.s16       q0, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q1, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q0, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q1, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q0, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q1, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
+        vmlal.s16       q0, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q1, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!

-        vmlal.s16       q0, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q1, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
+        vmlal.s16       q0, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q1, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!

-        vmlal.s16       q0, d16, d20
-        vmlal.s16       q1, d17, d21
+        vmlal.s16       q0, d4, d8
+        vmlal.s16       q1, d5, d9

         vpadd.s32       d0, d0, d1
         vpadd.s32       d1, d2, d3

         vrshrn.s32      d0, q0, SBC_PROTO_FIXED_SCALE

-        vld1.16         {d16, d17, d18, d19}, [r2, :128]!
+        vld1.16         {d2, d3, d4, d5}, [r2, :128]!

         vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
         vdup.i32        d0, d0[0]  /* TODO: can be eliminated */

-        vmull.s16       q10, d16, d0
-        vmull.s16       q11, d17, d0
-        vmlal.s16       q10, d18, d1
-        vmlal.s16       q11, d19, d1
+        vmull.s16       q3, d2, d0
+        vmull.s16       q4, d3, d0
+        vmlal.s16       q3, d4, d1
+        vmlal.s16       q4, d5, d1

-        vpadd.s32       d0, d20, d21 /* TODO: can be eliminated */
-        vpadd.s32       d1, d22, d23 /* TODO: can be eliminated */
+        vpadd.s32       d0, d6, d7 /* TODO: can be eliminated */
+        vpadd.s32       d1, d8, d9 /* TODO: can be eliminated */

         vst1.32         {d0, d1}, [r1, :128]

@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
         /* TODO: merge even and odd cases (or even merge all four calls to this
          * function) in order to have only aligned reads from 'in' array
          * and reduce number of load instructions */
-        vld1.16         {d16, d17}, [r0, :64]!
-        vld1.16         {d20, d21}, [r2, :128]!
-
-        vmull.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmull.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmull.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmull.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
-
-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmlal.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
-
-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmlal.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
-
-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-        vmlal.s16       q14, d18, d22
-        vld1.16         {d16, d17}, [r0, :64]!
-        vmlal.s16       q15, d19, d23
-        vld1.16         {d20, d21}, [r2, :128]!
-
-        vmlal.s16       q12, d16, d20
-        vld1.16         {d18, d19}, [r0, :64]!
-        vmlal.s16       q13, d17, d21
-        vld1.16         {d22, d23}, [r2, :128]!
-
-        vmlal.s16       q14, d18, d22
-        vmlal.s16       q15, d19, d23
-
-        vpadd.s32       d0, d24, d25
-        vpadd.s32       d1, d26, d27
-        vpadd.s32       d2, d28, d29
-        vpadd.s32       d3, d30, d31
+        vld1.16         {d4, d5}, [r0, :64]!
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmull.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmull.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmull.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmull.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+
+        vmlal.s16       q8, d6, d10
+        vmlal.s16       q9, d7, d11
+
+        vpadd.s32       d0, d12, d13
+        vpadd.s32       d1, d14, d15
+        vpadd.s32       d2, d16, d17
+        vpadd.s32       d3, d18, d19

         vrshr.s32       q0, q0, SBC_PROTO_FIXED_SCALE
         vrshr.s32       q1, q1, SBC_PROTO_FIXED_SCALE
@@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
         vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
         vdup.i32        d0, d0[0]  /* TODO: can be eliminated */

-        vld1.16         {d16, d17}, [r2, :128]!
-        vmull.s16       q12, d16, d0
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmull.s16       q13, d17, d0
-        vmull.s16       q14, d18, d0
-        vmull.s16       q15, d19, d0
-
-        vld1.16         {d16, d17}, [r2, :128]!
-        vmlal.s16       q12, d16, d1
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmlal.s16       q13, d17, d1
-        vmlal.s16       q14, d18, d1
-        vmlal.s16       q15, d19, d1
-
-        vld1.16         {d16, d17}, [r2, :128]!
-        vmlal.s16       q12, d16, d2
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmlal.s16       q13, d17, d2
-        vmlal.s16       q14, d18, d2
-        vmlal.s16       q15, d19, d2
-
-        vld1.16         {d16, d17}, [r2, :128]!
-        vmlal.s16       q12, d16, d3
-        vld1.16         {d18, d19}, [r2, :128]!
-        vmlal.s16       q13, d17, d3
-        vmlal.s16       q14, d18, d3
-        vmlal.s16       q15, d19, d3
-
-        vpadd.s32       d0, d24, d25 /* TODO: can be eliminated */
-        vpadd.s32       d1, d26, d27 /* TODO: can be eliminated */
-        vpadd.s32       d2, d28, d29 /* TODO: can be eliminated */
-        vpadd.s32       d3, d30, d31 /* TODO: can be eliminated */
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmull.s16       q6, d4, d0
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmull.s16       q7, d5, d0
+        vmull.s16       q8, d6, d0
+        vmull.s16       q9, d7, d0
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d1
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d1
+        vmlal.s16       q8, d6, d1
+        vmlal.s16       q9, d7, d1
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d2
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d2
+        vmlal.s16       q8, d6, d2
+        vmlal.s16       q9, d7, d2
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d3
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d3
+        vmlal.s16       q8, d6, d3
+        vmlal.s16       q9, d7, d3
+
+        vpadd.s32       d0, d12, d13 /* TODO: can be eliminated */
+        vpadd.s32       d1, d14, d15 /* TODO: can be eliminated */
+        vpadd.s32       d2, d16, d17 /* TODO: can be eliminated */
+        vpadd.s32       d3, d18, d19 /* TODO: can be eliminated */

         vst1.32         {d0, d1, d2, d3}, [r1, :128]

diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 2cca784f5a..48cb816b70 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -19,6 +19,7 @@
 #include <stdint.h>

 #include "libavutil/attributes.h"
+#include "libavutil/intreadwrite.h"
 #include "libavcodec/vc1dsp.h"
 #include "vc1dsp.h"

@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
 void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);

+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
+
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int rnd);

@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);

+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+    /* Dealing with starting and stopping, and removing escape bytes, are
+     * comparatively less time-sensitive, so are more clearly expressed using
+     * a C wrapper around the assembly inner loop. Note that we assume a
+     * little-endian machine that supports unaligned loads. */
+    int dsize = 0;
+    while (size >= 4)
+    {
+        int found = 0;
+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+        {
+            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
+            if (!found)
+            {
+                *dst++ = *src++;
+                --size;
+                ++dsize;
+            }
+        }
+        if (!found)
+        {
+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
+            dst += skip;
+            src += skip;
+            size -= skip;
+            dsize += skip;
+            while (!found && size >= 4)
+            {
+                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
+                if (!found)
+                {
+                    *dst++ = *src++;
+                    --size;
+                    ++dsize;
+                }
+            }
+        }
+        if (found)
+        {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            ++src;
+            size -= 3;
+            dsize += 2;
+        }
+    }
+    while (size > 0)
+    {
+        *dst++ = *src++;
+        --size;
+        ++dsize;
+    }
+    return dsize;
+}
+
 #define FN_ASSIGN(X, Y) \
     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;

+    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
+    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
+    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
+    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
+    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
+    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
+
     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
     FN_ASSIGN(1, 0);
     FN_ASSIGN(2, 0);
@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
 }
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index 93f043bf08..96014fbebc 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
         vst1.32         {d1[1]},  [r0,:32]
         bx              lr
 endfunc
+
+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of lower block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter4_neon, export=1
+        sub             r3, r0, r1, lsl #2
+        vldr            d0, .Lcoeffs
+        vld1.32         {d1[0]}, [r0], r1       @ P5
+        vld1.32         {d2[0]}, [r3], r1       @ P1
+        vld1.32         {d3[0]}, [r3], r1       @ P2
+        vld1.32         {d4[0]}, [r0], r1       @ P6
+        vld1.32         {d5[0]}, [r3], r1       @ P3
+        vld1.32         {d6[0]}, [r0], r1       @ P7
+        vld1.32         {d7[0]}, [r3]           @ P4
+        vld1.32         {d16[0]}, [r0]          @ P8
+        vshll.u8        q9, d1, #1              @ 2*P5
+        vdup.16         d17, r2                 @ pq
+        vshll.u8        q10, d2, #1             @ 2*P1
+        vmovl.u8        q11, d3                 @ P2
+        vmovl.u8        q1, d4                  @ P6
+        vmovl.u8        q12, d5                 @ P3
+        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
+        vmovl.u8        q11, d6                 @ P7
+        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
+        vshll.u8        q2, d5, #1              @ 2*P3
+        vmovl.u8        q3, d7                  @ P4
+        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
+        vmovl.u8        q11, d16                @ P8
+        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
+        vmovl.u8        q12, d1                 @ P5
+        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
+        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
+        vsub.i16        d1, d6, d24             @ P4-P5
+        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
+        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
+        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
+        vabs.s16        d2, d1
+        vrshr.s16       d3, d18, #3
+        vrshr.s16       d5, d20, #3
+        vshr.s16        d2, d2, #1              @ clip
+        vrshr.s16       d4, d4, #3
+        vabs.s16        d3, d3                  @ a2
+        vshr.s16        d1, d1, #8              @ clip_sign
+        vabs.s16        d5, d5                  @ a1
+        vceq.i16        d7, d2, #0              @ test clip == 0
+        vabs.s16        d16, d4                 @ a0
+        vshr.s16        d4, d4, #8              @ a0_sign
+        vcge.s16        d18, d5, d3             @ test a1 >= a2
+        vcge.s16        d17, d16, d17           @ test a0 >= pq
+        vbsl            d18, d3, d5             @ a3
+        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
+        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
+        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        d5, d18, d16            @ test a3 >= a0
+        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
+        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
+        vmov.32         r0, d4[1]               @ move to gp reg
+        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        vcge.s16        d4, d0, d2
+        tst             r0, #1
+        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
+        vbsl            d4, d2, d0              @ FFMIN(d, clip)
+        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        vqmovun.s16     d0, q3
+        vqmovun.s16     d1, q12
+        vst1.32         {d0[0]}, [r3], r1
+        vst1.32         {d1[0]}, [r3]
+1:      bx              lr
+endfunc
+
+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of right block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter4_neon, export=1
+        sub             r3, r0, #4              @ where to start reading
+        vldr            d0, .Lcoeffs
+        vld1.32         {d2}, [r3], r1
+        sub             r0, r0, #1              @ where to start writing
+        vld1.32         {d4}, [r3], r1
+        vld1.32         {d3}, [r3], r1
+        vld1.32         {d5}, [r3]
+        vdup.16         d1, r2                  @ pq
+        vtrn.8          q1, q2
+        vtrn.16         d2, d3                  @ P1, P5, P3, P7
+        vtrn.16         d4, d5                  @ P2, P6, P4, P8
+        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
+        vmovl.u8        q8, d4                  @ P2, P6
+        vmovl.u8        q9, d3                  @ P3, P7
+        vmovl.u8        q2, d5                  @ P4, P8
+        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
+        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
+        vmovl.u8        q1, d2                  @ P1, P5
+        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
+        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
+        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
+        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
+        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
+        vsub.i16        d3, d4, d2              @ P4-P5
+        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
+        vrshr.s16       q3, q3, #3
+        vabs.s16        d5, d3
+        vshr.s16        d3, d3, #8              @ clip_sign
+        vrshr.s16       d16, d20, #3
+        vabs.s16        q3, q3                  @ a1, a2
+        vshr.s16        d5, d5, #1              @ clip
+        vabs.s16        d17, d16                @ a0
+        vceq.i16        d18, d5, #0             @ test clip == 0
+        vshr.s16        d16, d16, #8            @ a0_sign
+        vcge.s16        d19, d6, d7             @ test a1 >= a2
+        vcge.s16        d1, d17, d1             @ test a0 >= pq
+        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
+        vbsl            d19, d7, d6             @ a3
+        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
+        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
+        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
+        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
+        vmov.32         r2, d3[1]               @ move to gp reg
+        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        vcge.s16        d3, d0, d5
+        tst             r2, #1
+        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
+        vbsl            d3, d5, d0              @ FFMIN(d, clip)
+        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        vqmovun.s16     d1, q1
+        vqmovun.s16     d0, q2
+        vst2.8          {d0[0], d1[0]}, [r0], r1
+        vst2.8          {d0[1], d1[1]}, [r0], r1
+        vst2.8          {d0[2], d1[2]}, [r0], r1
+        vst2.8          {d0[3], d1[3]}, [r0]
+1:      bx              lr
+endfunc
+
+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of lower block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter8_neon, export=1
+        sub             r3, r0, r1, lsl #2
+        vldr            d0, .Lcoeffs
+        vld1.32         {d1}, [r0 :64], r1      @ P5
+        vld1.32         {d2}, [r3 :64], r1      @ P1
+        vld1.32         {d3}, [r3 :64], r1      @ P2
+        vld1.32         {d4}, [r0 :64], r1      @ P6
+        vld1.32         {d5}, [r3 :64], r1      @ P3
+        vld1.32         {d6}, [r0 :64], r1      @ P7
+        vshll.u8        q8, d1, #1              @ 2*P5
+        vshll.u8        q9, d2, #1              @ 2*P1
+        vld1.32         {d7}, [r3 :64]          @ P4
+        vmovl.u8        q1, d3                  @ P2
+        vld1.32         {d20}, [r0 :64]         @ P8
+        vmovl.u8        q11, d4                 @ P6
+        vdup.16         q12, r2                 @ pq
+        vmovl.u8        q13, d5                 @ P3
+        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
+        vmovl.u8        q1, d6                  @ P7
+        vshll.u8        q2, d5, #1              @ 2*P3
+        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
+        vmovl.u8        q3, d7                  @ P4
+        vmovl.u8        q10, d20                @ P8
+        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
+        vmovl.u8        q1, d1                  @ P5
+        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
+        vsub.i16        q13, q3, q1             @ P4-P5
+        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
+        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
+        vabs.s16        q10, q13
+        vshr.s16        q13, q13, #8            @ clip_sign
+        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
+        vshr.s16        q10, q10, #1            @ clip
+        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
+        vrshr.s16       q8, q8, #3
+        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
+        vceq.i16        q11, q10, #0            @ test clip == 0
+        vrshr.s16       q9, q9, #3
+        vabs.s16        q8, q8                  @ a2
+        vabs.s16        q9, q9                  @ a1
+        vrshr.s16       q2, q2, #3
+        vcge.s16        q14, q9, q8             @ test a1 >= a2
+        vabs.s16        q15, q2                 @ a0
+        vshr.s16        q2, q2, #8              @ a0_sign
+        vbsl            q14, q8, q9             @ a3
+        vcge.s16        q8, q15, q12            @ test a0 >= pq
+        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
+        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        q12, q14, q15           @ test a3 >= a0
+        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
+        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
+        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
+        vshl.i64        q11, q9, #16
+        vmov.32         r0, d18[1]              @ move to gp reg
+        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        vmov.32         r2, d19[1]
+        vshr.s64        q9, q11, #48
+        vcge.s16        q11, q0, q10
+        vorr            q8, q8, q9
+        and             r0, r0, r2
+        vbsl            q11, q10, q0            @ FFMIN(d, clip)
+        tst             r0, #1
+        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
+        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
+        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        vqmovun.s16     d0, q3
+        vqmovun.s16     d1, q1
+        vst1.32         {d0}, [r3 :64], r1
+        vst1.32         {d1}, [r3 :64]
+1:      bx              lr
+endfunc
+
+.align  5
+.Lcoeffs:
+.quad   0x00050002
+
+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of right block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter8_neon, export=1
+        push            {lr}
+        sub             r3, r0, #4              @ where to start reading
+        vldr            d0, .Lcoeffs
+        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
+        sub             r0, r0, #1              @ where to start writing
+        vld1.32         {d4}, [r3], r1
+        add             r12, r0, r1, lsl #2
+        vld1.32         {d3}, [r3], r1
+        vld1.32         {d5}, [r3], r1
+        vld1.32         {d6}, [r3], r1
+        vld1.32         {d16}, [r3], r1
+        vld1.32         {d7}, [r3], r1
+        vld1.32         {d17}, [r3]
+        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
+        vdup.16         q9, r2                  @ pq
+        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
+        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
+        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
+        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
+        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
+        vtrn.32         d2, d6                  @ P1, P5
+        vtrn.32         d4, d16                 @ P2, P6
+        vtrn.32         d3, d7                  @ P3, P7
+        vtrn.32         d5, d17                 @ P4, P8
+        vshll.u8        q10, d2, #1             @ 2*P1
+        vshll.u8        q11, d6, #1             @ 2*P5
+        vmovl.u8        q12, d4                 @ P2
+        vmovl.u8        q13, d16                @ P6
+        vmovl.u8        q14, d3                 @ P3
+        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
+        vmovl.u8        q12, d7                 @ P7
+        vshll.u8        q1, d3, #1              @ 2*P3
+        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
+        vmovl.u8        q2, d5                  @ P4
+        vmovl.u8        q8, d17                 @ P8
+        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
+        vmovl.u8        q3, d6                  @ P5
+        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
+        vsub.i16        q12, q2, q3             @ P4-P5
+        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
+        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
+        vabs.s16        q8, q12
+        vshr.s16        q12, q12, #8            @ clip_sign
+        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
+        vshr.s16        q8, q8, #1              @ clip
+        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
+        vrshr.s16       q11, q11, #3
+        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
+        vceq.i16        q13, q8, #0             @ test clip == 0
+        vrshr.s16       q10, q10, #3
+        vabs.s16        q11, q11                @ a2
+        vabs.s16        q10, q10                @ a1
+        vrshr.s16       q1, q1, #3
+        vcge.s16        q14, q10, q11           @ test a1 >= a2
+        vabs.s16        q15, q1                 @ a0
+        vshr.s16        q1, q1, #8              @ a0_sign
+        vbsl            q14, q11, q10           @ a3
+        vcge.s16        q9, q15, q9             @ test a0 >= pq
+        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
+        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        q11, q14, q15           @ test a3 >= a0
+        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
+        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
+        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
+        vmov.32         r2, d20[1]              @ move to gp reg
+        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        vmov.32         r3, d21[1]
+        vcge.s16        q10, q0, q8
+        and             r14, r2, r3
+        vbsl            q10, q8, q0             @ FFMIN(d, clip)
+        tst             r14, #1
+        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
+        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        vqmovun.s16     d1, q3
+        vqmovun.s16     d0, q2
+        tst             r2, #1
+        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
+        vst2.8          {d0[0], d1[0]}, [r0], r1
+        vst2.8          {d0[1], d1[1]}, [r0], r1
+        vst2.8          {d0[2], d1[2]}, [r0], r1
+        vst2.8          {d0[3], d1[3]}, [r0]
+1:      tst             r3, #1
+        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
+        vst2.8          {d0[4], d1[4]}, [r12], r1
+        vst2.8          {d0[5], d1[5]}, [r12], r1
+        vst2.8          {d0[6], d1[6]}, [r12], r1
+        vst2.8          {d0[7], d1[7]}, [r12]
+2:      pop             {pc}
+endfunc
+
+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of lower block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter16_neon, export=1
+        vpush           {d8-d15}
+        sub             r3, r0, r1, lsl #2
+        vldr            d0, .Lcoeffs
+        vld1.64         {q1}, [r0 :128], r1     @ P5
+        vld1.64         {q2}, [r3 :128], r1     @ P1
+        vld1.64         {q3}, [r3 :128], r1     @ P2
+        vld1.64         {q4}, [r0 :128], r1     @ P6
+        vld1.64         {q5}, [r3 :128], r1     @ P3
+        vld1.64         {q6}, [r0 :128], r1     @ P7
+        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
+        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
+        vld1.64         {q9}, [r3 :128]         @ P4
+        vmovl.u8        q10, d6                 @ P2[0..7]
+        vld1.64         {q11}, [r0 :128]        @ P8
+        vmovl.u8        q12, d8                 @ P6[0..7]
+        vdup.16         q13, r2                 @ pq
+        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
+        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
+        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
+        vmovl.u8        q3, d7                  @ P2[8..15]
+        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
+        vmovl.u8        q4, d9                  @ P6[8..15]
+        vmovl.u8        q14, d10                @ P3[0..7]
+        vmovl.u8        q15, d12                @ P7[0..7]
+        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
+        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
+        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
+        vmovl.u8        q6, d13                 @ P7[8..15]
+        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
+        vmovl.u8        q14, d18                @ P4[0..7]
+        vmovl.u8        q9, d19                 @ P4[8..15]
+        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
+        vmovl.u8        q15, d11                @ P3[8..15]
+        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
+        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
+        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
+        vmovl.u8        q15, d22                @ P8[0..7]
+        vmovl.u8        q11, d23                @ P8[8..15]
+        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
+        vmovl.u8        q6, d2                  @ P5[0..7]
+        vmovl.u8        q1, d3                  @ P5[8..15]
+        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
+        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
+        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
+        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
+        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
+        vrshr.s16       q8, q8, #3
+        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
+        vrshr.s16       q7, q7, #3
+        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
+        vabs.s16        q11, q15
+        vabs.s16        q8, q8                  @ a1[0..7]
+        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
+        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
+        vrshr.s16       q2, q2, #3
+        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
+        vabs.s16        q7, q7                  @ a2[0..7]
+        vrshr.s16       q10, q10, #3
+        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
+        vshr.s16        q11, q11, #1            @ clip[0..7]
+        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
+        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
+        vabs.s16        q2, q2                  @ a1[8..15]
+        vrshr.s16       q3, q3, #3
+        vabs.s16        q10, q10                @ a2[8..15]
+        vbsl            q4, q7, q8              @ a3[0..7]
+        vabs.s16        q7, q12
+        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
+        vrshr.s16       q5, q5, #3
+        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
+        vshr.s16        q7, q7, #1              @ clip[8..15]
+        vbsl            q12, q10, q2            @ a3[8..15]
+        vabs.s16        q2, q3                  @ a0[0..7]
+        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
+        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
+        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
+        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
+        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
+        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
+        vabs.s16        q4, q5                  @ a0[8..15]
+        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
+        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
+        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
+        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
+        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
+        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
+        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
+        vmov.32         r0, d4[1]               @ move to gp reg
+        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
+        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vmov.32         r2, d5[1]
+        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
+        vshl.i64        q2, q2, #16
+        vcge.s16        q12, q15, q11
+        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
+        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
+        vshr.s64        q2, q2, #48
+        and             r0, r0, r2
+        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
+        vshl.i64        q11, q4, #16
+        vmov.32         r2, d8[1]
+        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
+        vorr            q2, q10, q2
+        vmov.32         r12, d9[1]
+        vshr.s64        q4, q11, #48
+        vcge.s16        q10, q0, q7
+        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
+        vorr            q4, q8, q4
+        and             r2, r2, r12
+        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
+        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
+        and             r0, r0, r2
+        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
+        tst             r0, #1
+        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
+        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
+        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
+        vqmovun.s16     d4, q14
+        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
+        vqmovun.s16     d0, q6
+        vqmovun.s16     d5, q9
+        vqmovun.s16     d1, q1
+        vst1.64         {q2}, [r3 :128], r1
+        vst1.64         {q0}, [r3 :128]
+1:      vpop            {d8-d15}
+        bx              lr
+endfunc
+
+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of right block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter16_neon, export=1
+        push            {r4-r6,lr}
+        vpush           {d8-d15}
+        sub             r3, r0, #4              @ where to start reading
+        vldr            d0, .Lcoeffs
+        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
+        sub             r0, r0, #1              @ where to start writing
+        vld1.32         {d3}, [r3], r1
+        add             r4, r0, r1, lsl #2
+        vld1.32         {d10}, [r3], r1
+        vld1.32         {d11}, [r3], r1
+        vld1.32         {d16}, [r3], r1
+        vld1.32         {d4}, [r3], r1
+        vld1.32         {d8}, [r3], r1
+        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
+        vld1.32         {d14}, [r3], r1
+        vld1.32         {d5}, [r3], r1
+        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
+        vld1.32         {d6}, [r3], r1
+        vld1.32         {d12}, [r3], r1
+        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
+        vld1.32         {d13}, [r3], r1
+        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
+        vld1.32         {d1}, [r3], r1
+        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
+        vld1.32         {d7}, [r3], r1
+        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
+        vld1.32         {d9}, [r3], r1
+        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
+        vld1.32         {d15}, [r3]
+        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
+        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
+        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
+        vdup.16         q9, r2                  @ pq
+        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
+        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
+        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
+        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
+        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
+        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
+        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
+        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
+        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
+        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
+        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
+        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
+        vmovl.u8        q1, d3                  @ P2[0..7]
+        vmovl.u8        q12, d4                 @ P6[0..7]
+        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
+        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
+        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
+        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
+        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
+        vmovl.u8        q1, d10                 @ P3[0..7]
+        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
+        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
+        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
+        vmovl.u8        q14, d6                 @ P2[8..15]
+        vmovl.u8        q3, d7                  @ P6[8..15]
+        vmovl.u8        q15, d8                 @ P7[0..7]
+        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
+        vmovl.u8        q1, d12                 @ P3[8..15]
+        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
+        vmovl.u8        q4, d9                  @ P7[8..15]
+        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
+        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
+        vmovl.u8        q5, d11                 @ P4[0..7]
+        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
+        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
+        vmovl.u8        q6, d13                 @ P4[8..15]
+        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
+        vmovl.u8        q1, d14                 @ P8[0..7]
+        vmovl.u8        q7, d15                 @ P8[8..15]
+        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
+        vmovl.u8        q4, d16                 @ P5[0..7]
+        vmovl.u8        q8, d1                  @ P5[8..15]
+        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
+        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
+        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
+        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
+        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
+        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
+        vrshr.s16       q10, q10, #3
+        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
+        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
+        vrshr.s16       q11, q11, #3
+        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
+        vrshr.s16       q2, q2, #3
+        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
+        vabs.s16        q10, q10                @ a1[0..7]
+        vrshr.s16       q13, q13, #3
+        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
+        vabs.s16        q3, q11                 @ a2[0..7]
+        vabs.s16        q2, q2                  @ a1[8..15]
+        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
+        vabs.s16        q11, q1
+        vabs.s16        q12, q13                @ a2[8..15]
+        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
+        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
+        vrshr.s16       q15, q15, #3
+        vshr.s16        q11, q11, #1            @ clip[0..7]
+        vrshr.s16       q14, q14, #3
+        vbsl            q13, q3, q10            @ a3[0..7]
+        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
+        vabs.s16        q10, q15                @ a0[8..15]
+        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
+        vbsl            q3, q12, q2             @ a3[8..15]
+        vabs.s16        q2, q14                 @ a0[0..7]
+        vabs.s16        q12, q7
+        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
+        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
+        vshr.s16        q12, q12, #1            @ clip[8..15]
+        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
+        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
+        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
+        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
+        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
+        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
+        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
+        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
+        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
+        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
+        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
+        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
+        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
+        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
+        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
+        vcge.s16        q14, q13, q12
+        vmov.32         r2, d4[1]               @ move to gp reg
+        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
+        vmov.32         r3, d5[1]
+        vcge.s16        q2, q0, q11
+        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
+        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
+        vmov.32         r5, d6[1]
+        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
+        vmov.32         r6, d7[1]
+        and             r12, r2, r3
+        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
+        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
+        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
+        and             r14, r5, r6
+        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
+        and             r12, r12, r14
+        vqmovun.s16     d4, q6
+        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
+        tst             r12, #1
+        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
+        vqmovun.s16     d2, q5
+        vqmovun.s16     d3, q4
+        vqmovun.s16     d5, q8
+        tst             r2, #1
+        bne             1f
+        vst2.8          {d2[0], d3[0]}, [r0], r1
+        vst2.8          {d2[1], d3[1]}, [r0], r1
+        vst2.8          {d2[2], d3[2]}, [r0], r1
+        vst2.8          {d2[3], d3[3]}, [r0]
+1:      add             r0, r4, r1, lsl #2
+        tst             r3, #1
+        bne             2f
+        vst2.8          {d2[4], d3[4]}, [r4], r1
+        vst2.8          {d2[5], d3[5]}, [r4], r1
+        vst2.8          {d2[6], d3[6]}, [r4], r1
+        vst2.8          {d2[7], d3[7]}, [r4]
+2:      add             r4, r0, r1, lsl #2
+        tst             r5, #1
+        bne             3f
+        vst2.8          {d4[0], d5[0]}, [r0], r1
+        vst2.8          {d4[1], d5[1]}, [r0], r1
+        vst2.8          {d4[2], d5[2]}, [r0], r1
+        vst2.8          {d4[3], d5[3]}, [r0]
+3:      tst             r6, #1
+        bne             4f
+        vst2.8          {d4[4], d5[4]}, [r4], r1
+        vst2.8          {d4[5], d5[5]}, [r4], r1
+        vst2.8          {d4[6], d5[6]}, [r4], r1
+        vst2.8          {d4[7], d5[7]}, [r4]
+4:      vpop            {d8-d15}
+        pop             {r4-r6,pc}
+endfunc
+
+@ Copy at most the specified number of bytes from source to destination buffer,
+@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
+@ On entry:
+@   r0 -> source buffer
+@   r1 = max number of bytes to copy
+@   r2 -> destination buffer, optimally 8-byte aligned
+@ On exit:
+@   r0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+        @ Offset by 48 to screen out cases that are too short for us to handle,
+        @ and also make it easy to test for loop termination, or to determine
+        @ whether we need an odd number of half-iterations of the loop.
+        subs    r1, r1, #48
+        bmi     90f
+
+        @ Set up useful constants
+        vmov.i32        q0, #0x3000000
+        vmov.i32        q1, #0x30000
+
+        tst             r1, #16
+        bne             1f
+
+          vld1.8          {q8, q9}, [r0]!
+          vbic            q12, q8, q0
+          vext.8          q13, q8, q9, #1
+          vext.8          q14, q8, q9, #2
+          vext.8          q15, q8, q9, #3
+          veor            q12, q12, q1
+          vbic            q13, q13, q0
+          vbic            q14, q14, q0
+          vbic            q15, q15, q0
+          vceq.i32        q12, q12, #0
+          veor            q13, q13, q1
+          veor            q14, q14, q1
+          veor            q15, q15, q1
+          vceq.i32        q13, q13, #0
+          vceq.i32        q14, q14, #0
+          vceq.i32        q15, q15, #0
+          add             r1, r1, #16
+          b               3f
+
+1:      vld1.8          {q10, q11}, [r0]!
+        vbic            q12, q10, q0
+        vext.8          q13, q10, q11, #1
+        vext.8          q14, q10, q11, #2
+        vext.8          q15, q10, q11, #3
+        veor            q12, q12, q1
+        vbic            q13, q13, q0
+        vbic            q14, q14, q0
+        vbic            q15, q15, q0
+        vceq.i32        q12, q12, #0
+        veor            q13, q13, q1
+        veor            q14, q14, q1
+        veor            q15, q15, q1
+        vceq.i32        q13, q13, #0
+        vceq.i32        q14, q14, #0
+        vceq.i32        q15, q15, #0
+        @ Drop through...
+2:        vmov            q8, q11
+          vld1.8          {q9}, [r0]!
+        vorr            q13, q12, q13
+        vorr            q15, q14, q15
+          vbic            q12, q8, q0
+        vorr            q3, q13, q15
+          vext.8          q13, q8, q9, #1
+          vext.8          q14, q8, q9, #2
+          vext.8          q15, q8, q9, #3
+          veor            q12, q12, q1
+        vorr            d6, d6, d7
+          vbic            q13, q13, q0
+          vbic            q14, q14, q0
+          vbic            q15, q15, q0
+          vceq.i32        q12, q12, #0
+        vmov            r3, r12, d6
+          veor            q13, q13, q1
+          veor            q14, q14, q1
+          veor            q15, q15, q1
+          vceq.i32        q13, q13, #0
+          vceq.i32        q14, q14, #0
+          vceq.i32        q15, q15, #0
+        orrs            r3, r3, r12
+        bne             90f
+        vst1.64         {q10}, [r2]!
+3:          vmov            q10, q9
+            vld1.8          {q11}, [r0]!
+          vorr            q13, q12, q13
+          vorr            q15, q14, q15
+            vbic            q12, q10, q0
+          vorr            q3, q13, q15
+            vext.8          q13, q10, q11, #1
+            vext.8          q14, q10, q11, #2
+            vext.8          q15, q10, q11, #3
+            veor            q12, q12, q1
+          vorr            d6, d6, d7
+            vbic            q13, q13, q0
+            vbic            q14, q14, q0
+            vbic            q15, q15, q0
+            vceq.i32        q12, q12, #0
+          vmov            r3, r12, d6
+            veor            q13, q13, q1
+            veor            q14, q14, q1
+            veor            q15, q15, q1
+            vceq.i32        q13, q13, #0
+            vceq.i32        q14, q14, #0
+            vceq.i32        q15, q15, #0
+          orrs            r3, r3, r12
+          bne             91f
+          vst1.64         {q8}, [r2]!
+        subs            r1, r1, #32
+        bpl             2b
+
+90:     add             r0, r1, #48
+        bx              lr
+
+91:     sub             r1, r1, #16
+        b               90b
+endfunc
diff --git a/libavcodec/av1.h b/libavcodec/av1.h
index 951a18ecb2..0f99ae4829 100644
--- a/libavcodec/av1.h
+++ b/libavcodec/av1.h
@@ -114,13 +114,6 @@ enum {
     AV1_WARP_MODEL_TRANSLATION = 1,
     AV1_WARP_MODEL_ROTZOOM     = 2,
     AV1_WARP_MODEL_AFFINE      = 3,
-    AV1_WARP_PARAM_REDUCE_BITS = 6,
-
-    AV1_DIV_LUT_BITS      = 8,
-    AV1_DIV_LUT_PREC_BITS = 14,
-    AV1_DIV_LUT_NUM       = 257,
-
-    AV1_MAX_LOOP_FILTER = 63,
 };


diff --git a/libavcodec/av1dec.c b/libavcodec/av1dec.c
index a3301f454f..a75d6744d3 100644
--- a/libavcodec/av1dec.c
+++ b/libavcodec/av1dec.c
@@ -28,34 +28,6 @@
 #include "internal.h"
 #include "profiles.h"

-/**< same with Div_Lut defined in spec 7.11.3.7 */
-static const uint16_t div_lut[AV1_DIV_LUT_NUM] = {
-  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
-  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
-  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
-  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
-  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
-  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
-  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
-  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
-  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
-  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
-  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
-  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
-  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
-  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
-  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
-  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
-  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
-  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
-  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
-  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
-  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
-  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
-  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
-  8240,  8224,  8208,  8192
-};
-
 static uint32_t inverse_recenter(int r, uint32_t v)
 {
     if (v > 2 * r)
@@ -125,70 +97,6 @@ static void read_global_param(AV1DecContext *s, int type, int ref, int idx)
                                        -mx, mx + 1, r) << prec_diff) + round;
 }

-static uint64_t round_two(uint64_t x, uint16_t n)
-{
-    if (n == 0)
-        return x;
-    return ((x + ((uint64_t)1 << (n - 1))) >> n);
-}
-
-static int64_t round_two_signed(int64_t x, uint16_t n)
-{
-    return ((x<0) ? -((int64_t)round_two(-x, n)) : (int64_t)round_two(x, n));
-}
-
-/**
- * Resolve divisor process.
- * see spec 7.11.3.7
- */
-static int16_t resolve_divisor(uint32_t d, uint16_t *shift)
-{
-    int32_t e, f;
-
-    *shift = av_log2(d);
-    e = d - (1 << (*shift));
-    if (*shift > AV1_DIV_LUT_BITS)
-        f = round_two(e, *shift - AV1_DIV_LUT_BITS);
-    else
-        f = e << (AV1_DIV_LUT_BITS - (*shift));
-
-    *shift += AV1_DIV_LUT_PREC_BITS;
-
-    return div_lut[f];
-}
-
-/**
- * check if global motion params is valid.
- * see spec 7.11.3.6
- */
-static uint8_t get_shear_params_valid(AV1DecContext *s, int idx)
-{
-    int16_t alpha, beta, gamma, delta, divf, divs;
-    int64_t v, w;
-    int32_t *param = &s->cur_frame.gm_params[idx][0];
-    if (param[2] < 0)
-        return 0;
-
-    alpha = av_clip_int16(param[2] - (1 << AV1_WARPEDMODEL_PREC_BITS));
-    beta  = av_clip_int16(param[3]);
-    divf  = resolve_divisor(abs(param[2]), &divs);
-    v     = (int64_t)param[4] * (1 << AV1_WARPEDMODEL_PREC_BITS);
-    w     = (int64_t)param[3] * param[4];
-    gamma = av_clip_int16((int)round_two_signed((v * divf), divs));
-    delta = av_clip_int16(param[5] - (int)round_two_signed((w * divf), divs) - (1 << AV1_WARPEDMODEL_PREC_BITS));
-
-    alpha = round_two_signed(alpha, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-    beta  = round_two_signed(beta,  AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-    gamma = round_two_signed(gamma, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-    delta = round_two_signed(delta, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS;
-
-    if ((4 * abs(alpha) + 7 * abs(beta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS) ||
-        (4 * abs(gamma) + 4 * abs(delta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS))
-        return 0;
-
-    return 1;
-}
-
 /**
 * update gm type/params, since cbs already implemented part of this funcation,
 * so we don't need to full implement spec.
@@ -236,9 +144,6 @@ static void global_motion_params(AV1DecContext *s)
             read_global_param(s, type, ref, 0);
             read_global_param(s, type, ref, 1);
         }
-        if (type <= AV1_WARP_MODEL_AFFINE) {
-            s->cur_frame.gm_invalid[ref] = !get_shear_params_valid(s, ref);
-        }
     }
 }

@@ -604,9 +509,6 @@ static int av1_frame_ref(AVCodecContext *avctx, AV1Frame *dst, const AV1Frame *s

     dst->spatial_id = src->spatial_id;
     dst->temporal_id = src->temporal_id;
-    memcpy(dst->gm_invalid,
-           src->gm_invalid,
-           AV1_NUM_REF_FRAMES * sizeof(uint8_t));
     memcpy(dst->gm_type,
            src->gm_type,
            AV1_NUM_REF_FRAMES * sizeof(uint8_t));
diff --git a/libavcodec/av1dec.h b/libavcodec/av1dec.h
index 4e140588b9..248a68750f 100644
--- a/libavcodec/av1dec.h
+++ b/libavcodec/av1dec.h
@@ -42,7 +42,6 @@ typedef struct AV1Frame {
     int temporal_id;
     int spatial_id;

-    uint8_t gm_invalid[AV1_NUM_REF_FRAMES];
     uint8_t gm_type[AV1_NUM_REF_FRAMES];
     int32_t gm_params[AV1_NUM_REF_FRAMES][6];

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 8a71c04230..53644506e5 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -2595,6 +2595,17 @@ typedef struct AVHWAccel {
      * that avctx->hwaccel_priv_data is invalid.
      */
     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
+
+    /**
+     * Called if parsing fails
+     *
+     * An error has occured, end_frame will not be called
+     * start_frame & decode_slice may or may not have been called
+     * Optional
+     *
+     * @param avctx the codec context
+     */
+    void (*abort_frame)(AVCodecContext *avctx);
 } AVHWAccel;

 /**
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index f04017d4b4..5efd24e9c3 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -869,7 +869,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,

     binkb_init_bundles(c);
     ref_start = frame->data[plane_idx];
-    ref_end   = frame->data[plane_idx] + ((bh - 1) * frame->linesize[plane_idx] + bw - 1) * 8;
+    ref_end   = frame->data[plane_idx] + (bh * frame->linesize[plane_idx] + bw) * 8;

     for (i = 0; i < 64; i++)
         coordmap[i] = (i & 7) + (i >> 3) * stride;
@@ -925,7 +925,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                 yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                 ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8*stride > ref_end) {
                     av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                 } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                     c->put_pixels_tab(dst, ref, stride, 8);
@@ -941,7 +941,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                 yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                 ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8 * stride > ref_end) {
                     av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                 } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                     c->put_pixels_tab(dst, ref, stride, 8);
@@ -973,7 +973,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 xoff = binkb_get_value(c, BINKB_SRC_X_OFF);
                 yoff = binkb_get_value(c, BINKB_SRC_Y_OFF) + ybias;
                 ref = dst + xoff + yoff * stride;
-                if (ref < ref_start || ref > ref_end) {
+                if (ref < ref_start || ref + 8 * stride > ref_end) {
                     av_log(c->avctx, AV_LOG_WARNING, "Reference block is out of bounds\n");
                 } else if (ref + 8*stride < dst || ref >= dst + 8*stride) {
                     c->put_pixels_tab(dst, ref, stride, 8);
@@ -1086,7 +1086,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
         for (bx = 0; bx < bw; bx++, dst += 8, prev += 8) {
             blk = get_value(c, BINK_SRC_BLOCK_TYPES);
             // 16x16 block type on odd line means part of the already decoded block, so skip it
-            if (((by & 1) || (bx & 1)) && blk == SCALED_BLOCK) {
+            if ((by & 1) && blk == SCALED_BLOCK) {
                 bx++;
                 dst  += 8;
                 prev += 8;
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 38d06b2842..bbf5d70560 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -44,6 +44,10 @@ typedef struct CABACContext{
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
+    struct {
+        uint16_t bits;
+        uint16_t range;
+    } by22;
 }CABACContext;

 int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
diff --git a/libavcodec/cbs_av1_syntax_template.c b/libavcodec/cbs_av1_syntax_template.c
index d98d3d42de..6fe6e9a4f3 100644
--- a/libavcodec/cbs_av1_syntax_template.c
+++ b/libavcodec/cbs_av1_syntax_template.c
@@ -355,7 +355,7 @@ static int FUNC(set_frame_refs)(CodedBitstreamContext *ctx, RWContext *rw,
         AV1_REF_FRAME_ALTREF2, AV1_REF_FRAME_ALTREF
     };
     int8_t ref_frame_idx[AV1_REFS_PER_FRAME], used_frame[AV1_NUM_REF_FRAMES];
-    int16_t shifted_order_hints[AV1_NUM_REF_FRAMES];
+    int8_t shifted_order_hints[AV1_NUM_REF_FRAMES];
     int cur_frame_hint, latest_order_hint, earliest_order_hint, ref;
     int i, j;

diff --git a/libavcodec/cdgraphics.c b/libavcodec/cdgraphics.c
index b452baa7d8..263459d0f2 100644
--- a/libavcodec/cdgraphics.c
+++ b/libavcodec/cdgraphics.c
@@ -239,7 +239,7 @@ static void cdg_scroll(CDGraphicsContext *cc, uint8_t *data,
     for (y = FFMAX(0, vinc); y < FFMIN(CDG_FULL_HEIGHT + vinc, CDG_FULL_HEIGHT); y++)
         memcpy(out + FFMAX(0, hinc) + stride * y,
                in + FFMAX(0, hinc) - hinc + (y - vinc) * stride,
-               FFABS(stride) - FFABS(hinc));
+               FFMIN(stride + hinc, stride));

     if (vinc > 0)
         cdg_fill_wrapper(0, 0, out,
diff --git a/libavcodec/cfhd.c b/libavcodec/cfhd.c
index b61d1e7222..6f13207cc1 100644
--- a/libavcodec/cfhd.c
+++ b/libavcodec/cfhd.c
@@ -838,7 +838,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                             const uint16_t q = s->quantisation;

                             for (i = 0; i < run; i++) {
-                                *coeff_data |= coeff * 256U;
+                                *coeff_data |= coeff * 256;
                                 *coeff_data++ *= q;
                             }
                         } else {
@@ -869,7 +869,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                             const uint16_t q = s->quantisation;

                             for (i = 0; i < run; i++) {
-                                *coeff_data |= coeff * 256U;
+                                *coeff_data |= coeff * 256;
                                 *coeff_data++ *= q;
                             }
                         } else {
diff --git a/libavcodec/codec.h b/libavcodec/codec.h
index 50a22f6e3c..5acf572ef4 100644
--- a/libavcodec/codec.h
+++ b/libavcodec/codec.h
@@ -367,6 +367,17 @@ const AVCodec *av_codec_iterate(void **opaque);
  */
 AVCodec *avcodec_find_decoder(enum AVCodecID id);

+/**
+ * Find a registered decoder with a matching codec ID and pix_fmt.
+ * A decoder will pix_fmt set to NULL will match any fmt.
+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
+ *
+ * @param id AVCodecID of the requested decoder
+ * @param fmt AVPixelForma that msut be supported by decoder
+ * @return A decoder if one was found, NULL otherwise.
+ */
+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
+
 /**
  * Find a registered decoder with the specified name.
  *
diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
index cf7fc2c56c..b9999cde01 100644
--- a/libavcodec/diracdec.c
+++ b/libavcodec/diracdec.c
@@ -1432,8 +1432,8 @@ static void global_mv(DiracContext *s, DiracBlock *block, int x, int y, int ref)
     int *c      = s->globalmc[ref].perspective;

     int64_t m   = (1<<ep) - (c[0]*(int64_t)x + c[1]*(int64_t)y);
-    int64_t mx  = m * (uint64_t)((A[0][0] * (int64_t)x + A[0][1]*(int64_t)y) + (1LL<<ez) * b[0]);
-    int64_t my  = m * (uint64_t)((A[1][0] * (int64_t)x + A[1][1]*(int64_t)y) + (1LL<<ez) * b[1]);
+    int64_t mx  = m * (int64_t)((A[0][0] * (int64_t)x + A[0][1]*(int64_t)y) + (1LL<<ez) * b[0]);
+    int64_t my  = m * (int64_t)((A[1][0] * (int64_t)x + A[1][1]*(int64_t)y) + (1LL<<ez) * b[1]);

     block->u.mv[ref][0] = (mx + (1<<(ez+ep))) >> (ez+ep);
     block->u.mv[ref][1] = (my + (1<<(ez+ep))) >> (ez+ep);
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index 31ae147433..2461c51727 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -1353,7 +1353,7 @@ static av_cold int dnxhd_encode_end(AVCodecContext *avctx)
     av_freep(&ctx->qmatrix_c16);
     av_freep(&ctx->qmatrix_l16);

-    if (ctx->thread[1]) {
+    if (avctx->active_thread_type == FF_THREAD_SLICE) {
         for (i = 1; i < avctx->thread_count; i++)
             av_freep(&ctx->thread[i]);
     }
diff --git a/libavcodec/dstdec.c b/libavcodec/dstdec.c
index 819a037c69..84d19b91aa 100644
--- a/libavcodec/dstdec.c
+++ b/libavcodec/dstdec.c
@@ -215,7 +215,7 @@ static uint8_t prob_dst_x_bit(int c)
     return (ff_reverse[c & 127] >> 1) + 1;
 }

-static int build_filter(int16_t table[DST_MAX_ELEMENTS][16][256], const Table *fsets)
+static void build_filter(int16_t table[DST_MAX_ELEMENTS][16][256], const Table *fsets)
 {
     int i, j, k, l;

@@ -226,17 +226,14 @@ static int build_filter(int16_t table[DST_MAX_ELEMENTS][16][256], const Table *f
             int total = av_clip(length - j * 8, 0, 8);

             for (k = 0; k < 256; k++) {
-                int64_t v = 0;
+                int v = 0;

                 for (l = 0; l < total; l++)
                     v += (((k >> l) & 1) * 2 - 1) * fsets->coeff[i][j * 8 + l];
-                if ((int16_t)v != v)
-                    return AVERROR_INVALIDDATA;
                 table[i][j][k] = v;
             }
         }
     }
-    return 0;
 }

 static int decode_frame(AVCodecContext *avctx, void *data,
@@ -332,9 +329,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     ac_init(ac, gb);

-    ret = build_filter(s->filter, &s->fsets);
-    if (ret < 0)
-        return ret;
+    build_filter(s->filter, &s->fsets);

     memset(s->status, 0xAA, sizeof(s->status));
     memset(dsd, 0, frame->nb_samples * 4 * channels);
diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c
index 8a912bf6c1..c30b57799c 100644
--- a/libavcodec/dxva2_av1.c
+++ b/libavcodec/dxva2_av1.c
@@ -139,7 +139,7 @@ static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *c
         pp->frame_refs[i].Index  = ref_frame->buf[0] ? ref_idx : 0xFF;

         /* Global Motion */
-        pp->frame_refs[i].wminvalid = h->cur_frame.gm_invalid[AV1_REF_FRAME_LAST + i];
+        pp->frame_refs[i].wminvalid = (h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i] == AV1_WARP_MODEL_IDENTITY);
         pp->frame_refs[i].wmtype    = h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i];
         for (j = 0; j < 6; ++j) {
              pp->frame_refs[i].wmmat[j] = h->cur_frame.gm_params[AV1_REF_FRAME_LAST + i][j];
diff --git a/libavcodec/eac3dec.c b/libavcodec/eac3dec.c
index 33b9c88bb2..3a5c7989b9 100644
--- a/libavcodec/eac3dec.c
+++ b/libavcodec/eac3dec.c
@@ -139,11 +139,9 @@ static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
             // spx_noise_blend and spx_signal_blend are both FP.23
             nscale *= 1.0 / (1<<23);
             sscale *= 1.0 / (1<<23);
-            if (nscale < -1.0)
-                nscale = -1.0;
 #endif
             for (i = 0; i < s->spx_band_sizes[bnd]; i++) {
-                UINTFLOAT noise = (INTFLOAT)(nscale * (int32_t)av_lfg_get(&s->dith_state));
+                float noise  = nscale * (int32_t)av_lfg_get(&s->dith_state);
                 s->transform_coeffs[ch][bin]   *= sscale;
                 s->transform_coeffs[ch][bin++] += noise;
             }
diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c
index 85f40a5c54..197ba6fc6e 100644
--- a/libavcodec/eatgq.c
+++ b/libavcodec/eatgq.c
@@ -61,7 +61,7 @@ static av_cold int tgq_decode_init(AVCodecContext *avctx)
     return 0;
 }

-static int tgq_decode_block(TgqContext *s, int16_t block[64], GetBitContext *gb)
+static void tgq_decode_block(TgqContext *s, int16_t block[64], GetBitContext *gb)
 {
     uint8_t *perm = s->scantable.permutated;
     int i, j, value;
@@ -69,8 +69,6 @@ static int tgq_decode_block(TgqContext *s, int16_t block[64], GetBitContext *gb)
     for (i = 1; i < 64;) {
         switch (show_bits(gb, 3)) {
         case 4:
-            if (i >= 63)
-                return AVERROR_INVALIDDATA;
             block[perm[i++]] = 0;
         case 0:
             block[perm[i++]] = 0;
@@ -80,8 +78,6 @@ static int tgq_decode_block(TgqContext *s, int16_t block[64], GetBitContext *gb)
         case 1:
             skip_bits(gb, 2);
             value = get_bits(gb, 6);
-            if (value > 64 - i)
-                return AVERROR_INVALIDDATA;
             for (j = 0; j < value; j++)
                 block[perm[i++]] = 0;
             break;
@@ -109,7 +105,6 @@ static int tgq_decode_block(TgqContext *s, int16_t block[64], GetBitContext *gb)
         }
     }
     block[0] += 128 << 4;
-    return 0;
 }

 static void tgq_idct_put_mb(TgqContext *s, int16_t (*block)[64], AVFrame *frame,
@@ -169,11 +164,8 @@ static int tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
         if (ret < 0)
             return ret;

-        for (i = 0; i < 6; i++) {
-            int ret = tgq_decode_block(s, s->block[i], &gb);
-            if (ret < 0)
-                return ret;
-        }
+        for (i = 0; i < 6; i++)
+            tgq_decode_block(s, s->block[i], &gb);
         tgq_idct_put_mb(s, s->block, frame, mb_x, mb_y);
         bytestream2_skip(&s->gb, mode);
     } else {
diff --git a/libavcodec/escape124.c b/libavcodec/escape124.c
index 58278ecaa7..94c2a961e6 100644
--- a/libavcodec/escape124.c
+++ b/libavcodec/escape124.c
@@ -88,6 +88,11 @@ static CodeBook unpack_codebook(GetBitContext* gb, unsigned depth,
     unsigned i, j;
     CodeBook cb = { 0 };

+    if (size >= INT_MAX / 34 || get_bits_left(gb) < size * 34)
+        return cb;
+
+    if (size >= INT_MAX / sizeof(MacroBlock))
+        return cb;
     cb.blocks = av_malloc(size ? size * sizeof(MacroBlock) : 1);
     if (!cb.blocks)
         return cb;
@@ -221,7 +226,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
     // represent a lower bound of the space needed for skipped superblocks. Non
     // skipped SBs need more space.
     if (get_bits_left(&gb) < 64 + s->num_superblocks * 23LL / 4320)
-        return AVERROR_INVALIDDATA;
+        return -1;

     frame_flags = get_bits_long(&gb, 32);
     frame_size  = get_bits_long(&gb, 32);
@@ -272,14 +277,9 @@ static int escape124_decode_frame(AVCodecContext *avctx,
             }

             av_freep(&s->codebooks[i].blocks);
-            if (cb_size >= INT_MAX / 34 || get_bits_left(&gb) < (int)cb_size * 34)
-                return AVERROR_INVALIDDATA;
-
-            if (cb_size >= INT_MAX / sizeof(MacroBlock))
-                return AVERROR_INVALIDDATA;
             s->codebooks[i] = unpack_codebook(&gb, cb_depth, cb_size);
             if (!s->codebooks[i].blocks)
-                return AVERROR(ENOMEM);
+                return -1;
         }
     }

diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index e3effad2e7..49ba7fd6de 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -1240,8 +1240,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         td->ysize = FFMIN(s->tile_attr.ySize, s->ydelta - tile_y * s->tile_attr.ySize);
         td->xsize = FFMIN(s->tile_attr.xSize, s->xdelta - tile_x * s->tile_attr.xSize);

-        if (td->xsize * (uint64_t)s->current_channel_offset > INT_MAX ||
-            av_image_check_size2(td->xsize, td->ysize, s->avctx->max_pixels, AV_PIX_FMT_NONE, 0, s->avctx) < 0)
+        if (td->xsize * (uint64_t)s->current_channel_offset > INT_MAX)
             return AVERROR_INVALIDDATA;

         td->channel_line_size = td->xsize * s->current_channel_offset;/* uncompress size of one line */
@@ -1265,8 +1264,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         td->ysize          = FFMIN(s->scan_lines_per_block, s->ymax - line + 1); /* s->ydelta - line ?? */
         td->xsize          = s->xdelta;

-        if (td->xsize * (uint64_t)s->current_channel_offset > INT_MAX ||
-            av_image_check_size2(td->xsize, td->ysize, s->avctx->max_pixels, AV_PIX_FMT_NONE, 0, s->avctx) < 0)
+        if (td->xsize * (uint64_t)s->current_channel_offset > INT_MAX)
             return AVERROR_INVALIDDATA;

         td->channel_line_size = td->xsize * s->current_channel_offset;/* uncompress size of one line */
@@ -1831,8 +1829,8 @@ static int decode_header(EXRContext *s, AVFrame *frame)
             dx = bytestream2_get_le32(gb);
             dy = bytestream2_get_le32(gb);

-            s->w = (unsigned)dx - sx + 1;
-            s->h = (unsigned)dy - sy + 1;
+            s->w = dx - sx + 1;
+            s->h = dy - sy + 1;

             continue;
         } else if ((var_size = check_header_variable(s, "lineOrder",
@@ -1947,12 +1945,9 @@ static int decode_header(EXRContext *s, AVFrame *frame)
                                                      "preview", 16)) >= 0) {
             uint32_t pw = bytestream2_get_le32(gb);
             uint32_t ph = bytestream2_get_le32(gb);
-            uint64_t psize = pw * ph;
-            if (psize > INT64_MAX / 4)
-                return AVERROR_INVALIDDATA;
-            psize *= 4;
+            int64_t psize = 4LL * pw * ph;

-            if ((int64_t)psize >= bytestream2_get_bytes_left(gb))
+            if (psize >= bytestream2_get_bytes_left(gb))
                 return AVERROR_INVALIDDATA;

             bytestream2_skip(gb, psize);
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 82a9c20853..8516fef5d7 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -166,34 +166,24 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
     RangeCoder *c = &fs->c;
     uint8_t state[CONTEXT_SIZE];
     unsigned ps, i, context_count;
-    int sx, sy, sw, sh;
-
     memset(state, 128, sizeof(state));
-    sx = get_symbol(c, state, 0);
-    sy = get_symbol(c, state, 0);
-    sw = get_symbol(c, state, 0) + 1U;
-    sh = get_symbol(c, state, 0) + 1U;

     av_assert0(f->version > 2);

-
-    if (sx < 0 || sy < 0 || sw <= 0 || sh <= 0)
-        return AVERROR_INVALIDDATA;
-    if (sx > f->num_h_slices - sw || sy > f->num_v_slices - sh)
-        return AVERROR_INVALIDDATA;
-
-    fs->slice_x      =  sx       * (int64_t)f->width  / f->num_h_slices;
-    fs->slice_y      =  sy       * (int64_t)f->height / f->num_v_slices;
-    fs->slice_width  = (sx + sw) * (int64_t)f->width  / f->num_h_slices - fs->slice_x;
-    fs->slice_height = (sy + sh) * (int64_t)f->height / f->num_v_slices - fs->slice_y;
-
-    av_assert0((unsigned)fs->slice_width  <= f->width &&
-                (unsigned)fs->slice_height <= f->height);
-    av_assert0 (   (unsigned)fs->slice_x + (uint64_t)fs->slice_width  <= f->width
-                && (unsigned)fs->slice_y + (uint64_t)fs->slice_height <= f->height);
-
-    if (fs->ac == AC_GOLOMB_RICE && fs->slice_width >= (1<<23))
-        return AVERROR_INVALIDDATA;
+    fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+    fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
+    fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
+
+    fs->slice_x /= f->num_h_slices;
+    fs->slice_y /= f->num_v_slices;
+    fs->slice_width  = fs->slice_width /f->num_h_slices - fs->slice_x;
+    fs->slice_height = fs->slice_height/f->num_v_slices - fs->slice_y;
+    if ((unsigned)fs->slice_width > f->width || (unsigned)fs->slice_height > f->height)
+        return -1;
+    if (    (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+         || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
+        return -1;

     for (i = 0; i < f->plane_count; i++) {
         PlaneContext * const p = &fs->plane[i];
@@ -308,11 +298,8 @@ static int decode_slice(AVCodecContext *c, void *arg)
     }
     if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
         return ret;
-    if (f->cur->key_frame || fs->slice_reset_contexts) {
+    if (f->cur->key_frame || fs->slice_reset_contexts)
         ff_ffv1_clear_slice_state(f, fs);
-    } else if (fs->slice_damaged) {
-        return AVERROR_INVALIDDATA;
-    }

     width  = fs->slice_width;
     height = fs->slice_height;
@@ -475,11 +462,6 @@ static int read_extra_header(FFV1Context *f)
         return AVERROR_INVALIDDATA;
     }

-    if (f->num_h_slices > MAX_SLICES / f->num_v_slices) {
-        av_log(f->avctx, AV_LOG_ERROR, "slice count unsupported\n");
-        return AVERROR_PATCHWELCOME;
-    }
-
     f->quant_table_count = get_symbol(c, state, 0);
     if (f->quant_table_count > (unsigned)MAX_QUANT_TABLES || !f->quant_table_count) {
         av_log(f->avctx, AV_LOG_ERROR, "quant table count %d is invalid\n", f->quant_table_count);
@@ -782,25 +764,21 @@ static int read_header(FFV1Context *f)
         fs->slice_damaged = 0;

         if (f->version == 2) {
-            int sx = get_symbol(c, state, 0);
-            int sy = get_symbol(c, state, 0);
-            int sw = get_symbol(c, state, 0) + 1U;
-            int sh = get_symbol(c, state, 0) + 1U;
-
-            if (sx < 0 || sy < 0 || sw <= 0 || sh <= 0)
+            fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+            fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+            fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
+            fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
+
+            fs->slice_x     /= f->num_h_slices;
+            fs->slice_y     /= f->num_v_slices;
+            fs->slice_width  = fs->slice_width  / f->num_h_slices - fs->slice_x;
+            fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
+            if ((unsigned)fs->slice_width  > f->width ||
+                (unsigned)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
-            if (sx > f->num_h_slices - sw || sy > f->num_v_slices - sh)
+            if (   (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
-
-            fs->slice_x      =  sx       * (int64_t)f->width  / f->num_h_slices;
-            fs->slice_y      =  sy       * (int64_t)f->height / f->num_v_slices;
-            fs->slice_width  = (sx + sw) * (int64_t)f->width  / f->num_h_slices - fs->slice_x;
-            fs->slice_height = (sy + sh) * (int64_t)f->height / f->num_v_slices - fs->slice_y;
-
-            av_assert0((unsigned)fs->slice_width  <= f->width &&
-                       (unsigned)fs->slice_height <= f->height);
-            av_assert0 (   (unsigned)fs->slice_x + (uint64_t)fs->slice_width  <= f->width
-                        && (unsigned)fs->slice_y + (uint64_t)fs->slice_height <= f->height);
         }

         for (i = 0; i < f->plane_count; i++) {
diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index 9b1d65e825..0b1d176ba1 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -93,11 +93,11 @@ static av_always_inline int RENAME(decode_line)(FFV1Context *s, int w,
                         run_count--;
                     }
                 } else {
-                    while (run_count > 1 && w-x > 1) {
-                        sample[1][x] = RENAME(predict)(sample[1] + x, sample[0] + x);
-                        x++;
-                        run_count--;
-                    }
+                while (run_count > 1 && w-x > 1) {
+                    sample[1][x] = RENAME(predict)(sample[1] + x, sample[0] + x);
+                    x++;
+                    run_count--;
+                }
                 }
                 run_count--;
                 if (run_count < 0) {
diff --git a/libavcodec/fmvc.c b/libavcodec/fmvc.c
index 82a2822e07..3701b0849b 100644
--- a/libavcodec/fmvc.c
+++ b/libavcodec/fmvc.c
@@ -401,17 +401,20 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     PutByteContext *pb = &s->pb;
     AVFrame *frame = data;
     int ret, y, x;
-    int key_frame;

     if (avpkt->size < 8)
         return AVERROR_INVALIDDATA;

+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
     bytestream2_init(gb, avpkt->data, avpkt->size);
     bytestream2_skip(gb, 2);

-    key_frame = !!bytestream2_get_le16(gb);
+    frame->key_frame = !!bytestream2_get_le16(gb);
+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;

-    if (key_frame) {
+    if (frame->key_frame) {
         const uint8_t *src;
         unsigned type, size;
         uint8_t *dst;
@@ -431,12 +434,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
             return AVERROR_PATCHWELCOME;
         }

-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
-            return ret;
-
-        frame->key_frame = 1;
-        frame->pict_type = AV_PICTURE_TYPE_I;
-
         src = s->buffer;
         dst = frame->data[0] + (avctx->height - 1) * frame->linesize[0];
         for (y = 0; y < avctx->height; y++) {
@@ -517,12 +514,6 @@ static int decode_frame(AVCodecContext *avctx, void *data,
             dst = &rect[block_h * s->stride];
         }

-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
-            return ret;
-
-        frame->key_frame = 0;
-        frame->pict_type = AV_PICTURE_TYPE_P;
-
         ssrc = s->buffer;
         ddst = frame->data[0] + (avctx->height - 1) * frame->linesize[0];
         for (y = 0; y < avctx->height; y++) {
diff --git a/libavcodec/g729_parser.c b/libavcodec/g729_parser.c
index ef08b48bf3..010f688104 100644
--- a/libavcodec/g729_parser.c
+++ b/libavcodec/g729_parser.c
@@ -49,9 +49,6 @@ static int g729_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
         s->block_size = (avctx->bit_rate < 8000) ? G729D_6K4_BLOCK_SIZE : G729_8K_BLOCK_SIZE;
         if (avctx->codec_id == AV_CODEC_ID_ACELP_KELVIN)
             s->block_size++;
-        // channels > 2 is invalid, we pass the packet on unchanged
-        if (avctx->channels > 2)
-            s->block_size = 0;
         s->block_size *= avctx->channels;
         s->duration   = avctx->frame_size;
     }
diff --git a/libavcodec/h263.h b/libavcodec/h263.h
index f5355e7ced..491f2e0aac 100644
--- a/libavcodec/h263.h
+++ b/libavcodec/h263.h
@@ -100,16 +100,15 @@ void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code);


 static inline int h263_get_motion_length(int val, int f_code){
-    int bit_size, code, sign;
+    int l, bit_size, code;

     if (val == 0) {
         return ff_mvtab[0][1];
     } else {
         bit_size = f_code - 1;
         /* modulo encoding */
-        val  = sign_extend(val, 6 + bit_size);
-        sign = val >> 31;
-        val  = (val ^ sign) - sign; /* val = FFABS(val) */
+        l= INT_BIT - 6 - bit_size;
+        val = (val<<l)>>l;
         val--;
         code = (val >> bit_size) + 1;

diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index f6f7789cef..e8b4d83e6e 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -545,8 +545,6 @@ retry:
     avctx->has_b_frames = !s->low_delay;

     if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4) {
-        if (s->pict_type != AV_PICTURE_TYPE_B && s->mb_num/2 > get_bits_left(&s->gb))
-            return AVERROR_INVALIDDATA;
         if (ff_mpeg4_workaround_bugs(avctx) == 1)
             goto retry;
         if (s->studio_profile != (s->idsp.idct == NULL))
diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
index bf3ab88da4..485f47d36e 100644
--- a/libavcodec/h264dec.c
+++ b/libavcodec/h264dec.c
@@ -654,10 +654,6 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
             avpriv_request_sample(avctx, "data partitioning");
             break;
         case H264_NAL_SEI:
-            if (h->setup_finished) {
-                avpriv_request_sample(avctx, "Late SEI");
-                break;
-            }
             ret = ff_h264_sei_decode(&h->sei, &nal->gb, &h->ps, avctx);
             h->has_recovery_point = h->has_recovery_point || h->sei.recovery_point.recovery_frame_cnt != -1;
             if (avctx->debug & FF_DEBUG_GREEN_MD)
diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h
new file mode 100644
index 0000000000..72cbba0953
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v1.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	rps;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	num_active_dpb_entries;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	num_rps_poc_st_curr_before;
+	__u8	num_rps_poc_st_curr_after;
+	__u8	num_rps_poc_lt_curr;
+
+	__u8	padding;
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h
new file mode 100644
index 0000000000..7cbbbf055f
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v2.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	rps;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	padding[5];
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u8	num_active_dpb_entries;
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
+/*
+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
+ * the number of data (in bits) to skip in the
+ * slice segment header.
+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
+ * to before syntax element "slice_temporal_mvp_enabled_flag".
+ * If IDR, the skipped bits are just "pic_output_flag"
+ * (separate_colour_plane_flag is not supported).
+ */
+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h
new file mode 100644
index 0000000000..4e35bd583d
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v3.h
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	padding[5];
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u8	num_active_dpb_entries;
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
+/*
+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
+ * the number of data (in bits) to skip in the
+ * slice segment header.
+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
+ * to before syntax element "slice_temporal_mvp_enabled_flag".
+ * If IDR, the skipped bits are just "pic_output_flag"
+ * (separate_colour_plane_flag is not supported).
+ */
+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
+
+#endif
diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
new file mode 100644
index 0000000000..c02fdbe5a8
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -0,0 +1,524 @@
+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ *  Video for Linux Two controls header file
+ *
+ *  Copyright (C) 1999-2012 the contributors
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  Alternatively you can redistribute this file under the terms of the
+ *  BSD license as stated below:
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *  3. The names of its contributors may not be used to endorse or promote
+ *     products derived from this software without specific prior written
+ *     permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  The contents of this header was split off from videodev2.h. All control
+ *  definitions should be added to this header, which is included by
+ *  videodev2.h.
+ */
+
+#ifndef AVCODEC_HEVC_CTRLS_V4_H
+#define AVCODEC_HEVC_CTRLS_V4_H
+
+#include <linux/const.h>
+#include <linux/types.h>
+
+#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS
+#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
+#endif
+#ifndef V4L2_CID_CODEC_STATELESS_BASE
+#define V4L2_CID_CODEC_STATELESS_BASE		(V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900)
+#endif
+
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
+
+enum v4l2_stateless_hevc_decode_mode {
+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_stateless_hevc_start_code {
+	V4L2_STATELESS_HEVC_START_CODE_NONE,
+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/**
+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
+ *
+ * @video_parameter_set_id: specifies the value of the
+ *			vps_video_parameter_set_id of the active VPS
+ * @seq_parameter_set_id: provides an identifier for the SPS for
+ *			  reference by other syntax elements
+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
+ *				in units of luma samples
+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
+ *				in units of luma samples
+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
+ *                         samples of the luma array
+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
+ *                           samples of the chroma arrays
+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
+ *                                     the variable MaxPicOrderCntLsb
+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
+ *                                    required size of the decoded picture
+ *                                    buffer for the codec video sequence
+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
+ *				    value of SpsMaxLatencyPictures array
+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
+ *					    luma coding block size
+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
+ *					      the maximum and minimum luma
+ *					      coding block size
+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
+ *					       transform block size
+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
+ *						 the maximum and minimum luma
+ *						 transform block size
+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in inter
+ *					 prediction mode
+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in intra
+ *					 prediction mode
+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
+ *                                    bits used to represent each of PCM sample
+ *                                    values of the luma component
+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
+ *                                      of bits used to represent each of PCM
+ *                                      sample values of the chroma components
+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                              minimum size of coding blocks
+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
+ *						  the maximum and minimum size of
+ *						  coding blocks
+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
+ *				 syntax structures included in the SPS
+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
+ *				reference pictures that are specified in the SPS
+ * @chroma_format_idc: specifies the chroma sampling
+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
+ *                             of temporal sub-layers
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_sps {
+	__u8	video_parameter_set_id;
+	__u8	seq_parameter_set_id;
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u8	reserved[6];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+/**
+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
+ *
+ * @pic_parameter_set_id: identifies the PPS for reference by other
+ *			  syntax elements
+ * @num_extra_slice_header_bits: specifies the number of extra slice header
+ *				 bits that are present in the slice header RBSP
+ *				 for coded pictures referring to the PPS.
+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l0_active_minus1
+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l1_active_minus1
+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
+ *		     each slice referring to the PPS
+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
+ *			    tree block size and the minimum luma coding block
+ *			    size of coding units that convey cu_qp_delta_abs
+ *			    and cu_qp_delta_sign_flag
+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
+ *			     partitioning the picture
+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
+ *			  the picture
+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
+ *			 units of coding tree blocks
+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
+ *		       units of coding tree blocks
+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
+ *			  beta divided by 2
+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
+ *			divided by 2
+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
+ *                                    the variable Log2ParMrgLevel
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_pps {
+	__u8	pic_parameter_set_id;
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+	__u8	reserved;
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+/**
+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
+ *
+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
+ * @flags: long term flag for the reference frame
+ * @field_pic: whether the reference is a field picture or a frame.
+ * @reserved: padding field. Should be zeroed by applications.
+ * @pic_order_cnt_val: the picture order count of the current picture.
+ */
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	reserved;
+	__s32	pic_order_cnt_val;
+};
+
+/**
+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
+ *
+ * @delta_luma_weight_l0: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 0
+ * @luma_offset_l0: the additive offset applied to the luma prediction value
+ *		    for list 0
+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 0
+ * @chroma_offset_l0: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 0
+ * @delta_luma_weight_l1: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 1
+ * @luma_offset_l1: the additive offset applied to the luma prediction value
+ *		    for list 1
+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 1
+ * @chroma_offset_l1: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 1
+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
+ *			    all luma weighting factors
+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
+ *				    of the denominator for all chroma
+ *				    weighting factors
+ */
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+/**
+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
+ *
+ * This control is a dynamically sized 1-dimensional array,
+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+ *
+ * @bit_size: size (in bits) of the current slice data
+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
+ *			     elements in the slice header.
+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
+ * @colour_plane_id: specifies the colour plane associated with the current slice
+ * @slice_pic_order_cnt: specifies the picture order count
+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 0
+ *                                that may be used to decode the slice
+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 1
+ *                                that may be used to decode the slice
+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
+ *			for temporal motion vector prediction
+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
+ *				   motion vector prediction candidates supported in
+ *				   the slice subtracted from 5
+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
+ *		    blocks in the slice
+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
+ * @slice_act_y_qp_offset: screen content extension parameters
+ * @slice_act_cb_qp_offset: screen content extension parameters
+ * @slice_act_cr_qp_offset: screen content extension parameters
+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
+ *		more fields
+ * @reserved0: padding field. Should be zeroed by applications.
+ * @slice_segment_addr: specifies the address of the first coding tree block in
+ *			the slice segment
+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS
+ * @pred_weight_table: the prediction weight coefficients for inter-picture
+ *		       prediction
+ * @reserved1: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_byte_offset;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__s32	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	__u8	reserved0[3];
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u8	reserved1[2];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+/**
+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
+ *
+ * @pic_order_cnt_val: picture order count
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS of the first slice
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS of the first slice
+ * @num_active_dpb_entries: the number of entries in dpb
+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
+ *			    set that come before the current frame
+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
+ *			   set that come after the current frame
+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
+ * @poc_st_curr_before: provides the index of the short term before references
+ *			in DPB array
+ * @poc_st_curr_after: provides the index of the short term after references
+ *		       in DPB array
+ * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @reserved: padding field. Should be zeroed by applications.
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+	__u8	num_active_dpb_entries;
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	reserved[4];
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/**
+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
+ *
+ * @scaling_list_4x4: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_8x8: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_16x16:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_32x32:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ */
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index a45cb6f0fb..6b9824088c 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -145,22 +145,11 @@ int i, j;

     if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
         for (i = 0; i < height; i++) {
-            for (j = 0; j < width - 7; j+=8)
+            for (j = 0; j < width; j+=8)
                 AV_COPY64U(dst+j, src+j);
             dst += stride_dst;
             src += stride_src;
         }
-        if (width&7) {
-            dst += ((width>>3)<<3) - stride_dst * height;
-            src += ((width>>3)<<3) - stride_src * height;
-            width &= 7;
-            for (i = 0; i < height; i++) {
-                for (j = 0; j < width; j++)
-                    dst[j] = src[j];
-                dst += stride_dst;
-                src += stride_src;
-            }
-        }
     } else {
         for (i = 0; i < height; i++) {
             for (j = 0; j < width; j+=16)
diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
index 463d352055..7feff43c28 100644
--- a/libavcodec/hevc_parser.c
+++ b/libavcodec/hevc_parser.c
@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
     avctx->level    = ps->sps->ptl.general_ptl.level_idc;

+    if (ps->sps->chroma_format_idc == 1) {
+        avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
+            ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
+            AVCHROMA_LOC_LEFT;
+    }
+    else if (ps->sps->chroma_format_idc == 2 ||
+             ps->sps->chroma_format_idc == 3) {
+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
+    }
+    else {
+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
+    }
+
     if (ps->vps->vps_timing_info_present_flag) {
         num = ps->vps->vps_num_units_in_tick;
         den = ps->vps->vps_time_scale;
diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index 4f6d985ae6..eefae71275 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         if (!frame->rpl_buf)
             goto fail;

-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
-        if (!frame->tab_mvf_buf)
-            goto fail;
-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
+        if (s->tab_mvf_pool) {
+            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+            if (!frame->tab_mvf_buf)
+                goto fail;
+            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
+        }

-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
-        if (!frame->rpl_tab_buf)
-            goto fail;
-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
-        for (j = 0; j < frame->ctb_count; j++)
-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
+        if (s->rpl_tab_pool) {
+            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+            if (!frame->rpl_tab_buf)
+                goto fail;
+            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+            for (j = 0; j < frame->ctb_count; j++)
+                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
+        }

         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s)
     int ctb_count    = frame->ctb_count;
     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
     int i;
+    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;

     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
         return AVERROR_INVALIDDATA;

-    for (i = ctb_addr_ts; i < ctb_count; i++)
-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+    if (frame->rpl_tab) {
+        for (i = ctb_addr_ts; i < ctb_count; i++)
+            frame->rpl_tab[i] = tab;
+    }

-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
+    frame->refPicList = tab->refPicList;

     return 0;
 }
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 19d6d517f3..7b05b41441 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -333,6 +333,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)

     ff_set_sar(avctx, sps->vui.sar);

+    if (sps->chroma_format_idc == 1) {
+        avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
+            sps->vui.chroma_sample_loc_type_top_field + 1 :
+            AVCHROMA_LOC_LEFT;
+    }
+    else if (sps->chroma_format_idc == 2 ||
+             sps->chroma_format_idc == 3) {
+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
+    }
+    else {
+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
+    }
+
     if (sps->vui.video_signal_type_present_flag)
         avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
                                                             : AVCOL_RANGE_MPEG;
@@ -392,14 +405,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
                      CONFIG_HEVC_NVDEC_HWACCEL + \
+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
                      CONFIG_HEVC_VAAPI_HWACCEL + \
                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
+                     CONFIG_HEVC_RPI4_8_HWACCEL + \
+                     CONFIG_HEVC_RPI4_10_HWACCEL + \
                      CONFIG_HEVC_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;

     switch (sps->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUVJ420P:
+#if CONFIG_HEVC_RPI4_8_HWACCEL
+        *fmt++ = AV_PIX_FMT_RPI4_8;
+#endif
 #if CONFIG_HEVC_DXVA2_HWACCEL
         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
@@ -418,9 +437,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #endif
 #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
 #endif
         break;
     case AV_PIX_FMT_YUV420P10:
+#if CONFIG_HEVC_RPI4_10_HWACCEL
+        *fmt++ = AV_PIX_FMT_RPI4_10;
+#endif
 #if CONFIG_HEVC_DXVA2_HWACCEL
         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
@@ -439,6 +464,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #endif
 #if CONFIG_HEVC_NVDEC_HWACCEL
         *fmt++ = AV_PIX_FMT_CUDA;
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
 #endif
         break;
     case AV_PIX_FMT_YUV444P:
@@ -485,6 +513,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
     if (!sps)
         return 0;

+    // If hwaccel then we don't need all the s/w decode helper arrays
+    if (s->avctx->hwaccel) {
+        export_stream_params(s, sps);
+
+        s->avctx->pix_fmt = pix_fmt;
+        s->ps.sps = sps;
+        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
+        return 0;
+    }
+
     ret = pic_arrays_init(s, sps);
     if (ret < 0)
         goto fail;
@@ -2901,11 +2939,13 @@ static int hevc_frame_start(HEVCContext *s)
                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
     int ret;

-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    if (s->horizontal_bs) {
+        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    }

     s->is_decoded        = 0;
     s->first_nal_type    = s->nal_unit_type;
@@ -3327,7 +3367,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
     s->ref = NULL;
     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
     if (ret < 0)
+    {
+        // Ensure that hwaccel knows this frame is over
+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
+            s->avctx->hwaccel->abort_frame(s->avctx);
+        }
+
         return ret;
+    }

     if (avctx->hwaccel) {
         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
@@ -3338,7 +3385,7 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
         }
     } else {
         /* verify the SEI checksum */
-        if (avctx->err_recognition & AV_EF_CRCCHECK && s->ref && s->is_decoded &&
+        if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
             s->sei.picture_hash.is_md5) {
             ret = verify_md5(s, s->ref->frame);
             if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
@@ -3370,15 +3417,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
     if (ret < 0)
         return ret;

-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
-    if (!dst->tab_mvf_buf)
-        goto fail;
-    dst->tab_mvf = src->tab_mvf;
+    if (src->tab_mvf_buf) {
+        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+        if (!dst->tab_mvf_buf)
+            goto fail;
+        dst->tab_mvf = src->tab_mvf;
+    }

-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
-    if (!dst->rpl_tab_buf)
-        goto fail;
-    dst->rpl_tab = src->rpl_tab;
+    if (src->rpl_tab_buf) {
+        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+        if (!dst->rpl_tab_buf)
+            goto fail;
+        dst->rpl_tab = src->rpl_tab;
+    }

     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
     if (!dst->rpl_buf)
@@ -3697,6 +3748,15 @@ AVCodec ff_hevc_decoder = {
 #if CONFIG_HEVC_NVDEC_HWACCEL
                                HWACCEL_NVDEC(hevc),
 #endif
+#if CONFIG_HEVC_RPI4_8_HWACCEL
+                               HWACCEL_RPI4_8(hevc),
+#endif
+#if CONFIG_HEVC_RPI4_10_HWACCEL
+                               HWACCEL_RPI4_10(hevc),
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+                               HWACCEL_V4L2REQUEST(hevc),
+#endif
 #if CONFIG_HEVC_VAAPI_HWACCEL
                                HWACCEL_VAAPI(hevc),
 #endif
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 61425975cd..56cd9e605d 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -313,7 +313,7 @@ static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(src[x] + offset_table[(src[x] >> shift) & 31]);
+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
         dst += stride_dst;
         src += stride_src;
     }
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index 8e54cf73f9..2277aadf75 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -39,6 +39,9 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
 extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
 extern const AVHWAccel ff_hevc_dxva2_hwaccel;
 extern const AVHWAccel ff_hevc_nvdec_hwaccel;
+extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
+extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
 extern const AVHWAccel ff_hevc_vaapi_hwaccel;
 extern const AVHWAccel ff_hevc_vdpau_hwaccel;
 extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
index f421dc909f..f93283b893 100644
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@@ -24,6 +24,7 @@


 #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
+#define HWACCEL_CAP_MT_SAFE         (1 << 1)


 typedef struct AVCodecHWConfigInternal {
@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
 #define HWACCEL_NVDEC(codec) \
     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
+#define HWACCEL_RPI4_8(codec) \
+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8,       NONE,         ff_ ## codec ## _rpi4_8_hwaccel)
+#define HWACCEL_RPI4_10(codec) \
+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10,      NONE,         ff_ ## codec ## _rpi4_10_hwaccel)
+#define HWACCEL_V4L2REQUEST(codec) \
+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
 #define HWACCEL_VAAPI(codec) \
     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
 #define HWACCEL_VDPAU(codec) \
diff --git a/libavcodec/jpeglsdec.c b/libavcodec/jpeglsdec.c
index fe0b3c3c40..c4ffa81f7d 100644
--- a/libavcodec/jpeglsdec.c
+++ b/libavcodec/jpeglsdec.c
@@ -67,7 +67,7 @@ int ff_jpegls_decode_lse(MJpegDecodeContext *s)
         s->t3     = get_bits(&s->gb, 16);
         s->reset  = get_bits(&s->gb, 16);

-        if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
             av_log(s->avctx, AV_LOG_DEBUG, "Coding parameters maxval:%d T1:%d T2:%d T3:%d reset:%d\n",
                    s->maxval, s->t1, s->t2, s->t3, s->reset);
         }
@@ -96,7 +96,7 @@ int ff_jpegls_decode_lse(MJpegDecodeContext *s)
         else
             maxtab = 65530/wt - 1;

-        if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
             av_log(s->avctx, AV_LOG_DEBUG, "LSE palette %d tid:%d wt:%d maxtab:%d\n", id, tid, wt, maxtab);
         }
         if (maxtab >= 256) {
@@ -186,7 +186,7 @@ static inline int ls_get_code_runterm(GetBitContext *gb, JLSState *state,
     if (RItype)
         temp += state->N[Q] >> 1;

-    for (k = 0; ((unsigned)state->N[Q] << k) < temp; k++)
+    for (k = 0; (state->N[Q] << k) < temp; k++)
         ;

 #ifdef JLS_BROKEN
@@ -195,8 +195,6 @@ static inline int ls_get_code_runterm(GetBitContext *gb, JLSState *state,
 #endif
     ret = get_ur_golomb_jpegls(gb, k, state->limit - limit_add - 1,
                                state->qbpp);
-    if (ret < 0)
-        return -0x10000;

     /* decode mapped error */
     map = 0;
@@ -211,7 +209,7 @@ static inline int ls_get_code_runterm(GetBitContext *gb, JLSState *state,
         ret = ret >> 1;
     }

-    if (FFABS(ret) > 0xFFFF)
+    if(FFABS(ret) > 0xFFFF)
         return -0x10000;
     /* update state */
     state->A[Q] += FFABS(ret) - RItype;
@@ -478,19 +476,19 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
             for (i = 0; i < s->height; i++) {
                 switch(s->xfrm) {
                 case 1:
-                    for (x = off; x + 2 < w; x += 3) {
+                    for (x = off; x < w; x += 3) {
                         src[x  ] += src[x+1] + 128;
                         src[x+2] += src[x+1] + 128;
                     }
                     break;
                 case 2:
-                    for (x = off; x + 2 < w; x += 3) {
+                    for (x = off; x < w; x += 3) {
                         src[x  ] += src[x+1] + 128;
                         src[x+2] += ((src[x  ] + src[x+1])>>1) + 128;
                     }
                     break;
                 case 3:
-                    for (x = off; x + 2 < w; x += 3) {
+                    for (x = off; x < w; x += 3) {
                         int g = src[x+0] - ((src[x+2]+src[x+1])>>2) + 64;
                         src[x+0] = src[x+2] + g + 128;
                         src[x+2] = src[x+1] + g + 128;
@@ -498,7 +496,7 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
                     }
                     break;
                 case 4:
-                    for (x = off; x + 2 < w; x += 3) {
+                    for (x = off; x < w; x += 3) {
                         int r    = src[x+0] - ((                       359 * (src[x+2]-128) + 490) >> 8);
                         int g    = src[x+0] - (( 88 * (src[x+1]-128) - 183 * (src[x+2]-128) +  30) >> 8);
                         int b    = src[x+0] + ((454 * (src[x+1]-128)                        + 574) >> 8);
diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index 1b08e9308e..d81e55cf4c 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -408,9 +408,6 @@ output_zeros:
         if (zero_run) {
             zero_run = 0;
             i += esc_count;
-            if (i >  end - dst ||
-                i >= src_end - src)
-                return AVERROR_INVALIDDATA;
             memcpy(dst, src, i);
             dst += i;
             l->zeros_rem = lag_calc_zero_run(src[i]);
diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
index a9b57f526d..a9c983eaca 100644
--- a/libavcodec/libdav1d.c
+++ b/libavcodec/libdav1d.c
@@ -127,11 +127,7 @@ static av_cold int libdav1d_init(AVCodecContext *c)
 {
     Libdav1dContext *dav1d = c->priv_data;
     Dav1dSettings s;
-#if FF_DAV1D_VERSION_AT_LEAST(6,0)
-    int threads = c->thread_count;
-#else
     int threads = (c->thread_count ? c->thread_count : av_cpu_count()) * 3 / 2;
-#endif
     int res;

     av_log(c, AV_LOG_INFO, "libdav1d %s\n", dav1d_version());
@@ -157,7 +153,7 @@ static av_cold int libdav1d_init(AVCodecContext *c)
         s.n_threads = FFMAX(dav1d->frame_threads, dav1d->tile_threads);
     else
         s.n_threads = FFMIN(threads, DAV1D_MAX_THREADS);
-    s.max_frame_delay = (c->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : 0;
+    s.max_frame_delay = (c->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : s.n_threads;
     av_log(c, AV_LOG_DEBUG, "Using %d threads, %d max_frame_delay\n",
            s.n_threads, s.max_frame_delay);
 #else
@@ -248,10 +244,8 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
     if (res < 0) {
         if (res == AVERROR(EINVAL))
             res = AVERROR_INVALIDDATA;
-        if (res != AVERROR(EAGAIN)) {
-            dav1d_data_unref(data);
+        if (res != AVERROR(EAGAIN))
             return res;
-        }
     }

     res = dav1d_get_picture(dav1d->c, p);
diff --git a/libavcodec/libkvazaar.c b/libavcodec/libkvazaar.c
index 7389265415..4432649853 100644
--- a/libavcodec/libkvazaar.c
+++ b/libavcodec/libkvazaar.c
@@ -210,19 +210,13 @@ static int libkvazaar_encode(AVCodecContext *avctx,

         // Copy pixels from frame to input_pic.
         {
-            uint8_t *dst[4] = {
-                input_pic->data[0],
-                input_pic->data[1],
-                input_pic->data[2],
-                NULL,
-            };
             int dst_linesizes[4] = {
               frame->width,
               frame->width / 2,
               frame->width / 2,
               0
             };
-            av_image_copy(dst, dst_linesizes,
+            av_image_copy(input_pic->data, dst_linesizes,
                           (const uint8_t **)frame->data, frame->linesize,
                           frame->format, frame->width, frame->height);
         }
diff --git a/libavcodec/libopenh264dec.c b/libavcodec/libopenh264dec.c
index dcd781dd84..c7aa7fa19c 100644
--- a/libavcodec/libopenh264dec.c
+++ b/libavcodec/libopenh264dec.c
@@ -91,8 +91,8 @@ static int svc_decode_frame(AVCodecContext *avctx, void *data,
 {
     SVCContext *s = avctx->priv_data;
     SBufferInfo info = { 0 };
-    uint8_t *ptrs[4] = { NULL };
-    int ret, linesize[4];
+    uint8_t* ptrs[3];
+    int ret, linesize[3];
     AVFrame *avframe = data;
     DECODING_STATE state;
 #if OPENH264_VER_AT_LEAST(1, 7)
@@ -140,7 +140,6 @@ static int svc_decode_frame(AVCodecContext *avctx, void *data,

     linesize[0] = info.UsrData.sSystemBuffer.iStride[0];
     linesize[1] = linesize[2] = info.UsrData.sSystemBuffer.iStride[1];
-    linesize[3] = 0;
     av_image_copy(avframe->data, avframe->linesize, (const uint8_t **) ptrs, linesize, avctx->pix_fmt, avctx->width, avctx->height);

     avframe->pts     = info.uiOutYuvTimeStamp;
diff --git a/libavcodec/libuavs3d.c b/libavcodec/libuavs3d.c
index 59b50a2843..be03da39e2 100644
--- a/libavcodec/libuavs3d.c
+++ b/libavcodec/libuavs3d.c
@@ -208,9 +208,7 @@ static int libuavs3d_decode_frame(AVCodecContext *avctx, void *data, int *got_fr
                 }
                 avctx->has_b_frames  = !seqh->low_delay;
                 avctx->pix_fmt = seqh->bit_depth_internal == 8 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUV420P10LE;
-                ret = ff_set_dimensions(avctx, seqh->horizontal_size, seqh->vertical_size);
-                if (ret < 0)
-                    return ret;
+                ff_set_dimensions(avctx, seqh->horizontal_size, seqh->vertical_size);
                 h->got_seqhdr = 1;

                 if (seqh->colour_description) {
diff --git a/libavcodec/libxavs2.c b/libavcodec/libxavs2.c
index f33240f300..2a4a3e36bd 100644
--- a/libavcodec/libxavs2.c
+++ b/libavcodec/libxavs2.c
@@ -205,7 +205,7 @@ static int xavs2_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         ret = cae->api->encoder_encode(cae->encoder, &pic, &cae->packet);

         if (ret) {
-            av_log(avctx, AV_LOG_ERROR, "Encoding error occurred.\n");
+            av_log(avctx, AV_LOG_ERROR, "Encoding error occured.\n");
             return AVERROR_EXTERNAL;
         }

diff --git a/libavcodec/midivid.c b/libavcodec/midivid.c
index 3e6a9ca3d9..2200440e2c 100644
--- a/libavcodec/midivid.c
+++ b/libavcodec/midivid.c
@@ -202,7 +202,12 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     bytestream2_skip(gb, 8);
     uncompressed = bytestream2_get_le32(gb);

-    if (!uncompressed) {
+    if ((ret = ff_reget_buffer(avctx, s->frame, 0)) < 0)
+        return ret;
+
+    if (uncompressed) {
+        ret = decode_mvdv(s, avctx, frame);
+    } else {
         av_fast_padded_malloc(&s->uncompressed, &s->uncompressed_size, 16LL * (avpkt->size - 12));
         if (!s->uncompressed)
             return AVERROR(ENOMEM);
@@ -211,13 +216,9 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         if (ret < 0)
             return ret;
         bytestream2_init(gb, s->uncompressed, ret);
+        ret = decode_mvdv(s, avctx, frame);
     }

-    if ((ret = ff_reget_buffer(avctx, s->frame, 0)) < 0)
-        return ret;
-
-    ret = decode_mvdv(s, avctx, frame);
-
     if (ret < 0)
         return ret;
     key = ret;
diff --git a/libavcodec/mjpegbdec.c b/libavcodec/mjpegbdec.c
index 3fab4a66bc..19875a2ddb 100644
--- a/libavcodec/mjpegbdec.c
+++ b/libavcodec/mjpegbdec.c
@@ -57,7 +57,6 @@ static int mjpegb_decode_frame(AVCodecContext *avctx,
     buf_end = buf + buf_size;
     s->got_picture = 0;
     s->adobe_transform = -1;
-    s->buf_size = buf_size;

 read_header:
     /* reset on every SOI */
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 7135c95bda..afb117cfc6 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -1082,10 +1082,6 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int nb_components, int p
         return AVERROR_INVALIDDATA;
     if (s->v_max != 1 || s->h_max != 1 || !s->lossless)
         return AVERROR_INVALIDDATA;
-    if (s->bayer) {
-        if (s->rct || s->pegasus_rct)
-            return AVERROR_INVALIDDATA;
-    }


     s->restart_count = s->restart_interval;
@@ -1202,8 +1198,6 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int nb_components, int p
                 ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else if (s->bayer) {
-            if (s->bits <= 8)
-                return AVERROR_PATCHWELCOME;
             if (nb_components == 1) {
                 /* Leave decoding to the TIFF/DNG decoder (see comment in ff_mjpeg_decode_sof) */
                 for (mb_x = 0; mb_x < width; mb_x++)
@@ -1938,8 +1932,6 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
         }

         len -= 9;
-        if (s->bayer)
-            goto out;
         if (s->got_picture)
             if (rgb != s->rgb || pegasus_rct != s->pegasus_rct) {
                 av_log(s->avctx, AV_LOG_WARNING, "Mismatching LJIF tag\n");
diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c
index 0845814834..12dd7be2e8 100644
--- a/libavcodec/mjpegenc_common.c
+++ b/libavcodec/mjpegenc_common.c
@@ -247,7 +247,7 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     default: av_assert0(0);
     }

-    put_bits(pb, 16, 8 + 3 * components);
+    put_bits(pb, 16, 17);
     if (lossless && (  avctx->pix_fmt == AV_PIX_FMT_BGR0
                     || avctx->pix_fmt == AV_PIX_FMT_BGRA
                     || avctx->pix_fmt == AV_PIX_FMT_BGR24))
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 7563fb0b12..0c30034dd4 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -521,7 +521,7 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,

     /* This should happen for TrueHD streams with >6 channels and MLP's noise
      * type. It is not yet known if this is allowed. */
-    if (max_matrix_channel > MAX_MATRIX_CHANNEL_MLP && !noise_type) {
+    if (max_channel > MAX_MATRIX_CHANNEL_MLP && !noise_type) {
         avpriv_request_sample(m->avctx,
                               "%d channels (more than the "
                               "maximum supported by the decoder)",
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index cb15ac072a..f6261db962 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -24,6 +24,9 @@
  * MMAL Video Decoder
  */

+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
 #include <bcm_host.h>
 #include <interface/mmal/mmal.h>
 #include <interface/mmal/mmal_parameters_video.h>
@@ -31,6 +34,7 @@
 #include <interface/mmal/util/mmal_util_params.h>
 #include <interface/mmal/util/mmal_default_components.h>
 #include <interface/mmal/vc/mmal_vc_api.h>
+#pragma GCC diagnostic pop
 #include <stdatomic.h>

 #include "avcodec.h"
diff --git a/libavcodec/mobiclip.c b/libavcodec/mobiclip.c
index 4baf347446..bf47a5bc41 100644
--- a/libavcodec/mobiclip.c
+++ b/libavcodec/mobiclip.c
@@ -329,7 +329,7 @@ static av_cold int mobiclip_init(AVCodecContext *avctx)
     return 0;
 }

-static int setup_qtables(AVCodecContext *avctx, int64_t quantizer)
+static int setup_qtables(AVCodecContext *avctx, int quantizer)
 {
     MobiClipContext *s = avctx->priv_data;
     int qx, qy;
@@ -1256,7 +1256,7 @@ static int mobiclip_decode(AVCodecContext *avctx, void *data,
         frame->key_frame = 0;
         s->dct_tab_idx = 0;

-        ret = setup_qtables(avctx, s->quantizer + (int64_t)get_se_golomb(gb));
+        ret = setup_qtables(avctx, s->quantizer + get_se_golomb(gb));
         if (ret < 0)
             return ret;

diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index b79e22c422..5b0958733c 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -1614,7 +1614,7 @@ int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
         for(y=0; y<s->mb_height; y++){
             int x;
             int xy= y*s->mb_stride;
-            for(x=0; x<s->mb_width; x++, xy++){
+            for(x=0; x<s->mb_width; x++){
                 if(s->mb_type[xy] & type){
                     int mx= mv_table[xy][0];
                     int my= mv_table[xy][1];
@@ -1622,15 +1622,16 @@ int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
                                      fcode_tab[my + MAX_MV]);
                     int j;

-                    if (mx >= range || mx < -range ||
-                        my >= range || my < -range)
-                        continue;
+                        if(mx >= range || mx < -range ||
+                           my >= range || my < -range)
+                            continue;

                     for(j=0; j<fcode && j<8; j++){
                         if(s->pict_type==AV_PICTURE_TYPE_B || s->current_picture.mc_mb_var[xy] < s->current_picture.mb_var[xy])
                             score[j]-= 170;
                     }
                 }
+                xy++;
             }
         }

diff --git a/libavcodec/motionpixels.c b/libavcodec/motionpixels.c
index 07febd3c22..b08a2f624b 100644
--- a/libavcodec/motionpixels.c
+++ b/libavcodec/motionpixels.c
@@ -184,7 +184,7 @@ static YuvPixel mp_get_yuv_from_rgb(MotionPixelsContext *mp, int x, int y)
     int color;

     color = *(uint16_t *)&mp->frame->data[0][y * mp->frame->linesize[0] + x * 2];
-    return mp_rgb_yuv_table[color & 0x7FFF];
+    return mp_rgb_yuv_table[color];
 }

 static void mp_set_rgb_from_yuv(MotionPixelsContext *mp, int x, int y, const YuvPixel *p)
diff --git a/libavcodec/movtextenc.c b/libavcodec/movtextenc.c
index b36354b14e..cf30adbd0a 100644
--- a/libavcodec/movtextenc.c
+++ b/libavcodec/movtextenc.c
@@ -85,7 +85,7 @@ typedef struct {
     uint8_t box_flags;
     StyleBox d;
     uint16_t text_pos;
-    unsigned byte_count;
+    uint16_t byte_count;
     char **fonts;
     int font_count;
     double font_scale_factor;
@@ -585,9 +585,9 @@ static void mov_text_cancel_overrides_cb(void *priv, const char *style_name)
     mov_text_ass_style_set(s, style);
 }

-static unsigned utf8_strlen(const char *text, int len)
+static uint16_t utf8_strlen(const char *text, int len)
 {
-    unsigned i = 0, ret = 0;
+    uint16_t i = 0, ret = 0;
     while (i < len) {
         char c = text[i];
         if ((c & 0x80) == 0)
@@ -607,7 +607,7 @@ static unsigned utf8_strlen(const char *text, int len)

 static void mov_text_text_cb(void *priv, const char *text, int len)
 {
-    unsigned utf8_len = utf8_strlen(text, len);
+    uint16_t utf8_len = utf8_strlen(text, len);
     MovTextContext *s = priv;
     av_bprint_append_data(&s->buffer, text, len);
     // If it's not utf-8, just use the byte length
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index e8f99dc5cf..09bf01247d 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -2999,10 +2999,6 @@ static int ipu_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame * const frame = data;
     int ret;

-    // Check for minimal intra MB size (considering mb header, luma & chroma dc VLC, ac EOB VLC)
-    if (avpkt->size*8LL < (avctx->width+15)/16 * ((avctx->height+15)/16) * (2 + 3*4 + 2*2 + 2*6))
-        return AVERROR_INVALIDDATA;
-
     ret = ff_get_buffer(avctx, frame, 0);
     if (ret < 0)
         return ret;
diff --git a/libavcodec/mpegaudiodec_template.c b/libavcodec/mpegaudiodec_template.c
index 642fa5ac79..4fd9e3a690 100644
--- a/libavcodec/mpegaudiodec_template.c
+++ b/libavcodec/mpegaudiodec_template.c
@@ -372,7 +372,7 @@ static int handle_crc(MPADecodeContext *s, int sec_len)
         crc_val = av_crc(crc_tab, crc_val, &buf[6], sec_byte_len);

         AV_WB32(tmp_buf,
-                ((buf[6 + sec_byte_len] & (0xFF00U >> sec_rem_bits)) << 24) +
+                ((buf[6 + sec_byte_len] & (0xFF00 >> sec_rem_bits)) << 24) +
                 ((s->crc << 16) >> sec_rem_bits));

         crc_val = av_crc(crc_tab, crc_val, tmp_buf, 3);
diff --git a/libavcodec/mss4.c b/libavcodec/mss4.c
index 4ad653c443..7f11f30dc8 100644
--- a/libavcodec/mss4.c
+++ b/libavcodec/mss4.c
@@ -26,7 +26,6 @@
  */

 #include "libavutil/thread.h"
-#include "libavutil/imgutils.h"

 #include "avcodec.h"
 #include "bytestream.h"
@@ -477,9 +476,6 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                width, height);
         return AVERROR_INVALIDDATA;
     }
-    if (av_image_check_size2(width, height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx) < 0)
-        return AVERROR_INVALIDDATA;
-
     if (quality < 1 || quality > 100) {
         av_log(avctx, AV_LOG_ERROR, "Invalid quality setting %d\n", quality);
         return AVERROR_INVALIDDATA;
diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index c6498864c8..b09ddbe0fa 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -1025,9 +1025,8 @@ static av_cold void nvenc_setup_rate_control(AVCodecContext *avctx)

         av_log(avctx, AV_LOG_VERBOSE, "CQ(%d) mode enabled.\n", tmp_quality);

-        // CQ mode shall discard avg bitrate/vbv buffer size and honor only max bitrate
+        //CQ mode shall discard avg bitrate & honor max bitrate;
         ctx->encode_config.rcParams.averageBitRate = avctx->bit_rate = 0;
-        ctx->encode_config.rcParams.vbvBufferSize = avctx->rc_buffer_size = 0;
         ctx->encode_config.rcParams.maxBitRate = avctx->rc_max_rate;
     }
 }
@@ -1761,7 +1760,7 @@ static int nvenc_register_frame(AVCodecContext *avctx, const AVFrame *frame)
     NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;

     AVHWFramesContext *frames_ctx = (AVHWFramesContext*)frame->hw_frames_ctx->data;
-    NV_ENC_REGISTER_RESOURCE reg = { 0 };
+    NV_ENC_REGISTER_RESOURCE reg;
     int i, idx, ret;

     for (i = 0; i < ctx->nb_registered_frames; i++) {
@@ -1926,7 +1925,7 @@ static int nvenc_set_timestamp(AVCodecContext *avctx,
     pkt->pts = params->outputTimeStamp;
     pkt->dts = timestamp_queue_dequeue(ctx->timestamp_list);

-    pkt->dts -= FFMAX(ctx->encode_config.frameIntervalP - 1, 0) * FFMAX(avctx->ticks_per_frame, 1) * FFMAX(avctx->time_base.num, 1);
+    pkt->dts -= FFMAX(ctx->encode_config.frameIntervalP - 1, 0) * FFMAX(avctx->ticks_per_frame, 1);

     return 0;
 }
diff --git a/libavcodec/opus_silk.c b/libavcodec/opus_silk.c
index 8523b55ada..913053c5e2 100644
--- a/libavcodec/opus_silk.c
+++ b/libavcodec/opus_silk.c
@@ -198,8 +198,7 @@ static inline int silk_is_lpc_stable(const int16_t lpc[16], int order)
     }
 }

-static void silk_lsp2poly(const int32_t lsp[/* 2 * half_order - 1 */],
-                          int32_t pol[/* half_order + 1 */], int half_order)
+static void silk_lsp2poly(const int32_t lsp[16], int32_t pol[16], int half_order)
 {
     int i, j;

diff --git a/libavcodec/pictordec.c b/libavcodec/pictordec.c
index 4d81c311c3..6340902526 100644
--- a/libavcodec/pictordec.c
+++ b/libavcodec/pictordec.c
@@ -245,6 +245,8 @@ static int decode_frame(AVCodecContext *avctx,
                         run = bytestream2_get_le16(&s->g);
                     val = bytestream2_get_byte(&s->g);
                 }
+                if (!bytestream2_get_bytes_left(&s->g))
+                    break;

                 if (bits_per_plane == 8) {
                     picmemset_8bpp(s, frame, val, run, &x, &y);
diff --git a/libavcodec/pixlet.c b/libavcodec/pixlet.c
index febee5c31d..ad9d830af7 100644
--- a/libavcodec/pixlet.c
+++ b/libavcodec/pixlet.c
@@ -405,7 +405,7 @@ static void filterfn(int16_t *dest, int16_t *tmp, unsigned size, int64_t scale)
                 (int64_t) low [i - 1] * -INT64_C(325392907)  +
                 (int64_t) high[i + 0] *  INT64_C(1518500249) +
                 (int64_t) high[i - 1] *  INT64_C(1518500249);
-        dest[i * 2] = av_clip_int16(((value >> 32) * (uint64_t)scale) >> 32);
+        dest[i * 2] = av_clip_int16(((value >> 32) * scale) >> 32);
     }

     for (i = 0; i < hsize; i++) {
@@ -416,7 +416,7 @@ static void filterfn(int16_t *dest, int16_t *tmp, unsigned size, int64_t scale)
                 (int64_t) high[i + 1] *  INT64_C(303700064)  +
                 (int64_t) high[i + 0] * -INT64_C(3644400640) +
                 (int64_t) high[i - 1] *  INT64_C(303700064);
-        dest[i * 2 + 1] = av_clip_int16(((value >> 32) * (uint64_t)scale) >> 32);
+        dest[i * 2 + 1] = av_clip_int16(((value >> 32) * scale) >> 32);
     }
 }

diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c
index 6301080832..6aa3c1b436 100644
--- a/libavcodec/pngdec.c
+++ b/libavcodec/pngdec.c
@@ -322,7 +322,7 @@ void ff_png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
 static void deloco_ ## NAME(TYPE *dst, int size, int alpha) \
 { \
     int i; \
-    for (i = 0; i < size - 2; i += 3 + alpha) { \
+    for (i = 0; i < size; i += 3 + alpha) { \
         int g = dst [i + 1]; \
         dst[i + 0] += g; \
         dst[i + 2] += g; \
diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index 6f48d2c208..0b0ff03c18 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -145,12 +145,6 @@ typedef struct FrameThreadContext {
                                     * Set for the first N packets, where N is the number of threads.
                                     * While it is set, ff_thread_en/decode_frame won't return any results.
                                     */
-
-    /* hwaccel state is temporarily stored here in order to transfer its ownership
-     * to the next decoding thread without the need for extra synchronization */
-    const AVHWAccel *stash_hwaccel;
-    void            *stash_hwaccel_context;
-    void            *stash_hwaccel_priv;
 } FrameThreadContext;

 #if FF_API_THREAD_SAFE_CALLBACKS
@@ -215,7 +209,8 @@ FF_ENABLE_DEPRECATION_WARNINGS

         /* if the previous thread uses hwaccel then we take the lock to ensure
          * the threads don't run concurrently */
-        if (avctx->hwaccel) {
+        if (avctx->hwaccel &&
+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
             pthread_mutex_lock(&p->parent->hwaccel_mutex);
             p->hwaccel_serializing = 1;
         }
@@ -235,17 +230,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
             ff_thread_finish_setup(avctx);

         if (p->hwaccel_serializing) {
-            /* wipe hwaccel state to avoid stale pointers lying around;
-             * the state was transferred to FrameThreadContext in
-             * ff_thread_finish_setup(), so nothing is leaked */
-            avctx->hwaccel                     = NULL;
-            avctx->hwaccel_context             = NULL;
-            avctx->internal->hwaccel_priv_data = NULL;
-
             p->hwaccel_serializing = 0;
             pthread_mutex_unlock(&p->parent->hwaccel_mutex);
         }
-        av_assert0(!avctx->hwaccel);

         if (p->async_serializing) {
             p->async_serializing = 0;
@@ -307,10 +294,14 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
         dst->color_range = src->color_range;
         dst->chroma_sample_location = src->chroma_sample_location;

+        dst->hwaccel = src->hwaccel;
+        dst->hwaccel_context = src->hwaccel_context;
+
         dst->channels       = src->channels;
         dst->sample_rate    = src->sample_rate;
         dst->sample_fmt     = src->sample_fmt;
         dst->channel_layout = src->channel_layout;
+        dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;

         if (!!dst->hw_frames_ctx != !!src->hw_frames_ctx ||
             (dst->hw_frames_ctx && dst->hw_frames_ctx->data != src->hw_frames_ctx->data)) {
@@ -456,12 +447,6 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx,
         }
     }

-    /* transfer the stashed hwaccel state, if any */
-    av_assert0(!p->avctx->hwaccel);
-    FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
-    FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-    FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
-
     av_packet_unref(p->avpkt);
     ret = av_packet_ref(p->avpkt, avpkt);
     if (ret < 0) {
@@ -652,7 +637,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {

     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;

-    if (avctx->hwaccel && !p->hwaccel_serializing) {
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
+        !p->hwaccel_serializing) {
         pthread_mutex_lock(&p->parent->hwaccel_mutex);
         p->hwaccel_serializing = 1;
     }
@@ -665,14 +652,6 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
         async_lock(p->parent);
     }

-    /* save hwaccel state for passing to the next thread;
-     * this is done here so that this worker thread can wipe its own hwaccel
-     * state after decoding, without requiring synchronization */
-    av_assert0(!p->parent->stash_hwaccel);
-    p->parent->stash_hwaccel         = avctx->hwaccel;
-    p->parent->stash_hwaccel_context = avctx->hwaccel_context;
-    p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
-
     pthread_mutex_lock(&p->progress_mutex);
     if(atomic_load(&p->state) == STATE_SETUP_FINISHED){
         av_log(avctx, AV_LOG_WARNING, "Multiple ff_thread_finish_setup() calls\n");
@@ -767,6 +746,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)

     park_frame_worker_threads(fctx, thread_count);

+    if (fctx->prev_thread && avctx->internal->hwaccel_priv_data !=
+                             fctx->prev_thread->avctx->internal->hwaccel_priv_data) {
+        if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n");
+        }
+    }
+
     if (fctx->prev_thread && fctx->prev_thread != fctx->threads)
         if (update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0) < 0) {
             av_log(avctx, AV_LOG_ERROR, "Final thread update failed\n");
@@ -820,13 +806,6 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
     av_freep(&fctx->threads);
     free_pthread(fctx, thread_ctx_offsets);

-    /* if we have stashed hwaccel state, move it to the user-facing context,
-     * so it will be freed in avcodec_close() */
-    av_assert0(!avctx->hwaccel);
-    FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
-    FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-    FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
-
     av_freep(&avctx->internal->thread_ctx);

     if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
diff --git a/libavcodec/qdrw.c b/libavcodec/qdrw.c
index c04c756d71..65279c9805 100644
--- a/libavcodec/qdrw.c
+++ b/libavcodec/qdrw.c
@@ -369,7 +369,7 @@ static int decode_frame(AVCodecContext *avctx,
             bytestream2_skip(&gbc, 18);
             colors = bytestream2_get_be16(&gbc);

-            if (colors < 0 || colors > 255) {
+            if (colors < 0 || colors > 256) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Error color count - %i(0x%X)\n", colors, colors);
                 return AVERROR_INVALIDDATA;
diff --git a/libavcodec/qpeldsp.c b/libavcodec/qpeldsp.c
index d99b8fd0ba..6e52b33657 100644
--- a/libavcodec/qpeldsp.c
+++ b/libavcodec/qpeldsp.c
@@ -198,7 +198,7 @@ static void OPNAME ## qpel8_mc01_c(uint8_t *dst, const uint8_t *src,          \
     uint8_t full[16 * 9];                                                     \
     uint8_t half[64];                                                         \
                                                                               \
-    copy_block8(full, src, 16, stride, 9);                                    \
+    copy_block9(full, src, 16, stride, 9);                                    \
     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
 }                                                                             \
@@ -208,7 +208,7 @@ static void OPNAME ## qpel8_mc02_c(uint8_t *dst, const uint8_t *src,          \
 {                                                                             \
     uint8_t full[16 * 9];                                                     \
                                                                               \
-    copy_block8(full, src, 16, stride, 9);                                    \
+    copy_block9(full, src, 16, stride, 9);                                    \
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
 }                                                                             \
                                                                               \
@@ -218,7 +218,7 @@ static void OPNAME ## qpel8_mc03_c(uint8_t *dst, const uint8_t *src,          \
     uint8_t full[16 * 9];                                                     \
     uint8_t half[64];                                                         \
                                                                               \
-    copy_block8(full, src, 16, stride, 9);                                    \
+    copy_block9(full, src, 16, stride, 9);                                    \
     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
 }                                                                             \
@@ -458,7 +458,7 @@ static void OPNAME ## qpel16_mc01_c(uint8_t *dst, const uint8_t *src,         \
     uint8_t full[24 * 17];                                                    \
     uint8_t half[256];                                                        \
                                                                               \
-    copy_block16(full, src, 24, stride, 17);                                  \
+    copy_block17(full, src, 24, stride, 17);                                  \
     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
 }                                                                             \
@@ -468,7 +468,7 @@ static void OPNAME ## qpel16_mc02_c(uint8_t *dst, const uint8_t *src,         \
 {                                                                             \
     uint8_t full[24 * 17];                                                    \
                                                                               \
-    copy_block16(full, src, 24, stride, 17);                                  \
+    copy_block17(full, src, 24, stride, 17);                                  \
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
 }                                                                             \
                                                                               \
@@ -478,7 +478,7 @@ static void OPNAME ## qpel16_mc03_c(uint8_t *dst, const uint8_t *src,         \
     uint8_t full[24 * 17];                                                    \
     uint8_t half[256];                                                        \
                                                                               \
-    copy_block16(full, src, 24, stride, 17);                                  \
+    copy_block17(full, src, 24, stride, 17);                                  \
     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
 }                                                                             \
diff --git a/libavcodec/rasc.c b/libavcodec/rasc.c
index 5ed1333886..207d50c452 100644
--- a/libavcodec/rasc.c
+++ b/libavcodec/rasc.c
@@ -722,7 +722,6 @@ static int decode_frame(AVCodecContext *avctx,
             break;
         default:
             bytestream2_skip(gb, size);
-            ret = 0;
         }

         if (ret < 0)
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index 079d5c5d10..0781f28615 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -294,6 +294,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */

+    /* RPI (Might as well define for everything) */
+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
+    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
+
     { AV_PIX_FMT_NONE, 0 },
 };

diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index d181b74570..b943dd0379 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -24,6 +24,7 @@
  * Raw Video Encoder
  */

+#include "config.h"
 #include "avcodec.h"
 #include "raw.h"
 #include "internal.h"
@@ -31,6 +32,10 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/avassert.h"
+#if CONFIG_SAND
+#include "libavutil/rpi_sand_fns.h"
+#endif

 static av_cold int raw_encode_init(AVCodecContext *avctx)
 {
@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }

+#if CONFIG_SAND
+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3 / 2;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height;
+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+    return 0;
+}
+
+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+    dst += width * height * 2;
+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+    return 0;
+}
+
+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height * 2;
+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
+    return 0;
+}
+#endif
+
+
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                      const AVFrame *frame, int *got_packet)
+                      const AVFrame *src_frame, int *got_packet)
 {
-    int ret = av_image_get_buffer_size(frame->format,
-                                       frame->width, frame->height, 1);
+    int ret;
+    AVFrame * frame = NULL;

-    if (ret < 0)
+#if CONFIG_SAND
+    if (av_rpi_is_sand_frame(src_frame)) {
+        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
+            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
+            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
+        *got_packet = (ret == 0);
         return ret;
+    }
+#endif
+
+    if ((frame = av_frame_clone(src_frame)) == NULL) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
+        goto fail;
+
+    ret = av_image_get_buffer_size(frame->format,
+                                       frame->width, frame->height, 1);
+    if (ret < 0)
+        goto fail;

     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
-        return ret;
+        goto fail;
     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
                                        (const uint8_t **)frame->data, frame->linesize,
                                        frame->format,
                                        frame->width, frame->height, 1)) < 0)
-        return ret;
+        goto fail;

     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
        frame->format   == AV_PIX_FMT_YUYV422) {
@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
     pkt->flags |= AV_PKT_FLAG_KEY;
+    av_frame_free(&frame);
     *got_packet = 1;
     return 0;
+
+fail:
+    av_frame_free(&frame);
+    *got_packet = 0;
+    return ret;
 }

 AVCodec ff_rawvideo_encoder = {
diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
new file mode 100644
index 0000000000..58c094c5f8
--- /dev/null
+++ b/libavcodec/rpi_hevc_cabac.c
@@ -0,0 +1,2257 @@
+/*
+ * HEVC CABAC decoding
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define UNCHECKED_BITSTREAM_READER 1
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+
+#include "cabac_functions.h"
+#include "rpi_hevc_data.h"
+#include "hevc.h"
+#include "rpi_hevcdec.h"
+#include "rpi_hevc_cabac_fns.h"
+
+#include "libavutil/rpi_sand_fns.h"
+
+// BY22 is probably faster than simple bypass if the processor has
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+// x86 has fast int divide
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+// Use native divide if we have a fast one - otherwise use mpy 1/x
+// x86 has a fast integer divide - arm doesn't - unsure about other
+// architectures
+#define USE_BY22_DIV  ARCH_X86
+
+// Special case blocks with a single significant ceoff
+// Decreases the complexity of the code for a common case but increases the
+// code size.
+#define USE_N_END_1 1
+
+#if !USE_BY22_DIV
+// * 1/x @ 32 bits gets us 22 bits of accuracy
+#define CABAC_BY22_PEEK_BITS  22
+#else
+// A real 32-bit divide gets us another bit
+// If we have a 64 bit int & a unit time divider then we should get a lot
+// of bits (55)  but that is untested and it is unclear if it would give
+// us a large advantage
+#define CABAC_BY22_PEEK_BITS  23
+#endif
+
+#define CABAC_MAX_BIN 31
+
+
+#if USE_BY22 && !USE_BY22_DIV
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+
+static const uint32_t cabac_by22_inv_range[256] = {
+                                                    0,      I(257), I(258), I(259),
+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+    I(510), I(511)
+};
+#undef I
+#endif  // USE_BY22
+
+#if ARCH_ARM
+#include "arm/rpi_hevc_cabac.h"
+#endif
+
+/**
+ * number of bin by SyntaxElement.
+ */
+static const int8_t num_bins_in_se[] = {
+     1, // sao_merge_flag
+     1, // sao_type_idx
+     0, // sao_eo_class
+     0, // sao_band_position
+     0, // sao_offset_abs
+     0, // sao_offset_sign
+     0, // end_of_slice_flag
+     3, // split_coding_unit_flag
+     1, // cu_transquant_bypass_flag
+     3, // skip_flag
+     3, // cu_qp_delta
+     1, // pred_mode
+     4, // part_mode
+     0, // pcm_flag
+     1, // prev_intra_luma_pred_mode
+     0, // mpm_idx
+     0, // rem_intra_luma_pred_mode
+     2, // intra_chroma_pred_mode
+     1, // merge_flag
+     1, // merge_idx
+     5, // inter_pred_idc
+     2, // ref_idx_l0
+     2, // ref_idx_l1
+     2, // abs_mvd_greater0_flag
+     2, // abs_mvd_greater1_flag
+     0, // abs_mvd_minus2
+     0, // mvd_sign_flag
+     1, // mvp_lx_flag
+     1, // no_residual_data_flag
+     3, // split_transform_flag
+     2, // cbf_luma
+     4, // cbf_cb, cbf_cr
+     2, // transform_skip_flag[][]
+     2, // explicit_rdpcm_flag[][]
+     2, // explicit_rdpcm_dir_flag[][]
+    18, // last_significant_coeff_x_prefix
+    18, // last_significant_coeff_y_prefix
+     0, // last_significant_coeff_x_suffix
+     0, // last_significant_coeff_y_suffix
+     4, // significant_coeff_group_flag
+    44, // significant_coeff_flag
+    24, // coeff_abs_level_greater1_flag
+     6, // coeff_abs_level_greater2_flag
+     0, // coeff_abs_level_remaining
+     0, // coeff_sign_flag
+     8, // log2_res_scale_abs
+     2, // res_scale_sign_flag
+     1, // cu_chroma_qp_offset_flag
+     1, // cu_chroma_qp_offset_idx
+};
+
+/**
+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
+ */
+static const int elem_offset[sizeof(num_bins_in_se)] = {
+    0, // sao_merge_flag
+    1, // sao_type_idx
+    2, // sao_eo_class
+    2, // sao_band_position
+    2, // sao_offset_abs
+    2, // sao_offset_sign
+    2, // end_of_slice_flag
+    2, // split_coding_unit_flag
+    5, // cu_transquant_bypass_flag
+    6, // skip_flag
+    9, // cu_qp_delta
+    12, // pred_mode
+    13, // part_mode
+    17, // pcm_flag
+    17, // prev_intra_luma_pred_mode
+    18, // mpm_idx
+    18, // rem_intra_luma_pred_mode
+    18, // intra_chroma_pred_mode
+    20, // merge_flag
+    21, // merge_idx
+    22, // inter_pred_idc
+    27, // ref_idx_l0
+    29, // ref_idx_l1
+    31, // abs_mvd_greater0_flag
+    33, // abs_mvd_greater1_flag
+    35, // abs_mvd_minus2
+    35, // mvd_sign_flag
+    35, // mvp_lx_flag
+    36, // no_residual_data_flag
+    37, // split_transform_flag
+    40, // cbf_luma
+    42, // cbf_cb, cbf_cr
+    46, // transform_skip_flag[][]
+    48, // explicit_rdpcm_flag[][]
+    50, // explicit_rdpcm_dir_flag[][]
+    52, // last_significant_coeff_x_prefix
+    70, // last_significant_coeff_y_prefix
+    88, // last_significant_coeff_x_suffix
+    88, // last_significant_coeff_y_suffix
+    88, // significant_coeff_group_flag
+    92, // significant_coeff_flag
+    136, // coeff_abs_level_greater1_flag
+    160, // coeff_abs_level_greater2_flag
+    166, // coeff_abs_level_remaining
+    166, // coeff_sign_flag
+    166, // log2_res_scale_abs
+    174, // res_scale_sign_flag
+    176, // cu_chroma_qp_offset_flag
+    177, // cu_chroma_qp_offset_idx
+};
+
+#define CNU 154
+/**
+ * Indexed by init_type
+ */
+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
+    { // sao_merge_flag
+      153,
+      // sao_type_idx
+      200,
+      // split_coding_unit_flag
+      139, 141, 157,
+      // cu_transquant_bypass_flag
+      154,
+      // skip_flag
+      CNU, CNU, CNU,
+      // cu_qp_delta
+      154, 154, 154,
+      // pred_mode
+      CNU,
+      // part_mode
+      184, CNU, CNU, CNU,
+      // prev_intra_luma_pred_mode
+      184,
+      // intra_chroma_pred_mode
+      63, 139,
+      // merge_flag
+      CNU,
+      // merge_idx
+      CNU,
+      // inter_pred_idc
+      CNU, CNU, CNU, CNU, CNU,
+      // ref_idx_l0
+      CNU, CNU,
+      // ref_idx_l1
+      CNU, CNU,
+      // abs_mvd_greater1_flag
+      CNU, CNU,
+      // abs_mvd_greater1_flag
+      CNU, CNU,
+      // mvp_lx_flag
+      CNU,
+      // no_residual_data_flag
+      CNU,
+      // split_transform_flag
+      153, 138, 138,
+      // cbf_luma
+      111, 141,
+      // cbf_cb, cbf_cr
+      94, 138, 182, 154,
+      // transform_skip_flag
+      139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
+      // last_significant_coeff_x_prefix
+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
+       79, 108, 123,  63,
+      // last_significant_coeff_y_prefix
+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
+       79, 108, 123,  63,
+      // significant_coeff_group_flag
+      91, 171, 134, 141,
+      // significant_coeff_flag
+      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
+      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
+      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
+      141, 111,
+      // coeff_abs_level_greater1_flag
+      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
+      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
+      // coeff_abs_level_greater2_flag
+      138, 153, 136, 167, 152, 152,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
+    { // sao_merge_flag
+      153,
+      // sao_type_idx
+      185,
+      // split_coding_unit_flag
+      107, 139, 126,
+      // cu_transquant_bypass_flag
+      154,
+      // skip_flag
+      197, 185, 201,
+      // cu_qp_delta
+      154, 154, 154,
+      // pred_mode
+      149,
+      // part_mode
+      154, 139, 154, 154,
+      // prev_intra_luma_pred_mode
+      154,
+      // intra_chroma_pred_mode
+      152, 139,
+      // merge_flag
+      110,
+      // merge_idx
+      122,
+      // inter_pred_idc
+      95, 79, 63, 31, 31,
+      // ref_idx_l0
+      153, 153,
+      // ref_idx_l1
+      153, 153,
+      // abs_mvd_greater1_flag
+      140, 198,
+      // abs_mvd_greater1_flag
+      140, 198,
+      // mvp_lx_flag
+      168,
+      // no_residual_data_flag
+      79,
+      // split_transform_flag
+      124, 138, 94,
+      // cbf_luma
+      153, 111,
+      // cbf_cb, cbf_cr
+      149, 107, 167, 154,
+      // transform_skip_flag
+      139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
+      // last_significant_coeff_x_prefix
+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
+       94, 108, 123, 108,
+      // last_significant_coeff_y_prefix
+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
+       94, 108, 123, 108,
+      // significant_coeff_group_flag
+      121, 140, 61, 154,
+      // significant_coeff_flag
+      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
+      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
+      // coeff_abs_level_greater1_flag
+      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
+      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
+      // coeff_abs_level_greater2_flag
+      107, 167, 91, 122, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
+    { // sao_merge_flag
+      153,
+      // sao_type_idx
+      160,
+      // split_coding_unit_flag
+      107, 139, 126,
+      // cu_transquant_bypass_flag
+      154,
+      // skip_flag
+      197, 185, 201,
+      // cu_qp_delta
+      154, 154, 154,
+      // pred_mode
+      134,
+      // part_mode
+      154, 139, 154, 154,
+      // prev_intra_luma_pred_mode
+      183,
+      // intra_chroma_pred_mode
+      152, 139,
+      // merge_flag
+      154,
+      // merge_idx
+      137,
+      // inter_pred_idc
+      95, 79, 63, 31, 31,
+      // ref_idx_l0
+      153, 153,
+      // ref_idx_l1
+      153, 153,
+      // abs_mvd_greater1_flag
+      169, 198,
+      // abs_mvd_greater1_flag
+      169, 198,
+      // mvp_lx_flag
+      168,
+      // no_residual_data_flag
+      79,
+      // split_transform_flag
+      224, 167, 122,
+      // cbf_luma
+      153, 111,
+      // cbf_cb, cbf_cr
+      149, 92, 167, 154,
+      // transform_skip_flag
+      139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
+      // last_significant_coeff_x_prefix
+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
+       79, 108, 123,  93,
+      // last_significant_coeff_y_prefix
+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
+       79, 108, 123,  93,
+      // significant_coeff_group_flag
+      121, 140, 61, 154,
+      // significant_coeff_flag
+      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
+      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
+      // coeff_abs_level_greater1_flag
+      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
+      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
+      // coeff_abs_level_greater2_flag
+      107, 167, 91, 107, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
+};
+
+static const uint8_t scan_1x1[1] = {
+    0,
+};
+
+static const uint8_t horiz_scan2x2_x[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t horiz_scan2x2_y[4] = {
+    0, 0, 1, 1
+};
+
+static const uint8_t horiz_scan4x4_x[16] = {
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+};
+
+static const uint8_t horiz_scan4x4_y[16] = {
+    0, 0, 0, 0,
+    1, 1, 1, 1,
+    2, 2, 2, 2,
+    3, 3, 3, 3,
+};
+
+static const uint8_t horiz_scan8x8_inv[8][8] = {
+    {  0,  1,  2,  3, 16, 17, 18, 19, },
+    {  4,  5,  6,  7, 20, 21, 22, 23, },
+    {  8,  9, 10, 11, 24, 25, 26, 27, },
+    { 12, 13, 14, 15, 28, 29, 30, 31, },
+    { 32, 33, 34, 35, 48, 49, 50, 51, },
+    { 36, 37, 38, 39, 52, 53, 54, 55, },
+    { 40, 41, 42, 43, 56, 57, 58, 59, },
+    { 44, 45, 46, 47, 60, 61, 62, 63, },
+};
+
+static const uint8_t diag_scan2x2_x[4] = {
+    0, 0, 1, 1,
+};
+
+static const uint8_t diag_scan2x2_y[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t diag_scan2x2_inv[2][2] = {
+    { 0, 2, },
+    { 1, 3, },
+};
+
+static const uint8_t diag_scan4x4_inv[4][4] = {
+    { 0,  2,  5,  9, },
+    { 1,  4,  8, 12, },
+    { 3,  7, 11, 14, },
+    { 6, 10, 13, 15, },
+};
+
+static const uint8_t diag_scan8x8_inv[8][8] = {
+    {  0,  2,  5,  9, 14, 20, 27, 35, },
+    {  1,  4,  8, 13, 19, 26, 34, 42, },
+    {  3,  7, 12, 18, 25, 33, 41, 48, },
+    {  6, 11, 17, 24, 32, 40, 47, 53, },
+    { 10, 16, 23, 31, 39, 46, 52, 57, },
+    { 15, 22, 30, 38, 45, 51, 56, 60, },
+    { 21, 29, 37, 44, 50, 55, 59, 62, },
+    { 28, 36, 43, 49, 54, 58, 61, 63, },
+};
+
+
+typedef struct
+{
+    uint16_t coeff;
+    uint16_t scale;
+} xy_off_t;
+
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+
+#define OFF_DIAG(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+#define OFF_HORIZ(t) {\
+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+}
+
+#define OFF_VERT(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+static const xy_off_t off_xys[3][4][16] =
+{
+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+};
+
+
+// Helper fns
+#ifndef hevc_mem_bits32
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+{
+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+}
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+#define hevc_clz32 hevc_clz32_builtin
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+{
+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+}
+#endif
+
+// It is unlikely that we will ever need this but include for completeness
+#ifndef hevc_clz32
+static inline unsigned int hevc_clz32(unsigned int x)
+{
+    unsigned int n = 1;
+    if ((x & 0xffff0000) == 0) {
+        n += 16;
+        x <<= 16;
+    }
+    if ((x & 0xff000000) == 0) {
+        n += 8;
+        x <<= 8;
+    }
+    if ((x & 0xf0000000) == 0) {
+        n += 4;
+        x <<= 4;
+    }
+    if ((x & 0xc0000000) == 0) {
+        n += 2;
+        x <<= 2;
+    }
+    return n - ((x >> 31) & 1);
+}
+#endif
+
+static inline int cabac_overflow(const CABACContext * const cc)
+{
+    av_assert0(cc->bytestream >= cc->bytestream_start);
+    return cc->bytestream >= cc->bytestream_end + 4;
+}
+
+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
+{
+    return cabac_overflow(&lc->cc);
+}
+
+#if !USE_BY22
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+// will no longer be called but the setup calls will still exist and we want
+// to null them out
+#define bypass_start(s)
+#define bypass_finish(s)
+#else
+// Use BY22 for residual bypass block
+
+#define bypass_start(cc) get_cabac_by22_start(cc)
+#define bypass_finish(cc) get_cabac_by22_finish(cc)
+
+// BY22 notes that bypass is simply a divide into the bitstream and so we
+// can peek out large quantities of bits at once and treat the result as if
+// it was VLC.  In many cases this will lead to O(1) processing rather than
+// O(n) though the setup and teardown is sufficiently expensive that it is
+// only worth using if we expect to be dealing with more than a few bits
+// The definition of "a few bits" will vary from platform to platform but
+// tests on ARM show that it probably isn't worth it for a single coded
+// residual, but is for >1 - it also seems likely that if there are
+// more residuals then they are likely to be bigger and this will make the
+// O(1) nature of the code more worthwhile.
+
+
+// Bypass block start
+// Must be called before _by22_peek is used as it sets the CABAC environment
+// into the correct state.  _by22_finish must be called to return to 'normal'
+// (i.e. non-bypass) cabac decoding
+#ifndef get_cabac_by22_start
+static inline void get_cabac_by22_start(CABACContext * const c)
+{
+    const unsigned int bits = __builtin_ctz(c->low);
+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+#if !USE_BY22_DIV
+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+#endif
+
+    c->bytestream -= (CABAC_BITS / 8);
+    c->by22.bits = bits;
+#if !USE_BY22_DIV
+    c->by22.range = c->range;
+    c->range = inv;
+#endif
+    c->low = x;
+}
+#endif
+
+// Bypass block finish
+// Must be called at the end of the bypass block to return to normal operation
+static inline void get_cabac_by22_finish(CABACContext * const c)
+{
+    unsigned int used = c->by22.bits;
+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+
+    c->bytestream += bytes_used + (CABAC_BITS / 8);
+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+#if !USE_BY22_DIV
+    c->range = c->by22.range;
+#endif
+}
+
+// Peek bypass bits
+// _by22_start must be called before _by22_peek is called and _by22_flush
+// must be called afterwards to flush any used bits
+// The actual number of valid bits returned is
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+// will be at least 22 which should be long enough for any prefix or suffix
+// though probably not long enough for the worst case combination
+#ifndef get_cabac_by22_peek
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+{
+#if USE_BY22_DIV
+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+#else
+    uint32_t x = c->low & ~1U;
+    const uint32_t inv = c->range;
+
+    if (inv != 0)
+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+
+    return x << 1;
+#endif
+}
+#endif
+
+// Flush bypass bits peeked by _by22_peek
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+// val is an unmodified copy of whatever _by22_peek returned
+#ifndef get_cabac_by22_flush
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+{
+    // Subtract the bits used & reshift up to the top of the word
+#if USE_BY22_DIV
+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+#else
+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+#endif
+
+    // and refill lower bits
+    // We will probably OR over some existing bits but that doesn't matter
+    c->by22.bits += n;
+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+}
+#endif
+
+#endif  // USE_BY22
+
+
+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
+{
+    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
+    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
+}
+
+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
+    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
+}
+
+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
+{
+    GetBitContext * const gb = &lc->gb;
+    skip_bits(gb, 1);
+    align_get_bits(gb);
+    return ff_init_cabac_decoder(&lc->cc,
+                          gb->buffer + get_bits_count(gb) / 8,
+                          (get_bits_left(gb) + 7) / 8);
+}
+
+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    int init_type = 2 - s->sh.slice_type;
+    int i;
+
+    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
+        init_type ^= 3;
+
+    for (i = 0; i < HEVC_CONTEXTS; i++) {
+        int init_value = init_values[init_type][i];
+        int m = (init_value >> 4) * 5 - 45;
+        int n = ((init_value & 15) << 3) - 16;
+        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
+
+        pre ^= pre >> 31;
+        if (pre > 124)
+            pre = 124 + (pre & 1);
+        lc->cabac_state[i] = pre;
+    }
+
+    for (i = 0; i < 4; i++)
+        lc->stat_coeff[i] = 0;
+}
+
+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
+{
+    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
+    {
+        lc->qPy_pred = s->sh.slice_qp;
+        cabac_init_state(s, lc);
+    }
+    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
+    {
+        lc->qPy_pred = s->sh.slice_qp;
+        load_states(s, lc);
+    }
+    lc->cabac_init_req = 0;
+}
+
+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
+
+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
+{
+    return get_cabac_inline(c, state);
+}
+
+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
+{
+    return get_cabac_terminate(c);
+}
+
+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
+{
+    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
+        return 0;
+
+    if (!get_cabac_bypass(&lc->cc))
+        return SAO_BAND;
+    return SAO_EDGE;
+}
+
+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
+{
+    int i;
+    int value = get_cabac_bypass(&lc->cc);
+
+    for (i = 0; i < 4; i++)
+        value = (value << 1) | get_cabac_bypass(&lc->cc);
+    return value;
+}
+
+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    int i = 0;
+    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
+
+    while (i < length && get_cabac_bypass(&lc->cc))
+        i++;
+    return i;
+}
+
+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
+{
+    return get_cabac_bypass(&lc->cc);
+}
+
+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
+{
+    int ret = get_cabac_bypass(&lc->cc) << 1;
+    ret    |= get_cabac_bypass(&lc->cc);
+    return ret;
+}
+
+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
+{
+    int val = 1;
+
+    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
+        return 0;
+
+    while (val < 5 &&
+           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
+        val++;
+
+    if (val >= 5) {
+        unsigned int k = 0;
+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
+            val += 1 << k;
+            k++;
+        }
+//        if (k == CABAC_MAX_BIN)
+//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+
+        while (k--)
+            val += get_cabac_bypass(&lc->cc) << k;
+    }
+    return get_cabac_bypass(&lc->cc) ? -val : val;
+}
+
+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
+    int i = 0;
+
+    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
+        i++;
+
+    return i;
+}
+
+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
+{
+    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
+        return PART_2Nx2N;
+    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
+        if (lc->cu.pred_mode == MODE_INTRA) // 0
+            return PART_NxN;
+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
+            return PART_2NxN;
+        if (log2_cb_size == 3) // 00
+            return PART_Nx2N;
+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
+            return PART_Nx2N;
+        return PART_NxN; // 000
+    }
+
+    if (!s->ps.sps->amp_enabled_flag) {
+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
+            return PART_2NxN;
+        return PART_Nx2N;
+    }
+
+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
+            return PART_2NxN;
+        if (get_cabac_bypass(&lc->cc)) // 0101
+            return PART_2NxnD;
+        return PART_2NxnU; // 0100
+    }
+
+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
+        return PART_Nx2N;
+    if (get_cabac_bypass(&lc->cc)) // 0001
+        return PART_nRx2N;
+    return PART_nLx2N;  // 0000
+}
+
+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
+{
+    int i = 0;
+    while (i < 2 && get_cabac_bypass(&lc->cc))
+        i++;
+    return i;
+}
+
+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
+{
+    int i;
+    int value = get_cabac_bypass(&lc->cc);
+
+    for (i = 0; i < 4; i++)
+        value = (value << 1) | get_cabac_bypass(&lc->cc);
+    return value;
+}
+
+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
+{
+    int ret;
+    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
+        return 4;
+
+    ret  = get_cabac_bypass(&lc->cc) << 1;
+    ret |= get_cabac_bypass(&lc->cc);
+    return ret;
+}
+
+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
+
+    if (i != 0) {
+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
+            i++;
+    }
+    return i;
+}
+
+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
+{
+    if (nPbW + nPbH == 12)
+        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
+    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
+        return PRED_BI;
+
+    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
+}
+
+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
+{
+    int i = 0;
+    int max = num_ref_idx_lx - 1;
+    int max_ctx = FFMIN(max, 2);
+
+    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
+        i++;
+    if (i == 2) {
+        while (i < max && get_cabac_bypass(&lc->cc))
+            i++;
+    }
+
+    return i;
+}
+
+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
+}
+
+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
+}
+
+#if !USE_BY22
+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
+{
+    int ret = 2;
+    int k = 1;
+
+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
+        ret += 1U << k;
+        k++;
+    }
+    if (k == CABAC_MAX_BIN) {
+        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+        return 0;
+    }
+
+    while (k--)
+        ret += get_cabac_bypass(&lc->cc) << k;
+    return get_cabac_bypass_sign(&lc->cc, -ret);
+}
+#endif
+
+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return get_cabac_bypass_sign(&lc->cc, -1);
+}
+
+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
+{
+    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
+}
+
+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
+{
+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
+}
+
+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
+{
+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
+}
+
+
+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
+    int i =0;
+
+    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
+        i++;
+
+    return i;
+}
+
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
+{
+    int i = 0;
+    int max = (log2_size << 1) - 1;
+    int ctx_offset, ctx_shift;
+
+    if (!c_idx_nz) {
+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+        ctx_shift = (log2_size + 1) >> 2;
+    } else {
+        ctx_offset = 15;
+        ctx_shift = log2_size - 2;
+    }
+    while (i < max &&
+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scx_prefix = i;
+
+    i = 0;
+    while (i < max &&
+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scy_prefix = i;
+}
+
+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
+                                                 int last_significant_coeff_prefix)
+{
+    int i;
+    int length = (last_significant_coeff_prefix >> 1) - 1;
+    int value = get_cabac_bypass(&lc->cc);
+
+    for (i = 1; i < length; i++)
+        value = (value << 1) | get_cabac_bypass(&lc->cc);
+    return value;
+}
+
+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
+{
+    int inc;
+
+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
+
+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
+}
+
+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
+{
+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+}
+
+#if !USE_BY22
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+#endif
+
+
+#ifndef coeff_abs_level_remaining_decode_bypass
+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
+{
+    uint32_t y;
+    unsigned int prefix;
+    unsigned int last_coeff_abs_level_remaining;
+    unsigned int n;
+
+    y = get_cabac_by22_peek(c);
+    prefix = hevc_clz32(~y);
+    // y << prefix will always have top bit 0
+
+    if (prefix < 3) {
+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+        n = prefix + 1 + rice_param;
+    }
+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+    {
+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix * 2 + rice_param - 2;
+    }
+    else {
+        unsigned int suffix;
+
+        get_cabac_by22_flush(c, prefix, y);
+        y = get_cabac_by22_peek(c);
+
+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix + rice_param - 2;
+    }
+
+    get_cabac_by22_flush(c, n, y);
+
+    return last_coeff_abs_level_remaining;
+}
+#endif
+
+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
+{
+    int prefix = 0;
+    int suffix = 0;
+    int last_coeff_abs_level_remaining;
+    int i;
+
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
+        prefix++;
+    if (prefix == CABAC_MAX_BIN) {
+//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+        return 0;
+    }
+
+    if (prefix < 3) {
+        for (i = 0; i < rc_rice_param; i++)
+            suffix = (suffix << 1) | get_cabac_bypass(c);
+        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
+    } else {
+        int prefix_minus3 = prefix - 3;
+        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
+            suffix = (suffix << 1) | get_cabac_bypass(c);
+        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
+                                              << rc_rice_param) + suffix;
+    }
+
+    return last_coeff_abs_level_remaining;
+}
+
+#if !USE_BY22
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
+{
+    unsigned int i;
+    uint32_t ret = 0;
+
+    for (i = 0; i < nb; i++)
+        ret = (ret << 1) | get_cabac_bypass(c);
+
+    return ret << (32 - nb);
+}
+#endif
+
+#ifndef coeff_sign_flag_decode_bypass
+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
+{
+    uint32_t y;
+    y = get_cabac_by22_peek(c);
+    get_cabac_by22_flush(c, nb, y);
+    return y & ~(0xffffffffU >> nb);
+}
+#endif
+
+
+#ifndef get_cabac_greater1_bits
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i;
+    unsigned int rv = 0;
+    for (i = 0; i != n; ++i) {
+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+        const unsigned int b = get_cabac(c, state0 + idx);
+        rv = (rv << 1) | b;
+    }
+    return rv;
+}
+#endif
+
+
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+// this version of events.
+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
+    int * const pprev_subset_coded, int * const psum,
+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+{
+    CABACContext * const c = &lc->cc;
+    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
+    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
+    unsigned int rv;
+    unsigned int i;
+    const unsigned int n = FFMIN(n_end, 8);
+
+    // Really this is i != n but the simple unconditional loop is cheaper
+    // and faster
+    for (i = 0; i != 8; ++i)
+        levels[i] = 1;
+
+    rv = get_cabac_greater1_bits(c, n, state0);
+
+    *pprev_subset_coded = 0;
+    *psum = n;
+
+    rv <<= (32 - n);
+    if (rv != 0)
+    {
+        *pprev_subset_coded = 1;
+        *psum = n + 1;
+        i = hevc_clz32(rv);
+        levels[i] = 2;
+        if (get_cabac(c, state_gt2) == 0)
+        {
+            // Unset first coded bit
+            rv &= ~(0x80000000U >> i);
+        }
+    }
+
+    if (n_end > 8) {
+        const unsigned int g8 = n_end - 8;
+        rv |= ((1 << g8) - 1) << (24 - g8);
+        for (i = 0; i != g8; ++i) {
+            levels[i + 8] = 0;
+        }
+    }
+
+    return rv;
+}
+
+// extended_precision_processing_flag must be false given we are
+// putting the result into a 16-bit array
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+// scale_m is uint8_t
+//
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+//   or it can be 2 (if we have transquant_bypass)
+// shift is set to one less than we really want but would normally be
+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+// to achieve it
+
+#ifndef trans_scale_sat
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+}
+#endif
+
+
+#ifndef update_rice
+static inline void update_rice(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+    if (x >= 6)
+        (*stat_coeff)++;
+    else if (x == 0 && *stat_coeff > 0)
+        (*stat_coeff)--;
+}
+#endif
+
+
+// n must be > 0 on entry
+#ifndef get_cabac_sig_coeff_flag_idxs
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    do {
+        if (get_cabac(c, state0 + ctx_map[n]))
+            *p++ = n;
+    } while (--n != 0);
+    return p;
+}
+#endif
+
+
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t * ctx_map,  // const ptr here but not in asm
+    uint8_t * const flag_idx)
+{
+    int rv;
+
+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+
+    return rv;
+}
+
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x1,  x2,  x3,\
+     x4,  x5,  x6,  x7,\
+     x8,  x9, x10, x11,\
+    x12, x13, x14, x15}
+
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x8, x12,\
+     x1,  x5,  x9, x13,\
+     x2,  x6, x10, x14,\
+     x3,  x7, x11, x15}
+
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x1,  x8,\
+     x5,  x2, x12,  x9,\
+     x6,  x3, x13, x10,\
+     x7, x14, x11, x15}
+
+
+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
+    uint8_t * const significant_coeff_group_flag,
+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+    int * const pPrev_sig)
+{
+    while (--i >= 0) {
+        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
+        const unsigned int x_cg = scan_x_cg[i];
+
+        // For the flag decode we only care about Z/NZ but
+        // we use the full Right * 2 + Down when calculating
+        // significant coeff flags so we obtain it here.
+        //
+        // The group flag array is one longer than it needs to
+        // be so we don't need to check for y_cg limits
+        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
+
+        if (i == 0 ||
+            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
+        {
+            gf_y[0] |= (1 << x_cg);
+            *pPrev_sig = prev_sig;
+            break;
+        }
+    }
+
+    return i;
+}
+
+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
+    const unsigned int log2_trafo_size, const unsigned int c_idx,
+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+{
+    const AVFrame * const frame = s->frame;
+    const unsigned int stride = frame_stride1(s->frame, c_idx);
+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
+    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
+    uint8_t * const dst = !is_sliced ?
+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+        c_idx == 0 ?
+            av_rpi_sand_frame_pos_y(frame, x, y) :
+            av_rpi_sand_frame_pos_c(frame, x, y);
+
+    const unsigned int i = jb->intra.n;
+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
+
+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+        pc->ta.dst == dst)
+    {
+        av_assert1(pc->size == log2_trafo_size &&
+                   pc->c_idx == 1 &&
+                   pc->ta.stride == stride);
+
+        pc->type = RPI_PRED_ADD_RESIDUAL_C;
+    }
+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+        pc->dc.dst == dst)
+    {
+        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
+        av_assert1(pc->size == log2_trafo_size &&
+                   pc->c_idx == 1 &&
+                   pc->dc.stride == stride);
+
+        // Rewrite as add residual - must rewrite all fields as different union member
+        pc->type = RPI_PRED_ADD_RESIDUAL_V;
+        pc->ta.buf = coeffs;
+        pc->ta.dst = dst;
+        pc->ta.stride = stride;
+        pc->ta.dc = dc;
+    }
+    else
+    {
+        HEVCPredCmd * const cmd = pc + 1;
+        jb->intra.n = i + 1;
+
+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+        cmd->size = log2_trafo_size;
+        cmd->ta.buf = coeffs;
+        cmd->ta.dst = dst;
+        cmd->ta.stride = stride;
+        cmd->ta.dc = 0;
+    }
+}
+
+
+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+    const unsigned int log2_trafo_size, const unsigned int c_idx,
+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+{
+    const AVFrame * const frame = s->frame;
+    const unsigned int stride = frame_stride1(s->frame, c_idx);
+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
+    const int is_sliced = 1;
+    uint8_t * const dst = !is_sliced ?
+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+        c_idx == 0 ?
+            av_rpi_sand_frame_pos_y(frame, x, y) :
+            av_rpi_sand_frame_pos_c(frame, x, y);
+
+    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
+    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
+
+    const unsigned int i = jb->intra.n;
+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
+
+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+        pc->ta.dst == dst)
+    {
+        av_assert1(pc->size == log2_trafo_size &&
+                   pc->c_idx == 1 &&
+                   pc->ta.stride == stride);
+
+        pc->ta.dc = (int16_t)coeff;
+    }
+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+        pc->dc.dst == dst)
+    {
+        av_assert1(pc->size == log2_trafo_size &&
+                   pc->c_idx == 1 &&
+                   pc->dc.stride == stride &&
+                   (pc->dc.dc & ~0xffff) == 0);
+
+        pc->dc.dc |= (coeff << 16);
+    }
+    else
+    {
+        HEVCPredCmd * const cmd = pc + 1;
+        jb->intra.n = i + 1;
+
+        cmd->type = RPI_PRED_ADD_DC + c_idx;
+        cmd->size = log2_trafo_size;
+        cmd->dc.dst = dst;
+        cmd->dc.stride = stride;
+        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
+    }
+}
+
+
+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                const int x0, const int y0,
+                                const int log2_trafo_size, const enum ScanType scan_idx,
+                                const int c_idx)
+{
+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
+
+    int last_significant_coeff_x, last_significant_coeff_y;
+    int num_coeff = 0;
+    int prev_subset_coded = 0;
+
+    int num_last_subset;
+    int x_cg_last_sig, y_cg_last_sig;
+
+    const uint8_t *scan_x_cg, *scan_y_cg;
+    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+
+    int use_vpu;
+#if RPI_COMPRESS_COEFFS
+    int num_nonzero = 0;
+    int use_compress = 0;
+    int *coeffs32;
+#endif
+    int use_dc = 0;
+    int16_t *coeffs;
+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
+    int explicit_rdpcm_flag = 0;
+    int explicit_rdpcm_dir_flag;
+
+    int i;
+    int shift,scale;
+    const uint8_t *scale_matrix = NULL;
+    uint8_t dc_scale;
+    const int c_idx_nz = (c_idx != 0);
+    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
+    int prev_sig = 0;
+    int may_hide_sign;
+
+    int16_t dummy_coeffs[16];
+
+    // Derive QP for dequant
+    if (!lc->cu.cu_transquant_bypass_flag) {
+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+
+        if (s->ps.pps->transform_skip_enabled_flag &&
+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
+            if (transform_skip_flag) {
+                trans_skip_or_bypass = 1;
+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+                    may_hide_sign = 0;
+                }
+            }
+        }
+
+        {
+            static const uint8_t level_scale[8] = {
+                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
+            };
+            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
+
+            // Shift is set to one less than will actually occur as the scale
+            // and saturate step adds 1 and then shifts right again
+            scale = level_scale[qp6 & 7];
+//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
+            shift = log2_trafo_size - (qp6 >> 3);
+
+            if (shift < 0) {
+                scale <<= -shift;
+                shift = 0;
+            }
+        }
+
+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
+            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+            const unsigned int matrix_id =
+                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
+
+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            dc_scale = scale_matrix[0];
+            if (log2_trafo_size >= 4)
+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+        }
+        else
+        {
+            static const uint8_t sixteen_scale[64] = {
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16
+            };
+            scale_matrix = sixteen_scale;
+            dc_scale = 16;
+        }
+    } else {
+        static const uint8_t unit_scale[64] = {
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+        };
+        scale_matrix = unit_scale;
+        shift        = 0;
+        scale        = 2;  // We will shift right to kill this
+        dc_scale     = 1;
+
+        may_hide_sign = 0;
+    }
+
+
+
+
+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+        trans_skip_or_bypass) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
+        if (explicit_rdpcm_flag) {
+            may_hide_sign = 0;
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
+        }
+    }
+
+    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
+                                           &last_significant_coeff_x, &last_significant_coeff_y);
+
+    if (last_significant_coeff_x > 3) {
+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
+        (2 + (last_significant_coeff_x & 1)) +
+        suffix;
+    }
+
+    if (last_significant_coeff_y > 3) {
+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
+        (2 + (last_significant_coeff_y & 1)) +
+        suffix;
+    }
+
+    if (scan_idx == SCAN_VERT)
+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
+
+    x_cg_last_sig = last_significant_coeff_x >> 2;
+    y_cg_last_sig = last_significant_coeff_y >> 2;
+
+    switch (scan_idx) {
+    case SCAN_DIAG: {
+        int last_x_c = last_significant_coeff_x & 3;
+        int last_y_c = last_significant_coeff_y & 3;
+
+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+
+        switch (log2_trafo_size) {
+        case 2:
+            scan_x_cg = scan_1x1;
+            scan_y_cg = scan_1x1;
+            break;
+        case 3:
+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = diag_scan2x2_x;
+            scan_y_cg = diag_scan2x2_y;
+            break;
+        case 4:
+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
+            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
+            break;
+        case 5:
+        default:
+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
+            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
+            break;
+        }
+        break;
+    }
+    case SCAN_HORIZ:
+        scan_x_cg = horiz_scan2x2_x;
+        scan_y_cg = horiz_scan2x2_y;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+        break;
+    default: //SCAN_VERT
+        scan_x_cg = horiz_scan2x2_y;
+        scan_y_cg = horiz_scan2x2_x;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+        break;
+    }
+    num_coeff++;
+    num_last_subset = (num_coeff - 1) >> 4;
+
+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+
+    {
+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
+        use_vpu = 0;
+        use_dc = (num_coeff == 1) && !special &&
+            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
+
+        if (use_dc) {
+            // Just need a little empty space
+            coeffs = dummy_coeffs;
+            // No need to clear
+        }
+        else
+        {
+            use_vpu = !special && log2_trafo_size >= 4;
+#if RPI_COMPRESS_COEFFS
+            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
+#endif
+            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+#if RPI_COMPRESS_COEFFS
+            coeffs32 = (int*)coeffs;
+            if (!use_compress)
+#endif
+#if HAVE_NEON
+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+#else
+            memset(coeffs, 0, ccount * sizeof(int16_t));
+#endif
+        }
+    }
+
+    i = num_last_subset;
+    do {
+        int implicit_non_zero_coeff = 0;
+        int n_end;
+
+        uint8_t significant_coeff_flag_idx[16];
+        unsigned int nb_significant_coeff_flag = 0;
+
+        if (i == num_last_subset) {
+            // First time through
+            int last_scan_pos = num_coeff - (i << 4) - 1;
+            n_end = last_scan_pos - 1;
+            significant_coeff_flag_idx[0] = last_scan_pos;
+            nb_significant_coeff_flag = 1;
+        } else {
+            n_end = 15;
+            implicit_non_zero_coeff = (i != 0);
+        }
+
+        if (n_end >= 0) {
+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+            };
+            // N.B. prev_sig = Right * 2 + Down
+            static const uint8_t ctx_idx_maps[3][4][16] = {
+                {
+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                }
+            };
+            const uint8_t *ctx_idx_map_p;
+            int scf_offset = 0;
+
+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                ctx_idx_map_p = ctx_idx_maps[0][3];
+                scf_offset = 40 + c_idx_nz;
+            } else {
+                if (c_idx_nz != 0)
+                    scf_offset = 27;
+
+                if (log2_trafo_size == 2) {
+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
+                } else {
+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+                    if (!c_idx_nz) {
+                        if (i != 0)
+                            scf_offset += 3;
+
+                        if (log2_trafo_size == 3) {
+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                        } else {
+                            scf_offset += 21;
+                        }
+                    } else {
+                        if (log2_trafo_size == 3)
+                            scf_offset += 9;
+                        else
+                            scf_offset += 12;
+                    }
+                }
+            }
+
+            if (n_end > 0) {
+                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
+                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+                    n_end, ctx_idx_map_p,
+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+
+                nb_significant_coeff_flag += cnt;
+                if (cnt != 0) {
+                    implicit_non_zero_coeff = 0;
+                }
+            }
+
+            if (implicit_non_zero_coeff == 0) {
+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                    scf_offset = 42 + c_idx_nz;
+                } else {
+                    if (i == 0) {
+                        scf_offset = c_idx_nz ? 27 : 0;
+                    } else {
+                        scf_offset = 2 + scf_offset;
+                    }
+                }
+                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                    nb_significant_coeff_flag++;
+                }
+            } else {
+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                nb_significant_coeff_flag++;
+            }
+        }
+#if RPI_COMPRESS_COEFFS
+        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
+          int16_t temp[32*32];
+          const unsigned int ccount = 1 << (log2_trafo_size * 2);
+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
+          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
+          coeffs32 = (int *)temp;
+          memset(coeffs, 0, ccount * sizeof(int16_t));
+          num_nonzero--;
+          while (num_nonzero >= 0) {
+            const unsigned int res = coeffs32[num_nonzero];
+            const unsigned int offset = res & 0xffff;
+            coeffs[ offset ] = res >> 16;
+            num_nonzero--;
+          }
+          use_compress = 0;
+        }
+#endif
+
+        if (nb_significant_coeff_flag != 0) {
+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+                prev_subset_coded;
+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+                (gt1_idx_delta << 2);
+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+                gt1_idx_delta;
+
+            const unsigned int x_cg = scan_x_cg[i];
+            const unsigned int y_cg = scan_y_cg[i];
+            int16_t * const blk_coeffs = coeffs +
+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+            // This calculation is 'wrong' for log2_traffo_size == 2
+            // but that doesn't matter as in this case x_cg & y_cg
+            // are always 0 so result is correct (0) anyway
+            const uint8_t * const blk_scale = scale_matrix +
+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+
+            // * The following code block doesn't deal with these flags:
+            //   (nor did the one it replaces)
+            //
+            // cabac_bypass_alignment_enabled_flag
+            //    This should be easy but I can't find a test case
+            // extended_precision_processing_flag
+            //    This can extend the required precision past 16bits
+            //    so is probably tricky - also no example found yet
+
+#if USE_N_END_1
+            if (nb_significant_coeff_flag == 1) {
+                // There is a small gain to be had from special casing the single
+                // transform coefficient case.  The reduction in complexity
+                // makes up for the code duplicatioon.
+
+                int trans_coeff_level = 1;
+                int coeff_sign_flag;
+                int coded_val = 0;
+
+                // initialize first elem of coeff_bas_level_greater1_flag
+                prev_subset_coded = 0;
+
+                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
+                    trans_coeff_level = 2;
+                    prev_subset_coded = 1;
+                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
+                }
+
+                // Probably not worth the overhead of starting by22 for just one value
+                coeff_sign_flag = get_cabac_bypass(&lc->cc);
+
+                if (coded_val)
+                {
+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
+                    } else {
+                        uint8_t * const stat_coeff =
+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                        const unsigned int c_rice_param = *stat_coeff >> 2;
+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
+
+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                    }
+                }
+
+                {
+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+                    const unsigned int scale_m = blk_scale[xy_off->scale];
+                    const int res = trans_scale_sat(
+                        (trans_coeff_level ^ k) - k,  // Apply sign
+                        scale,
+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+                        shift);
+#if RPI_COMPRESS_COEFFS
+                      if (use_compress)
+                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
+                      else
+#endif
+                      blk_coeffs[xy_off->coeff] = res;
+                }
+            }
+            else
+#endif
+            {
+                int sign_hidden = may_hide_sign;
+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+                uint32_t coeff_sign_flags;
+                uint32_t coded_vals = 0;
+                // Sum(abs(level[]))
+                // In fact we only need the bottom bit and in some future
+                // version that may be all we calculate
+                unsigned int sum_abs;
+
+                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+
+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+                    sign_hidden = 0;
+
+                // -- Start bypass block
+
+                bypass_start(&lc->cc);
+
+                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
+
+                if (coded_vals != 0)
+                {
+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+                    int * level = levels - 1;
+
+                    do {
+                        {
+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+                            level += z;
+                            coded_vals <<= z;
+                        }
+
+                        {
+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+
+                            sum_abs += last_coeff_abs_level_remaining + 1;
+                            *level = trans_coeff_level;
+
+                            if (stat_coeff != NULL)
+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                            stat_coeff = NULL;
+
+                            if (trans_coeff_level > (3 << c_rice_param) &&
+                                (c_rice_param < 4 || rice_adaptation_enabled))
+                                ++c_rice_param;
+                        }
+                    } while (coded_vals != 0);
+                }
+
+                // sign_hidden = 0 or 1 so we can combine the tests
+                if ((sign_hidden & sum_abs) != 0) {
+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
+                }
+
+                bypass_finish(&lc->cc);
+
+                // -- Finish bypass block
+
+                // Scale loop
+                {
+                    int m = nb_significant_coeff_flag - 1;
+
+                    // Deal with DC component (if any) first
+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+                    {
+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+                        const int res = trans_scale_sat(
+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+#if RPI_COMPRESS_COEFFS
+                        if (use_compress)
+                        {
+                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
+                        }
+                        else
+#endif
+                        {
+                            blk_coeffs[0] = res;
+                        }
+                        --m;
+                    }
+
+#if !USE_N_END_1
+                    // If N_END_1 set then m was at least 1 initially
+                    if (m >= 0)
+#endif
+                    {
+                        do {
+                            const xy_off_t * const xy_off = scan_xy_off +
+                                significant_coeff_flag_idx[m];
+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+                            const int res = trans_scale_sat(
+                                (levels[m] ^ k) - k,
+                                scale,
+                                blk_scale[xy_off->scale],
+                                shift);
+#if RPI_COMPRESS_COEFFS
+                            if (use_compress) {
+                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
+                            } else
+#endif
+                              blk_coeffs[xy_off->coeff] = res;
+                        } while (--m >= 0);
+                    }
+                }
+
+            }
+        }
+    } while ((i = next_subset(lc, i, c_idx_nz,
+                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
+             !cabac_overflow(&lc->cc));
+
+    if (lc->cu.cu_transquant_bypass_flag) {
+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
+
+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+        }
+    } else {
+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                      log2_trafo_size == 2 &&
+                      lc->cu.pred_mode == MODE_INTRA;
+            if (rot) {
+                for (i = 0; i < 8; i++)
+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+            }
+
+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
+
+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                        lc->cu.pred_mode == MODE_INTRA &&
+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
+
+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+            }
+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+            s->hevcdsp.transform_4x4_luma(coeffs);
+        }
+        else if (!use_vpu)
+        {
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+            if (max_xy == 0)
+            {
+                if (use_dc)
+                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
+                else
+                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+            }
+            else {
+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
+            }
+        }
+    }
+
+#if 0
+    // Mildly rotted - we support no mode where cross is valid
+    if (lc->tu.cross_pf) {
+        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
+        const int ccount = 1 << (log2_trafo_size * 2);
+
+        for (i = 0; i < ccount; i++) {
+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+        }
+    }
+#endif
+
+    if (!use_dc) {
+#if RPI_COMPRESS_COEFFS
+        if (use_compress) {
+          coeffs32[num_nonzero] = 0;
+        }
+#endif
+        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
+    }
+}
+
+#if !USE_BY22
+// Stores results to lc
+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
+{
+    int x = abs_mvd_greater0_flag_decode(lc);
+    int y = abs_mvd_greater0_flag_decode(lc);
+
+    if (x)
+        x += abs_mvd_greater1_flag_decode(lc);
+    if (y)
+        y += abs_mvd_greater1_flag_decode(lc);
+
+    switch (x) {
+    case 2: x = mvd_decode(lc);           break;
+    case 1: x = mvd_sign_flag_decode(lc); break;
+    case 0: x = 0;                       break;
+    }
+
+    switch (y) {
+    case 2: y = mvd_decode(lc);           break;
+    case 1: y = mvd_sign_flag_decode(lc); break;
+    case 0: y = 0;                       break;
+    }
+    return MV_XY(x,y);
+}
+#else
+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
+{
+    int x = abs_mvd_greater0_flag_decode(lc);
+    int y = abs_mvd_greater0_flag_decode(lc);
+
+    if ((x | y) == 0)
+        return 0;
+
+    if (x != 0)
+        x += abs_mvd_greater1_flag_decode(lc);
+    if (y != 0)
+        y += abs_mvd_greater1_flag_decode(lc);
+
+    if ((x | y) == 1)
+    {
+        // Not worth starting BY22
+        if (x != 0)
+            x = mvd_sign_flag_decode(lc);
+        if (y != 0)
+            y = mvd_sign_flag_decode(lc);
+    }
+    else
+    {
+        CABACContext * const cc = &lc->cc;
+        uint32_t val;
+        uint32_t b;
+        unsigned int n = 0;
+
+        bypass_start(cc);
+        b = val = get_cabac_by22_peek(cc);
+
+        if (x == 1) {
+            x = ((int32_t)b >> 31) | 1;
+            n = 1;
+            b <<= 1;
+        }
+        else if (x == 2) {
+            // EG1 so we have (leading one bits + 1) of suffix
+            // This makes prefix & suffix lengths the same
+            const unsigned int k = hevc_clz32(~b) + 1;
+            int s;
+
+            av_assert2(k <= 15);
+
+            b <<= k;
+            n = 2 * k + 1; // Includes suffix & sign
+
+            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
+            // if we are going to do this without a flush
+            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
+            {
+                // Need too many bits - flush
+                // n = k
+                get_cabac_by22_flush(cc, k, val);
+                b = val = get_cabac_by22_peek(cc);
+                n = k + 1;
+            }
+
+            x = (b >> (32 - k)) + (1 << k);
+            b <<= k;
+            s = (int32_t)b >> 31;
+            x = (x ^ s) - s;
+            b <<= 1;
+
+            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
+            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
+            {
+                get_cabac_by22_flush(cc, n, val);
+                b = val = get_cabac_by22_peek(cc);
+                n = 0;
+            }
+        }
+
+        if (y == 1) {
+            y = ((int32_t)b >> 31) | 1;
+            ++n;
+            // don't care about b anymore
+        }
+        else if (y == 2) {
+            const unsigned int k = hevc_clz32(~b) + 1;
+            int s;
+
+            av_assert2(k <= 15);
+
+            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
+            // if we are going to do this without a flush
+            b <<= k;
+            n += 2 * k + 1;
+
+            if (n > CABAC_BY22_PEEK_BITS)
+            {
+                // Need too many bits - flush
+                get_cabac_by22_flush(cc, n - (k + 1), val);
+                b = val = get_cabac_by22_peek(cc);
+                n = k + 1;
+            }
+
+            y = (b >> (32 - k)) + (1 << k);
+            s = (int32_t)(b << k) >> 31;
+            y = (y ^ s) - s;
+            // don't care about b anymore
+        }
+
+        get_cabac_by22_flush(cc, n, val);
+        bypass_finish(cc);
+    }
+
+    return MV_XY(x, y);
+}
+#endif
diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
new file mode 100644
index 0000000000..ca191f00d9
--- /dev/null
+++ b/libavcodec/rpi_hevc_cabac_fns.h
@@ -0,0 +1,217 @@
+/*
+ * HEVC CABAC decoding
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2018 John Cox
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
+#define AVCODEC_RPI_HEVC_CABAC_FNS_H
+
+#include "config.h"
+#include "rpi_hevcdec.h"
+
+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
+
+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                const int x0, const int y0,
+                                const int log2_trafo_size, const enum ScanType scan_idx,
+                                const int c_idx);
+
+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
+
+#define HEVC_BIN_SAO_MERGE_FLAG                         0
+#define HEVC_BIN_SAO_TYPE_IDX                           1
+#define HEVC_BIN_SAO_EO_CLASS                           2
+#define HEVC_BIN_SAO_BAND_POSITION                      2
+#define HEVC_BIN_SAO_OFFSET_ABS                         2
+#define HEVC_BIN_SAO_OFFSET_SIGN                        2
+#define HEVC_BIN_END_OF_SLICE_FLAG                      2
+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
+#define HEVC_BIN_SKIP_FLAG                              6
+#define HEVC_BIN_CU_QP_DELTA                            9
+#define HEVC_BIN_PRED_MODE                              12
+#define HEVC_BIN_PART_MODE                              13
+#define HEVC_BIN_PCM_FLAG                               17
+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
+#define HEVC_BIN_MPM_IDX                                18
+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
+#define HEVC_BIN_MERGE_FLAG                             20
+#define HEVC_BIN_MERGE_IDX                              21
+#define HEVC_BIN_INTER_PRED_IDC                         22
+#define HEVC_BIN_REF_IDX_L0                             27
+#define HEVC_BIN_REF_IDX_L1                             29
+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
+#define HEVC_BIN_ABS_MVD_MINUS2                         35
+#define HEVC_BIN_MVD_SIGN_FLAG                          35
+#define HEVC_BIN_MVP_LX_FLAG                            35
+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
+#define HEVC_BIN_CBF_LUMA                               40
+#define HEVC_BIN_CBF_CB_CR                              42
+#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
+#define HEVC_BIN_COEFF_SIGN_FLAG                        166
+#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
+#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
+
+
+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
+
+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
+    const uint8_t *ptr = c->bytestream;
+
+    if (c->low & 0x1)
+        ptr--;
+#if CABAC_BITS == 16
+    if (c->low & 0x1FF)
+        ptr--;
+#endif
+    if ((int) (c->bytestream_end - ptr) < n)
+        return NULL;
+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
+        return NULL;
+
+    return ptr;
+}
+
+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
+}
+
+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
+}
+
+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
+}
+
+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                                            const unsigned int ct_depth,
+                                                            const unsigned int x0, const unsigned int y0)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
+                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
+                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
+}
+
+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                             const int x0, const int y0, const int x_cb, const int y_cb)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
+                                 (s->cabac_stash_left[y0 >> 3] & 1) +
+                                 (s->cabac_stash_up[x0 >> 3] & 1));
+}
+
+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
+}
+
+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
+}
+
+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
+}
+
+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
+}
+
+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
+}
+
+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
+}
+
+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
+}
+
+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
+}
+
+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
+}
+
+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
+{
+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
+}
+
+
+
+#endif
+
diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
new file mode 100644
index 0000000000..341bb77d9d
--- /dev/null
+++ b/libavcodec/rpi_hevc_data.c
@@ -0,0 +1,75 @@
+/*
+ * HEVC shared tables
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "rpi_hevc_data.h"
+
+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
+    0, 0, 1, 0,
+    1, 2, 0, 1,
+    2, 3, 1, 2,
+    3, 2, 3, 3,
+};
+
+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
+    0, 1, 0, 2,
+    1, 0, 3, 2,
+    1, 0, 3, 2,
+    1, 3, 2, 3,
+};
+
+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
+    0, 0, 1, 0,
+    1, 2, 0, 1,
+    2, 3, 0, 1,
+    2, 3, 4, 0,
+    1, 2, 3, 4,
+    5, 0, 1, 2,
+    3, 4, 5, 6,
+    0, 1, 2, 3,
+    4, 5, 6, 7,
+    1, 2, 3, 4,
+    5, 6, 7, 2,
+    3, 4, 5, 6,
+    7, 3, 4, 5,
+    6, 7, 4, 5,
+    6, 7, 5, 6,
+    7, 6, 7, 7,
+};
+
+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
+    0, 1, 0, 2,
+    1, 0, 3, 2,
+    1, 0, 4, 3,
+    2, 1, 0, 5,
+    4, 3, 2, 1,
+    0, 6, 5, 4,
+    3, 2, 1, 0,
+    7, 6, 5, 4,
+    3, 2, 1, 0,
+    7, 6, 5, 4,
+    3, 2, 1, 7,
+    6, 5, 4, 3,
+    2, 7, 6, 5,
+    4, 3, 7, 6,
+    5, 4, 7, 6,
+    5, 7, 6, 7,
+};
diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
new file mode 100644
index 0000000000..0aee673d8b
--- /dev/null
+++ b/libavcodec/rpi_hevc_data.h
@@ -0,0 +1,31 @@
+/*
+ * HEVC shared data tables
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RPI_HEVC_DATA_H
+#define AVCODEC_RPI_HEVC_DATA_H
+
+#include <stdint.h>
+
+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
+
+#endif /* AVCODEC_RPI_HEVC_DATA_H */
diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
new file mode 100644
index 0000000000..5125d1eb6b
--- /dev/null
+++ b/libavcodec/rpi_hevc_filter.c
@@ -0,0 +1,1210 @@
+/*
+ * HEVC video decoder
+ *
+ * Originally by:
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 Seppo Tomperi
+ * Copyright (C) 2013 Wassim Hamidouche
+ *
+ * Substantially rewritten:
+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+//#define DISABLE_SAO
+//#define DISABLE_DEBLOCK
+//#define DISABLE_STRENGTHS
+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+//#define DISABLE_DEBLOCK_NONREF
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+
+#include "rpi_hevcdec.h"
+
+#include "bit_depth_template.c"
+
+#include "rpi_qpu.h"
+#include "rpi_zc.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#define LUMA 0
+#define CB 1
+#define CR 2
+
+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
+// so -12,75 overall
+static const uint8_t tctablex[] = {
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
+
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
+    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
+    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
+};
+#define tctable (tctablex + 12 + 6*8)
+
+static const uint8_t betatablex[] = {
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
+
+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
+     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
+    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
+};
+#define betatable (betatablex + 12 + 6*8)
+
+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
+                            const int c_idx, const int tc_offset)
+{
+    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
+}
+
+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
+                               const unsigned int xBase, const unsigned int yBase)
+{
+    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
+    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
+    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
+    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
+    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
+    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
+    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
+    const int qPy_pred = lc->qPy_pred;
+
+    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
+             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
+            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
+             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
+}
+
+// * Only called from bitstream decode in foreground
+//   so should be safe
+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
+{
+    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
+
+    if (lc->tu.cu_qp_delta != 0) {
+        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
+        int off = s->ps.sps->qp_bd_offset;
+        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
+                                 52 + off) - off;
+    } else
+        lc->qp_y = qp_y;
+}
+
+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
+{
+    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
+}
+
+// "DSP" these?
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
+    switch (pixel_shift)
+    {
+        case 2:
+            *(uint32_t *)dst = *(uint32_t *)src;
+            break;
+        case 1:
+            *(uint16_t *)dst = *(uint16_t *)src;
+            break;
+        default:
+            *dst = *src;
+            break;
+    }
+}
+
+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
+                           ptrdiff_t stride_src, int x, int y, int width, int height,
+                           int c_idx, int x_ctb, int y_ctb)
+{
+    const unsigned int sh = pixel_shift(s, c_idx);
+    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
+    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
+
+    /* copy horizontal edges */
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
+        src, width << sh);
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
+        src + stride_src * (height - 1), width << sh);
+
+    /* copy vertical edges */
+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
+
+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
+}
+
+// N.B. Src & dst are swapped as this is a restore!
+// x0 & y0 are in luma coords
+// Width & height are in Y/C pels as appropriate
+// * Clear scope for optimsation here but not used enough to be worth it
+static void restore_tqb_pixels(const HEVCRpiContext * const s,
+                               uint8_t *src1, const uint8_t *dst1,
+                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
+                               const unsigned int x0, const unsigned int y0,
+                               const unsigned int width, const int height,
+                               const int c_idx)
+{
+    if (s->ps.pps->transquant_bypass_enable_flag ||
+        s->ps.sps->pcm.loop_filter_disable_flag)
+    {
+        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
+        int blks_y = height >> (c_idx == 0 ? 3 : 2);
+        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
+        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
+        const unsigned int sh = ((x0 >> 3) & 7);
+        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
+
+        do {
+            unsigned int m = (*pcm >> sh) & mask;
+            uint8_t * bd = src1;
+            const uint8_t * bs = dst1;
+            while (m != 0) {
+                if ((m & 1) != 0) {
+                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
+                }
+                m >>= 1;
+                bs += bwidth;
+                bd += bwidth;
+            }
+            src1 += stride_src * bheight;
+            dst1 += stride_dst * bheight;
+            pcm += s->ps.sps->pcm_width;
+        } while (--blks_y > 0);
+    }
+}
+
+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
+
+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
+{
+#if SAO_FILTER_N == 5
+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+#elif SAO_FILTER_N == 6
+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+#else
+#error Confused by size of sao fn array
+#endif
+    int c_idx;
+    int edges[4];  // 0 left 1 top 2 right 3 bottom
+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
+    // flags indicating unfilterable edges
+    uint8_t vert_edge[]      = { 0, 0 };
+    uint8_t horiz_edge[]     = { 0, 0 };
+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
+    uint8_t restore          = no_tile_filter || !lfase;
+    uint8_t left_tile_edge   = 0;
+    uint8_t right_tile_edge  = 0;
+    uint8_t up_tile_edge     = 0;
+    uint8_t bottom_tile_edge = 0;
+    const int sliced = 1;
+    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
+
+    edges[0]   = x_ctb == 0;
+    edges[1]   = y_ctb == 0;
+    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
+
+#ifdef DISABLE_SAO
+    return;
+#endif
+
+    if (restore) {
+        if (!edges[0]) {
+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
+        }
+        if (!edges[2]) {
+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
+        }
+        if (!edges[1]) {
+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
+        }
+        if (!edges[3]) {
+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[1]) {
+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
+        }
+        if (!edges[1] && !edges[2]) {
+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
+        }
+        if (!edges[2] && !edges[3]) {
+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[3]) {
+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
+        }
+    }
+
+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
+        const unsigned int vshift = ctx_vshift(s, c_idx);
+        const unsigned int hshift = ctx_hshift(s, c_idx);
+        const int x0 = x >> hshift;
+        const int y0 = y >> vshift;
+        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
+        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
+        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
+        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
+        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+        ptrdiff_t stride_dst;
+        uint8_t *dst;
+
+        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+        uint8_t * const src = !sliced ?
+                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
+            c_idx == 0 ?
+                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
+                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+            !sliced ? src - (1 << sh) :
+            c_idx == 0 ?
+                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
+                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+            !sliced ? src + (width << sh) :
+            c_idx == 0 ?
+                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
+                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
+
+        if (sliced && c_idx > 1) {
+            break;
+        }
+
+//        if (c_idx == 1)
+//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
+
+        switch (sao->type_idx[c_idx]) {
+        case SAO_BAND:
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            if (s->ps.pps->transquant_bypass_enable_flag ||
+                s->ps.sps->pcm.loop_filter_disable_flag)
+            {
+                // Can't use the edge buffer here as it may be in use by the foreground
+                DECLARE_ALIGNED(64, uint8_t, dstbuf)
+                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
+                dst = dstbuf;
+                stride_dst = 2*MAX_PB_SIZE;
+                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
+                if (sliced && c_idx != 0)
+                {
+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+                                                    sao->offset_val[1], sao->band_position[1],
+                                                    sao->offset_val[2], sao->band_position[2],
+                                                    width, height);
+                }
+                else
+                {
+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+                                                    width, height);
+                }
+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                                   x, y, width, height, c_idx);
+            } else {
+                if (sliced && c_idx != 0)
+                {
+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+                                                    sao->offset_val[1], sao->band_position[1],
+                                                    sao->offset_val[2], sao->band_position[2],
+                                                    width, height);
+                }
+                else
+                {
+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+                                                    width, height);
+                }
+            }
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        case SAO_EDGE:
+        {
+            const int w = s->ps.sps->width >> hshift;
+            const int h = s->ps.sps->height >> vshift;
+            int top_edge = edges[1];
+            int bottom_edge = edges[3];
+            // Can't use the edge buffer here as it may be in use by the foreground
+            DECLARE_ALIGNED(64, uint8_t, dstbuf)
+                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
+
+            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
+            dst = dstbuf + stride_dst + 32;
+
+            if (!top_edge) {
+                uint8_t *dst1;
+                int src_idx;
+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+
+                dst1 = dst - stride_dst;
+
+                if (src_l != NULL) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
+                }
+
+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+
+                if (src_r != NULL) {
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
+                }
+            }
+            if (!bottom_edge) {
+                uint8_t * const dst1 = dst + height * stride_dst;
+                int src_idx;
+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+                const unsigned int hoff = height * stride_src;
+
+                if (src_l != NULL) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
+                }
+
+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+
+                if (src_r != NULL) {
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
+                }
+            }
+            if (src_l != NULL) {
+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
+                              src_l,
+                              sh, height, stride_dst, stride_src);
+                }
+            }
+            if (src_r != NULL) {
+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
+                              src_r,
+                              sh, height, stride_dst, stride_src);
+                }
+            }
+
+            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
+
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            if (sliced && c_idx != 0)
+            {
+                // Class always the same for both U & V (which is just as well :-))
+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+                                                width, height);
+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+                                                    stride_src, stride_dst,
+                                                    sao,
+                                                    edges, width,
+                                                    height, c_idx,
+                                                    vert_edge,
+                                                    horiz_edge,
+                                                    diag_edge);
+            }
+            else
+            {
+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+                                                sao->eo_class[c_idx], width, height);
+                s->hevcdsp.sao_edge_restore[restore](src, dst,
+                                                    stride_src, stride_dst,
+                                                    sao,
+                                                    edges, width,
+                                                    height, c_idx,
+                                                    vert_edge,
+                                                    horiz_edge,
+                                                    diag_edge);
+            }
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        }
+        }
+    }
+
+#if RPI_ZC_SAND_8_IN_10_BUF
+    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
+        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
+    {
+        const unsigned int stride1 = frame_stride1(s->frame, 1);
+        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
+        const unsigned int xoff = (x >> 8) * stride2 * stride1;
+        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
+        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
+        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
+        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
+        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
+        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
+        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
+
+//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
+        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
+        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
+    }
+#endif
+}
+
+// When bits are delivered to deblock we want them
+//#define TL 1
+//#define TR 2
+//#define BL 4
+//#define BR 8
+
+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
+// so we need to rearrange before passing on
+
+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
+    return (pcm[0] |
+        (pcm[1] << 8) |
+        (pcm[s->ps.sps->pcm_width] << 16) |
+        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
+}
+
+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
+    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
+}
+
+// We cast away const here as we want this to work for both get and set
+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
+{
+    return (uint32_t *)(bs +
+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
+#warning Unexpected masks
+        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
+            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
+#error Stride1 < return size
+#endif
+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
+}
+
+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
+{
+    return (uint8_t *)(bs +
+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
+            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
+}
+
+
+// Get block strength
+// Given how we call we will always get within the 32bit boundries
+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
+                                unsigned int xl, unsigned int xr, const unsigned int y)
+{
+    if (xr <= xl) {
+        return 0;
+    }
+    else
+    {
+#if HAVE_ARMV6T2_INLINE
+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
+#error This case not yet handled in bs_get32
+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
+#error Stride1 < return size
+#endif
+        uint32_t tmp;
+        __asm__ (
+            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
+            "rsb         %[xr], %[xl], %[xr]                         \n\t"
+            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
+            "add         %[xr], %[xr], #7                            \n\t"
+            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
+            "bic         %[xr], %[xr], #7                            \n\t"
+            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
+            "lsr         %[xr], %[xr], #1                            \n\t"
+            "cmp         %[xr], #32                                  \n\t"
+            "mvn         %[tmp], #0                                  \n\t"
+            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
+            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
+            "lsr         %[xl], %[bs], %[xl]                         \n\t"
+            "it ne                                                   \n\t"
+            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
+            :  // Outputs
+                      [bs]"+r"(bs),
+                 [stride2]"+r"(stride2),
+                      [xl]"+r"(xl),
+                      [xr]"+r"(xr),
+                     [tmp]"=&r"(tmp)
+            :  // Inputs
+                       [y]"r"(y),
+                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
+                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
+                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
+            :  // Clobbers
+                "cc"
+        );
+        return (uint32_t) bs;
+#else
+        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
+        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
+
+        return n == 32 ? a :
+            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
+#endif
+    }
+}
+
+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
+{
+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
+    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
+}
+
+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
+{
+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
+    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
+}
+
+
+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
+{
+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
+    const unsigned int ctb_size = (1 << log2_ctb_size);
+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  1);
+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
+    const DBParams * cb_dbp = s->deblock + ctb_n;
+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
+
+    unsigned int cb_x;
+
+    // Do in CTB-shaped blocks
+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
+    {
+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
+        const unsigned int bv_l = FFMAX(cb_x, 8);
+        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
+        const unsigned int bh_l = bv_l - 8;
+        unsigned int y;
+
+        // Main body
+        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
+        {
+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
+
+            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
+            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+
+            if (vbs != 0)
+            {
+                const uint8_t * const tcv = tctable + dbp->tc_offset;
+                const uint8_t * const betav = betatable + dbp->beta_offset;
+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
+                unsigned int x;
+
+                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
+                {
+                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
+                    {
+                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+                                                         frame_stride1(s->frame, LUMA),
+                                                         betav[qp],
+                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
+                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
+                                                         pcmfa & 3,
+                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+                    }
+                }
+            }
+
+            if (y != 0)
+            {
+                uint32_t hbs;
+
+                // H left - mostly separated out so we only need a uint32_t hbs
+                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
+                {
+                    const unsigned int x = bh_l;
+                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
+                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+                    const DBParams * const dbph = dbp - 1;
+                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
+
+                    av_assert2(cb_x - bh_l == 8);
+
+                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+                                                         frame_stride1(s->frame, LUMA),
+                                                         betatable[qp + dbph->beta_offset],
+                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
+                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
+                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
+                }
+
+                // H
+                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
+                {
+                    unsigned int x;
+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
+
+                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
+                    {
+                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
+                        {
+                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
+                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+                                                                frame_stride1(s->frame, LUMA),
+                                                                betatable[qp + dbp->beta_offset],
+                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
+                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
+                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
+                        }
+                    }
+                }
+            }
+
+        }
+    }
+}
+
+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
+    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
+}
+
+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
+{
+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
+    const unsigned int ctb_size = (1 << log2_ctb_size);
+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  8);
+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
+    const DBParams * dbp = s->deblock + ctb_n;
+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
+    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
+    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
+
+    unsigned int cb_x;
+
+    av_assert1((bounds.x & (ctb_size - 1)) == 0);
+    av_assert1((bounds.y & (ctb_size - 1)) == 0);
+    av_assert1(bounds.h <= ctb_size);
+
+    // Do in CTB-shaped blocks
+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
+        const unsigned int bv_l = FFMAX(cb_x, 16);
+        unsigned int y;
+
+        // V above
+        if (bounds.y != 0) {
+            // Deblock V up 8
+            // CTB above current
+            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
+            const unsigned int y = bounds.y - 8;
+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
+
+            if (vbs != 0)
+            {
+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
+                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
+                unsigned int x;
+
+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
+                {
+                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
+                    {
+                        const int qp0 = q2h(s, x, y);
+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+                                                       frame_stride1(s->frame, 1),
+                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+                                                       pcmfa & 3);
+                    }
+                }
+            }
+        }
+
+        for (y = bounds.y; y < b_b; y += 16)
+        {
+            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
+                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
+
+            // V
+            if (vbs != 0)
+            {
+                unsigned int x;
+                unsigned int pcmfa =
+                    (y + 16 > b_b ?
+                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
+                        pcm4(s, bv_l - 1, y));
+                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
+
+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
+                {
+                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
+                    {
+                        const int qp0 = q2h(s, x, y);
+                        const int qp1 = q2h(s, x, y + 8);
+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+                            frame_stride1(s->frame, 1),
+                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
+                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
+                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
+                    }
+                }
+            }
+
+            // H
+            if (y != 0)
+            {
+                uint32_t hbs;
+                const unsigned int bh_l = bv_l - 16;
+                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
+                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+
+                // H left - mostly separated out so we only need a uint32_t hbs
+                // Stub is width 8 to the left of bounds, but width 16 internally
+                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
+                {
+                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
+
+                    // Chop off bits we don't want...
+                    if (bh_l < bounds.x) {
+                        pcmfa |= 0x10001; // TL|BL pre rearrangement
+                        hbs &= ~3;  // Make BS 0
+                    }
+
+                    // Double check we still want this
+                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
+                    {
+                        const unsigned int x = bh_l;
+                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
+                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
+
+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+                            frame_stride1(s->frame, 1),
+                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
+                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
+                    }
+                }
+
+                // H main
+                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
+                {
+                    unsigned int x;
+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
+
+                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
+                    {
+                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
+                        {
+                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
+                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
+
+                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+                                frame_stride1(s->frame, 1),
+                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
+                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
+                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
+{
+    return x & ~(~0U << log2_n);
+}
+
+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
+{
+    av_assert2((y & 7) == 0);
+
+    // This doesn't have the same simultainious update issues that bsf_stash
+    // does (other threads will have a different y) so we can do it the easy way
+    if ((bsf &= mask) != 0)
+        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
+}
+
+
+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
+{
+    // We arrange this in a slightly odd fashion but it lines up with
+    // how we are going to use it in the actual deblock code & it is easier
+    // to do the contortions here than there
+    //
+    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
+
+    av_assert2((x & 7) == 0);
+
+    if ((bsf &= mask) != 0)
+    {
+        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
+        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
+
+        if (mask <= 0xf)
+        {
+            *p |= (bsf << sh);
+        }
+        else
+        {
+            do {
+                *p |= (bsf & 0xf) << sh;
+                p += HEVC_RPI_BS_STRIDE1_BYTES;
+            } while ((bsf >>= 4) != 0);
+        }
+    }
+}
+
+static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
+                              const unsigned int rep, const unsigned int dup,
+                              const unsigned int mvf_stride0,
+                              const unsigned int mvf_stride1,
+                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
+                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
+{
+    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
+            mvf_p, mvf_q,
+            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
+            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
+}
+
+
+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
+                                               const HEVCRpiLocalContext * const lc,
+                                               const unsigned int x0, const unsigned int y0,
+                                               const unsigned int log2_trafo_size,
+                                               const int is_coded_block)
+{
+    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
+    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
+    const RefPicList * const rpl        = s->refPicList;
+    // Rep count for bsf_mv when running with min_pu chuncks
+    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
+    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
+    const unsigned int trafo_size       = (1U << log2_trafo_size);
+    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
+    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
+
+    // Do we cover a pred split line?
+    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
+    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
+
+    uint32_t bsf_h;
+    uint32_t bsf_v;
+
+#ifdef DISABLE_STRENGTHS
+    return;
+#endif
+
+    // We are always on a size boundary
+    av_assert2((x0 & (trafo_size - 1)) == 0);
+    av_assert2((y0 & (trafo_size - 1)) == 0);
+    // log2_trafo_size not really a transform size; we can have to deal
+    // with size 2^6 blocks
+    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
+
+    // Retrieve and update coded (b0), intra (b1) bs flags
+    //
+    // Store on min width (rather than uint32_t) to avoid possible issues
+    // with another thread on another core running wpp using the same
+    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
+    //
+    // In bsf BS=2 is represented by 3 as it is much easier to test & set
+    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
+    // 3 will work the same
+    {
+        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
+        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
+        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
+        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
+
+        switch (log2_trafo_size)
+        {
+            case 2:
+            case 3:
+            {
+                const unsigned int sh_h = (x0 >> 1) & 7;
+                const unsigned int sh_v = (y0 >> 1) & 7;
+                bsf_h = *p;
+                bsf_v = *q;
+                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
+                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
+                bsf_h >>= sh_h;
+                bsf_v >>= sh_v;
+                break;
+            }
+            case 4:
+                bsf_h = *p;
+                bsf_v = *q;
+                *p = bsf0;
+                *q = bsf0;
+                break;
+            case 5:
+                bsf_h = *(uint16_t *)p;
+                bsf_v = *(uint16_t *)q;
+                *(uint16_t *)p = bsf0;
+                *(uint16_t *)q = bsf0;
+                break;
+            case 6:
+            default:
+                bsf_h = *(uint32_t *)p;
+                bsf_v = *(uint32_t *)q;
+                *(uint32_t *)p = bsf0;
+                *(uint32_t *)q = bsf0;
+                break;
+        }
+
+        bsf_h |= bsf0;
+        bsf_v |= bsf0;
+    }
+
+    // Do Horizontal
+    if ((y0 & 7) == 0)
+    {
+        // Boundary upper
+        if (y0 != 0 &&
+            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
+             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
+        {
+            // Look at MVs (BS=1) if we don't already has a full set of bs bits
+            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
+            {
+                // If we aren't on the top boundary we must be in the middle
+                // and in that case we know where mvf can change
+                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
+                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
+                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
+                      rpl;
+
+                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+                    trafo_size >> (log2_min_pu_size + log2_rep),
+                    trafo_size >> (log2_min_pu_size + log2_rep),
+                    rpl, rpl_top,
+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
+            }
+
+            // Finally put the results into bs
+            hbs_set(s, x0, y0, bsf_mask, bsf_h);
+        }
+
+        // Max of 1 pu internal split - ignore if not on 8pel boundary
+        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
+        {
+            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
+            // If we have the x split as well then it must be in the middle
+            const unsigned int log2_rep = has_x_split ? 1 : 0;
+
+            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+                   trafo_size >> (log2_min_pu_size + log2_rep),
+                   trafo_size >> (log2_min_pu_size + log2_rep),
+                   rpl, rpl,
+                   mvf, mvf - MVF_STASH_WIDTH_PU));
+        }
+    }
+
+    // And again for vertical - same logic as horizontal just in the other direction
+    if ((x0 & 7) == 0)
+    {
+        // Boundary left
+        if (x0 != 0 &&
+            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
+             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
+        {
+            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
+            {
+                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
+                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
+                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
+                    rpl;
+
+                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
+                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
+                    rpl, rpl_left,
+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
+            }
+
+            vbs_set(s, x0, y0, bsf_mask, bsf_v);
+        }
+
+        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
+        {
+            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
+            const unsigned int log2_rep = has_y_split ? 1 : 0;
+
+            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
+                   rpl, rpl,
+                   mvf, mvf - 1));
+        }
+    }
+}
+
+#undef LUMA
+#undef CB
+#undef CR
+
+static inline unsigned int ussub(const unsigned int a, const unsigned int b)
+{
+    return a < b ? 0 : a - b;
+}
+
+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
+{
+    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
+}
+
+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
+{
+    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
+    int x, y;
+
+    const unsigned int br = bounds.x + bounds.w;
+    const unsigned int bb = bounds.y + bounds.h;
+
+    const int x_end = (br >= s->ps.sps->width);
+    const int y_end = (bb >= s->ps.sps->height);
+
+    // Deblock may not touch the edges of the bound as they are still needed
+    // for Intra pred
+    //
+    // Deblock is disabled with a per-slice flag
+    // Given that bounds may cover multiple slices & we dblock outside bounds
+    // anyway we can't avoid deblock using that flag - about the only thing we
+    // could do is have a "no deblock seen yet" flag but it doesn't really
+    // seem worth the effort
+
+    deblock_y_blk(s, bounds, x_end, y_end);
+    deblock_uv_blk(s, bounds, x_end, y_end);
+
+    // SAO needs
+    // (a) CTB alignment
+    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
+    {
+        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
+        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
+        const unsigned int yt = ussub(bounds.y, yo);
+        const unsigned int yb = y_end ? bb : ussub(bb, yo);
+        const unsigned int xl = ussub(bounds.x, xo);
+        const unsigned int xr = x_end ? br : ussub(br, xo);
+
+        if (s->ps.sps->sao_enabled)
+        {
+            for (y = yt; y < yb; y += ctb_size) {
+                for (x = xl; x < xr; x += ctb_size) {
+                    sao_filter_CTB(s, x, y);
+                }
+            }
+        }
+
+        // Cache invalidate
+        y = 0;
+        if (xr != 0 && yb != 0)
+        {
+            const unsigned int llen =
+                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
+            const unsigned int mask = ~(llen - 1);
+            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
+            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
+            const unsigned int it = ussub(yt, 1);
+            const unsigned int ib = y_end ? bb : yb - 1;
+
+            if (il < ir) {
+                rpi_cache_buf_t cbuf;
+                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
+                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+                  il, it, ir - il, ib - it,
+                  ctx_vshift(s, 1), 1, 1);
+
+                // If we have to commit the right hand tile boundry due to
+                // cache boundry considerations then at EoTile we must commit
+                // that boundry to bottom of tile (bounds)
+                if (ib != bb && ir == br && eot) {
+                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+                      br - 1, ib, 1, bb - ib,
+                      ctx_vshift(s, 1), 1, 1);
+                }
+
+                rpi_cache_flush_finish(rfe);
+
+                if (x_end)
+                    y = y_end ? INT_MAX : ib;
+
+//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
+            }
+        }
+    }
+
+    return y;
+}
+
diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
new file mode 100644
index 0000000000..6b36f5e737
--- /dev/null
+++ b/libavcodec/rpi_hevc_mv.h
@@ -0,0 +1,71 @@
+#ifndef AVCODEC_RPI_HEVC_MV_H
+#define AVCODEC_RPI_HEVC_MV_H
+
+#include "config.h"
+
+typedef int32_t MvXY;
+
+typedef struct HEVCRpiMvField {
+    MvXY xy[2];
+    int8_t ref_idx[2];
+    int8_t pred_flag;
+    int8_t dummy; // To 12 bytes
+} HEVCRpiMvField;
+
+
+#define MV_X(xy) (((xy) << 16) >> 16)
+#define MV_Y(xy) ((xy) >> 16)
+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
+
+#if ARCH_ARM
+#include "arm/rpi_hevc_mv_arm.h"
+#endif
+
+#ifndef mvxy_add
+static inline MvXY mvxy_add(const MvXY a, const MvXY b)
+{
+    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
+}
+#endif
+
+
+#ifndef mv_scale_xy
+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
+{
+    int tx, scale_factor;
+
+    td = td == 0 ? 1 : av_clip_int8(td);
+    tb = av_clip_int8(tb);
+    tx = (0x4000 + (abs(td) >> 1)) / td;
+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
+    return MV_XY(
+        av_clip_int16((scale_factor * MV_X(src) + 127 +
+                           (scale_factor * MV_X(src) < 0)) >> 8),
+        av_clip_int16((scale_factor * MV_Y(src) + 127 +
+                           (scale_factor * MV_Y(src) < 0)) >> 8));
+}
+#endif
+
+// 8.3.1 states that the bitstream may not contain poc diffs that do not
+// fit in 16 bits, so given that we don't care about the high bits we only
+// store the low 16 + LT & Inter flags
+
+#define COL_POC_INTRA   0
+#define COL_POC_INTER   (1 << 16)
+#define COL_POC_LT      (1 << 17)
+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
+
+typedef struct ColMv_s {
+    int32_t poc;
+    int32_t xy;
+} ColMv;
+
+typedef struct ColMvField_s {
+    ColMv L[2];
+} ColMvField;
+
+
+
+#endif // AVCODEC_RPI_HEVC_MV_H
diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
new file mode 100644
index 0000000000..27a9f69525
--- /dev/null
+++ b/libavcodec/rpi_hevc_mvs.c
@@ -0,0 +1,487 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 Anand Meher Kotra
+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hevc.h"
+#include "rpi_hevcdec.h"
+
+static av_always_inline int
+is_eq_mer(const unsigned int plevel,
+    const unsigned int xN, const unsigned int yN,
+    const unsigned int xP, const unsigned int yP)
+{
+    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
+}
+
+// check if the mv's and refidx are the same between A and B
+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
+{
+    return a->pred_flag == b->pred_flag &&
+        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
+        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
+    return 0;
+}
+
+/*
+ * 8.5.3.1.7  temporal luma motion vector prediction
+ */
+static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
+                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
+                                       const int nPbW, const int nPbH, const int refIdxLx,
+                                       MvXY * const mvLXCol, const int X)
+{
+    int x, y;
+    const ColMv * cmv = NULL;
+
+    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
+    const RefPicList * const refPicList = s->refPicList + X;
+    const int cur_lt = refPicList->isLongTerm[refIdxLx];
+
+    *mvLXCol = 0;
+    // Unlikely but we might have a col_ref IDR frame!
+    if (col_ref->col_mvf == NULL)
+        return 0;
+
+    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
+
+    //bottom right collocated motion vector
+    x = x0 + nPbW;
+    y = y0 + nPbH;
+
+    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+        y < s->ps.sps->height &&
+        x < s->ps.sps->width)
+    {
+        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
+            (y >> 4) * s->col_mvf_stride;
+
+        if (col->L[0].poc != COL_POC_INTRA &&
+            (col->L[1].poc == COL_POC_INTRA ||
+             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
+        {
+            cmv = col->L + 0;
+        }
+        else if (col->L[1].poc != COL_POC_INTRA)
+        {
+            cmv = col->L + 1;
+        }
+    }
+
+    // derive center collocated motion vector
+    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
+    {
+        cmv = NULL;
+        x                  = x0 + (nPbW >> 1);
+        y                  = y0 + (nPbH >> 1);
+
+        {
+            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
+              (y >> 4) * s->col_mvf_stride;
+
+            if (col->L[0].poc != COL_POC_INTRA &&
+              (col->L[1].poc == COL_POC_INTRA ||
+               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
+            {
+              cmv = col->L + 0;
+            }
+            else if (col->L[1].poc != COL_POC_INTRA)
+            {
+              cmv = col->L + 1;
+            }
+        }
+    }
+
+    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
+        return 0;
+
+    {
+        const int col_poc  = col_ref->poc;
+        const int ref_poc  = refPicList->list[refIdxLx];
+
+        *mvLXCol = (cur_lt ||
+                        cmv->poc == col_poc ||
+                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
+                    cmv->xy :
+                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
+    }
+
+    return cmv != NULL;
+}
+
+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
+{
+    return b != NULL && compare_mv_ref_idx(a, b);
+}
+
+
+
+/*
+ * 8.5.3.1.2  Derivation process for spatial merging candidates
+ */
+static inline const HEVCRpiMvField *
+derive_spatial_merge_candidates(
+    const HEVCRpiContext * const s,
+    const HEVCRpiLocalContext * const lc,
+    const unsigned int x0, const unsigned int y0,
+    const unsigned int nPbW, const unsigned int nPbH,
+    const unsigned int avail,
+    const unsigned int part_idx,
+    const unsigned int merge_idx,
+    HEVCRpiMvField * const mvf_t)
+{
+    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
+    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
+
+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
+    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
+    const unsigned int part_mode = lc->cu.part_mode;
+
+    const HEVCRpiMvField * perm[4];
+    unsigned int nb_merge_cand = 0;
+
+    // singleMCLFlag => part_idx == 0 so no need to test for it
+    if ((avail & AVAIL_L) == 0 ||
+        (part_idx == 1 &&
+            ((parts_a1 >> part_mode) & 1) != 0 ||
+                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
+        mvf_a1->pred_flag == PF_INTRA)
+    {
+        mvf_a1 = NULL;
+    }
+    else
+    {
+        if (merge_idx == nb_merge_cand)
+            return mvf_a1;
+        perm[nb_merge_cand++] = mvf_a1;
+    }
+
+    if ((avail & AVAIL_U) == 0 ||
+            (part_idx == 1 &&
+               ((parts_b1 >> part_mode) & 1) != 0 ||
+                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
+            mvf_b1->pred_flag == PF_INTRA)
+    {
+        mvf_b1 = NULL;
+    }
+    else if (!mvf_eq(mvf_b1, mvf_a1))
+    {
+        if (merge_idx == nb_merge_cand)
+            return mvf_b1;
+        perm[nb_merge_cand++] = mvf_b1;
+    }
+
+    // above right spatial merge candidate
+    // Never need mvf_b0 again so don't bother zeroing if navail
+    if ((avail & AVAIL_UR) != 0 &&
+        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
+        mvf_b0->pred_flag != PF_INTRA &&
+        !mvf_eq(mvf_b0, mvf_b1))
+    {
+        if (merge_idx == nb_merge_cand)
+            return mvf_b0;
+        perm[nb_merge_cand++] = mvf_b0;
+    }
+
+    // left bottom spatial merge candidate
+    // Never need mvf_a0 again so don't bother zeroing if navail
+    if ((avail & AVAIL_DL) != 0 &&
+        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
+        mvf_a0->pred_flag != PF_INTRA &&
+        !mvf_eq(mvf_a0, mvf_a1))
+    {
+        if (merge_idx == nb_merge_cand)
+            return mvf_a0;
+        perm[nb_merge_cand++] = mvf_a0;
+    }
+
+    // above left spatial merge candidate
+    if (nb_merge_cand != 4 &&
+        (avail & AVAIL_UL) != 0 &&
+        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
+    {
+        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
+
+        if (mvf_b2->pred_flag != PF_INTRA &&
+            !mvf_eq(mvf_b2, mvf_a1) &&
+            !mvf_eq(mvf_b2, mvf_b1))
+        {
+            if (merge_idx == nb_merge_cand)
+                return mvf_b2;
+            perm[nb_merge_cand++] = mvf_b2;
+        }
+    }
+
+    // temporal motion vector candidate
+    if (s->sh.slice_temporal_mvp_enabled_flag)
+    {
+        static const HEVCRpiMvField mvf_z = {{0}};
+
+        *mvf_t = mvf_z;
+
+        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
+                                        0, mvf_t->xy + 0, 0))
+            mvf_t->pred_flag = PF_L0;
+
+        if (s->sh.slice_type == HEVC_SLICE_B &&
+                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
+                                            0, mvf_t->xy + 1, 1))
+            mvf_t->pred_flag |= PF_L1;
+
+        if (mvf_t->pred_flag != 0)
+        {
+            if (merge_idx == nb_merge_cand)
+                return mvf_t;
+            perm[nb_merge_cand++] = mvf_t;
+        }
+    }
+
+    // combined bi-predictive merge candidates  (applies for B slices)
+    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
+    {
+        unsigned int comb_idx = 0;
+        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
+        const RefPicList * const refPicList = s->refPicList;
+
+        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
+        {
+            static const uint8_t l0_l1_cand_idx[12][2] = {
+                { 0, 1, },
+                { 1, 0, },
+                { 0, 2, },
+                { 2, 0, },
+                { 1, 2, },
+                { 2, 1, },
+                { 0, 3, },
+                { 3, 0, },
+                { 1, 3, },
+                { 3, 1, },
+                { 2, 3, },
+                { 3, 2, },
+            };
+
+            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
+            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
+            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
+            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
+
+            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
+                (mvf_c1->pred_flag & PF_L1) != 0 &&
+                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
+                 mvf_c0->xy[0] != mvf_c1->xy[1]))
+            {
+                if (merge_idx == nb_merge_cand++)
+                {
+                    // Need to be a bit careful as we will construct mvf_t and we
+                    // may already be using that as one of our condidates
+                    // so build & copy rather than build in place
+                    const HEVCRpiMvField mvf_m = {
+                        .xy = {
+                            mvf_c0->xy[0],
+                            mvf_c1->xy[1]},
+                        .ref_idx = {
+                            mvf_c0->ref_idx[0],
+                            mvf_c1->ref_idx[1]},
+                        .pred_flag = PF_BI
+                    };
+                    *mvf_t = mvf_m;
+                    return mvf_t;
+                }
+            }
+        }
+    }
+
+    // "append" Zero motion vector candidates
+    {
+        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
+                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
+        const unsigned int zero_idx = merge_idx - nb_merge_cand;
+
+        const HEVCRpiMvField mvf_m = {
+            .xy = {0, 0},
+            .ref_idx = {
+                zero_idx < nb_refs ? zero_idx : 0,
+                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
+            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
+        };
+
+        *mvf_t = mvf_m;
+        return mvf_t;
+    }
+}
+
+
+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
+                                int nPbH, int log2_cb_size, int part_idx,
+                                int merge_idx, HEVCRpiMvField * const mv)
+{
+    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
+        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
+                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
+                                        0, merge_idx, mv) :
+        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
+                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
+                                        part_idx, merge_idx, mv);
+
+    if (mvf_m != mv)
+        *mv = *mvf_m;
+
+    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
+        mv->pred_flag = PF_L0;
+}
+
+
+static av_always_inline const MvXY *
+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
+{
+    if (mvf != NULL)
+    {
+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
+            return mvf->xy + pfi0;
+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
+            return mvf->xy + pfi1;
+    }
+    return NULL;
+}
+
+static av_always_inline const MvXY *
+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
+              const int islt0, const int poc0, const int poc_cur,
+              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
+{
+    if (mvf != NULL)
+    {
+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
+        {
+            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
+            if (islt0 || poc1 == poc0) {
+                return mvf->xy + pfi0;
+            }
+            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
+            return mv_t;
+        }
+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
+        {
+            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
+            if (islt0 || poc1 == poc0) {
+                return mvf->xy + pfi1;
+            }
+            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
+            return mv_t;
+        }
+    }
+    return NULL;
+}
+
+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+    const unsigned int x0, const unsigned int y0,
+    const unsigned int nPbW, const unsigned int nPbH,
+    const unsigned int avail,
+    HEVCRpiMvField * const mv,
+    const unsigned int mvp_lx_flag, const unsigned int LX)
+{
+    const unsigned int pfi0 = LX;
+    const unsigned int pfi1 = LX == 0 ? 1 : 0;
+    const RefPicList * const rpl = s->refPicList;
+    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
+    const int poc_cur = s->poc;
+    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
+
+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
+    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
+    const MvXY * mva = NULL;
+    const MvXY * mvb;
+    MvXY * const mv_rv = mv->xy + LX;
+    MvXY mvt_a, mvt_b;
+
+    *mv_rv = 0;
+
+    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
+        mvf_a0 = NULL;
+    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
+        goto use_mva;
+
+    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
+        mvf_a1 = NULL;
+
+    if (mva == NULL &&
+        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
+        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
+        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
+
+    if (mvp_lx_flag == 0 && mva != NULL)
+        goto use_mva;
+
+    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
+        mvf_b0 = NULL;
+    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
+        mvf_b1 = NULL;
+    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
+        mvf_b2 = NULL;
+
+    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
+        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
+        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
+
+    if (mvf_a0 == NULL && mvf_a1 == NULL) {
+        mva = mvb;
+        if (mvp_lx_flag == 0 && mva != NULL)
+            goto use_mva;
+
+        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
+            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
+            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
+    }
+
+    if (mva == NULL) {
+        mva = mvb;
+        mvb = NULL;
+    }
+
+    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
+        mvb = NULL;
+
+    if (mvp_lx_flag == 0 && mva != NULL) {
+        goto use_mva;
+    }
+    else if (mvp_lx_flag != 0 && mvb != NULL) {
+        *mv_rv = *mvb;
+    }
+    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
+        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
+                                    nPbH, mv->ref_idx[LX],
+                                    mv_rv, LX);
+    }
+    return;
+
+use_mva:
+    *mv_rv = *mva;
+    return;
+}
+
diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
new file mode 100644
index 0000000000..e58a59ce5e
--- /dev/null
+++ b/libavcodec/rpi_hevc_parse.c
@@ -0,0 +1,143 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "bytestream.h"
+#include "h2645_parse.h"
+#include "hevc.h"
+#include "rpi_hevc_parse.h"
+
+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
+                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
+                                 int err_recognition, int apply_defdispwin, void *logctx)
+{
+    int i;
+    int ret = 0;
+    H2645Packet pkt = { 0 };
+
+    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
+                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
+    if (ret < 0) {
+        goto done;
+    }
+
+    for (i = 0; i < pkt.nb_nals; i++) {
+        H2645NAL *nal = &pkt.nals[i];
+
+        /* ignore everything except parameter sets and VCL NALUs */
+        switch (nal->type) {
+        case HEVC_NAL_VPS:
+            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
+            if (ret < 0)
+                goto done;
+            break;
+        case HEVC_NAL_SPS:
+            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
+            if (ret < 0)
+                goto done;
+            break;
+        case HEVC_NAL_PPS:
+            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
+            if (ret < 0)
+                goto done;
+            break;
+        case HEVC_NAL_SEI_PREFIX:
+        case HEVC_NAL_SEI_SUFFIX:
+            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
+            if (ret < 0)
+                goto done;
+            break;
+        default:
+            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
+            break;
+        }
+    }
+
+done:
+    ff_h2645_packet_uninit(&pkt);
+    if (err_recognition & AV_EF_EXPLODE)
+        return ret;
+
+    return 0;
+}
+
+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
+                             int err_recognition, int apply_defdispwin, void *logctx)
+{
+    int ret = 0;
+    GetByteContext gb;
+
+    bytestream2_init(&gb, data, size);
+
+    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
+        /* It seems the extradata is encoded as hvcC format.
+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
+         * is finalized. When finalized, configurationVersion will be 1 and we
+         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
+        int i, j, num_arrays, nal_len_size;
+
+        *is_nalff = 1;
+
+        bytestream2_skip(&gb, 21);
+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
+        num_arrays   = bytestream2_get_byte(&gb);
+
+        /* nal units in the hvcC always have length coded with 2 bytes,
+         * so put a fake nal_length_size = 2 while parsing them */
+        *nal_length_size = 2;
+
+        /* Decode nal units from hvcC. */
+        for (i = 0; i < num_arrays; i++) {
+            int type = bytestream2_get_byte(&gb) & 0x3f;
+            int cnt  = bytestream2_get_be16(&gb);
+
+            for (j = 0; j < cnt; j++) {
+                // +2 for the nal size field
+                int nalsize = bytestream2_peek_be16(&gb) + 2;
+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
+                    av_log(logctx, AV_LOG_ERROR,
+                           "Invalid NAL unit size in extradata.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
+                                            *nal_length_size, err_recognition, apply_defdispwin,
+                                            logctx);
+                if (ret < 0) {
+                    av_log(logctx, AV_LOG_ERROR,
+                           "Decoding nal unit %d %d from hvcC failed\n",
+                           type, i);
+                    return ret;
+                }
+                bytestream2_skip(&gb, nalsize);
+            }
+        }
+
+        /* Now store right nal length size, that will be used to parse
+         * all other nals */
+        *nal_length_size = nal_len_size;
+    } else {
+        *is_nalff = 0;
+        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
+                                    err_recognition, apply_defdispwin, logctx);
+        if (ret < 0)
+            return ret;
+    }
+
+    return ret;
+}
diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
new file mode 100644
index 0000000000..4b4d032a16
--- /dev/null
+++ b/libavcodec/rpi_hevc_parse.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.265 parser code
+ */
+
+#ifndef AVCODEC_RPI_HEVC_PARSE_H
+#define AVCODEC_RPI_HEVC_PARSE_H
+
+#include <stdint.h>
+
+#include "rpi_hevc_ps.h"
+#include "rpi_hevc_sei.h"
+
+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
+                             int err_recognition, int apply_defdispwin, void *logctx);
+
+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
new file mode 100644
index 0000000000..f4e31f7d1d
--- /dev/null
+++ b/libavcodec/rpi_hevc_ps.c
@@ -0,0 +1,1938 @@
+/*
+ * HEVC Parameter Set decoding
+ *
+ * Copyright (C) 2012 - 2103 Guillaume Martres
+ * Copyright (C) 2012 - 2103 Mickael Raulet
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2013 Vittorio Giovara
+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/imgutils.h"
+#include "golomb.h"
+#include "rpi_hevc_data.h"
+#include "rpi_hevc_ps.h"
+#include "rpi_hevcdec.h"
+
+static const uint8_t default_scaling_list_intra[] = {
+    16, 16, 16, 16, 17, 18, 21, 24,
+    16, 16, 16, 16, 17, 19, 22, 25,
+    16, 16, 17, 18, 20, 22, 25, 29,
+    16, 16, 18, 21, 24, 27, 31, 36,
+    17, 17, 20, 24, 30, 35, 41, 47,
+    18, 19, 22, 27, 35, 44, 54, 65,
+    21, 22, 25, 31, 41, 54, 70, 88,
+    24, 25, 29, 36, 47, 65, 88, 115
+};
+
+static const uint8_t default_scaling_list_inter[] = {
+    16, 16, 16, 16, 17, 18, 20, 24,
+    16, 16, 16, 17, 18, 20, 24, 25,
+    16, 16, 17, 18, 20, 24, 25, 28,
+    16, 17, 18, 20, 24, 25, 28, 33,
+    17, 18, 20, 24, 25, 28, 33, 41,
+    18, 20, 24, 25, 28, 33, 41, 54,
+    20, 24, 25, 28, 33, 41, 54, 71,
+    24, 25, 28, 33, 41, 54, 71, 91
+};
+
+static const AVRational vui_sar[] = {
+    {  0,   1 },
+    {  1,   1 },
+    { 12,  11 },
+    { 10,  11 },
+    { 16,  11 },
+    { 40,  33 },
+    { 24,  11 },
+    { 20,  11 },
+    { 32,  11 },
+    { 80,  33 },
+    { 18,  11 },
+    { 15,  11 },
+    { 64,  33 },
+    { 160, 99 },
+    {  4,   3 },
+    {  3,   2 },
+    {  2,   1 },
+};
+
+
+// pps_cb_qp_offset: -12,+12
+// slice_cb_qp_offset: -12,+12 also
+//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
+// cr_qp_offset_list[n]: -12,+12
+// So worst case total offset: -24,+24
+
+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
+#define M(B,n) C(B,(-n))
+
+// Sizeof the QP_START_BLOCK
+#define QP_OFFSET_0 (8*6 + 12*2)
+#define QP_START(B) \
+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
+\
+    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
+    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
+    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
+    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
+    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
+    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
+    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
+    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
+#define QP_END(B) \
+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
+
+#define T1(B)\
+{\
+    QP_START(B),\
+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
+    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
+    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
+    C(B,44), C(B,45),\
+    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
+    QP_END(B)\
+}
+#define T0(B)\
+{\
+    QP_START(B),\
+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
+    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
+    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
+    C(B,50), C(B,51),\
+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
+    QP_END(B)\
+}
+
+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
+
+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
+
+#undef T
+#undef C
+#undef QP_END
+
+#define C(B,n) ((n)<0?0:(n)>51?51:(n))
+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
+#define QP_DBLK_OFFSET_0 QP_OFFSET_0
+#define QP_END(B)\
+ 51, 51, 51, 51, 51, 51
+
+// These don't need all the padding we have here (12 top/bottom would be enough)
+static const uint8_t qp_c_dblk_0[] = T0(0);
+static const uint8_t qp_c_dblk_1[] = T1(0);
+
+#undef T
+#undef M
+#undef C
+#undef QP_END
+#undef QP_START
+
+
+static void remove_pps(HEVCRpiParamSets * const s, const int id)
+{
+    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
+        s->pps = NULL;
+    av_buffer_unref(&s->pps_list[id]);
+}
+
+static void remove_sps(HEVCRpiParamSets * const s, const int id)
+{
+    int i;
+    if (s->sps_list[id]) {
+        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
+            s->sps = NULL;
+
+        /* drop all PPS that depend on this SPS */
+        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
+            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
+                remove_pps(s, i);
+
+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
+    }
+    av_buffer_unref(&s->sps_list[id]);
+}
+
+static void remove_vps(HEVCRpiParamSets * const s, const int id)
+{
+    int i;
+    if (s->vps_list[id]) {
+        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
+            s->vps = NULL;
+
+        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
+            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
+                remove_sps(s, i);
+    }
+    av_buffer_unref(&s->vps_list[id]);
+}
+
+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
+                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
+{
+    uint8_t rps_predict = 0;
+    int delta_poc;
+    int k0 = 0;
+    int k1 = 0;
+    int k  = 0;
+    int i;
+
+    if (rps != sps->st_rps && sps->nb_st_rps)
+        rps_predict = get_bits1(gb);
+
+    if (rps_predict) {
+        const ShortTermRPS *rps_ridx;
+        int delta_rps;
+        unsigned abs_delta_rps;
+        uint8_t use_delta_flag = 0;
+        uint8_t delta_rps_sign;
+
+        if (is_slice_header) {
+            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
+            if (delta_idx > sps->nb_st_rps) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
+                       delta_idx, sps->nb_st_rps);
+                return AVERROR_INVALIDDATA;
+            }
+            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
+            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
+        } else
+            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
+
+        delta_rps_sign = get_bits1(gb);
+        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid value of abs_delta_rps: %d\n",
+                   abs_delta_rps);
+            return AVERROR_INVALIDDATA;
+        }
+        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
+        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
+            int used = rps->used[k] = get_bits1(gb);
+
+            if (!used)
+                use_delta_flag = get_bits1(gb);
+
+            if (used || use_delta_flag) {
+                if (i < rps_ridx->num_delta_pocs)
+                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
+                else
+                    delta_poc = delta_rps;
+                rps->delta_poc[k] = delta_poc;
+                if (delta_poc < 0)
+                    k0++;
+                else
+                    k1++;
+                k++;
+            }
+        }
+
+        if (k >= FF_ARRAY_ELEMS(rps->used)) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid num_delta_pocs: %d\n", k);
+            return AVERROR_INVALIDDATA;
+        }
+
+        rps->num_delta_pocs    = k;
+        rps->num_negative_pics = k0;
+        // sort in increasing order (smallest first)
+        if (rps->num_delta_pocs != 0) {
+            int used, tmp;
+            for (i = 1; i < rps->num_delta_pocs; i++) {
+                delta_poc = rps->delta_poc[i];
+                used      = rps->used[i];
+                for (k = i - 1; k >= 0; k--) {
+                    tmp = rps->delta_poc[k];
+                    if (delta_poc < tmp) {
+                        rps->delta_poc[k + 1] = tmp;
+                        rps->used[k + 1]      = rps->used[k];
+                        rps->delta_poc[k]     = delta_poc;
+                        rps->used[k]          = used;
+                    }
+                }
+            }
+        }
+        if ((rps->num_negative_pics >> 1) != 0) {
+            int used;
+            k = rps->num_negative_pics - 1;
+            // flip the negative values to largest first
+            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
+                delta_poc         = rps->delta_poc[i];
+                used              = rps->used[i];
+                rps->delta_poc[i] = rps->delta_poc[k];
+                rps->used[i]      = rps->used[k];
+                rps->delta_poc[k] = delta_poc;
+                rps->used[k]      = used;
+                k--;
+            }
+        }
+    } else {
+        unsigned int prev, nb_positive_pics;
+        rps->num_negative_pics = get_ue_golomb_long(gb);
+        nb_positive_pics       = get_ue_golomb_long(gb);
+
+        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
+            nb_positive_pics >= HEVC_MAX_REFS) {
+            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
+        if (rps->num_delta_pocs) {
+            prev = 0;
+            for (i = 0; i < rps->num_negative_pics; i++) {
+                delta_poc = get_ue_golomb_long(gb) + 1;
+                if (delta_poc < 1 || delta_poc > 32768) {
+                    av_log(avctx, AV_LOG_ERROR,
+                        "Invalid value of delta_poc: %d\n",
+                        delta_poc);
+                    return AVERROR_INVALIDDATA;
+                }
+                prev -= delta_poc;
+                rps->delta_poc[i] = prev;
+                rps->used[i]      = get_bits1(gb);
+            }
+            prev = 0;
+            for (i = 0; i < nb_positive_pics; i++) {
+                delta_poc = get_ue_golomb_long(gb) + 1;
+                if (delta_poc < 1 || delta_poc > 32768) {
+                    av_log(avctx, AV_LOG_ERROR,
+                        "Invalid value of delta_poc: %d\n",
+                        delta_poc);
+                    return AVERROR_INVALIDDATA;
+                }
+                prev += delta_poc;
+                rps->delta_poc[rps->num_negative_pics + i] = prev;
+                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
+            }
+        }
+    }
+    return 0;
+}
+
+
+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
+                                      PTLCommon * const ptl)
+{
+    int i;
+
+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
+        return -1;
+
+    ptl->profile_space = get_bits(gb, 2);
+    ptl->tier_flag     = get_bits1(gb);
+    ptl->profile_idc   = get_bits(gb, 5);
+    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
+        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
+        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
+        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
+    else
+        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
+
+    for (i = 0; i < 32; i++) {
+        ptl->profile_compatibility_flag[i] = get_bits1(gb);
+
+        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
+            ptl->profile_idc = i;
+    }
+    ptl->progressive_source_flag    = get_bits1(gb);
+    ptl->interlaced_source_flag     = get_bits1(gb);
+    ptl->non_packed_constraint_flag = get_bits1(gb);
+    ptl->frame_only_constraint_flag = get_bits1(gb);
+
+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
+    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
+
+    return 0;
+}
+
+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
+                      PTL * const ptl, const int max_num_sub_layers)
+{
+    int i;
+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
+        return -1;
+    }
+
+    ptl->general_ptl.level_idc = get_bits(gb, 8);
+
+    for (i = 0; i < max_num_sub_layers - 1; i++) {
+        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
+        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
+    }
+
+    if (max_num_sub_layers - 1> 0)
+        for (i = max_num_sub_layers - 1; i < 8; i++)
+            skip_bits(gb, 2); // reserved_zero_2bits[i]
+    for (i = 0; i < max_num_sub_layers - 1; i++) {
+        if (ptl->sub_layer_profile_present_flag[i] &&
+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "PTL information for sublayer %i too short\n", i);
+            return -1;
+        }
+        if (ptl->sub_layer_level_present_flag[i]) {
+            if (get_bits_left(gb) < 8) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not enough data for sublayer %i level_idc\n", i);
+                return -1;
+            } else
+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        }
+    }
+
+    return 0;
+}
+
+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
+                                const int subpic_params_present)
+{
+    int i;
+
+    for (i = 0; i < nb_cpb; i++) {
+        get_ue_golomb_long(gb); // bit_rate_value_minus1
+        get_ue_golomb_long(gb); // cpb_size_value_minus1
+
+        if (subpic_params_present) {
+            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
+            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
+        }
+        skip_bits1(gb); // cbr_flag
+    }
+}
+
+static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
+                      const int max_sublayers)
+{
+    int nal_params_present = 0, vcl_params_present = 0;
+    int subpic_params_present = 0;
+    int i;
+
+    if (common_inf_present) {
+        nal_params_present = get_bits1(gb);
+        vcl_params_present = get_bits1(gb);
+
+        if (nal_params_present || vcl_params_present) {
+            subpic_params_present = get_bits1(gb);
+
+            if (subpic_params_present) {
+                skip_bits(gb, 8); // tick_divisor_minus2
+                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
+                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
+                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
+            }
+
+            skip_bits(gb, 4); // bit_rate_scale
+            skip_bits(gb, 4); // cpb_size_scale
+
+            if (subpic_params_present)
+                skip_bits(gb, 4);  // cpb_size_du_scale
+
+            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
+            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
+            skip_bits(gb, 5); // dpb_output_delay_length_minus1
+        }
+    }
+
+    for (i = 0; i < max_sublayers; i++) {
+        int low_delay = 0;
+        unsigned int nb_cpb = 1;
+        int fixed_rate = get_bits1(gb);
+
+        if (!fixed_rate)
+            fixed_rate = get_bits1(gb);
+
+        if (fixed_rate)
+            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
+        else
+            low_delay = get_bits1(gb);
+
+        if (!low_delay) {
+            nb_cpb = get_ue_golomb_long(gb) + 1;
+            if (nb_cpb < 1 || nb_cpb > 32) {
+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
+        if (nal_params_present)
+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
+        if (vcl_params_present)
+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
+    }
+    return 0;
+}
+
+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
+                           HEVCRpiParamSets * const ps)
+{
+    int i,j;
+    int vps_id = 0;
+    ptrdiff_t nal_size;
+    HEVCRpiVPS *vps;
+    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
+
+    if (!vps_buf)
+        return AVERROR(ENOMEM);
+    vps = (HEVCRpiVPS*)vps_buf->data;
+
+    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
+
+    nal_size = gb->buffer_end - gb->buffer;
+    if (nal_size > sizeof(vps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               nal_size, sizeof(vps->data));
+        vps->data_size = sizeof(vps->data);
+    } else {
+        vps->data_size = nal_size;
+    }
+    memcpy(vps->data, gb->buffer, vps->data_size);
+
+    vps_id = get_bits(gb, 4);
+    if (vps_id >= HEVC_MAX_VPS_COUNT) {
+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
+        goto err;
+    }
+
+    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
+        goto err;
+    }
+
+    vps->vps_max_layers               = get_bits(gb, 6) + 1;
+    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
+    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
+
+    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
+        goto err;
+    }
+
+    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
+        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
+               vps->vps_max_sub_layers);
+        goto err;
+    }
+
+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
+        goto err;
+
+    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
+
+    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
+    for (; i < vps->vps_max_sub_layers; i++) {
+        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
+        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
+        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
+
+        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
+            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
+                   vps->vps_max_dec_pic_buffering[i] - 1);
+            goto err;
+        }
+        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
+            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
+                   vps->vps_num_reorder_pics[i]);
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                goto err;
+        }
+    }
+
+    vps->vps_max_layer_id   = get_bits(gb, 6);
+    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
+        goto err;
+    }
+
+    for (i = 1; i < vps->vps_num_layer_sets; i++)
+        for (j = 0; j <= vps->vps_max_layer_id; j++)
+            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
+
+    vps->vps_timing_info_present_flag = get_bits1(gb);
+    if (vps->vps_timing_info_present_flag) {
+        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
+        vps->vps_time_scale                      = get_bits_long(gb, 32);
+        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
+        if (vps->vps_poc_proportional_to_timing_flag)
+            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
+        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
+            goto err;
+        }
+        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
+            int common_inf_present = 1;
+
+            get_ue_golomb_long(gb); // hrd_layer_set_idx
+            if (i)
+                common_inf_present = get_bits1(gb);
+            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
+        }
+    }
+    get_bits1(gb); /* vps_extension_flag */
+
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread VPS by %d bits\n", -get_bits_left(gb));
+        if (ps->vps_list[vps_id])
+            goto err;
+    }
+
+    if (ps->vps_list[vps_id] &&
+        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
+        av_buffer_unref(&vps_buf);
+    } else {
+        remove_vps(ps, vps_id);
+        ps->vps_list[vps_id] = vps_buf;
+    }
+
+    return 0;
+
+err:
+    av_buffer_unref(&vps_buf);
+    return AVERROR_INVALIDDATA;
+}
+
+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
+                       const int apply_defdispwin, HEVCRpiSPS * const sps)
+{
+    VUI backup_vui, * const vui = &sps->vui;
+    GetBitContext backup;
+    int sar_present, alt = 0;
+
+    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
+
+    sar_present = get_bits1(gb);
+    if (sar_present) {
+        uint8_t sar_idx = get_bits(gb, 8);
+        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
+            vui->sar = vui_sar[sar_idx];
+        else if (sar_idx == 255) {
+            vui->sar.num = get_bits(gb, 16);
+            vui->sar.den = get_bits(gb, 16);
+        } else
+            av_log(avctx, AV_LOG_WARNING,
+                   "Unknown SAR index: %u.\n", sar_idx);
+    }
+
+    vui->overscan_info_present_flag = get_bits1(gb);
+    if (vui->overscan_info_present_flag)
+        vui->overscan_appropriate_flag = get_bits1(gb);
+
+    vui->video_signal_type_present_flag = get_bits1(gb);
+    if (vui->video_signal_type_present_flag) {
+        vui->video_format                    = get_bits(gb, 3);
+        vui->video_full_range_flag           = get_bits1(gb);
+        vui->colour_description_present_flag = get_bits1(gb);
+        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
+            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
+        if (vui->colour_description_present_flag) {
+            vui->colour_primaries        = get_bits(gb, 8);
+            vui->transfer_characteristic = get_bits(gb, 8);
+            vui->matrix_coeffs           = get_bits(gb, 8);
+
+            // Set invalid values to "unspecified"
+            if (!av_color_primaries_name(vui->colour_primaries))
+                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
+            if (!av_color_transfer_name(vui->transfer_characteristic))
+                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
+            if (!av_color_space_name(vui->matrix_coeffs))
+                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
+                switch (sps->pix_fmt) {
+                case AV_PIX_FMT_YUV444P:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
+                    break;
+                case AV_PIX_FMT_YUV444P10:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
+                    break;
+                case AV_PIX_FMT_YUV444P12:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
+                    break;
+                }
+            }
+        }
+    }
+
+    vui->chroma_loc_info_present_flag = get_bits1(gb);
+    if (vui->chroma_loc_info_present_flag) {
+        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
+        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
+    }
+
+    vui->neutra_chroma_indication_flag = get_bits1(gb);
+    vui->field_seq_flag                = get_bits1(gb);
+    vui->frame_field_info_present_flag = get_bits1(gb);
+
+    // Backup context in case an alternate header is detected
+    memcpy(&backup, gb, sizeof(backup));
+    memcpy(&backup_vui, vui, sizeof(backup_vui));
+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
+        vui->default_display_window_flag = 0;
+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
+    } else
+        vui->default_display_window_flag = get_bits1(gb);
+
+    if (vui->default_display_window_flag) {
+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
+
+        if (apply_defdispwin &&
+            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
+            av_log(avctx, AV_LOG_DEBUG,
+                   "discarding vui default display window, "
+                   "original values are l:%u r:%u t:%u b:%u\n",
+                   vui->def_disp_win.left_offset,
+                   vui->def_disp_win.right_offset,
+                   vui->def_disp_win.top_offset,
+                   vui->def_disp_win.bottom_offset);
+
+            vui->def_disp_win.left_offset   =
+            vui->def_disp_win.right_offset  =
+            vui->def_disp_win.top_offset    =
+            vui->def_disp_win.bottom_offset = 0;
+        }
+    }
+
+timing_info:
+    vui->vui_timing_info_present_flag = get_bits1(gb);
+
+    if (vui->vui_timing_info_present_flag) {
+        if( get_bits_left(gb) < 66 && !alt) {
+            // The alternate syntax seem to have timing info located
+            // at where def_disp_win is normally located
+            av_log(avctx, AV_LOG_WARNING,
+                   "Strange VUI timing information, retrying...\n");
+            memcpy(vui, &backup_vui, sizeof(backup_vui));
+            memcpy(gb, &backup, sizeof(backup));
+            alt = 1;
+            goto timing_info;
+        }
+        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
+        vui->vui_time_scale                      = get_bits_long(gb, 32);
+        if (alt) {
+            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
+        }
+        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
+        if (vui->vui_poc_proportional_to_timing_flag)
+            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
+        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
+        if (vui->vui_hrd_parameters_present_flag)
+            decode_hrd(gb, 1, sps->max_sub_layers);
+    }
+
+    vui->bitstream_restriction_flag = get_bits1(gb);
+    if (vui->bitstream_restriction_flag) {
+        if (get_bits_left(gb) < 8 && !alt) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Strange VUI bitstream restriction information, retrying"
+                   " from timing information...\n");
+            memcpy(vui, &backup_vui, sizeof(backup_vui));
+            memcpy(gb, &backup, sizeof(backup));
+            alt = 1;
+            goto timing_info;
+        }
+        vui->tiles_fixed_structure_flag              = get_bits1(gb);
+        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
+        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
+        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
+        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
+        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
+        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
+        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
+    }
+
+    if (get_bits_left(gb) < 1 && !alt) {
+        // XXX: Alternate syntax when sps_range_extension_flag != 0?
+        av_log(avctx, AV_LOG_WARNING,
+               "Overread in VUI, retrying from timing information...\n");
+        memcpy(vui, &backup_vui, sizeof(backup_vui));
+        memcpy(gb, &backup, sizeof(backup));
+        alt = 1;
+        goto timing_info;
+    }
+}
+
+static void set_default_scaling_list_data(ScalingList * const sl)
+{
+    int matrixId;
+
+    for (matrixId = 0; matrixId < 6; matrixId++) {
+        // 4x4 default is 16
+        memset(sl->sl[0][matrixId], 16, 16);
+        sl->sl_dc[0][matrixId] = 16; // default for 16x16
+        sl->sl_dc[1][matrixId] = 16; // default for 32x32
+    }
+
+    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
+    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
+
+    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
+
+    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
+    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
+
+    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
+
+    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
+
+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
+}
+
+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
+                             const HEVCRpiSPS * const sps)
+{
+    uint8_t scaling_list_pred_mode_flag;
+    int32_t scaling_list_dc_coef[2][6];
+    int size_id, matrix_id, pos;
+    int i;
+
+    for (size_id = 0; size_id < 4; size_id++)
+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
+            scaling_list_pred_mode_flag = get_bits1(gb);
+            if (!scaling_list_pred_mode_flag) {
+                unsigned int delta = get_ue_golomb_long(gb);
+                /* Only need to handle non-zero delta. Zero means default,
+                 * which should already be in the arrays. */
+                if (delta) {
+                    // Copy from previous array.
+                    delta *= (size_id == 3) ? 3 : 1;
+                    if (matrix_id < delta) {
+                        av_log(avctx, AV_LOG_ERROR,
+                               "Invalid delta in scaling list data: %d.\n", delta);
+                        return AVERROR_INVALIDDATA;
+                    }
+
+                    memcpy(sl->sl[size_id][matrix_id],
+                           sl->sl[size_id][matrix_id - delta],
+                           size_id > 0 ? 64 : 16);
+                    if (size_id > 1)
+                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
+                }
+            } else {
+                int next_coef, coef_num;
+                int32_t scaling_list_delta_coef;
+
+                next_coef = 8;
+                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
+                if (size_id > 1) {
+                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
+                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
+                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
+                }
+                for (i = 0; i < coef_num; i++) {
+                    if (size_id == 0)
+                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
+                                  ff_hevc_rpi_diag_scan4x4_x[i];
+                    else
+                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
+                                  ff_hevc_rpi_diag_scan8x8_x[i];
+
+                    scaling_list_delta_coef = get_se_golomb(gb);
+                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
+                    sl->sl[size_id][matrix_id][pos] = next_coef;
+                }
+            }
+        }
+
+    if (sps->chroma_format_idc == 3) {
+        for (i = 0; i < 64; i++) {
+            sl->sl[3][1][i] = sl->sl[2][1][i];
+            sl->sl[3][2][i] = sl->sl[2][2][i];
+            sl->sl[3][4][i] = sl->sl[2][4][i];
+            sl->sl[3][5][i] = sl->sl[2][5][i];
+        }
+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
+    }
+
+
+    return 0;
+}
+
+static int map_pixel_format(HEVCRpiSPS * const sps)
+{
+    const int cfmt = sps->chroma_format_idc;
+
+    sps->pix_fmt = AV_PIX_FMT_NONE;
+    switch (sps->bit_depth) {
+    case 8:
+        if (cfmt == 1)
+            sps->pix_fmt = AV_PIX_FMT_SAND128;
+        break;
+    case 10:
+        if (cfmt == 1)
+            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
+        break;
+    default:
+        break;
+    }
+
+    sps->hshift[0] = sps->vshift[0] = 0;
+    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
+    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
+
+    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
+
+    return 0;
+}
+
+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
+                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
+{
+    HEVCRpiWindow *ow;
+    int ret = 0;
+    int log2_diff_max_min_transform_block_size;
+    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
+    int i;
+
+    // Coded parameters
+
+    sps->vps_id = get_bits(gb, 4);
+    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (vps_list && !vps_list[sps->vps_id]) {
+        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
+               sps->vps_id);
+        return AVERROR_INVALIDDATA;
+    }
+
+    sps->max_sub_layers = get_bits(gb, 3) + 1;
+    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
+        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
+               sps->max_sub_layers);
+        return AVERROR_INVALIDDATA;
+    }
+
+    sps->temporal_id_nesting_flag = get_bits(gb, 1);
+
+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
+        return ret;
+
+    *sps_id = get_ue_golomb_long(gb);
+    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
+        return AVERROR_INVALIDDATA;
+    }
+
+    sps->chroma_format_idc = get_ue_golomb_long(gb);
+    if (sps->chroma_format_idc > 3U) {
+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->chroma_format_idc == 3)
+        sps->separate_colour_plane_flag = get_bits1(gb);
+
+    if (sps->separate_colour_plane_flag)
+        sps->chroma_format_idc = 0;
+
+    sps->width  = get_ue_golomb_long(gb);
+    sps->height = get_ue_golomb_long(gb);
+    if ((ret = av_image_check_size(sps->width,
+                                   sps->height, 0, avctx)) < 0)
+        return ret;
+
+    if (get_bits1(gb)) { // pic_conformance_flag
+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
+
+        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
+            av_log(avctx, AV_LOG_DEBUG,
+                   "discarding sps conformance window, "
+                   "original values are l:%u r:%u t:%u b:%u\n",
+                   sps->pic_conf_win.left_offset,
+                   sps->pic_conf_win.right_offset,
+                   sps->pic_conf_win.top_offset,
+                   sps->pic_conf_win.bottom_offset);
+
+            sps->pic_conf_win.left_offset   =
+            sps->pic_conf_win.right_offset  =
+            sps->pic_conf_win.top_offset    =
+            sps->pic_conf_win.bottom_offset = 0;
+        }
+        sps->output_window = sps->pic_conf_win;
+    }
+
+    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
+    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Luma bit depth (%d) is different from chroma bit depth (%d), "
+               "this is unsupported.\n",
+               sps->bit_depth, bit_depth_chroma);
+        return AVERROR_INVALIDDATA;
+    }
+
+    ret = map_pixel_format(sps);
+    if (ret < 0)
+        return ret;
+
+    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
+    if (sps->log2_max_poc_lsb > 16) {
+        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
+               sps->log2_max_poc_lsb - 4);
+        return AVERROR_INVALIDDATA;
+    }
+
+    sublayer_ordering_info = get_bits1(gb);
+    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
+    for (i = start; i < sps->max_sub_layers; i++) {
+        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
+        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
+        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
+        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
+                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
+            return AVERROR_INVALIDDATA;
+        }
+        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
+            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
+                   sps->temporal_layer[i].num_reorder_pics);
+            if (avctx->err_recognition & AV_EF_EXPLODE ||
+                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
+                return AVERROR_INVALIDDATA;
+            }
+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
+        }
+    }
+
+    if (!sublayer_ordering_info) {
+        for (i = 0; i < start; i++) {
+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
+            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
+            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
+        }
+    }
+
+    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
+    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
+    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
+    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
+    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
+                                               sps->log2_min_tb_size;
+
+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_diff_max_min_coding_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    {
+        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
+        // Not a bitstream limitation, but all profiles
+        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Inferred parameters
+        sps->log2_ctb_size = CtbLog2SizeY;
+//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
+    }
+
+    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
+    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
+
+    sps->scaling_list_enable_flag = get_bits1(gb);
+    if (sps->scaling_list_enable_flag) {
+        set_default_scaling_list_data(&sps->scaling_list);
+
+        if (get_bits1(gb)) {
+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
+    sps->amp_enabled_flag = get_bits1(gb);
+    sps->sao_enabled      = get_bits1(gb);
+
+    // Set pcm defaults (0) so we don't have to test _enabled when we
+    // want to use them
+    memset(&sps->pcm, 0, sizeof(sps->pcm));
+
+    if (get_bits1(gb))  // pcm_enabled_flag
+    {
+        const unsigned int limit_max_pcm = FFMIN(5,
+            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
+        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
+        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
+        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
+                                        get_ue_golomb_long(gb);
+        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
+                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
+            return AVERROR_INVALIDDATA;
+        }
+        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
+            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
+            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
+                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
+            return AVERROR_INVALIDDATA;
+        }
+
+        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
+    }
+
+    // Could be based on min_pcm_cb_size but much easier logic if we just stick
+    // with 8 (and costs us little)
+    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
+    sps->pcm_height = (sps->height + 7) >> 3;
+
+    sps->nb_st_rps = get_ue_golomb_long(gb);
+    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
+        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
+               sps->nb_st_rps);
+        return AVERROR_INVALIDDATA;
+    }
+    for (i = 0; i < sps->nb_st_rps; i++) {
+        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
+                                                 sps, 0)) < 0)
+            return ret;
+    }
+
+    sps->long_term_ref_pics_present_flag = get_bits1(gb);
+    if (sps->long_term_ref_pics_present_flag) {
+        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
+        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
+                   sps->num_long_term_ref_pics_sps);
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
+            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
+            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
+        }
+    }
+
+    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
+    sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
+    sps->vui.sar = (AVRational){0, 1};
+    vui_present = get_bits1(gb);
+    if (vui_present)
+        decode_vui(gb, avctx, apply_defdispwin, sps);
+
+    if (get_bits1(gb)) { // sps_extension_flag
+        int sps_extension_flag[1];
+        for (i = 0; i < 1; i++)
+            sps_extension_flag[i] = get_bits1(gb);
+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+        if (sps_extension_flag[0]) {
+            int extended_precision_processing_flag;
+            int cabac_bypass_alignment_enabled_flag;
+
+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            extended_precision_processing_flag = get_bits1(gb);
+            if (extended_precision_processing_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "extended_precision_processing_flag not yet implemented\n");
+
+            if (get_bits1(gb))          // sps->intra_smoothing_disabled_flag
+                sps->intra_filters_disable |= FILTER_EITHER;
+            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+
+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
+            if (cabac_bypass_alignment_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
+        }
+    }
+    if (apply_defdispwin) {
+        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
+        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
+        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
+        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
+    }
+
+    ow = &sps->output_window;
+    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
+        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
+        ow->left_offset + ow->right_offset  >= sps->width ||
+        ow->top_offset  + ow->bottom_offset >= sps->height) {
+        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
+               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
+        if (avctx->err_recognition & AV_EF_EXPLODE) {
+            return AVERROR_INVALIDDATA;
+        }
+        av_log(avctx, AV_LOG_WARNING,
+               "Displaying the whole video surface.\n");
+        memset(ow, 0, sizeof(*ow));
+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
+    }
+
+    // Inferred parameters
+
+    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
+    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
+    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
+
+    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
+    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
+    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
+    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
+    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
+    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
+
+    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
+    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
+
+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
+               sps->max_transform_hierarchy_depth_inter);
+        return AVERROR_INVALIDDATA;
+    }
+    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
+               sps->max_transform_hierarchy_depth_intra);
+        return AVERROR_INVALIDDATA;
+    }
+    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "max transform block size out of range: %d\n",
+               sps->log2_max_trafo_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread SPS by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
+                           HEVCRpiParamSets *ps, int apply_defdispwin)
+{
+    HEVCRpiSPS *sps;
+    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
+    unsigned int sps_id;
+    int ret;
+    ptrdiff_t nal_size;
+
+    if (!sps_buf)
+        return AVERROR(ENOMEM);
+    sps = (HEVCRpiSPS*)sps_buf->data;
+
+    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
+
+    nal_size = gb->buffer_end - gb->buffer;
+    if (nal_size > sizeof(sps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               nal_size, sizeof(sps->data));
+        sps->data_size = sizeof(sps->data);
+    } else {
+        sps->data_size = nal_size;
+    }
+    memcpy(sps->data, gb->buffer, sps->data_size);
+
+    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
+                            apply_defdispwin,
+                            ps->vps_list, avctx);
+    if (ret < 0) {
+        av_buffer_unref(&sps_buf);
+        return ret;
+    }
+
+    if (avctx->debug & FF_DEBUG_BITSTREAM) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Parsed SPS: id %d; coded wxh: %dx%d; "
+               "cropped wxh: %dx%d; pix_fmt: %s.\n",
+               sps_id, sps->width, sps->height,
+               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
+               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
+               av_get_pix_fmt_name(sps->pix_fmt));
+    }
+
+    /* check if this is a repeat of an already parsed SPS, then keep the
+     * original one.
+     * otherwise drop all PPSes that depend on it */
+    if (ps->sps_list[sps_id] &&
+        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
+        av_buffer_unref(&sps_buf);
+    } else {
+        remove_sps(ps, sps_id);
+        ps->sps_list[sps_id] = sps_buf;
+    }
+
+    return 0;
+}
+
+static void hevc_pps_free(void *opaque, uint8_t *data)
+{
+    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
+
+    av_freep(&pps->column_width);
+    av_freep(&pps->row_height);
+    av_freep(&pps->col_bd);
+    av_freep(&pps->row_bd);
+    av_freep(&pps->col_idxX);
+    av_freep(&pps->ctb_addr_rs_to_ts);
+    av_freep(&pps->ctb_addr_ts_to_rs);
+    av_freep(&pps->tile_pos_ts);
+    av_freep(&pps->tile_size);
+    av_freep(&pps->tile_id);
+    av_freep(&pps->ctb_ts_flags);
+
+    av_freep(&pps);
+}
+
+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
+{
+    do
+    {
+        const int offset = get_se_golomb_long(gb);
+        if (offset < -12 || offset > 12) {
+            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
+            return AVERROR_INVALIDDATA;
+        }
+        *offsets++ = offset;
+    } while (n_minus_1-- != 0);
+    return 0;
+}
+
+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
+                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
+{
+    if (pps->transform_skip_enabled_flag) {
+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
+    }
+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
+    if (pps->cross_component_prediction_enabled_flag &&
+        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
+    {
+        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
+        return AVERROR_INVALIDDATA;
+    }
+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
+    if (pps->chroma_qp_offset_list_enabled_flag) {
+        int err;
+
+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
+        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
+            return AVERROR_INVALIDDATA;
+        }
+        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
+
+        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
+            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
+            return err;
+    }
+
+    {
+        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
+
+        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
+        if (pps->log2_sao_offset_scale_luma > max_offset) {
+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
+            return AVERROR_INVALIDDATA;
+        }
+        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
+        if (pps->log2_sao_offset_scale_chroma > max_offset) {
+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    return(0);
+}
+
+static inline int setup_pps(AVCodecContext * const avctx,
+                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
+{
+    int pic_area_in_ctbs;
+    int i, j, x, y, ctb_addr_rs, tile_id;
+
+    // Inferred parameters
+
+    // qp_y -> qp_u/qp_v tables
+    // The tables have at least -24,+24 overrun after adding offset here
+    // which should allow for clipless offseting
+
+    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
+    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
+
+    if (sps->chroma_format_idc == 1) {
+        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
+        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
+        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
+        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
+    }
+    else
+    {
+        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
+        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
+        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
+        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
+    }
+
+    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
+    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
+    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
+    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
+        return AVERROR(ENOMEM);
+
+    if (pps->uniform_spacing_flag) {
+        if (!pps->column_width) {
+            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
+            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
+        }
+        if (!pps->column_width || !pps->row_height)
+            return AVERROR(ENOMEM);
+
+        for (i = 0; i < pps->num_tile_columns; i++) {
+            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
+                                   (i * sps->ctb_width) / pps->num_tile_columns;
+        }
+
+        for (i = 0; i < pps->num_tile_rows; i++) {
+            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
+                                 (i * sps->ctb_height) / pps->num_tile_rows;
+        }
+    }
+
+    {
+        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
+        pps->col_bd[0] = 0;
+        pps->tile_wpp_inter_disable = 0;
+        for (i = 0; i < pps->num_tile_columns; i++)
+        {
+            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
+
+            // Avoid trying tile parallel if the columns don't fall on cache boundries
+            // (this causes too much pain syncing flushes with the QPU)
+            // Ignore the final (RHS of pic) tile boundry
+            if ((pps->col_bd[i] & td_mask) != 0) {
+                pps->tile_wpp_inter_disable = 1;
+            }
+        }
+
+        // If we can start the next row before finishing the first line of
+        // this one then we must wait at the end of the tile
+        // * if this happens a lot then there are better but more complicated
+        //   conditions that we could apply
+        if (pps->tile_wpp_inter_disable) {
+            for (i = 0; i < pps->num_tile_rows; i++)
+            {
+                if (pps->row_height[i] <= RPI_MAX_JOBS) {
+                    pps->tile_wpp_inter_disable = 2;
+                    break;
+                }
+            }
+        }
+    }
+
+    pps->row_bd[0] = 0;
+    for (i = 0; i < pps->num_tile_rows; i++)
+        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
+
+    for (i = 0, j = 0; i < sps->ctb_width; i++) {
+        if (i >= pps->col_bd[j + 1])
+            j++;
+        pps->col_idxX[i] = j;
+    }
+
+    /**
+     * 6.5
+     */
+    pic_area_in_ctbs     = sps->ctb_size;
+
+    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
+    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
+    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
+    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
+    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
+    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
+    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
+        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
+        return AVERROR(ENOMEM);
+    }
+
+    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
+
+    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
+        int tb_x   = ctb_addr_rs % sps->ctb_width;
+        int tb_y   = ctb_addr_rs / sps->ctb_width;
+        int tile_x = 0;
+        int tile_y = 0;
+        int val    = 0;
+
+        for (i = 0; i < pps->num_tile_columns; i++) {
+            if (tb_x < pps->col_bd[i + 1]) {
+                tile_x = i;
+                break;
+            }
+        }
+
+        for (i = 0; i < pps->num_tile_rows; i++) {
+            if (tb_y < pps->row_bd[i + 1]) {
+                tile_y = i;
+                break;
+            }
+        }
+
+        for (i = 0; i < tile_x; i++)
+            val += pps->row_height[tile_y] * pps->column_width[i];
+        for (i = 0; i < tile_y; i++)
+            val += sps->ctb_width * pps->row_height[i];
+
+        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
+               tb_x - pps->col_bd[tile_x];
+
+        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
+        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
+    }
+
+    {
+        uint8_t * pflags = pps->ctb_ts_flags;
+        uint16_t * ptid = pps->tile_id;
+
+        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
+        {
+            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
+            {
+                const unsigned int tile_w = pps->column_width[i];
+
+                pflags[0] |= CTB_TS_FLAGS_CIREQ;
+
+                for (x = 0; x != tile_w; ++x) {
+                    pflags[x] |= CTB_TS_FLAGS_TOT;
+                }
+
+                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
+                {
+                    pflags[0] |= CTB_TS_FLAGS_SOTL;
+
+                    if (pps->entropy_coding_sync_enabled_flag)
+                    {
+                        if (pps->column_width[i] != 1)
+                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
+                        else
+                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
+
+                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
+                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
+                    }
+
+                    for (x = 0; x != tile_w; ++x)
+                        *ptid++ = tile_id;
+
+                    pflags += tile_w;
+                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
+                    if (i + 1 == pps->num_tile_columns)
+                        pflags[-1] |= CTB_TS_FLAGS_EOL;
+                }
+
+                pflags[-1] |= CTB_TS_FLAGS_EOT;
+            }
+        }
+    }
+
+    {
+        unsigned int ts = 0;
+        for (j = 0; j < pps->num_tile_rows; j++)
+            for (i = 0; i < pps->num_tile_columns; i++)
+            {
+                const unsigned int size = pps->column_width[i] * pps->row_height[j];
+                pps->tile_size[j * pps->num_tile_columns + i] = size;
+                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
+                ts += size;
+            }
+    }
+
+    return 0;
+}
+
+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
+                           HEVCRpiParamSets * const ps)
+{
+    const HEVCRpiSPS *sps = NULL;
+    int i, ret = 0;
+    unsigned int pps_id = 0;
+    ptrdiff_t nal_size;
+    unsigned log2_parallel_merge_level_minus2;
+
+    AVBufferRef *pps_buf;
+    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
+
+    if (!pps)
+        return AVERROR(ENOMEM);
+
+    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
+                               hevc_pps_free, NULL, 0);
+    if (!pps_buf) {
+        av_freep(&pps);
+        return AVERROR(ENOMEM);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
+
+    nal_size = gb->buffer_end - gb->buffer;
+    if (nal_size > sizeof(pps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               nal_size, sizeof(pps->data));
+        pps->data_size = sizeof(pps->data);
+    } else {
+        pps->data_size = nal_size;
+    }
+    memcpy(pps->data, gb->buffer, pps->data_size);
+
+    // Default values
+    pps->loop_filter_across_tiles_enabled_flag = 1;
+    pps->num_tile_columns                      = 1;
+    pps->num_tile_rows                         = 1;
+    pps->uniform_spacing_flag                  = 1;
+    pps->disable_dbf                           = 0;
+    pps->beta_offset                           = 0;
+    pps->tc_offset                             = 0;
+    pps->log2_max_transform_skip_block_size    = 2;
+
+    // Coded parameters
+    pps_id = get_ue_golomb_long(gb);
+    if (pps_id >= HEVC_MAX_PPS_COUNT) {
+        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->sps_id = get_ue_golomb_long(gb);
+    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    if (!ps->sps_list[pps->sps_id]) {
+        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
+
+    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
+    pps->output_flag_present_flag              = get_bits1(gb);
+    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
+
+    pps->sign_data_hiding_flag = get_bits1(gb);
+
+    pps->cabac_init_present_flag = get_bits1(gb);
+
+    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
+    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
+    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    pps->pic_init_qp_minus26 = get_se_golomb(gb);
+    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "init_qp_minus26 %d is outside the valid range "
+               "[%d, %d].\n",
+               pps->pic_init_qp_minus26,
+               -(26 + sps->qp_bd_offset), 25);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    pps->constrained_intra_pred_flag = get_bits1(gb);
+    pps->transform_skip_enabled_flag = get_bits1(gb);
+
+    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
+    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
+    if (pps->cu_qp_delta_enabled_flag)
+    {
+        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
+
+        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
+            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
+                   diff_cu_qp_delta_depth);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+
+        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
+    }
+
+    pps->cb_qp_offset = get_se_golomb(gb);
+    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
+        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
+               pps->cb_qp_offset);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->cr_qp_offset = get_se_golomb(gb);
+    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
+        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
+               pps->cr_qp_offset);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
+
+    pps->weighted_pred_flag   = get_bits1(gb);
+    pps->weighted_bipred_flag = get_bits1(gb);
+
+    pps->transquant_bypass_enable_flag    = get_bits1(gb);
+    pps->tiles_enabled_flag               = get_bits1(gb);
+    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
+
+    if (pps->tiles_enabled_flag) {
+        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
+        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
+        if (pps->num_tile_columns <= 0 ||
+            pps->num_tile_columns >= sps->width) {
+            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
+                   pps->num_tile_columns - 1);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+        if (pps->num_tile_rows <= 0 ||
+            pps->num_tile_rows >= sps->height) {
+            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
+                   pps->num_tile_rows - 1);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+
+        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
+        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
+        if (!pps->column_width || !pps->row_height) {
+            ret = AVERROR(ENOMEM);
+            goto err;
+        }
+
+        pps->uniform_spacing_flag = get_bits1(gb);
+        if (!pps->uniform_spacing_flag) {
+            uint64_t sum = 0;
+            for (i = 0; i < pps->num_tile_columns - 1; i++) {
+                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
+                sum                 += pps->column_width[i];
+            }
+            if (sum >= sps->ctb_width) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
+
+            sum = 0;
+            for (i = 0; i < pps->num_tile_rows - 1; i++) {
+                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
+                sum               += pps->row_height[i];
+            }
+            if (sum >= sps->ctb_height) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
+        }
+        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
+    }
+
+    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
+
+    pps->deblocking_filter_control_present_flag = get_bits1(gb);
+    if (pps->deblocking_filter_control_present_flag) {
+        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
+        pps->disable_dbf                             = get_bits1(gb);
+        if (!pps->disable_dbf) {
+            int beta_offset_div2 = get_se_golomb(gb);
+            int tc_offset_div2   = get_se_golomb(gb) ;
+            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
+                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
+                       beta_offset_div2);
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
+                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
+                       tc_offset_div2);
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            pps->beta_offset = 2 * beta_offset_div2;
+            pps->tc_offset   = 2 *   tc_offset_div2;
+        }
+    }
+
+    pps->scaling_list_data_present_flag = get_bits1(gb);
+    if (pps->scaling_list_data_present_flag) {
+        set_default_scaling_list_data(&pps->scaling_list);
+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
+        if (ret < 0)
+            goto err;
+    }
+    pps->lists_modification_present_flag = get_bits1(gb);
+    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
+    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
+        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
+               log2_parallel_merge_level_minus2);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
+
+    pps->slice_header_extension_present_flag = get_bits1(gb);
+
+    if (get_bits1(gb)) { // pps_extension_present_flag
+        int pps_range_extensions_flag = get_bits1(gb);
+        skip_bits(gb, 7); // pps_extension_7bits
+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
+                goto err;
+        }
+    }
+
+    ret = setup_pps(avctx, pps, sps);
+    if (ret < 0)
+        goto err;
+
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread PPS by %d bits\n", -get_bits_left(gb));
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    remove_pps(ps, pps_id);
+    ps->pps_list[pps_id] = pps_buf;
+
+    return 0;
+
+err:
+    av_buffer_unref(&pps_buf);
+    return ret;
+}
+
+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
+{
+    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
+    int prev_poc_lsb = pocTid0 % max_poc_lsb;
+    int prev_poc_msb = pocTid0 - prev_poc_lsb;
+    int poc_msb;
+
+    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
+        poc_msb = prev_poc_msb + max_poc_lsb;
+    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
+        poc_msb = prev_poc_msb - max_poc_lsb;
+    else
+        poc_msb = prev_poc_msb;
+
+    // For BLA picture types, POCmsb is set to 0.
+    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
+        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
+        nal_unit_type == HEVC_NAL_BLA_N_LP)
+        poc_msb = 0;
+
+    return poc_msb + poc_lsb;
+}
diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
new file mode 100644
index 0000000000..c725ebb9ca
--- /dev/null
+++ b/libavcodec/rpi_hevc_ps.h
@@ -0,0 +1,449 @@
+/*
+ * HEVC parameter set parsing
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RPI_HEVC_PS_H
+#define AVCODEC_RPI_HEVC_PS_H
+
+#include <stdint.h>
+
+#include "libavutil/buffer.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/rational.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "hevc.h"
+
+typedef struct ShortTermRPS {
+    unsigned int num_negative_pics;
+    int num_delta_pocs;
+    int rps_idx_num_delta_pocs;
+    int32_t delta_poc[32];
+    uint8_t used[32];
+} ShortTermRPS;
+
+typedef struct LongTermRPS {
+    int     poc[32];
+    uint8_t used[32];
+    uint8_t nb_refs;
+} LongTermRPS;
+
+typedef struct RpiSliceHeader {
+    unsigned int pps_id;
+
+    ///< address (in raster order) of the first block in the current slice segment
+    unsigned int   slice_segment_addr;
+    ///< address (in raster order) of the first block in the current slice
+    unsigned int   slice_addr;
+
+    enum HEVCSliceType slice_type;
+
+    int pic_order_cnt_lsb;
+
+    uint8_t first_slice_in_pic_flag;
+    uint8_t dependent_slice_segment_flag;
+    uint8_t pic_output_flag;
+    uint8_t colour_plane_id;
+
+    ///< RPS coded in the slice header itself is stored here
+    int short_term_ref_pic_set_sps_flag;
+    int short_term_ref_pic_set_size;
+    ShortTermRPS slice_rps;
+    const ShortTermRPS *short_term_rps;
+    int long_term_ref_pic_set_size;
+    LongTermRPS long_term_rps;
+    unsigned int list_entry_lx[2][32];
+
+    uint8_t rpl_modification_flag[2];
+    uint8_t no_output_of_prior_pics_flag;
+    uint8_t slice_temporal_mvp_enabled_flag;
+
+    unsigned int nb_refs[2];
+
+    uint8_t slice_sample_adaptive_offset_flag[3];
+    uint8_t mvd_l1_zero_flag;
+
+    uint8_t cabac_init_flag;
+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
+    uint8_t slice_loop_filter_across_slices_enabled_flag;
+    uint8_t collocated_list;
+
+    uint8_t no_dblk_boundary_flags;
+
+    unsigned int collocated_ref_idx;
+
+    int slice_qp_delta;
+    int slice_cb_qp_offset;  // -12, +12
+    int slice_cr_qp_offset;  // -12, +12
+
+    uint8_t cu_chroma_qp_offset_enabled_flag;
+
+    int beta_offset;    ///< beta_offset_div2 * 2
+    int tc_offset;      ///< tc_offset_div2 * 2
+
+    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
+
+    unsigned *entry_point_offset;
+    int * offset;
+    int * size;
+    int num_entry_point_offsets;
+    int offsets_allocated;
+
+    uint8_t offload_wpp;
+    uint8_t offload_tiles;
+
+    int8_t slice_qp;
+
+    uint8_t luma_log2_weight_denom;
+    uint8_t chroma_log2_weight_denom;
+
+    int16_t luma_weight_l0[16];     // -128, +255
+    int16_t luma_offset_l0[16];
+    int16_t chroma_weight_l0[16][2];
+    int16_t chroma_offset_l0[16][2];
+
+    int16_t luma_weight_l1[16];
+    int16_t luma_offset_l1[16];
+    int16_t chroma_weight_l1[16][2];
+    int16_t chroma_offset_l1[16][2];
+
+} RpiSliceHeader;
+
+typedef struct HEVCRpiWindow {
+    uint16_t left_offset;
+    uint16_t right_offset;
+    uint16_t top_offset;
+    uint16_t bottom_offset;
+} HEVCRpiWindow;
+
+typedef struct VUI {
+    AVRational sar;
+
+    int overscan_info_present_flag;
+    int overscan_appropriate_flag;
+
+    int video_signal_type_present_flag;
+    int video_format;
+    int video_full_range_flag;
+    int colour_description_present_flag;
+    uint8_t colour_primaries;
+    uint8_t transfer_characteristic;
+    uint8_t matrix_coeffs;
+
+    int chroma_loc_info_present_flag;
+    int chroma_sample_loc_type_top_field;
+    int chroma_sample_loc_type_bottom_field;
+    int neutra_chroma_indication_flag;
+
+    int field_seq_flag;
+    int frame_field_info_present_flag;
+
+    int default_display_window_flag;
+    HEVCRpiWindow def_disp_win;
+
+    int vui_timing_info_present_flag;
+    uint32_t vui_num_units_in_tick;
+    uint32_t vui_time_scale;
+    int vui_poc_proportional_to_timing_flag;
+    int vui_num_ticks_poc_diff_one_minus1;
+    int vui_hrd_parameters_present_flag;
+
+    int bitstream_restriction_flag;
+    int tiles_fixed_structure_flag;
+    int motion_vectors_over_pic_boundaries_flag;
+    int restricted_ref_pic_lists_flag;
+    int min_spatial_segmentation_idc;
+    int max_bytes_per_pic_denom;
+    int max_bits_per_min_cu_denom;
+    int log2_max_mv_length_horizontal;
+    int log2_max_mv_length_vertical;
+} VUI;
+
+typedef struct PTLCommon {
+    uint8_t profile_space;
+    uint8_t tier_flag;
+    uint8_t profile_idc;
+    uint8_t profile_compatibility_flag[32];
+    uint8_t level_idc;
+    uint8_t progressive_source_flag;
+    uint8_t interlaced_source_flag;
+    uint8_t non_packed_constraint_flag;
+    uint8_t frame_only_constraint_flag;
+} PTLCommon;
+
+typedef struct PTL {
+    PTLCommon general_ptl;
+    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
+
+    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
+} PTL;
+
+typedef struct HEVCRpiVPS {
+    uint8_t vps_temporal_id_nesting_flag;
+    int vps_max_layers;
+    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
+
+    PTL ptl;
+    int vps_sub_layer_ordering_info_present_flag;
+    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
+    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
+    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
+    int vps_max_layer_id;
+    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
+    uint8_t vps_timing_info_present_flag;
+    uint32_t vps_num_units_in_tick;
+    uint32_t vps_time_scale;
+    uint8_t vps_poc_proportional_to_timing_flag;
+    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
+    int vps_num_hrd_parameters;
+
+    uint8_t data[4096];
+    int data_size;
+} HEVCRpiVPS;
+
+typedef struct ScalingList {
+    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
+     * and size ID 3 only has 2 arrays, not 6. */
+    uint8_t sl[4][6][64];
+    uint8_t sl_dc[2][6];
+} ScalingList;
+
+typedef struct HEVCRpiSPS {
+    unsigned vps_id;
+    uint8_t chroma_format_idc;
+    uint8_t separate_colour_plane_flag;
+
+    HEVCRpiWindow output_window;
+
+    HEVCRpiWindow pic_conf_win;
+
+    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
+
+    uint8_t bit_depth;
+
+//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
+    uint8_t pixel_shift;
+    enum AVPixelFormat pix_fmt;
+
+    unsigned int log2_max_poc_lsb;
+
+    int max_sub_layers;
+    struct {
+        int max_dec_pic_buffering;
+        int num_reorder_pics;
+        int max_latency_increase;
+    } temporal_layer[HEVC_MAX_SUB_LAYERS];
+    uint8_t temporal_id_nesting_flag;
+
+    uint8_t scaling_list_enable_flag;
+    ScalingList scaling_list;
+
+    unsigned int nb_st_rps;
+    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
+
+    uint8_t amp_enabled_flag;
+    uint8_t sao_enabled;
+
+    uint8_t long_term_ref_pics_present_flag;
+    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
+    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
+    uint8_t num_long_term_ref_pics_sps;
+
+    struct {
+        uint8_t bit_depth;
+        uint8_t bit_depth_chroma;
+        uint8_t log2_min_pcm_cb_size;
+        uint8_t log2_max_pcm_cb_size;
+        uint8_t loop_filter_disable_flag;
+    } pcm;
+    char sps_temporal_mvp_enabled_flag;
+//    char sps_strong_intra_smoothing_enable_flag;  -> intra_filtes_disable
+
+    uint8_t log2_min_cb_size;  // 3..6
+    uint8_t log2_diff_max_min_coding_block_size;
+    uint8_t log2_min_tb_size;  // 2..5
+    uint8_t log2_max_trafo_size;
+    uint8_t log2_ctb_size;     // 4..6
+//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
+#define LOG2_MIN_PU_SIZE 2
+#define LOG2_MIN_CU_SIZE 3
+
+    uint8_t max_transform_hierarchy_depth_inter;
+    uint8_t max_transform_hierarchy_depth_intra;
+
+    char transform_skip_rotation_enabled_flag;
+    char transform_skip_context_enabled_flag;
+    char implicit_rdpcm_enabled_flag;
+    char explicit_rdpcm_enabled_flag;
+//    char intra_smoothing_disabled_flag;  -> intra_filtes_disable
+    char high_precision_offsets_enabled_flag;
+    char persistent_rice_adaptation_enabled_flag;
+
+    uint8_t intra_filters_disable;
+
+    ///< coded frame dimension in various units
+    int width;
+    int height;
+    int ctb_width;
+    int ctb_height;
+    int ctb_size;   // Pic size in CTBs not size of a CTB
+    int min_cb_width;
+    int min_cb_height;
+    int min_tb_width;
+    int min_tb_height;
+    int min_pu_width;
+    int min_pu_height;
+    int pcm_width;
+    int pcm_height;
+    int tb_mask;
+
+    int hshift[3];
+    int vshift[3];
+
+    int qp_bd_offset;
+
+    uint8_t data[4096];
+    int data_size;
+
+    VUI vui;
+    PTL ptl;
+} HEVCRpiSPS;
+
+#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
+#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
+#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
+#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
+#define CTB_TS_FLAGS_CSAVE      (1U << 4)
+#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
+#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
+#define CTB_TS_FLAGS_CLOAD      (1U << 7)
+
+typedef struct HEVCRpiPPS {
+    unsigned int sps_id; ///< seq_parameter_set_id
+
+    uint8_t sign_data_hiding_flag;
+
+    uint8_t cabac_init_present_flag;
+
+    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
+    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
+    int pic_init_qp_minus26;
+
+    uint8_t constrained_intra_pred_flag;
+    uint8_t transform_skip_enabled_flag;
+
+    uint8_t cu_qp_delta_enabled_flag;
+    uint8_t log2_min_cu_qp_delta_size;
+    int cb_qp_offset;   // -12..12
+    int cr_qp_offset;   // -12..12
+    const uint8_t * qp_dblk_x[3];
+    const int8_t * qp_bd_x[3];
+
+    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
+    uint8_t weighted_pred_flag;
+    uint8_t weighted_bipred_flag;
+    uint8_t output_flag_present_flag;
+    uint8_t transquant_bypass_enable_flag;
+
+    uint8_t dependent_slice_segments_enabled_flag;
+    uint8_t tiles_enabled_flag;
+    uint8_t entropy_coding_sync_enabled_flag;
+
+    uint8_t tile_wpp_inter_disable;
+    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
+    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
+    uint8_t uniform_spacing_flag;
+    uint8_t loop_filter_across_tiles_enabled_flag;
+
+    uint8_t seq_loop_filter_across_slices_enabled_flag;
+
+    uint8_t deblocking_filter_control_present_flag;
+    uint8_t deblocking_filter_override_enabled_flag;
+    uint8_t disable_dbf;
+    int beta_offset;    ///< beta_offset_div2 * 2
+    int tc_offset;      ///< tc_offset_div2 * 2
+
+    uint8_t scaling_list_data_present_flag;
+    ScalingList scaling_list;
+
+    uint8_t lists_modification_present_flag;
+    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
+    int num_extra_slice_header_bits;
+    uint8_t slice_header_extension_present_flag;
+    uint8_t log2_max_transform_skip_block_size;
+    uint8_t cross_component_prediction_enabled_flag;
+    uint8_t chroma_qp_offset_list_enabled_flag;
+    uint8_t diff_cu_chroma_qp_offset_depth;
+    uint8_t chroma_qp_offset_list_len_minus1;
+    int8_t  cb_qp_offset_list[6];
+    int8_t  cr_qp_offset_list[6];
+    uint8_t log2_sao_offset_scale_luma;
+    uint8_t log2_sao_offset_scale_chroma;
+
+    // Inferred parameters
+    uint16_t *column_width;  ///< ColumnWidth
+    uint16_t *row_height;    ///< RowHeight
+    uint16_t *col_bd;        ///< ColBd
+    uint16_t *row_bd;        ///< RowBd
+    uint16_t *col_idxX;
+
+    // We can limit these to uint16_t given our other size limits
+    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
+    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
+    uint16_t *tile_id;           ///< TileId
+    uint16_t *tile_pos_ts;       ///< TilePosRS
+    uint16_t *tile_size;         ///< TileSize
+    uint8_t * ctb_ts_flags;
+
+    uint8_t data[4096];
+    int data_size;
+} HEVCRpiPPS;
+
+typedef struct HEVCRpiParamSets {
+    /* currently active parameter sets */
+    const HEVCRpiVPS *vps;
+    const HEVCRpiSPS *sps;
+    const HEVCRpiPPS *pps;
+
+    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
+    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
+    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
+} HEVCRpiParamSets;
+
+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
+                           HEVCRpiParamSets *ps);
+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
+                           HEVCRpiParamSets *ps, int apply_defdispwin);
+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
+                           HEVCRpiParamSets *ps);
+
+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
+
+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
+                           uint8_t *buf, int buf_size);
+
+/**
+ * Compute POC of the current frame and return it.
+ */
+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
+
+#endif /* AVCODEC_RPI_HEVC_PS_H */
diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
new file mode 100644
index 0000000000..8cc5796cf0
--- /dev/null
+++ b/libavcodec/rpi_hevc_refs.c
@@ -0,0 +1,485 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "internal.h"
+#include "thread.h"
+#include "hevc.h"
+#include "rpi_hevcdec.h"
+
+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
+{
+    /* frame->frame can be NULL if context init failed */
+    if (!frame->frame || !frame->frame->buf[0])
+        return;
+
+    frame->flags &= ~flags;
+    if (!frame->flags) {
+        ff_thread_release_buffer(s->avctx, &frame->tf);
+
+        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
+        frame->col_mvf = NULL;
+
+        frame->collocated_ref = NULL;
+    }
+}
+
+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
+        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
+                            HEVC_FRAME_FLAG_SHORT_REF |
+                            HEVC_FRAME_FLAG_LONG_REF);
+}
+
+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
+}
+
+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
+{
+    int i, ret;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCRpiFrame * const frame = &s->DPB[i];
+        if (frame->frame->buf[0])
+            continue;
+
+        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
+                                   AV_GET_BUFFER_FLAG_REF);
+        if (ret < 0)
+            return NULL;
+
+        frame->col_mvf = NULL;
+        frame->col_mvf_buf = NULL;
+        if (s->used_for_ref && !s->is_irap)
+        {
+            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
+            if (!frame->col_mvf_buf)
+                goto fail;
+            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
+        }
+
+        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+
+        return frame;
+
+fail:
+        ff_hevc_rpi_unref_frame(s, frame, ~0);
+        return NULL;
+    }
+    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
+    return NULL;
+}
+
+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
+{
+    HEVCRpiFrame *ref;
+    int i;
+
+    /* check that this POC doesn't already exist */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCRpiFrame *frame = &s->DPB[i];
+
+        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
+            frame->poc == poc) {
+            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
+                   poc);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    ref = alloc_frame(s);
+    if (!ref)
+        return AVERROR(ENOMEM);
+
+    *frame = ref->frame;
+    s->ref = ref;
+
+    if (s->sh.pic_output_flag)
+        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
+    else
+        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
+
+    ref->poc      = poc;
+    ref->sequence = s->seq_decode;
+    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
+    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
+    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
+    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
+
+    return 0;
+}
+
+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
+{
+    do {
+        int nb_output = 0;
+        int min_poc   = INT_MAX;
+        int i, min_idx, ret;
+
+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+                HEVCRpiFrame *frame = &s->DPB[i];
+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
+                        frame->sequence == s->seq_output) {
+                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+                }
+            }
+        }
+
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCRpiFrame *frame = &s->DPB[i];
+            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
+                frame->sequence == s->seq_output) {
+                nb_output++;
+                if (frame->poc < min_poc || nb_output == 1) {
+                    min_poc = frame->poc;
+                    min_idx = i;
+                }
+            }
+        }
+
+        /* wait for more frames before output */
+        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
+            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
+            return 0;
+
+        if (nb_output) {
+            HEVCRpiFrame *frame = &s->DPB[min_idx];
+            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
+                return 0;
+
+            ret = av_frame_ref(out, frame->frame);
+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
+            else
+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+            if (ret < 0)
+                return ret;
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "Output frame with POC %d.\n", frame->poc);
+            return 1;
+        }
+
+        if (s->seq_output != s->seq_decode)
+            s->seq_output = (s->seq_output + 1) & 0xff;
+        else
+            break;
+    } while (1);
+
+    return 0;
+}
+
+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
+{
+    int dpb = 0;
+    int min_poc = INT_MAX;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCRpiFrame *frame = &s->DPB[i];
+        if ((frame->flags) &&
+            frame->sequence == s->seq_output &&
+            frame->poc != s->poc) {
+            dpb++;
+        }
+    }
+
+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCRpiFrame *frame = &s->DPB[i];
+            if ((frame->flags) &&
+                frame->sequence == s->seq_output &&
+                frame->poc != s->poc) {
+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
+                    min_poc = frame->poc;
+                }
+            }
+        }
+
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCRpiFrame *frame = &s->DPB[i];
+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
+                frame->sequence == s->seq_output &&
+                frame->poc <= min_poc) {
+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
+            }
+        }
+
+        dpb--;
+    }
+}
+
+static int init_slice_rpl(HEVCRpiContext *s)
+{
+    if (s->slice_idx >= s->rpl_tab_size)
+        return AVERROR_INVALIDDATA;
+
+    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
+    return 0;
+}
+
+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
+{
+    RpiSliceHeader *sh = &s->sh;
+
+    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
+    uint8_t list_idx;
+    int i, j, ret;
+
+    ret = init_slice_rpl(s);
+    if (ret < 0)
+        return ret;
+
+    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
+          s->rps[LT_CURR].nb_refs)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (list_idx = 0; list_idx < nb_list; list_idx++) {
+        RefPicList  rpl_tmp = { { 0 } };
+        RefPicList *rpl     = &s->refPicList[list_idx];
+
+        /* The order of the elements is
+         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
+         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
+        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
+                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
+                              LT_CURR };
+
+        /* concatenate the candidate lists for the current frame */
+        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
+            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
+                RefPicList *rps = &s->rps[cand_lists[i]];
+                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
+                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
+                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
+                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
+                    rpl_tmp.nb_refs++;
+                }
+            }
+        }
+
+        /* reorder the references if necessary */
+        if (sh->rpl_modification_flag[list_idx]) {
+            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
+                int idx = sh->list_entry_lx[list_idx][i];
+
+                if (idx >= rpl_tmp.nb_refs) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                rpl->list[i]       = rpl_tmp.list[idx];
+                rpl->ref[i]        = rpl_tmp.ref[idx];
+                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
+                rpl->nb_refs++;
+            }
+        } else {
+            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
+            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
+        }
+
+        if (sh->collocated_list == list_idx &&
+            sh->collocated_ref_idx < rpl->nb_refs)
+            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
+    }
+
+    return 0;
+}
+
+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
+{
+    int i;
+    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCRpiFrame *ref = &s->DPB[i];
+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
+            if ((ref->poc & LtMask) == poc)
+                return ref;
+        }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCRpiFrame *ref = &s->DPB[i];
+        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
+            if (ref->poc == poc || (ref->poc & LtMask) == poc)
+                return ref;
+        }
+    }
+
+    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Could not find ref with POC %d\n", poc);
+    return NULL;
+}
+
+static void mark_ref(HEVCRpiFrame *frame, int flag)
+{
+    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
+    frame->flags |= flag;
+}
+
+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
+{
+    HEVCRpiFrame *frame;
+    int i, x, y;
+
+    frame = alloc_frame(s);
+    if (!frame)
+        return NULL;
+
+    if (!s->ps.sps->pixel_shift) {
+        for (i = 0; frame->frame->buf[i]; i++)
+            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
+                   frame->frame->buf[i]->size);
+    } else {
+        for (i = 0; frame->frame->data[i]; i++)
+            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
+                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
+                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
+                            1 << (s->ps.sps->bit_depth - 1));
+                }
+    }
+
+    frame->poc      = poc;
+    frame->sequence = s->seq_decode;
+    frame->flags    = 0;
+
+    ff_hevc_rpi_progress_set_all_done(frame);
+
+    return frame;
+}
+
+/* add a reference with the given poc to the list and mark it as used in DPB */
+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
+                             int poc, int ref_flag)
+{
+    HEVCRpiFrame *ref = find_ref_idx(s, poc);
+
+    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
+        return AVERROR_INVALIDDATA;
+
+    if (!ref) {
+        ref = generate_missing_ref(s, poc);
+        if (!ref)
+            return AVERROR(ENOMEM);
+    }
+
+    list->list[list->nb_refs] = ref->poc;
+    list->ref[list->nb_refs]  = ref;
+    list->nb_refs++;
+
+    mark_ref(ref, ref_flag);
+    return 0;
+}
+
+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
+{
+    const ShortTermRPS *short_rps = s->sh.short_term_rps;
+    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
+    RefPicList               *rps = s->rps;
+    int i, ret = 0;
+
+    if (!short_rps) {
+        rps[0].nb_refs = rps[1].nb_refs = 0;
+        return 0;
+    }
+
+    /* clear the reference flags on all frames except the current one */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCRpiFrame *frame = &s->DPB[i];
+
+        if (frame == s->ref)
+            continue;
+
+        mark_ref(frame, 0);
+    }
+
+    for (i = 0; i < NB_RPS_TYPE; i++)
+        rps[i].nb_refs = 0;
+
+    /* add the short refs */
+    for (i = 0; i < short_rps->num_delta_pocs; i++) {
+        int poc = s->poc + short_rps->delta_poc[i];
+        int list;
+
+        if (!short_rps->used[i])
+            list = ST_FOLL;
+        else if (i < short_rps->num_negative_pics)
+            list = ST_CURR_BEF;
+        else
+            list = ST_CURR_AFT;
+
+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
+        if (ret < 0)
+            goto fail;
+    }
+
+    /* add the long refs */
+    for (i = 0; i < long_rps->nb_refs; i++) {
+        int poc  = long_rps->poc[i];
+        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
+
+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
+        if (ret < 0)
+            goto fail;
+    }
+
+fail:
+    /* release any frames that are now unused */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
+
+    return ret;
+}
+
+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
+{
+    int ret = 0;
+    int i;
+    const ShortTermRPS *rps = s->sh.short_term_rps;
+    LongTermRPS *long_rps   = &s->sh.long_term_rps;
+
+    if (rps) {
+        for (i = 0; i < rps->num_negative_pics; i++)
+            ret += !!rps->used[i];
+        for (; i < rps->num_delta_pocs; i++)
+            ret += !!rps->used[i];
+    }
+
+    if (long_rps) {
+        for (i = 0; i < long_rps->nb_refs; i++)
+            ret += !!long_rps->used[i];
+    }
+    return ret;
+}
diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
new file mode 100644
index 0000000000..cd8149d58e
--- /dev/null
+++ b/libavcodec/rpi_hevc_sei.c
@@ -0,0 +1,368 @@
+/*
+ * HEVC Supplementary Enhancement Information messages
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2013 Vittorio Giovara
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "golomb.h"
+#include "rpi_hevc_ps.h"
+#include "rpi_hevc_sei.h"
+
+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
+{
+    int cIdx, i;
+    uint8_t hash_type;
+    //uint16_t picture_crc;
+    //uint32_t picture_checksum;
+    hash_type = get_bits(gb, 8);
+
+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
+        if (hash_type == 0) {
+            s->is_md5 = 1;
+            for (i = 0; i < 16; i++)
+                s->md5[cIdx][i] = get_bits(gb, 8);
+        } else if (hash_type == 1) {
+            // picture_crc = get_bits(gb, 16);
+            skip_bits(gb, 16);
+        } else if (hash_type == 2) {
+            // picture_checksum = get_bits_long(gb, 32);
+            skip_bits(gb, 32);
+        }
+    }
+    return 0;
+}
+
+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
+{
+    int i;
+    // Mastering primaries
+    for (i = 0; i < 3; i++) {
+        s->display_primaries[i][0] = get_bits(gb, 16);
+        s->display_primaries[i][1] = get_bits(gb, 16);
+    }
+    // White point (x, y)
+    s->white_point[0] = get_bits(gb, 16);
+    s->white_point[1] = get_bits(gb, 16);
+
+    // Max and min luminance of mastering display
+    s->max_luminance = get_bits_long(gb, 32);
+    s->min_luminance = get_bits_long(gb, 32);
+
+    // As this SEI message comes before the first frame that references it,
+    // initialize the flag to 2 and decrement on IRAP access unit so it
+    // persists for the coded video sequence (e.g., between two IRAPs)
+    s->present = 2;
+    return 0;
+}
+
+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
+{
+    // Max and average light levels
+    s->max_content_light_level     = get_bits_long(gb, 16);
+    s->max_pic_average_light_level = get_bits_long(gb, 16);
+    // As this SEI message comes before the first frame that references it,
+    // initialize the flag to 2 and decrement on IRAP access unit so it
+    // persists for the coded video sequence (e.g., between two IRAPs)
+    s->present = 2;
+    return  0;
+}
+
+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
+{
+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
+    s->present = !get_bits1(gb);
+
+    if (s->present) {
+        s->arrangement_type               = get_bits(gb, 7);
+        s->quincunx_subsampling           = get_bits1(gb);
+        s->content_interpretation_type    = get_bits(gb, 6);
+
+        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
+        skip_bits(gb, 3);
+        s->current_frame_is_frame0_flag = get_bits1(gb);
+        // frame0_self_contained_flag, frame1_self_contained_flag
+        skip_bits(gb, 2);
+
+        if (!s->quincunx_subsampling && s->arrangement_type != 5)
+            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
+        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
+        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
+    }
+    skip_bits1(gb);             // upsampled_aspect_ratio_flag
+    return 0;
+}
+
+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
+{
+    s->present = !get_bits1(gb);
+
+    if (s->present) {
+        s->hflip = get_bits1(gb);     // hor_flip
+        s->vflip = get_bits1(gb);     // ver_flip
+
+        s->anticlockwise_rotation = get_bits(gb, 16);
+        skip_bits1(gb);     // display_orientation_persistence_flag
+    }
+
+    return 0;
+}
+
+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
+                                     void *logctx, int size)
+{
+    HEVCSEIPictureTiming *h = &s->picture_timing;
+    HEVCRpiSPS *sps;
+
+    if (!ps->sps_list[s->active_seq_parameter_set_id])
+        return(AVERROR(ENOMEM));
+    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
+
+    if (sps->vui.frame_field_info_present_flag) {
+        int pic_struct = get_bits(gb, 4);
+        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
+        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
+            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
+        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
+            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
+        }
+        get_bits(gb, 2);                   // source_scan_type
+        get_bits(gb, 1);                   // duplicate_flag
+        skip_bits1(gb);
+        size--;
+    }
+    skip_bits_long(gb, 8 * size);
+
+    return 0;
+}
+
+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
+                                                      int size)
+{
+    int flag;
+    int user_data_type_code;
+    int cc_count;
+
+    if (size < 3)
+       return AVERROR(EINVAL);
+
+    user_data_type_code = get_bits(gb, 8);
+    if (user_data_type_code == 0x3) {
+        skip_bits(gb, 1); // reserved
+
+        flag = get_bits(gb, 1); // process_cc_data_flag
+        if (flag) {
+            skip_bits(gb, 1);
+            cc_count = get_bits(gb, 5);
+            skip_bits(gb, 8); // reserved
+            size -= 2;
+
+            if (cc_count && size >= cc_count * 3) {
+                const uint64_t new_size = (s->a53_caption_size + cc_count
+                                           * UINT64_C(3));
+                int i, ret;
+
+                if (new_size > INT_MAX)
+                    return AVERROR(EINVAL);
+
+                /* Allow merging of the cc data from two fields. */
+                ret = av_reallocp(&s->a53_caption, new_size);
+                if (ret < 0)
+                    return ret;
+
+                for (i = 0; i < cc_count; i++) {
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                }
+                skip_bits(gb, 8); // marker_bits
+            }
+        }
+    } else {
+        int i;
+        for (i = 0; i < size - 1; i++)
+            skip_bits(gb, 8);
+    }
+
+    return 0;
+}
+
+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
+                                                         int size)
+{
+    uint32_t country_code;
+    uint32_t user_identifier;
+
+    if (size < 7)
+        return AVERROR(EINVAL);
+    size -= 7;
+
+    country_code = get_bits(gb, 8);
+    if (country_code == 0xFF) {
+        skip_bits(gb, 8);
+        size--;
+    }
+
+    skip_bits(gb, 8);
+    skip_bits(gb, 8);
+
+    user_identifier = get_bits_long(gb, 32);
+
+    switch (user_identifier) {
+        case MKBETAG('G', 'A', '9', '4'):
+            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
+        default:
+            skip_bits_long(gb, size * 8);
+            break;
+    }
+    return 0;
+}
+
+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
+{
+    int num_sps_ids_minus1;
+    int i;
+    unsigned active_seq_parameter_set_id;
+
+    get_bits(gb, 4); // active_video_parameter_set_id
+    get_bits(gb, 1); // self_contained_cvs_flag
+    get_bits(gb, 1); // num_sps_ids_minus1
+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
+
+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
+        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
+        return AVERROR_INVALIDDATA;
+    }
+
+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
+    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
+        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
+        return AVERROR_INVALIDDATA;
+    }
+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
+
+    for (i = 1; i <= num_sps_ids_minus1; i++)
+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
+
+    return 0;
+}
+
+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
+{
+    s->present = 1;
+    s->preferred_transfer_characteristics = get_bits(gb, 8);
+    return 0;
+}
+
+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
+                                 int type, int size)
+{
+    switch (type) {
+    case 256:  // Mismatched value from HM 8.1
+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
+    case HEVC_SEI_TYPE_FRAME_PACKING:
+        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
+    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
+        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
+    case HEVC_SEI_TYPE_PICTURE_TIMING:
+        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
+        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
+        return decode_nal_sei_content_light_info(&s->content_light, gb);
+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
+        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
+        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
+    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
+        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
+    default:
+        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
+        skip_bits_long(gb, 8 * size);
+        return 0;
+    }
+}
+
+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
+                                 int type, int size)
+{
+    switch (type) {
+    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
+    default:
+        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
+        skip_bits_long(gb, 8 * size);
+        return 0;
+    }
+}
+
+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
+                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
+{
+    int payload_type = 0;
+    int payload_size = 0;
+    int byte = 0xFF;
+    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
+
+    while (byte == 0xFF) {
+       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
+           return AVERROR_INVALIDDATA;
+        byte          = get_bits(gb, 8);
+        payload_type += byte;
+    }
+    byte = 0xFF;
+    while (byte == 0xFF) {
+        if (get_bits_left(gb) < 8 + 8LL*payload_size)
+            return AVERROR_INVALIDDATA;
+         byte          = get_bits(gb, 8);
+        payload_size += byte;
+    }
+    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
+        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
+    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
+        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
+    }
+}
+
+static int more_rbsp_data(GetBitContext *gb)
+{
+    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
+}
+
+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
+                           const HEVCRpiParamSets *ps, int type)
+{
+    int ret;
+
+    do {
+        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
+        if (ret < 0)
+            return ret;
+    } while (more_rbsp_data(gb));
+    return 1;
+}
+
+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
+{
+    s->a53_caption.a53_caption_size = 0;
+    av_freep(&s->a53_caption.a53_caption);
+}
diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
new file mode 100644
index 0000000000..d4ac348df9
--- /dev/null
+++ b/libavcodec/rpi_hevc_sei.h
@@ -0,0 +1,135 @@
+/*
+ * HEVC Supplementary Enhancement Information messages
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RPI_HEVC_SEI_H
+#define AVCODEC_RPI_HEVC_SEI_H
+
+#include <stdint.h>
+
+#include "libavutil/md5.h"
+
+#include "get_bits.h"
+
+/**
+ * SEI message types
+ */
+typedef enum {
+    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
+    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
+    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
+    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
+    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
+    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
+    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
+    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
+    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
+    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
+    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
+    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
+    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
+    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
+    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
+    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
+    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
+    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
+    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
+    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
+    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
+    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
+    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
+    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
+} HEVC_SEI_Type;
+
+typedef struct HEVCSEIPictureHash {
+    uint8_t       md5[3][16];
+    uint8_t is_md5;
+} HEVCSEIPictureHash;
+
+typedef struct HEVCSEIFramePacking {
+    int present;
+    int arrangement_type;
+    int content_interpretation_type;
+    int quincunx_subsampling;
+    int current_frame_is_frame0_flag;
+} HEVCSEIFramePacking;
+
+typedef struct HEVCSEIDisplayOrientation {
+    int present;
+    int anticlockwise_rotation;
+    int hflip, vflip;
+} HEVCSEIDisplayOrientation;
+
+typedef struct HEVCSEIPictureTiming {
+    int picture_struct;
+} HEVCSEIPictureTiming;
+
+typedef struct HEVCSEIA53Caption {
+    int a53_caption_size;
+    uint8_t *a53_caption;
+} HEVCSEIA53Caption;
+
+typedef struct HEVCSEIMasteringDisplay {
+    int present;
+    uint16_t display_primaries[3][2];
+    uint16_t white_point[2];
+    uint32_t max_luminance;
+    uint32_t min_luminance;
+} HEVCSEIMasteringDisplay;
+
+typedef struct HEVCSEIContentLight {
+    int present;
+    uint16_t max_content_light_level;
+    uint16_t max_pic_average_light_level;
+} HEVCSEIContentLight;
+
+typedef struct HEVCSEIAlternativeTransfer {
+    int present;
+    int preferred_transfer_characteristics;
+} HEVCSEIAlternativeTransfer;
+
+typedef struct HEVCSEIContext {
+    HEVCSEIPictureHash picture_hash;
+    HEVCSEIFramePacking frame_packing;
+    HEVCSEIDisplayOrientation display_orientation;
+    HEVCSEIPictureTiming picture_timing;
+    HEVCSEIA53Caption a53_caption;
+    HEVCSEIMasteringDisplay mastering_display;
+    HEVCSEIContentLight content_light;
+    int active_seq_parameter_set_id;
+    HEVCSEIAlternativeTransfer alternative_transfer;
+} HEVCSEIContext;
+
+struct HEVCRpiParamSets;
+
+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
+                           const struct HEVCRpiParamSets *ps, int type);
+
+/**
+ * Reset SEI values that are stored on the Context.
+ * e.g. Caption data that was extracted during NAL
+ * parsing.
+ *
+ * @param s HEVCRpiContext.
+ */
+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
+
+#endif /* AVCODEC_RPI_HEVC_SEI_H */
diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
new file mode 100644
index 0000000000..23b49a99ae
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader.c
@@ -0,0 +1,1537 @@
+#include "rpi_hevc_shader.h"
+
+#ifdef _MSC_VER
+   #include <stdint.h>
+   /* cast through uintptr_t to avoid warnings */
+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+#else
+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+#endif
+
+#ifdef __cplusplus
+extern "C" { /* the types are probably wrong... */
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _MSC_VER
+__declspec(align(8))
+#elif defined(__GNUC__)
+__attribute__((aligned(8)))
+#endif
+unsigned int ff_hevc_rpi_shader[] = {
+// ::mc_setup_c_q0
+// ::mc_start
+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_c_qn
+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
+// :1
+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
+// ::mc_filter_c_p
+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
+// :1
+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_c_p_l1
+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
+// :1
+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_c_b
+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
+// :1
+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_sync_q0
+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q1
+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q2
+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q3
+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync_q4
+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q5
+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q6
+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q7
+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync_q8
+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q9
+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q10
+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync_q11
+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c_qn
+// ::mc_exit_y_qn
+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c_q0
+// ::mc_exit_y_q0
+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_setup_y_q0
+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_y_qn
+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
+// :1
+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
+// :per_block_setup_8
+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
+// ::mc_filter_y_pxx
+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
+// :1
+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_y_bxx
+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
+// :1
+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_y_p00
+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
+// :1
+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_y_b00
+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+// :1
+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_setup_c10_q0
+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_c10_qn
+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
+// :1
+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
+// ::mc_filter_c10_p
+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
+// :1
+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_c10_p_l1
+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
+// :1
+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_c10_b
+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
+// :1
+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_sync10_q0
+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q1
+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q2
+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q3
+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync10_q4
+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q5
+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q6
+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q7
+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_sync10_q8
+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q9
+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q10
+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_sync10_q11
+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c10_q0
+// ::mc_exit_y10_q0
+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit_c10_qn
+// ::mc_exit_y10_qn
+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
+// :1
+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_setup_y10_q0
+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+// ::mc_setup_y10_qn
+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
+// :1
+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
+// :per_block_setup_10
+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
+// ::mc_filter_y10_pxx
+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
+// :1
+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_p00
+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
+// :1
+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_bxx
+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
+// :1
+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_filter_y10_b00
+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+// :1
+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
+// ::mc_end
+};
+#ifdef __HIGHC__
+#pragma Align_to(8, ff_hevc_rpi_shader)
+#endif
diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
new file mode 100644
index 0000000000..79651c9b6c
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader.h
@@ -0,0 +1,63 @@
+#ifndef rpi_hevc_shader_H
+#define rpi_hevc_shader_H
+
+extern unsigned int ff_hevc_rpi_shader[];
+
+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
+#define mc_start (ff_hevc_rpi_shader + 0)
+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
+#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
+#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
+#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
+#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
+#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
+#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
+#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
+#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
+#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
+#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
+#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
+#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
+#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
+#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
+#define mc_end (ff_hevc_rpi_shader + 2860)
+
+#endif
diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
new file mode 100644
index 0000000000..af5b59e181
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader.qasm
@@ -0,0 +1,1850 @@
+# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the copyright holder nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Written by Peter de Rivaz, John Cox
+
+
+
+# Inter pred asm
+#
+# Logic here should be good to 14 bits without modification
+# but only 8 & 10 are currently instantiated & tested
+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
+# in _p00 & _b00
+
+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+# the warning that we are using rotation & ra/rb registers. r0..3 can be
+# rotated through all 16 elems ra regs can only be rotated through their
+# local 4.  As it happens this is what is wanted here as we do not want the
+# constants from the other half of the calc.
+
+# Number limits in P/B calculation
+#
+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
+# we offset our intermediates s.t. they always end up +ve before the next
+# multiply (may be -ve whilst summing but that doesn't matter).
+#
+# Range calc for up to 14 bits (Y-B pred):
+#
+# denom: [0, 7]
+# bmax = (1 << bits) - 1
+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
+#
+# wt_mul: [-128, 255]
+# wt_off = off * 2 + 1: [-bmax, bmax]
+#
+# pel: [0, bmax]
+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
+#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
+#
+# This all looks good and is mostly bit depth independant - and as we manage
+# to do unsigned multiplies everywhere (now) this should be good for any bit
+# depth up to 14 (we could probably do 16 - but that requires a few tweaks
+# to the shifts we don't currently have logic for)
+
+# PREREAD is the number of requests that we have sitting in the TMU request
+# queue.
+#
+# There are 8 slots availible in the TMU request Q for tm0s requests, but
+# only 4 output FIFO entries and overflow is bad (corruption or crash)
+# (If threaded then only 2 out FIFO entries, but we aren't.)
+# In s/w we are effectively limited to the min vertical read which is >= 4
+# so output FIFO is the limit.
+#
+# As the test for read-next is is the main part of the Luma loop (rather than
+# the preload FIFO part) we are limited to min_luma_height - 1
+# Min_luma_height is 4 so we can only have a preload of 3
+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
+# in chroma without abandoning preload pretty much entirely (which would be bad)
+#
+# Timing tests vs preload of 4 suggests this doesn't hurt us much
+# Could have preread 4 for Chroma but when tested it didn't help
+
+.set PREREAD,                      3
+
+# Offset added (effectively) at the exit of the H FIR filter
+# This is enough to force the result +ve
+# Is good if it is a power of 2 as that allows for >> without loss
+#
+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
+# Round up to next power of 2
+
+.set FIR_OFFSET,                   0x4000
+
+# Block heights - 8 & 16 are the only numbers we currently support
+
+.set C_BLK_HEIGHT_8,               16
+.set C_BLK_HEIGHT_16,              8
+.set Y_BLK_HEIGHT_8,               16
+.set Y_BLK_HEIGHT_16,              8
+
+# QPU counts - depend on block size
+# If we have a 2-byte format & block_size > 8 then can only afford
+# 8 QPUs
+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
+
+.set N_QPU_8,                      12
+.set N_QPU_16,                     12
+
+# Value to add to the weight multiplier to convert it into an unsigned value
+# Should be power of two for convienience
+
+.set LOG2_MUL_ADD,                 14
+.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
+
+# Fixed denom (max that it can be set to)
+.set DENOM,                        7
+
+# register allocation
+#
+
+# ra0-3
+# Used as temp and may be loop filter coeffs (split into .8s)
+# or temp in loop. Check usage on an individual basis.
+
+# ra4-11
+# V FIFO / temp / free
+
+# -- free --                       ra12
+
+# -- free --                       ra13
+
+# -- free --                       ra14
+
+# -- free --                       ra15
+
+# uniform: width:height
+.set ra_width_height,              ra16
+.set ra_width,                     ra16.16b
+.set ra_height,                    ra16.16a
+
+# y:y2 same layout as y_y2_next so we can update both together
+.set ra_y_y2,                      ra17
+.set ra_y2,                        ra17.16a
+.set ra_y,                         ra17.16b
+
+# uniform: L1 weight (U on left, V on right)
+# Only used in Y B
+.set ra_wt_off_mul_l1,             ra18
+.set ra_wt_off_l1,                 ra18.16b
+.set ra_wt_mul_l1,                 ra18.16a
+
+# y_next:y2_next same layout as y_y2 so we can update both together
+.set ra_y_y2_next,                 ra19
+.set ra_y_next,                    ra19.16b
+.set ra_y2_next,                   ra19.16a
+
+# Setup: consts - subdivide a single register
+.set ra_kff800100,                 ra20
+.set ra_k256,                      ra20.16a
+.set ra_k0,                        ra20.8a
+.set ra_k1,                        ra20.8b
+.set ra_k128,                      ra20.8c
+.set ra_k255,                      ra20.8d
+
+# Loop: xshifts
+.set ra_xshift,                    ra21.16a
+.set ra_xshift_next,               ra21.16b
+
+# Loop var: L0 weight (U on left, V on right)
+# _off_ is not used in loop as we want to modify it before use
+.set ra_wt_off_mul_l0,             ra22
+.set ra_wt_mul_l0,                 ra22.16a
+.set ra_wt_off_l0,                 ra22.16b
+
+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
+.set ra_blk_height_pmax,           ra23
+.set ra_pmax,                      ra23.16a
+.set ra_blk_height,                ra23.8c
+# --free --                        ra23.8d
+
+# Loop:  src frame base (L0)
+.set ra_base,                      ra24
+
+# Misc  offsets
+.set ra_fir_off_val_wt_den_p7,     ra25
+.set ra_wt_den_p7,                 ra25.8a
+# -- free --                       ra25.8b
+.set ra_fir_off_val,               ra25.16b
+
+# As it happens these constants are the same
+.if FIR_OFFSET == MUL_ADD
+# Weight multiplier unsigned add
+.set ra_kmul_add,                  ra_fir_off_val
+.else
+.error "FIR_OFFSET != MUL_ADD: Need new register & init"
+.endif
+
+# Loop: next src frame base (L0)
+.set ra_base_next,                 ra26
+
+# Loop: height<<23 + width<<16 + vdw_setup_0
+.set ra_dma0,                      ra27
+
+# Loop: destination address
+.set ra_dest,                      ra28
+
+# Setup: Dup of rb_ef
+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
+# (top bits are ignored by mul24)
+.set ra_ef,                        ra29
+
+# Use an even numbered register as a link register to avoid corrupting flags
+.set ra_link,                      ra30
+
+# -- free --                       ra31
+
+.set rb_xshift2,                   rb0
+.set rb_xshift2_next,              rb1
+
+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
+.set rb_elem_x,                    rb2
+
+# El Flags
+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
+# Duped into ra_ef as sometimes that is easier to use
+.set rb_ef,                        rb3
+
+# rb4-11
+# Loop: V filter FIFO or V filter coeff
+
+# Loop var: offset to add before shift (round + weighting offsets)
+# Exact value varies by loop
+.set rb_wt_off,                    rb12
+
+# -- free --                       rb13
+
+# -- free --                       rb14
+
+# Loop: src frame base (L1)
+.set rb_base2,                     rb15
+
+# Line pitch (128 for sand128)
+.set rb_pitch,                     rb16
+
+# Loop count - 2 (set up TMU for next xfer)
+.set rb_i_tmu,                     rb17
+
+# Loop count for min(height, 16)
+# Y will reset & loop again if height > 16
+.set rb_lcount,                    rb18
+
+# frame_base2_next
+.set rb_base2_next,                rb19
+
+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
+# offset to the slice
+.set rb_xpitch,                    rb20
+
+# These 3 consts each save 1 instruction in Y loop setup
+# so whilst they are worthwhile they should be the 1st to die if we need
+# another b reg
+.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
+.set rb_y_coeffs_3,                rb22                         # 0x11283a40
+.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
+
+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
+.set rb_pmask,                     rb24
+
+# vdw_setup_1(dst_pitch)
+.set rb_dma1_base,                 rb25
+
+# Setup: pic width - 1
+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
+.set rb_max_x,                     rb26
+
+# vdw_setup_0 (depends on QPU number)
+.set rb_dma0_base,                 rb27
+
+# Setup: vw_setup value to reset VPM write pointer
+.set rb_vpm_init,                  rb28
+
+# Loop: vdw_setup_1(dst_pitch-width) = stride
+.set rb_dma1,                      rb29
+
+# Setup: pic_height - 1
+.set rb_max_y,                     rb30
+
+# Setup: FIR H offset
+.set rb_fir_off_h,                 rb31
+
+
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+.set i_shift16,                    -16
+.set i_shift21,                    -11
+.set i_shift23,                     -9
+.set i_shift30,                     -2
+
+# Much of the setup code is common between Y & C
+# Macros that express this - obviously these can't be overlapped
+# so are probably unsuitable for loop code
+
+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
+  mov r2, qpu_num
+.if v_bit_depth <= 8
+  # 8 bit version
+  asr r1, r2, 2
+  shl r1, r1, 6
+  and r0, r2, 3
+  or  r0, r0, r1
+
+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+  add r_vpm, r0, r1  # VPM 8bit storage
+
+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+  shl r0, r0, 5
+
+.else
+  # 16 bit version
+  # Limited to 8 QPUs if blk height > 8
+  asr r1, r2, 1
+.if v_blk_height <= 8
+  shl r1, r1, 4
+.else
+  shl r1, r1, 5
+.endif
+  and r0, r2, 1
+  or  r0, r0, r1
+
+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
+  add r_vpm, r0, r1
+
+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
+  shl r0, r0, 6
+.endif
+  add r_dma, r0, r1  # DMA out
+.endm
+
+
+.macro m_setup_q0
+  srel -, 12
+.endm
+
+# Code start label
+::mc_start
+
+################################################################################
+# mc_setup_c
+#
+# typedef struct qpu_mc_pred_c_s_s {
+#     int16_t y;
+#     int16_t x;
+#     uint32_t base;
+#     uint32_t pic_cw;            // C Width (== Y width / 2)
+#     uint32_t pic_ch;            // C Height (== Y Height / 2)
+#     uint32_t stride2;
+#     uint32_t stride1;
+#     uint32_t wdenom;
+#     int16_t y2;
+#     int16_t x2;
+#     uint32_t base2;
+#     uint32_t next_fn;
+# } qpu_mc_pred_c_s_t;
+
+.macro m_setup_c, v_bit_depth
+
+# Cannot use mul24 on x as x might be -ve, so must use shift
+.if v_bit_depth <= 8
+.set v_x_shift,         1
+.set v_pmask,           0xff
+.set v_blk_height,      C_BLK_HEIGHT_8
+.else
+.set v_x_shift,         2
+.set v_pmask,           0xffff
+.set v_blk_height,      C_BLK_HEIGHT_16
+.endif
+
+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
+
+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
+
+# Read image dimensions
+  sub r0, unif, 1                                               # pic c width
+  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
+  sub rb_max_y, unif, 1                                         # pic c height
+
+# load constants
+  mov ra_kff800100, 0xff800100
+  mov rb_pmask, v_pmask
+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
+
+# get source pitch
+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
+  mov rb_pitch, unif                                            # stride1
+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
+  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
+
+  and r0, 1, elem_num
+  nop                           ; mul24 r0, r0, 5
+.if v_bit_depth <= 8
+  add rb_elem_x, r0, elem_num
+.else
+  add r0, r0, elem_num
+  add rb_elem_x, r0, r0
+.endif
+
+# Compute base address for first and second access
+# ra_base ends up with t0s base
+# ra_base2 ends up with t1s base
+
+  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
+  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
+  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
+  min r0, r0, rb_max_x
+
+# Get shift
+# Shift will always calculate as 0 for 9+ bit
+# Ideally we can optimize the shift out of the code in these cases but for now
+# it is tidier to leave it in
+.if v_bit_depth <= 8
+  shl ra_xshift_next, r0, 3
+.else
+  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
+.endif
+
+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
+
+.if v_bit_depth <= 8
+  and r0, r0, -4
+.endif
+  sub r1, ra_k0, rb_pitch
+  and r1, r0, r1
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
+  add ra_base, ra_base, r0
+
+# Compute part of VPM to use for DMA output
+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+
+# And again for L1, but only worrying about frame2 stuff
+
+# Compute base address for first and second access
+# ra_base ends up with t0s base
+# rb_base2 ends up with t1s base
+
+  shl r0, ra0.16b, v_x_shift
+  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
+  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
+  min r0, r0, rb_max_x
+
+# Get shift (already zero if 9+ bit so ignore)
+.if v_bit_depth <= 8
+  shl rb_xshift2_next, r0, 3
+.endif
+
+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+
+.if v_bit_depth <= 8
+  and r0, r0, -4
+.endif
+  sub r1, ra_k0, rb_pitch
+  and r1, r0, r1                ; mov r3, PREREAD
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov r2, ra_y2
+  add rb_base2, rb_base2, r0    ; mov r0, ra_y
+
+# Do preloads
+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
+
+:1
+  sub.setf r3, r3, 1
+  max r1, r0, 0
+  min r1, r1, rb_max_y
+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
+  add t0s, ra_base, r1          ; mov ra_y, r0
+
+  max r1, r2, 0
+  brr.anynz -, r:1b
+  min r1, r1, rb_max_y
+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
+  add t1s, rb_base2, r1         ; mov ra_y2, r2
+# >>> .anynz 1b
+
+  mov ra_link, unif                                             # link
+# touch registers to keep simulator happy (and fills in delay slots)
+  mov ra4, 0                    ; mov rb4, 0
+  bra -, ra_link
+  mov ra5, 0                    ; mov rb5, 0
+  mov ra6, 0                    ; mov rb6, 0
+  mov ra7, 0                    ; mov rb7, 0
+# >>> ra_link
+.endm
+
+::mc_setup_c_q0
+  m_setup_q0
+::mc_setup_c_qn
+  m_setup_c 8
+
+################################################################################
+#
+# mc_filter_c_p
+#
+# typedef struct qpu_mc_pred_c_p_s {
+#     int16_t y;
+#     int16_t x;
+#     uint32_t base;
+#     uint16_t h;
+#     uint16_t w;
+#     uint32_t coeffs_x;
+#     uint32_t coeffs_y;
+#     uint32_t wo_u;
+#     uint32_t wo_v;
+#     uint32_t dst_addr_c;
+#     uint32_t next_fn;
+# } qpu_mc_pred_c_p_t;
+
+.macro m_filter_c_p, v_tmu, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         1
+.set v_x_mul,           2
+.set v_v_shift,         8
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         2
+.set v_x_mul,           4
+.set v_v_shift,         i_shift16
+# Shifts to get width & height in the right place in rb_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+
+.if v_tmu == 0
+.set vrx_xshift,        rb_xshift2              # b side more convienient
+.set vrx_xshift_next,   ra_xshift_next
+.set vra_y_next,        ra_y_next
+.set vrx_base_next,     ra_base_next
+.set vra_y,             ra_y
+.set vra_base,          ra_base
+.set vr_txs,            t0s
+.else
+.set vrx_xshift,        ra_xshift               # a side more convienient
+.set vrx_xshift_next,   rb_xshift2_next
+.set vra_y_next,        ra_y2_next
+.set vrx_base_next,     rb_base2_next
+.set vra_y,             ra_y2
+.set vra_base,          rb_base2
+.set vr_txs,            t1s
+.endif
+
+# denom shift values
+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
+
+# per-channel shifts were calculated on the *previous* invocation
+# get base addresses and per-channel shifts for *next* invocation
+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
+
+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
+
+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
+  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
+  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
+  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
+  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
+
+.if v_bit_depth <= 8
+  shl vrx_xshift_next, r0, 3
+  and r0, r0, -4
+.endif
+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
+  add vrx_base_next, r3, r0     ; mov r1, ra_height
+
+# set up VPM write
+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
+
+# Misc final setup...
+
+  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
+  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
+  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
+  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
+  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
+  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
+  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
+
+# r5           = -4                     (loop counter)
+# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
+# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
+# rb31         = FIR value offset
+
+# FIFO: rb4, ra5, rb6, ra7
+# Coeffs in ra3.8a, ra3.8b, rb10, rb11
+
+# We want (r0r1)
+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
+# We fetch (after shift)
+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+.if v_tmu == 0
+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
+  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
+  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
+.else
+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
+  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
+  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
+.endif
+
+  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
+
+  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
+.if v_tmu == 0
+  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
+.else
+  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
+.endif
+
+# apply horizontal filter
+# The filter coeffs for the two halves of this are the same (unlike in the
+# Y case) so it doesn't matter which ra0 we get them from
+# Also as the two halves are locked together we don't need to separate the 1st
+# r0 mul or the last r1 mul as they are valid for all QPUs
+
+  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
+  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+
+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
+# We would like to save the r5->r4 shift but we need a delay slot
+# for both r7 & r6 which we can't find anything to put in if we have
+# already multiplied r4 & r5!
+  brr.anyn -, r:1b
+  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
+  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
+  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
+# >>> .anyn 1b
+
+  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
+  sub r1, r1, r0                ; mul24 r0, ra7, rb11
+  sub r1, r1, r0
+
+  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
+  brr.anyn -, r:1b
+  asr r1, r1, i_wt_den_p6
+  min r1, r1, ra_pmax           ; mov -, vw_wait
+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
+# >>> .anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc ra_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  brr -, r:1b
+  add rb_lcount, rb_lcount, r0
+  add ra_dma0, ra_dma0, r1
+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_c_p
+  m_filter_c_p 0, 8
+
+::mc_filter_c_p_l1
+  m_filter_c_p 1, 8
+
+################################################################################
+#
+# mc_filter_c_b
+#
+# typedef struct qpu_mc_pred_c_b_s {
+#     int16_t y;
+#     int16_t x;
+#     uint32_t base;
+#     uint16_t h;
+#     uint16_t w;
+#     uint32_t coeffs_x1;
+#     uint32_t coeffs_y1;
+#     int16_t weight_u1;
+#     int16_t weight_v1;
+#     int16_t y2;
+#     int16_t x2;
+#     uint32_t base2;
+#     uint32_t coeffs_x2;
+#     uint32_t coeffs_y2;
+#     uint32_t wo_u2;
+#     uint32_t wo_v2;
+#     uint32_t dst_addr_c;
+#     uint32_t next_fn;
+# } qpu_mc_pred_c_b_t;
+
+.macro m_filter_c_b, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         1
+.set v_v_shift,         8
+# Shifts to get width & height in the right place in ra_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         2
+.set v_v_shift,         i_shift16
+# Shifts to get width & height in the right place in ra_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+.set v_x_mul,           (1 << v_x_shift)
+
+# denom shift values
+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
+
+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
+
+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
+  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
+  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
+  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
+
+.if v_bit_depth <= 8
+  shl ra_xshift_next, r0, 3
+.endif
+
+  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
+  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+
+# set up VPM write
+
+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
+
+  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
+  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
+  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
+  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
+
+# L1 - uniform layout could possibly be optimized
+
+  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
+  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
+  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
+  max r0, r0, r5                ; mov ra9, rb_max_y
+  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
+
+.if v_bit_depth <= 8
+  shl rb_xshift2_next, r0, 3
+.endif
+
+  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
+  and r1, r0, r1                ; mov r5rep, -4
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
+  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
+
+  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
+  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
+  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
+  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
+  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
+
+  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
+
+# r5        loop counter (-4)
+# ra0       H coeffs L0
+# ra1       H coeffs L1
+# ra2       V coeffs L0
+# ra3       V coeffs L1
+# ra9       rb_max_y alias
+# ra10      rb_xshift2 alias
+
+:1
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
+  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
+  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
+  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
+  add ra_y, 1, ra_y             ; mov r3, ra_y
+
+  max r3, r3, ra_k0             ; mov      r0, r1 << 15
+  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
+
+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
+  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
+
+# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
+
+  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+
+  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
+
+  shr r2, r4, ra10              ; mov rb5, rb6
+  shr r1, r2, v_v_shift         ; mov r3, ra_y2
+  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
+
+  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
+  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
+
+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
+  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
+
+# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
+
+  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
+  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+
+  brr.anyn -, r:1b
+  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
+  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
+  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+# >>> .anyn 1b
+
+  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
+  sub.setf -, r5, rb_lcount     ; mov r0, ra4
+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+  add r1, r1, r0                ; mul24 r0, ra7,  rb7
+
+  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
+  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
+  sub r2, r2, r0
+
+  shr r1, r1, 6
+  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
+  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
+  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
+  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
+
+  brr.anyn -, r:1b
+  asr r1, r1, ra_wt_den_p7
+  min r1, r1, ra_pmax           ; mov -, vw_wait
+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
+# >>> .anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc ra_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  brr -, r:1b
+  add rb_lcount, rb_lcount, r0
+  add ra_dma0, ra_dma0, r1
+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_c_b
+  m_filter_c_b 8
+
+################################################################################
+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
+# conflicts
+
+.macro m_exit_drain
+.if PREREAD == 2
+# Special case 2 as loop is wasteful
+  nop                   ; nop           ; ldtmu0
+  nop                   ; nop           ; ldtmu1
+  nop                   ; nop           ; ldtmu0
+  mov -, vw_wait        ; nop           ; ldtmu1
+.else
+  mov.setf r3, PREREAD - 1
+:1
+  brr.anynz -, r:1b
+  nop                   ; nop           ; ldtmu0
+  nop                   ; nop           ; ldtmu1
+  sub.setf r3, r3, 1
+ # >>>
+  mov  -, vw_wait
+.endif
+.endm
+
+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
+# All qpus start at the beginning and after that (group - 1) must have finished
+# before (group) can start
+#
+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
+# lockup otherwise)
+#
+# There is some, currently ill defined, potential lockup if we have the VDM active
+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
+#
+# The code stalled when I had many waiters on a single sem so we have a
+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
+# and we currently have both the memory & sems to support it.
+.macro m_sync_q, n_qpu, n_quads
+# Do not generate code for qpu >= quads * 4 -  fns should never be called
+.if n_qpu < n_quads * 4
+  mov ra_link, unif     # Can only branch to an a reg (not r0)
+  mov -, vw_wait        # [ra_link delay]
+
+.set n_sem_sync, n_qpu - (n_qpu % 4)
+.set n_sem_in, n_qpu
+.set n_sem_out, n_qpu + 1
+
+.if n_qpu % 4 == 0
+
+.set n_sem_quad_in,  12 + n_qpu / 4
+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
+
+  sacq -, n_sem_sync
+  sacq -, n_sem_sync
+  sacq -, n_sem_sync
+  bra -, ra_link
+  sacq -, n_sem_quad_in
+  srel -, n_sem_out
+  srel -, n_sem_quad_out
+
+.else
+  bra -, ra_link
+  srel -, n_sem_sync
+  sacq -, n_sem_in
+.if n_sem_out % 4 != 0
+  srel -, n_sem_out
+.else
+  nop
+.endif
+.endif
+.endif
+.endm
+
+.set v_quads8, N_QPU_8 / 4
+
+::mc_sync_q0
+  m_sync_q 0, v_quads8
+::mc_sync_q1
+  m_sync_q 1, v_quads8
+::mc_sync_q2
+  m_sync_q 2, v_quads8
+::mc_sync_q3
+  m_sync_q 3, v_quads8
+::mc_sync_q4
+  m_sync_q 4, v_quads8
+::mc_sync_q5
+  m_sync_q 5, v_quads8
+::mc_sync_q6
+  m_sync_q 6, v_quads8
+::mc_sync_q7
+  m_sync_q 7, v_quads8
+::mc_sync_q8
+  m_sync_q 8, v_quads8
+::mc_sync_q9
+  m_sync_q 9, v_quads8
+::mc_sync_q10
+  m_sync_q 10, v_quads8
+::mc_sync_q11
+  m_sync_q 11, v_quads8
+
+# mc_exit()
+# Chroma & Luma the same now
+
+.macro m_exit_qn
+  m_exit_drain
+  nop                   ; nop           ; thrend
+  nop
+  nop
+# >>> thrend <<<
+.endm
+
+::mc_exit_c_qn
+::mc_exit_y_qn
+  m_exit_qn
+
+
+
+# mc_interrupt_exit12()
+
+.macro m_exit_q0
+  m_exit_drain
+  sacq -, 12
+  nop                   ; nop           ; thrend
+  mov interrupt, 1
+  nop
+# >>> thrend <<<
+.endm
+
+::mc_exit_c_q0
+::mc_exit_y_q0
+  m_exit_q0
+
+# LUMA CODE
+
+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+# For P frames we make the second x,y coordinates offset by +8
+
+
+################################################################################
+# mc_setup
+#
+# typedef struct qpu_mc_pred_y_s_s {
+#    qpu_mc_src_t next_src1;
+#    qpu_mc_src_t next_src2;
+#    uint16_t pic_h;
+#    uint16_t pic_w;
+#    uint32_t stride2;
+#    uint32_t stride1;
+#    uint32_t wdenom;
+#    uint32_t next_fn;
+# } qpu_mc_pred_y_s_t;
+
+.macro m_setup_y, v_bit_depth
+
+# Cannot use mul24 on x as x might be -ve, so must use shift
+.if v_bit_depth <= 8
+.set v_x_shift,         0
+.set v_pmask,           0xff
+.set v_blk_height,      Y_BLK_HEIGHT_8
+.else
+.set v_x_shift,         1
+.set v_pmask,           0xffff
+.set v_blk_height,      Y_BLK_HEIGHT_16
+.endif
+
+
+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
+  mov ra9, unif                                                 # ref_y_base
+  mov ra1, unif                                                 # x2_y2
+
+
+# load constants
+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
+
+  mov ra_kff800100, 0xff800100
+  mov rb_pmask, v_pmask
+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
+  mov rb_y_coeffs_2, 0x050b0a00
+  mov rb_y_coeffs_3, 0x11283a40
+  mov rb_y_coeffs_5, 0x0a0b0500
+
+# Compute part of VPM to use
+
+# Read image dimensions
+  mov ra3, unif                                                 # width_height
+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
+.if v_x_shift == 0
+  sub rb_max_x, ra3.16b, 1
+.else
+  sub r0, ra3.16b, 1
+  shl rb_max_x, r0, v_x_shift
+.endif
+  sub rb_max_y, ra3.16a, 1
+  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
+
+# get destination pitch
+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
+  or  rb_dma1_base, r1, rb_pitch
+
+# Compute base address for first and second access
+  add r0, ra0.16b, r3                                           # Load x + elem_num
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, 0
+  min r0, r0, rb_max_x
+  shl ra_xshift_next, r0, 3                                     # Compute shifts
+
+# X is byte offset - we can only load words - mask
+
+  and r0, r0, -4                ; v8subs r2, r2, r2
+  sub r2, r2, rb_pitch
+  and r1, r0, r2
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                                                # Add stripe offsets
+  add ra_base, ra9, r0
+
+  # r3 still contains elem_num
+  add r0, ra1.16b, r3                                           # Load x
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, 0
+  min r0, r0, rb_max_x
+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
+
+  # r2 still contains mask
+  and r0, r0, -4
+  and r1, r0, r2
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                                                # Add stripe offsets
+  add rb_base2, ra11, r0
+
+# Do preloads
+  nop                           ; mov r0, ra0.16a               # ; r0 = y
+  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
+
+:1
+  sub.setf r3, r3, 1
+  max r1, r0, 0
+  min r1, r1, rb_max_y
+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
+  add t0s, ra_base, r1          ; mov ra_y, r0
+
+  max r1, r2, 0
+  brr.anynz -, r:1b
+  min r1, r1, rb_max_y
+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
+  add t1s, rb_base2, r1         ; mov ra_y2, r2
+# >>> .anynz 1b
+
+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+
+  mov ra_link, unif                                             # Next fn
+
+# touch vertical context to keep simulator happy
+  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
+  bra -, ra_link
+  mov ra9,  0                   ; mov rb9,  0
+  mov ra10, 0                   ; mov rb10, 0
+  mov ra11, 0                   ; mov rb11, 0
+# >>> ra_link
+.endm
+
+::mc_setup_y_q0
+  m_setup_q0
+::mc_setup_y_qn
+  m_setup_y 8
+
+################################################################################
+#
+# Start of per-block setup code
+# P and B blocks share the same setup code to save on Icache space
+
+# get base addresses and per-channel shifts for *next* invocation
+# per-channel shifts were calculated on the *previous* invocation
+
+# 1st 3 instructions of per_block-setup in branch delay
+#
+# typedef struct qpu_mc_pred_y_p_s {
+#    qpu_mc_src_t next_src1;
+#    qpu_mc_src_t next_src2;
+#    uint16_t h;
+#    uint16_t w;
+#    uint32_t mymx21;
+#    uint32_t wo1;
+#    uint32_t wo2;
+#    uint32_t dst_addr;
+#    uint32_t next_fn;
+# } qpu_mc_pred_y_p_t;
+#
+
+.macro m_luma_setup, v_bit_depth
+# Hack - QASM may well have have label pasting but I have no idea how...
+.if v_bit_depth == 8
+  brr ra_link, r:per_block_setup_8
+.elif v_bit_depth == 10
+  brr ra_link, r:per_block_setup_10
+.endif
+  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
+  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
+  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
+.endm
+
+.macro m_per_block_setup, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         0
+.set v_x_mul,           1
+# Shifts to get width & height in the right place in ra_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         1
+.set v_x_mul,           2
+# Shifts to get width & height in the right place in ra_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
+  min r0, r0, rb_max_x
+
+  shl ra_xshift_next, r0, 3                                     # Compute shifts
+  and r0, r0, -4
+  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
+  and r1, r0, r2                ; mov ra_y_next, ra0.16a
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
+  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
+
+  add r0, ra1.16b, r3                                           # Load x2
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
+  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
+  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
+  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
+  add rb_base2_next, rb_base2_next, r0
+
+# get width,height of block (unif load above), r1 = width * pel_size
+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
+  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
+  add rb_lcount, r0, (7-8)
+  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
+  add r0, r0, r1                                                # Combine width and height of destination area
+  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
+  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
+
+# get filter coefficients and discard unused B frame values
+  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
+  shl ra8, r0, 3                ; mov rb5, ra_k255
+
+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
+
+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
+# but I can't see a way of doing that that is cheap enough to be worth it
+
+# Picked out in a slightly random order to space out uniform loads
+
+  # 1
+  mov r1, 0x01040400            # [ra8 delay]
+  ror ra2.8b, r1, ra8.8d
+  ror ra0.8b, r1, ra8.8c
+  # 2
+  ror ra2.8c, rb_y_coeffs_2, ra8.8d
+  ror ra0.8c, rb_y_coeffs_2, ra8.8c
+  # 0
+  mov r1,0x00010100             # -ve  [ra8 delay]
+  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
+  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
+  # 7
+  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
+  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
+  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
+  # 3
+  ror ra2.8d, rb_y_coeffs_3, ra8.8d
+  ror ra0.8d, rb_y_coeffs_3, ra8.8c
+  # 5
+  ror ra3.8b, rb_y_coeffs_5, ra8.8d
+  ror ra1.8b, rb_y_coeffs_5, ra8.8c
+  # 6
+  mov r1,0x04040100
+  ror ra3.8c, r1, ra8.8d
+  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
+
+  bra -, ra_link
+  # 4
+  mov r1,0x3a281100
+  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
+  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
+# >>> branch ra_link
+
+# r5 = -8
+# r2 = fir_off_val
+# r3 = 128
+.endm
+
+:per_block_setup_8
+  m_per_block_setup 8
+
+
+
+################################################################################
+#
+# mc_filter_y_pxx
+#
+# Setup (& therefore uniform struct) shared with _bxx
+# Struct in m_luma_setup
+#
+# We can have 2 separate P reqs here as long as they mate to generate a
+# rectangular output block (i.e. h0 = h1, w0 = 8)
+#
+# At this point we have already issued PREREAD pairs of texture requests for the current block
+
+.macro m_filter_y_pxx, v_bit_depth
+
+# denom shift values
+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
+
+  m_luma_setup v_bit_depth
+
+  shl r1, ra_wt_off_l0, i_wt_den_p5
+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
+
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
+# This loop is identical to the B loop from here --->
+:1
+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
+
+  max r2, ra_y, 0               ; mov r1, 0
+  min r2, r2, rb_max_y          ; mov r3, ra_k1
+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
+  add t0s, ra_base, r2          ; mov rb5,  rb6
+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
+
+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
+  add t1s, rb_base2, r2         ; mov ra8,  ra9
+
+# apply horizontal filter
+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+
+  brr.anyn -, r:1b
+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+  # >>> .anyn 1b (r5 + r5)
+
+  # apply vertical filter and write to VPM
+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
+
+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+  add r1, r1, r0                ; mul24 r0, ra11, rb11
+# <--- to here
+  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
+  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
+  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
+
+  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
+  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
+
+  brr.anyn -, r:1b
+  asr r1, r1, i_wt_den_p6
+  min r1, r1, ra_pmax           ; mov -, vw_wait
+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
+# >>> branch.anyn 1b (r5 - rb_lcount)
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc ra_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  brr -, r:1b
+  add rb_lcount, rb_lcount, r0
+  add ra_dma0, ra_dma0, r1
+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_pxx
+  m_filter_y_pxx 8
+
+
+################################################################################
+
+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+#
+# Setup (& therefore uniform struct) shared with _pxx
+# Struct in m_luma_setup
+#
+# l0 calc in els 0-7, L1 in 8-15
+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
+#
+# At this point we have already issued PREREAD pairs of texture requests for the current block
+
+.macro m_filter_y_bxx, v_bit_depth
+
+# denom shift values
+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
+
+  m_luma_setup v_bit_depth
+
+  shl r1, ra_wt_off_l0, i_wt_den_p6
+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
+  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
+
+# This loop is identical to the P loop from here --->
+:1
+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
+
+  max r2, ra_y, 0               ; mov r1, 0
+  min r2, r2, rb_max_y          ; mov r3, ra_k1
+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
+  add t0s, ra_base, r2          ; mov rb5,  rb6
+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
+
+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
+  add t1s, rb_base2, r2         ; mov ra8,  ra9
+
+# apply horizontal filter
+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+
+  brr.anyn -, r:1b
+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
+  # >>> .anyn 1b (r5 + r5)
+
+  # apply vertical filter and write to VPM
+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
+
+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
+  add r1, r1, r0                ; mul24 r0, ra11, rb11
+# <--- to here
+  sub r1, r1, ra4
+  sub r1, r1, r0                ; mov r2, rb_wt_off
+
+  asr r1, r1, 6
+  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
+  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
+  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
+  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
+  add r1, r1, r2                ; mov r0, r1 << 8
+  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
+
+  brr.anyn -, r:1b
+  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
+  min r1, r1, ra_pmax           ; mov -, vw_wait
+  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
+# >>> branch.anyn 1b (r5 - rb_lcount)
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height
+
+# If looping again then we consumed block_height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc ra_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link (ra_height - remaining height)
+
+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  brr -, r:1b
+  add rb_lcount, rb_lcount, r0
+  add ra_dma0, ra_dma0, r1
+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_bxx
+  m_filter_y_bxx 8
+
+################################################################################
+#
+# typedef struct qpu_mc_pred_y_p00_s {
+#    qpu_mc_src_t next_src1;
+#    uint16_t h;
+#    uint16_t w;
+#    uint32_t wo1;
+#    uint32_t dst_addr;
+#    uint32_t next_fn;
+# } qpu_mc_pred_y_p00_t;
+
+.macro m_filter_y_p00, v_bit_depth
+
+.if v_bit_depth <= 8
+.set v_x_shift,         0
+.set v_x_mul,           1
+# Shifts to get width & height in the right place in ra_dma0
+.set v_dma_h_shift,     7
+.set v_dma_wh_shift,    i_shift16
+.else
+.set v_x_shift,         1
+.set v_x_mul,           2
+# Shifts to get width & height in the right place in ra_dma0
+.set v_dma_h_shift,     8
+.set v_dma_wh_shift,    15
+.endif
+
+  mov ra0, unif                 ; mov r0, elem_num              # y_x
+  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
+  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
+.if v_x_shift != 0
+  shl r0, r0, v_x_shift
+.endif
+
+  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
+  min r0, r0, rb_max_x          ; mov ra_width_height, unif
+
+  shl ra_xshift_next, r0, 3                                     # Compute shifts
+  and r0, r0, -4
+  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
+  and r1, r0, r2
+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
+  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
+
+# get width,height of block (unif load above)
+# Compute vdw_setup1(dst_pitch-width)
+  shl r1, ra_width, v_x_shift
+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
+  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
+  add r0, r0, r1                                                # Combine width and height of destination area
+  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
+  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
+  add ra_dma0, r0, rb_dma0_base
+
+:1
+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
+  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
+
+  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
+  shl r1, r1, 8                 ; mov r3, ra_blk_height
+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+
+  brr.anyn -, r:1b
+  asr r1, r1, DENOM + 8
+  min r1, r1, ra_pmax           ; mov -, vw_wait
+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc ra_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  brr -, r:1b
+  add rb_lcount, rb_lcount, r0
+  add ra_dma0, ra_dma0, r1
+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_p00
+  m_filter_y_p00 8
+
+################################################################################
+
+.macro m_filter_y_b00, v_bit_depth
+# luma setup does a fair bit more than we need calculating filter coeffs
+# that we will never use but it saves I-cache to use it (also simple!)
+  m_luma_setup v_bit_depth
+
+# Fix up vals that were expecting a filter (somewhat icky)
+  mov r2, 1
+  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
+  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
+  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+
+:1
+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+
+  max r2, ra_y, 0  # y
+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
+
+  max r2, ra_y2, 0
+  min r2, r2, rb_max_y
+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
+  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
+
+  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
+  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
+
+  shl r1, r1, 8                 ; mov r3, ra_blk_height
+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
+
+  brr.anyn -, r:1b
+  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
+  min r1, r1, ra_pmax           ; mov -, vw_wait
+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
+# >>> branch.anyn 1b
+
+# r0 = remaining height (min 0)
+# r2 = r3 * rb_pitch
+# r3 = block_height
+
+# If looping again then we consumed 16 height last loop
+# rb_dma1 (stride) remains constant
+# rb_i_tmu remains const (based on total height)
+# recalc ra_dma0, rb_lcount based on new segment height
+
+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
+
+# DMA out
+  bra.anyz -, ra_link
+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
+  shl r1, r1, i_shift23
+# >>> .anyz ra_link
+
+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+# We add to dma0 to reduce the number of output lines in the final block
+  brr -, r:1b
+  add rb_lcount, rb_lcount, r0
+  add ra_dma0, ra_dma0, r1
+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
+# >>> 1b
+.endm
+
+::mc_filter_y_b00
+  m_filter_y_b00 8
+
+################################################################################
+################################################################################
+# 10 BIT
+
+::mc_setup_c10_q0
+  m_setup_q0
+::mc_setup_c10_qn
+  m_setup_c 10
+
+::mc_filter_c10_p
+  m_filter_c_p 0, 10
+
+::mc_filter_c10_p_l1
+  m_filter_c_p 1, 10
+
+
+::mc_filter_c10_b
+  m_filter_c_b 10
+
+# Even if these fns are the same as for other bit depths we want our own copy
+# to keep the code we are using in a single lump to avoid (direct map) cache
+# thrashing
+.set v_quads10, N_QPU_16 / 4
+
+::mc_sync10_q0
+  m_sync_q 0, v_quads10
+::mc_sync10_q1
+  m_sync_q 1, v_quads10
+::mc_sync10_q2
+  m_sync_q 2, v_quads10
+::mc_sync10_q3
+  m_sync_q 3, v_quads10
+::mc_sync10_q4
+  m_sync_q 4, v_quads10
+::mc_sync10_q5
+  m_sync_q 5, v_quads10
+::mc_sync10_q6
+  m_sync_q 6, v_quads10
+::mc_sync10_q7
+  m_sync_q 7, v_quads10
+::mc_sync10_q8
+  m_sync_q 8, v_quads10
+::mc_sync10_q9
+  m_sync_q 9, v_quads10
+::mc_sync10_q10
+  m_sync_q 10, v_quads10
+::mc_sync10_q11
+  m_sync_q 11, v_quads10
+
+::mc_exit_y10_q0
+::mc_exit_c10_q0
+  m_exit_q0
+
+::mc_exit_y10_qn
+::mc_exit_c10_qn
+  m_exit_qn
+
+::mc_setup_y10_q0
+  m_setup_q0
+::mc_setup_y10_qn
+  m_setup_y 10
+
+:per_block_setup_10
+  m_per_block_setup 10
+
+::mc_filter_y10_pxx
+  m_filter_y_pxx 10
+
+::mc_filter_y10_p00
+  m_filter_y_p00 10
+
+::mc_filter_y10_bxx
+  m_filter_y_bxx 10
+
+::mc_filter_y10_b00
+  m_filter_y_b00 10
+
+
+
+::mc_end
+# Do not add code here because mc_end must appear after all other code.
diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
new file mode 100644
index 0000000000..89711d776b
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader_cmd.h
@@ -0,0 +1,165 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef RPI_SHADER_CMD_H
+#define RPI_SHADER_CMD_H
+
+#pragma pack(push, 4)
+
+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
+// If mixed then we are just confused and get a lot of warnings....
+typedef const uint8_t * qpu_mc_src_addr_t;
+typedef uint8_t * qpu_mc_dst_addr_t;
+#else
+typedef uint32_t qpu_mc_src_addr_t;
+typedef uint32_t qpu_mc_dst_addr_t;
+#endif
+
+typedef struct qpu_mc_src_s
+{
+    int16_t y;
+    int16_t x;
+    qpu_mc_src_addr_t base;
+} qpu_mc_src_t;
+
+
+typedef struct qpu_mc_pred_c_p_s {
+    qpu_mc_src_t next_src;
+    uint16_t h;
+    uint16_t w;
+    uint32_t coeffs_x;
+    uint32_t coeffs_y;
+    uint32_t wo_u;
+    uint32_t wo_v;
+    qpu_mc_dst_addr_t dst_addr_c;
+    uint32_t next_fn;
+} qpu_mc_pred_c_p_t;
+
+typedef struct qpu_mc_pred_c_b_s {
+    qpu_mc_src_t next_src1;
+    uint16_t h;
+    uint16_t w;
+    uint32_t coeffs_x1;
+    uint32_t coeffs_y1;
+    int16_t weight_u1;
+    int16_t weight_v1;
+    qpu_mc_src_t next_src2;
+    uint32_t coeffs_x2;
+    uint32_t coeffs_y2;
+    uint32_t wo_u2;
+    uint32_t wo_v2;
+    qpu_mc_dst_addr_t dst_addr_c;
+    uint32_t next_fn;
+} qpu_mc_pred_c_b_t;
+
+typedef struct qpu_mc_pred_c_s_s {
+    qpu_mc_src_t next_src1;
+    uint32_t pic_cw;            // C Width (== Y width / 2)
+    uint32_t pic_ch;            // C Height (== Y Height / 2)
+    uint32_t stride2;
+    uint32_t stride1;
+    qpu_mc_src_t next_src2;
+    uint32_t next_fn;
+} qpu_mc_pred_c_s_t;
+
+typedef struct qpu_mc_pred_c_s {
+    union {
+        qpu_mc_pred_c_p_t p;
+        qpu_mc_pred_c_b_t b;
+        qpu_mc_pred_c_s_t s;
+    };
+} qpu_mc_pred_c_t;
+
+
+typedef struct qpu_mc_pred_y_p_s {
+    qpu_mc_src_t next_src1;
+    qpu_mc_src_t next_src2;
+    uint16_t h;
+    uint16_t w;
+    uint32_t mymx21;
+    uint32_t wo1;
+    uint32_t wo2;
+    qpu_mc_dst_addr_t dst_addr;
+    uint32_t next_fn;
+} qpu_mc_pred_y_p_t;
+
+typedef struct qpu_mc_pred_y_p00_s {
+    qpu_mc_src_t next_src1;
+    uint16_t h;
+    uint16_t w;
+    uint32_t wo1;
+    qpu_mc_dst_addr_t dst_addr;
+    uint32_t next_fn;
+} qpu_mc_pred_y_p00_t;
+
+typedef struct qpu_mc_pred_y_s_s {
+    qpu_mc_src_t next_src1;
+    qpu_mc_src_t next_src2;
+    uint16_t pic_h;
+    uint16_t pic_w;
+    uint32_t stride2;
+    uint32_t stride1;
+    uint32_t next_fn;
+} qpu_mc_pred_y_s_t;
+
+typedef struct qpu_mc_pred_sync_s {
+    uint32_t next_fn;
+} qpu_mc_pred_sync_t;
+
+// Only a useful structure in that it allows us to return something other than a void *
+typedef struct qpu_mc_pred_y_s {
+    union {
+        qpu_mc_pred_y_p_t p;
+        qpu_mc_pred_y_p00_t p00;
+        qpu_mc_pred_y_s_t s;
+    };
+} qpu_mc_pred_y_t;
+
+typedef union qpu_mc_pred_cmd_u {
+    qpu_mc_pred_y_t y;
+    qpu_mc_pred_c_t c;
+    qpu_mc_pred_sync_t sync;
+} qpu_mc_pred_cmd_t;
+
+static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
+{
+    // Link is last el of previous cmd
+    ((uint32_t *)cmd)[-1] = fn;
+}
+
+#define QPU_MC_PRED_N_Y8        12
+#define QPU_MC_PRED_N_C8        12
+
+#define QPU_MC_PRED_N_Y10       12
+#define QPU_MC_PRED_N_C10       12
+
+#define QPU_MC_DENOM            7
+
+#pragma pack(pop)
+
+#endif
+
diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
new file mode 100644
index 0000000000..77d8366eb8
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader_template.c
@@ -0,0 +1,88 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "hevc.h"
+#include "rpi_hevcdec.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "rpi_hevc_shader_cmd.h"
+#include "rpi_hevc_shader_template.h"
+
+typedef struct shader_track_s
+{
+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
+    const struct qpu_mc_src_s *last_l0;
+    const struct qpu_mc_src_s *last_l1;
+    uint32_t width;  // pic_width * PW
+    uint32_t height;
+    uint32_t stride2;
+    uint32_t stride1;
+} shader_track_t;
+
+static int wtoidx(const unsigned int w)
+{
+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+    return pel_weight[w];
+}
+
+static const int fctom(uint32_t x)
+{
+    int rv;
+    // As it happens we can take the 2nd filter term & divide it by 8
+    // (dropping fractions) to get the fractional move
+    rv = 8 - ((x >> 11) & 0xf);
+    av_assert2(rv >= 0 && rv <= 7);
+    return rv;
+}
+
+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
+{
+    return (x << shl) >> shr;
+}
+
+static inline int woff_p(HEVCRpiContext *const s, int32_t x)
+{
+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
+}
+
+static inline int woff_b(HEVCRpiContext *const s, int32_t x)
+{
+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
+}
+
+static inline int wweight(int32_t x)
+{
+    return ext(x, 16, 16);
+}
+
+
+#define PW 1
+#include "rpi_hevc_shader_template_fn.h"
+
+#undef PW
+#define PW 2
+#include "rpi_hevc_shader_template_fn.h"
+
diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
new file mode 100644
index 0000000000..0fc5a45e9f
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader_template.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+
+struct HEVCRpiContext;
+struct HEVCRpiInterPredEnv;
+
+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
+                  const struct HEVCRpiInterPredEnv *const ipe_y,
+                  const struct HEVCRpiInterPredEnv *const ipe_c);
+
+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
+                  const struct HEVCRpiInterPredEnv *const ipe_y,
+                  const struct HEVCRpiInterPredEnv *const ipe_c);
+
+void rpi_sand_dump8(const char * const name,
+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+
+void rpi_sand_dump16(const char * const name,
+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+
+#endif
+
diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
new file mode 100644
index 0000000000..10c163a4b9
--- /dev/null
+++ b/libavcodec/rpi_hevc_shader_template_fn.h
@@ -0,0 +1,502 @@
+/*
+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+#define PATCH_STRIDE (16 * PW)
+
+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+{
+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
+        const pixel s = *(const pixel *)src;
+        pixel * d = (pixel *)dst;
+        for (unsigned int j = 0; j < w; j += PW) {
+            *d++ = s;
+        }
+    }
+}
+
+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+{
+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
+        memcpy(dst, src, w);
+    }
+}
+
+static void FUNC(get_patch_y)(const shader_track_t * const st,
+                         uint8_t * dst, const unsigned int dst_stride,
+                         const qpu_mc_src_t *src,
+                         unsigned int _w, unsigned int _h)
+{
+    int x = src->x * PW;
+    int y = src->y;
+    int w = _w * PW;
+    int h = _h;
+    int dl = 0;
+    int dr = 0;
+    int dt = 0;
+    int db = 0;
+
+    if (x < 0) {
+        if (-x >= w)
+            x = PW - w;
+        dl = -x;
+        w += x;
+        x = 0;
+    }
+    if (x + w > st->width) {
+        if (x >= st->width)
+            x = st->width - PW;
+        dr = (x + w) - st->width;
+        w = st->width - x;
+    }
+
+    // Y
+    if (y < 0) {
+        if (-y >= h)
+            y = 1 - h;
+        dt = -y;
+        h += y;
+        y = 0;
+    }
+    if (y + h > st->height) {
+        if (y >= st->height)
+            y = st->height - 1;
+        db = (y + h) - st->height;
+        h = st->height - y;
+    }
+
+    dst += dl + dt * dst_stride;
+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+
+    // Edge dup
+    if (dl != 0)
+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
+    if (dr != 0)
+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
+    w += dl + dr;
+    dst -= dl;
+
+    if (dt != 0)
+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
+    if (db != 0)
+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
+}
+
+
+
+static void FUNC(get_patch_c)(const shader_track_t * const st,
+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
+                         const qpu_mc_src_t *src,
+                         unsigned int _w, unsigned int _h)
+{
+    int x = src->x * PW;
+    int y = src->y;
+    int w = _w * PW;
+    int h = _h;
+    int dl = 0;
+    int dr = 0;
+    int dt = 0;
+    int db = 0;
+    const int width = st->width;
+    const int height = st->height;
+
+    if (x < 0) {
+        if (-x >= w)
+            x = PW - w;
+        dl = -x;
+        w += x;
+        x = 0;
+    }
+    if (x + w > width) {
+        if (x >= width)
+            x = width - PW;
+        dr = (x + w) - width;
+        w = width - x;
+    }
+
+    // Y
+    if (y < 0) {
+        if (-y >= h)
+            y = 1 - h;
+        dt = -y;
+        h += y;
+        y = 0;
+    }
+    if (y + h > height) {
+        if (y >= height)
+            y = height - 1;
+        db = (y + h) - height;
+        h = height - y;
+    }
+
+    dst_u += dl + dt * dst_stride;
+    dst_v += dl + dt * dst_stride;
+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+
+    // Edge dup
+    if (dl != 0)
+    {
+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
+    }
+    if (dr != 0)
+    {
+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
+    }
+    w += dl + dr;
+    dst_u -= dl;
+    dst_v -= dl;
+
+    if (dt != 0)
+    {
+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
+    }
+    if (db != 0)
+    {
+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
+    }
+}
+
+// w, y, w, h in pixels
+// stride1, stride2 in bytes
+void FUNC(rpi_sand_dump)(const char * const name,
+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
+{
+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
+
+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
+
+    if (is_c) {
+        x *= 2;
+        w *= 2;
+    }
+
+    for (int i = y; i != y + h; ++i) {
+        for (int j = x; j != x + w; ++j) {
+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
+#if PW == 1
+            if (j < 0 || i < 0)
+                printf("..%c", sep);
+            else
+                printf("%02x%c", *(const pixel*)p, sep);
+#else
+            if (j < 0 || i < 0)
+                printf("...%c", sep);
+            else
+                printf("%03x%c", *(const pixel*)p, sep);
+#endif
+        }
+        printf("\n");
+    }
+}
+
+
+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
+                  const HEVCRpiInterPredEnv *const ipe_y,
+                  const HEVCRpiInterPredEnv *const ipe_c)
+{
+    for (int c_idx = 0; c_idx < 2; ++c_idx)
+    {
+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
+        unsigned int exit_n = 0;
+
+        if (ipe == NULL || !ipe->used) {
+            continue;
+        }
+
+        do {
+            for (unsigned int i = 0; i != ipe->n; ++i) {
+                const HEVCRpiInterPredQ * const q = ipe->q + i;
+                shader_track_t * const st = tracka + i;
+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+
+                for (;;) {
+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
+
+                    if (link == q->code_setup) {
+                        if (c_idx == 0) {
+                            // Luma
+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
+
+                            st->height = c->pic_h;
+                            st->width = c->pic_w * PW;
+                            st->stride1 = c->stride1;
+                            st->stride2 = c->stride2;
+                            st->last_l0 = &c->next_src1;
+                            st->last_l1 = &c->next_src2;
+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                        }
+                        else {
+                            // Chroma
+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
+
+                            st->height = c->pic_ch;
+                            st->width = c->pic_cw * PW;
+                            st->stride1 = c->stride1;
+                            st->stride2 = c->stride2;
+                            st->last_l0 = &c->next_src1;
+                            st->last_l1 = &c->next_src2;
+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                        }
+                    }
+                    else if (link == s->qpu.y_pxx) {
+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+                        const int w1 = FFMIN(c->w, 8);
+                        const int w2 = c->w - w1;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h + 7);
+                        if (w2 > 0) {
+                            FUNC(get_patch_y)(st,
+                                        patch_y2, PATCH_STRIDE,
+                                        st->last_l1,
+                                        16, c->h + 7);
+                        }
+
+                        // wo[offset] = offset*2+1
+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
+                        if (w2 > 0) {
+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
+                        }
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.y_bxx) {
+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h + 7);
+                        FUNC(get_patch_y)(st,
+                                    patch_y2, PATCH_STRIDE,
+                                    st->last_l1,
+                                    16, c->h + 7);
+
+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
+
+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.y_p00) {
+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h + 7);
+
+                        // wo[offset] = offset*2+1
+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
+
+                        st->last_l0 = &c->next_src1;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.y_b00) {
+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+
+                        av_assert0(c->w <= 16 && c->h <= 64);
+
+                        FUNC(get_patch_y)(st,
+                                    patch_y1, PATCH_STRIDE,
+                                    st->last_l0,
+                                    16, c->h);
+                        FUNC(get_patch_y)(st,
+                                    patch_y2, PATCH_STRIDE,
+                                    st->last_l1,
+                                    16, c->h);
+
+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
+                           patch_y3, patch_y1, PATCH_STRIDE,
+                           c->h, 0, 0, c->w);
+
+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
+                            0, woff_b(s, c->wo2), 0, 0, c->w);
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.c_pxx) {
+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+                        const int mx = fctom(c->coeffs_x);
+                        const int my = fctom(c->coeffs_y);
+
+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_u3[8 * 16 * PW];
+                        uint8_t patch_v3[8 * 16 * PW];
+
+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+
+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+                        st->last_l0 = &c->next_src;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.c_pxx_l1) {
+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+                        const int mx = fctom(c->coeffs_x);
+                        const int my = fctom(c->coeffs_y);
+
+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+                        uint8_t patch_u3[8 * 16 * PW];
+                        uint8_t patch_v3[8 * 16 * PW];
+
+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+
+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+                        st->last_l1 = &c->next_src;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == s->qpu.c_bxx) {
+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
+                        const int mx1 = fctom(c->coeffs_x1);
+                        const int my1 = fctom(c->coeffs_y1);
+                        const int mx2 = fctom(c->coeffs_x2);
+                        const int my2 = fctom(c->coeffs_y2);
+
+                        uint8_t patch_u1[PATCH_STRIDE * 72];
+                        uint8_t patch_v1[PATCH_STRIDE * 72];
+                        uint8_t patch_u2[PATCH_STRIDE * 72];
+                        uint8_t patch_v2[PATCH_STRIDE * 72];
+                        uint8_t patch_u3[8 * 16 * PW];
+                        uint8_t patch_v3[8 * 16 * PW];
+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
+
+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+
+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                           c->h, mx1, my1, c->w);
+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+                           c->h, mx1, my1, c->w);
+
+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
+                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
+                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
+
+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+
+                        st->last_l0 = &c->next_src1;
+                        st->last_l1 = &c->next_src2;
+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+                    }
+                    else if (link == q->code_sync) {
+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
+                        break;
+                    }
+                    else if (link == q->code_exit) {
+                        // We expect exit to occur without other sync
+                        av_assert0(i == exit_n);
+                        ++exit_n;
+                        break;
+                    }
+                    else {
+                        av_assert0(0);
+                    }
+                }
+
+                st->qpu_mc_curr = cmd;
+            }
+        } while (exit_n == 0);
+    }
+}
+
+#undef FUNC
+#undef pixel
+
diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
new file mode 100644
index 0000000000..3caef20137
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform.s
@@ -0,0 +1,444 @@
+# ******************************************************************************
+# Argon Design Ltd.
+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+#
+# Module : HEVC
+# Author : Peter de Rivaz
+# ******************************************************************************
+
+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
+.set USE_STACK, 0
+
+# Lines that fail to assemble start with #:
+# The script insert_magic_opcodes.sh inserts the machine code directly for these.
+# HEVC VPU Transform
+#
+# Transform matrix can be thought of as
+#   output row vector = input row vector * transMatrix2
+#
+# The even rows of the matrix are symmetric
+# The odd rows of the matrix are antisymmetric
+#
+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+#
+# EXAMPLE
+#   (a b c d) (1 2  2  1)
+#             (3 4 -4 -3)
+#             (5 6  6  5)
+#             (7 8 -8 -7)
+#
+#  x=(a c)(1 2) = 1a+5c 2a+6c
+#         (5 6)
+#
+#  y=(b d)(3 4) = 3b+7d 4b+8d
+#         (7 8)
+#
+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+#
+#  Final results are (u , v[::-1])
+#
+#
+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+#  Apply the even matrix first and stop before rounding
+#  Then apply the odd matrix in a full manner:
+#
+#   First step is to compute partial products with the first input (16 cycles)
+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+#   2a 4b 6c 8d
+#   2a -4b 6c -8d
+#   1a -3b 5c -7d
+#
+#   Second step is to sum partial products into final position (8 cycles)
+#   1a+3b+5c+7d
+#   2a+4b+6c+8d
+#   2a-4b+6c-8d
+#   1a-3b+5c-7d
+#
+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+#
+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+#
+#   For 8x8 we could compute two in parallel.
+#
+#
+
+# Columns are transformed first
+#
+# Store top left half of transMatrix2 in
+# Store bottom left half of transMatrix2 in HX(32,32)
+#
+# For 16x16
+# HX(0:15,0) contains input data before transform
+# HY(0:15,0) contains 32bit output data after transform
+# HX(32,0) contains even rows of left half of transMatrix2
+# HX(32,32) contains odd rows of left half of transMatrix2
+# HY(48,0) contains partial products ready for summing
+#
+
+
+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done
+# coeffs32
+# num32: number of 32x32 transforms
+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+#
+
+.equ TRANS_SHIFT, 20 - BIT_DEPTH
+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
+.equ TRANS_ASL2, 16 - TRANS_SHIFT
+
+
+hevc_trans_16x16:
+  push r6-r15, lr # TODO cut down number of used registers
+  mov r14,r3 # coeffs32
+  mov r15,r4 # num32
+  mov r3, 16*2 # Stride of transMatrix2 in bytes
+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+
+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+  # Now use r0 to describe which matrix we are working on.
+  # Allows us to prefetch the next block of coefficients for efficiency.
+  mov r0,0 # This describes the location where we read our coefficients from
+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+  mov r7,16*16*2 # Total block size
+  mov r8,64*16 # Value used to swap from current to next VRF location
+  mov r4,64 # Constant used for rounding first pass
+  mov r5,TRANS_RND2 # Constant used for rounding second pass
+
+  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
+
+  add r11,sp,64 # Space for 32 bytes before, and rounding
+  lsr r11,5
+  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
+
+  lsr r10, r2, 16 # Number of compressed blocks stored in top short
+  extu r2,16
+  # At start of block r0,r1 point to the current block (that has already been loaded)
+  # r0 VRF location of current block
+  # r1 address of current block
+  # r2 number of 16*16 transforms to do
+  # r3 Stride of coefficients (==32)
+  # r4 TRANS_RND1 (64)
+  # r5 TRANS_RND2
+  # r6 temporary used inside col_trans16
+  # r7 16*16*2 total bytes in block
+  # r8 64*16 VRF switch locations
+  # r9 temporary in unpack_coeff for index
+  # r10 number of 16x16 transforms using compression
+  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
+  # r12 temporary counter in unpack_coeff
+  # r13
+  # r14 Save information for 32 bit transform (coeffs location)
+  # r15 Save information for 32 bit transform (number of transforms)
+  cmp r2,0
+  beq done16x16s
+block_loop:
+  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
+  cmp r10,0
+  mov r6, r1
+  beq not_compressed
+  sub r10, 1
+  bl unpack16x16
+not_compressed:
+  #mov r6,r1 # DEBUG without compress
+  vldh HX(0++,0)+r0,(r6 += r3) REP 16
+  #eor r0,r8
+  #add r1,r7
+  # Prefetch the next block
+  #bl unpack16x16
+  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
+  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
+  #eor r0,r8
+  #sub r1,r7
+
+  # Transform the current block
+  bl col_trans_16
+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+
+  bl col_trans_16
+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+
+  # Save results - note there has been a transposition during the processing so we save columns
+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+
+  # Move onto next block
+  eor r0,r8
+  add r1,r7
+
+  addcmpbgt r2,-1,0,block_loop
+done16x16s:
+
+  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
+  # Now go and do any 32x32 transforms
+  b hevc_trans_32x32
+
+  pop r6-r15, pc
+# This returns a value in r6 that says where to load the data from.
+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
+unpack16x16:
+# Clear out destination
+  vmov HX(0,0)+r0,0
+  mov r6, r11
+  vsth HX(0,0)+r0,(r6 += r3) REP 16
+  mov r5, r1 # Moving pointer to input coefficients
+unpack_outer_loop:
+  # Loop until we find the end
+  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
+  sub r6,r11,32
+  #add r6,pc,packed_data-$ # Packed data
+  vsth HX(0,0)+r0,(r6)  # Store into packed data
+  mov r12,0
+unpack_loop:
+  ld r4,(r6)
+  add r6,r6,4
+  lsr r9,r4,16 # r9 is destination value
+  cmp r4,0 # {value,index}
+  extu r4,8
+  beq done_unpack
+  sth r9,(r11, r4)
+  addcmpblt r12,1,8,unpack_loop
+#  # Read next 16
+  add r5,32
+  b unpack_outer_loop
+done_unpack:
+#  # Set new load location
+  mov r6, r11
+  #add r6,pc,unpacked_data-$
+#  # Restore constants
+  mov r4,64
+  mov r5,TRANS_RND2
+#  pop r6-r15, pc
+  b lr
+
+# r1,r2,r3 r7,r8 should be preserved
+# HX(0++,0)+r0 is the block to be transformed
+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+# Use HY(48,0) for intermediate results
+# r0 can be used, but should be returned to its original value at the end
+col_trans_16:
+  add r6,r0,16 # Final value for this loop
+col_trans_16_loop:
+  # First compute partial products for a single column
+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+  # Then sum up the results and place back
+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+  addcmpblt r0,1,r6,col_trans_16_loop
+  sub r0,16  # put r0 back to its original value
+  b lr
+
+col_trans_odd_16:
+  add r6,r0,16 # Final value for this loop
+col_trans_odd_16_loop:
+  # First compute partial products for a single column
+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+  # Then sum up the results and place back
+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+  addcmpblt r0,1,r6,col_trans_odd_16_loop
+  sub r0,16  # put r0 back to its original value
+  b lr
+
+# r1/r10 input pointer
+# r0,r4,r5,r6 free
+# r8/r9 output storage
+#
+# Store packed coefficients at r9-32
+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
+unpack32x32:
+# Clear out destination
+  vmov HX(0,0),0
+  add r0, r9, 32*32*2 # Unpacked buffer
+  mov r4, 32
+  vsth HX(0,0),(r0 += r4) REP 64
+unpack_outer_loop32:
+  # Loop until we find the end
+  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
+  sub r6,r9,32
+  #add r6,pc,packed_data-$ # Packed data
+  vsth HX(0,0),(r6)  # Store into packed data
+  mov r8,0
+unpack_loop32:
+  ld r4,(r6)
+  add r6,r6,4
+  lsr r5,r4,16 # r5 is destination value
+  cmp r4,0 # {value,index}
+  extu r4,10
+  beq done_unpack
+  sth r5,(r0, r4)
+  addcmpblt r8,1,8,unpack_loop32
+#  # Read next 16
+  add r1,32
+  b unpack_outer_loop32
+done_unpack32:
+  b lr
+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
+#
+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
+hevc_trans_32x32:
+  mov r1,r14 # coeffs
+  mov r2,r15 # num
+  lsr r15,r15,16 # Number that are packed
+  extu r2,16 # Total number
+
+  # Fetch odd transform matrix
+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+  #add r0, 16*16*2
+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+
+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+  mov r7, 16*16*2 # Total block size
+
+.if USE_STACK
+  # Stack base allocation
+  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
+  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
+  add r8,sp,63
+  lsr r8,5
+  lsl r8,5
+.else
+#:version r8
+  .half 0x00e8 #AUTOINSERTED
+  btst r8,16
+#:add r8,pc,intermediate_results-$
+  .half 0xbfe8
+  .half intermediate_results-($-2)
+  beq on_vpu1
+  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
+on_vpu1:
+.endif
+  mov r9,r8  # Backup of the temporary storage
+  mov r10,r1 # Backup of the coefficient buffer
+
+  cmp r2,0
+  beq done32x32s
+block_loop32:
+
+  # Transform the first 16 columns
+  mov r1,r10  # Input Coefficient buffer
+  mov r8,r9   # Output temporary storage
+  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
+  cmp r2,r15
+  bgt not_compressed_32
+  bl unpack32x32
+  add r1,r9,32*32*2   # Uncompressed into temporary storage
+  mov r8,r9           # Transform into here
+not_compressed_32:
+  # COLUMN TRANSFORM
+  mov r4, 64 # Constant used for rounding first pass
+  mov r5, 9 # left shift used for rounding first pass
+
+  bl trans32
+  # Transform the second 16 columns
+  add r8,32*16*2
+  add r1,32
+  bl trans32
+
+  # ROW TRANSFORM
+  mov r4, TRANS_RND2 # Constant used for rounding second pass
+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
+
+  mov r1,r9  # Input temporary storage
+  mov r8,r10   # Output Coefficient buffer
+  bl trans32
+  # Transform the second 16 columns
+  add r8,32*16*2
+  add r1,32
+  bl trans32
+
+  add r10, 32*32*2 # move onto next block of coefficients
+  addcmpbgt r2,-1,0,block_loop32
+done32x32s:
+
+.if USE_STACK
+  add sp,sp,32*32*4+64# Restore stack
+.endif
+
+  pop r6-r15, pc
+
+trans32:
+  push lr
+  # We can no longer afford the VRF space to do prefetching when doing 32x32
+  # Fetch the even rows
+  vldh HX(0++,0),(r1 += r3) REP 16
+  # Fetch the odd rows
+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+
+  # Transform the even rows using even matrix
+  mov r0, 0 # Even rows
+  bl col_trans_16
+
+  # Now transform the odd rows using odd matrix
+  mov r0, 64*16 # Odd rows
+  bl col_trans_odd_16
+
+  # Now apply butterfly to compute the first 16 results
+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+  # 16bit results now in HX(48,32)
+  mov r0,r8
+  mov r6,32*2
+  vsth VX(48,32++),(r0+=r6) REP 16
+
+  # Now apply butterfly to compute the second 16 results (in reverse order)
+  vsub HY(63,0),HY(0 ,0),HY(16,0)
+  vsub HY(62,0),HY(1 ,0),HY(17,0)
+  vsub HY(61,0),HY(2 ,0),HY(18,0)
+  vsub HY(60,0),HY(3 ,0),HY(19,0)
+  vsub HY(59,0),HY(4 ,0),HY(20,0)
+  vsub HY(58,0),HY(5 ,0),HY(21,0)
+  vsub HY(57,0),HY(6 ,0),HY(22,0)
+  vsub HY(56,0),HY(7 ,0),HY(23,0)
+  vsub HY(55,0),HY(8 ,0),HY(24,0)
+  vsub HY(54,0),HY(9 ,0),HY(25,0)
+  vsub HY(53,0),HY(10,0),HY(26,0)
+  vsub HY(52,0),HY(11,0),HY(27,0)
+  vsub HY(51,0),HY(12,0),HY(28,0)
+  vsub HY(50,0),HY(13,0),HY(29,0)
+  vsub HY(49,0),HY(14,0),HY(30,0)
+  vsub HY(48,0),HY(15,0),HY(31,0)
+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+  add r0,r8,32
+  vsth VX(48,32++),(r0+=r6) REP 16
+  pop pc
+
+.if USE_STACK == 0
+  .balign 32
+
+# .space directives generate 0's in the bin so avoid unnecessary padding by
+# just setting to appropriate value
+.equ intermediate_results, $+16*2
+
+# Layout goes:
+#
+#packed_buffer:
+#  .space 16*2
+#intermediate_results:
+#  .space 32*32*2
+#unpacked_buffer:
+#  .space 32*32*2
+#
+#packed_buffer2:
+#  .space 16*2
+#intermediate_results2:
+#  .space 32*32*2
+#unpacked_buffer2:
+#  .space 32*32*2
+.endif
+
+
diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
new file mode 100644
index 0000000000..1c364492d0
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform10.h
@@ -0,0 +1,94 @@
+static const unsigned char rpi_hevc_transform10 [] = {
+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
+0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
+0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
+};
diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
new file mode 100644
index 0000000000..1128a2c054
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform8.h
@@ -0,0 +1,94 @@
+static const unsigned char rpi_hevc_transform8 [] = {
+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
+0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
+0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
+};
diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
new file mode 100644
index 0000000000..e651e5c565
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.c
@@ -0,0 +1,6134 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Mickael Raulet
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/display.h"
+#include "libavutil/internal.h"
+#include "libavutil/mastering_display_metadata.h"
+#include "libavutil/md5.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/stereo3d.h"
+
+#include "decode.h"
+#include "bswapdsp.h"
+#include "bytestream.h"
+#include "golomb.h"
+#include "hevc.h"
+#include "rpi_hevc_data.h"
+#include "rpi_hevc_parse.h"
+#include "rpi_hevcdec.h"
+#include "rpi_hevc_cabac_fns.h"
+#include "profiles.h"
+#include "hwconfig.h"
+
+#include "rpi_zc_frames.h"
+#include "rpi_qpu.h"
+#include "rpi_hevc_shader.h"
+#include "rpi_hevc_shader_cmd.h"
+#include "rpi_hevc_shader_template.h"
+#include "rpi_zc.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#include "pthread.h"
+#include <stdatomic.h>
+
+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+
+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+
+#ifndef av_mod_uintp2
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+{
+    return a & ((1 << p) - 1);
+}
+#   define av_mod_uintp2   av_mod_uintp2_c
+#endif
+
+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
+
+#define MC_DUMMY_X (-32)
+#define MC_DUMMY_Y (-32)
+
+// UV & Y both have min 4x4 pred (no 2x2 chroma)
+// Allow for even spread +1 for setup, +1 for rounding
+// As we have load sharing this can (in theory) be exceeded so we have to
+// check after each CTU, but it is a good base size
+
+// Worst case (all 4x4) commands per CTU
+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
+
+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
+
+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
+
+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
+
+// Total cmds to allocate - allow for slack & setup
+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
+
+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
+
+// The QPU code for UV blocks only works up to a block width of 8
+#define RPI_CHROMA_BLOCK_WIDTH 8
+
+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+
+
+// Actual filter goes -ve, +ve, +ve, -ve using these values
+static const uint32_t rpi_filter_coefs[8] = {
+        ENCODE_COEFFS(  0,  64,   0,  0),
+        ENCODE_COEFFS(  2,  58,  10,  2),
+        ENCODE_COEFFS(  4,  54,  16,  2),
+        ENCODE_COEFFS(  6,  46,  28,  4),
+        ENCODE_COEFFS(  4,  36,  36,  4),
+        ENCODE_COEFFS(  4,  28,  46,  6),
+        ENCODE_COEFFS(  2,  16,  54,  4),
+        ENCODE_COEFFS(  2,  10,  58,  2)
+};
+
+// Function arrays by QPU
+
+static const int * const inter_pred_setup_c_qpu[12] = {
+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
+};
+
+static const int * const inter_pred_setup_c10_qpu[12] = {
+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
+};
+
+static const int * const inter_pred_setup_y_qpu[12] = {
+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
+};
+
+static const int * const inter_pred_setup_y10_qpu[12] = {
+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
+};
+
+static const int * const inter_pred_sync_qpu[12] = {
+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
+};
+
+static const int * const inter_pred_sync10_qpu[12] = {
+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
+};
+
+static const int * const inter_pred_exit_c_qpu[12] = {
+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
+};
+
+static const int * const inter_pred_exit_c10_qpu[12] = {
+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
+};
+
+static const int * const inter_pred_exit_y_qpu[12] = {
+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
+};
+
+static const int * const inter_pred_exit_y10_qpu[12] = {
+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
+};
+
+typedef struct ipe_chan_info_s
+{
+    const uint8_t bit_depth;
+    const uint8_t n;
+    const int * const * setup_fns;
+    const int * const * sync_fns;
+    const int * const * exit_fns;
+} ipe_chan_info_t;
+
+typedef struct ipe_init_info_s
+{
+    ipe_chan_info_t luma;
+    ipe_chan_info_t chroma;
+} ipe_init_info_t;
+
+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
+{
+    switch (ln)
+    {
+        default:  // normally 0
+            *b = a;
+            break;
+        case 1:
+            a |= a << 8;
+            *(uint16_t *)b = a;
+            b += stride;
+            *(uint16_t *)b = a;
+            break;
+        case 2:
+            a |= a << 8;
+            a |= a << 16;
+            *(uint32_t *)b = a;
+            b += stride;
+            *(uint32_t *)b = a;
+            b += stride;
+            *(uint32_t *)b = a;
+            b += stride;
+            *(uint32_t *)b = a;
+            break;
+        case 3:
+        {
+            unsigned int i;
+            uint64_t d;
+            a |= a << 8;
+            a |= a << 16;
+            d = ((uint64_t)a << 32) | a;
+            for (i = 0; i != 8; ++i, b += stride)
+                *(uint64_t *)b = d;
+            break;
+        }
+        case 4:
+        {
+            unsigned int i;
+            uint64_t d;
+            a |= a << 8;
+            a |= a << 16;
+            d = ((uint64_t)a << 32) | a;
+            for (i = 0; i != 16; ++i, b += stride)
+            {
+                *(uint64_t *)b = d;
+                *(uint64_t *)(b + 8) = d;
+            }
+            break;
+        }
+    }
+}
+
+// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
+// (4 not required)
+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
+{
+    switch (ln)
+    {
+        default:  // 0 or -1
+            *b_u = a;
+            *b_l = a;
+            break;
+        case 1:
+            a |= a << 8;
+            *(uint16_t *)b_u = a;
+            *(uint16_t *)b_l = a;
+            break;
+        case 2:
+            a |= a << 8;
+            a |= a << 16;
+            *(uint32_t *)b_u = a;
+            *(uint32_t *)b_l = a;
+            break;
+        case 3:
+            a |= a << 8;
+            a |= a << 16;
+            *(uint32_t *)b_u = a;
+            *(uint32_t *)(b_u + 4) = a;
+            *(uint32_t *)b_l = a;
+            *(uint32_t *)(b_l + 4) = a;
+            break;
+        case 4:
+            a |= a << 8;
+            a |= a << 16;
+            *(uint32_t *)b_u = a;
+            *(uint32_t *)(b_u + 4) = a;
+            *(uint32_t *)(b_u + 8) = a;
+            *(uint32_t *)(b_u + 12) = a;
+            *(uint32_t *)b_l = a;
+            *(uint32_t *)(b_l + 4) = a;
+            *(uint32_t *)(b_l + 8) = a;
+            *(uint32_t *)(b_l + 12) = a;
+            break;
+    }
+}
+
+static void zap_cabac_stash(uint8_t * b, const int ln)
+{
+    switch (ln)
+    {
+        default:  // 0
+            *b = 0;
+            break;
+        case 1:
+            *(uint16_t *)b = 0;
+            break;
+        case 2:
+            *(uint32_t *)b = 0;
+            break;
+        case 3:
+            *(uint32_t *)b = 0;
+            *(uint32_t *)(b + 4) = 0;
+            break;
+    }
+}
+
+
+
+// Set a small square block of bits in a bitmap
+// Bits must be aligned on their size boundry (which will be true of all split CBs)
+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
+{
+    unsigned int n;
+    const unsigned int sh = (x & 7);
+
+    f += (x >> 3);
+
+    av_assert2(ln <= 3);
+    av_assert2((x & ((1 << ln) - 1)) == 0);
+
+    switch (ln)
+    {
+        default:  // 1
+            f[0] |= 1 << sh;
+            break;
+        case 1:  // 3 * 2
+            n = 3 << sh;
+            f[0] |= n;
+            f[stride] |= n;
+            break;
+        case 2:  // 0xf * 4
+            n = 0xf << sh;
+            f[0] |= n;
+            f[stride] |= n;
+            f[stride * 2] |= n;
+            f[stride * 3] |= n;
+            break;
+        case 3:  // 0xff * 8
+            for (n = 0; n != 8; ++n, f += stride)
+                *f = 0xff;
+            break;
+    }
+}
+
+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
+   {  // 8
+      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
+      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
+   },
+   {  // 9
+      .luma =   {0},
+      .chroma = {0}
+   },
+   {  // 10
+      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
+      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
+   }
+
+};
+
+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
+{
+    const unsigned int n = ici->n;
+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
+
+    ipe->n = n;
+    ipe->max_fill = q1_size - ipe->min_gap;
+    for(unsigned int i = 0; i < n; i++) {
+        HEVCRpiInterPredQ * const q = ipe->q + i;
+        q->qpu_mc_curr = q->qpu_mc_base =
+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
+        q->code_setup = qpu_fn(ici->setup_fns[i]);
+        q->code_sync = qpu_fn(ici->sync_fns[i]);
+        q->code_exit = qpu_fn(ici->exit_fns[i]);
+    }
+}
+
+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
+{
+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
+
+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
+}
+
+// Unsigned Trivial MOD
+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
+{
+    return x >= n ? x - n : x;
+}
+
+// returns pq->job_n++
+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
+{
+    unsigned int const x2 = pq->job_n;
+    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
+    return x2;
+}
+
+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
+{
+    pq->terminate = 0;
+    pq->job_n = 0;
+    pq->context = s;
+    pq->worker = worker;
+    pq->psem_out = psem_out;
+    pq->pass_n = n;
+    pq->started = 0;
+    sem_init(&pq->sem_in, 0, 0);
+}
+
+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
+{
+    sem_destroy(&pq->sem_in);
+}
+
+static inline void rpi_sem_wait(sem_t * const sem)
+{
+    while (sem_wait(sem) != 0) {
+        av_assert0(errno == EINTR);
+    }
+}
+
+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
+{
+    sem_post(&pq->sem_in);
+}
+
+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
+{
+    // Do the various passes - common with the worker code
+    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
+        s->passq[i].worker(s, jb);
+    }
+}
+
+
+#if 0
+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
+{
+    int x;
+    sem_getvalue((sem_t *)&jbc->sem_out, &x);
+    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
+}
+#endif
+
+
+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
+{
+    HEVCRpiJob * jb;
+    HEVCRpiJobGlobal * const jbg = jbc->jbg;
+
+    pthread_mutex_lock(&jbg->lock);
+    // Check local 1st
+    if ((jb = jbc->jb1) != NULL)
+    {
+        // Only 1 - very easy :-)
+        jbc->jb1 = NULL;
+    }
+    else
+    {
+        // Now look for global free chain
+        if ((jb = jbg->free1) != NULL)
+        {
+            // Found one - unlink it
+            jbg->free1 = jb->next;
+            jb->next = NULL;
+        }
+        else
+        {
+            // Out of places to look - wait for one to become free - add to Qs
+
+            // Global
+            // If "good" lc then add after the last "good" el in the chain
+            // otherwise add to the tail
+            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
+            {
+                // Add to end as we had to wait last time or wait Q empty
+                if ((lc->jw_prev = jbg->wait_tail) == NULL)
+                    jbg->wait_head = lc;
+                else
+                    lc->jw_prev->jw_next = lc;
+                lc->jw_next = NULL;
+                jbg->wait_tail = lc;
+            }
+            else
+            {
+                // This is a "good" lc that we need to poke into the middle
+                // of the Q
+                // We know that the Q isn't empty and there is at least one
+                // !last_progess_good el in it from the previous test
+
+                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
+
+                if (p == NULL)
+                {
+                    // No current good els - add to head
+                    lc->jw_next = jbg->wait_head;
+                    jbg->wait_head = lc;
+                }
+                else
+                {
+                    lc->jw_next = p->jw_next;
+                    p->jw_next = lc;
+                }
+
+                lc->jw_next->jw_prev = lc;
+                lc->jw_prev = p;
+            }
+
+            // If "good" then we are now the last good waiting el
+            if (lc->last_progress_good)
+                jbg->wait_good = lc;
+
+            // Local
+            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
+                jbc->lcw_head = lc;
+            else
+                lc->ljw_prev->ljw_next = lc;
+            lc->ljw_next = NULL;
+            jbc->lcw_tail = lc;
+        }
+    }
+
+    pthread_mutex_unlock(&jbg->lock);
+
+    if (jb == NULL)  // Need to wait
+    {
+        rpi_sem_wait(&lc->jw_sem);
+        jb = lc->jw_job;  // Set by free code
+    }
+
+    return jb;
+}
+
+
+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
+{
+    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
+    HEVCRpiJobCtl * jbc = jb->jbc_local;
+    HEVCRpiLocalContext * lc = NULL;
+
+    pthread_mutex_lock(&jbg->lock);
+
+    if (jbc != NULL)
+    {
+        av_assert1(jbc->jb1 == NULL);
+
+        // Release to Local if nothing waiting there
+        if ((lc = jbc->lcw_head) == NULL)
+            jbc->jb1 = jb;
+    }
+    else
+    {
+        // Release to global if nothing waiting there
+        if ((lc = jbg->wait_head) == NULL)
+        {
+            jb->next = jbg->free1;
+            jbg->free1 = jb;
+        }
+        else
+        {
+            // ? seems somehow mildy ugly...
+            jbc = lc->context->jbc;
+        }
+    }
+
+    if (lc != NULL)
+    {
+        // Something was waiting
+
+        // Unlink
+        // Global
+        if (lc->jw_next == NULL)
+            jbg->wait_tail = lc->jw_prev;
+        else
+            lc->jw_next->jw_prev = lc->jw_prev;
+
+        if (lc->jw_prev == NULL)
+            jbg->wait_head = lc->jw_next;
+        else
+            lc->jw_prev->jw_next = lc->jw_next;
+
+        // Local
+        if (lc->ljw_next == NULL)
+            jbc->lcw_tail = lc->ljw_prev;
+        else
+            lc->ljw_next->ljw_prev = lc->ljw_prev;
+
+        if (lc->ljw_prev == NULL)
+            jbc->lcw_head = lc->ljw_next;
+        else
+            lc->ljw_prev->ljw_next = lc->ljw_next;
+
+        // Update good if required
+        if (jbg->wait_good == lc)
+            jbg->wait_good = lc->jw_prev;
+
+        // Prod
+        lc->jw_job = jb;
+        sem_post(&lc->jw_sem);
+    }
+
+    pthread_mutex_unlock(&jbg->lock);
+}
+
+static void job_lc_kill(HEVCRpiLocalContext * const lc)
+{
+    sem_destroy(&lc->jw_sem);
+}
+
+static void job_lc_init(HEVCRpiLocalContext * const lc)
+{
+    lc->jw_next = NULL;
+    lc->jw_prev = NULL;
+    lc->ljw_next = NULL;
+    lc->ljw_prev = NULL;
+    lc->jw_job = NULL;
+    sem_init(&lc->jw_sem,  0, 0);
+}
+
+// Returns:
+//  0 if we have waited for MV or expect to wait for recon
+//  1 if we haven't waited for MV & do not need to wait for recon
+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
+{
+    if (jb->waited) // reset by rpi_begin
+        return 0;
+    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
+    {
+        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
+                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
+            return 0;
+    }
+    return 1;
+}
+
+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
+{
+    HEVCRpiJobCtl *const jbc = s->jbc;
+    HEVCRpiJob * const jb = lc->jb0;
+
+    av_assert1(jb != NULL);
+
+    if (jb->ctu_ts_last < 0) {
+        return;
+    }
+
+    lc->last_progress_good = progress_good(s, jb);
+    jb->waited = !lc->last_progress_good;
+    lc->jb0 = NULL;
+
+    if (s->offload_recon)
+    {
+        pthread_mutex_lock(&jbc->in_lock);
+        jbc->offloadq[jbc->offload_in] = jb;
+        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
+        pthread_mutex_unlock(&jbc->in_lock);
+
+        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
+    }
+    else
+    {
+        pass_queue_do_all(s, jb);  // Consumes job before return
+    }
+}
+
+
+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+// available to receive the next job.
+//
+// Now safe against multiple callers - needed for tiles
+// "normal" and WPP will only call here one at a time
+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    HEVCRpiJobCtl * const jbc = s->jbc;
+
+    // It is legit for us to already have a job allocated - do nothing in this case
+    if (lc->jb0 != NULL)
+        return;
+
+    if (s->offload_recon)
+        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
+
+    lc->jb0 = job_alloc(jbc, lc);
+
+    rpi_begin(s, lc->jb0, lc->ts);
+}
+
+// Free up a job without submission
+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    HEVCRpiJobCtl * const jbc = s->jbc;
+    HEVCRpiJob * const jb = lc->jb0;
+
+    if (jb == NULL) {
+        return;
+    }
+
+    lc->jb0 = NULL;
+
+    job_free(jbc, jb);
+
+    // If offload then poke sem_out too
+    if (s->offload_recon) {
+        sem_post(&jbc->sem_out);
+    }
+}
+
+
+// Call this to wait for all jobs to have completed at the end of a frame
+// Slightly icky as there is no clean way to wait for a sem to count up
+// Not reentrant - call on main thread only
+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
+{
+    HEVCRpiJobCtl * const jbc = s->jbc;
+    int i = 0;
+
+    // We shouldn't reach here with an unsubmitted job
+    av_assert1(lc->jb0 == NULL);
+
+    // If no offload then there can't be anything to wait for
+    if (!s->offload_recon) {
+        return;
+    }
+
+    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
+    {
+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
+            rpi_sem_wait(&jbc->sem_out);
+        }
+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
+            sem_post(&jbc->sem_out);
+        }
+    }
+}
+
+static void * pass_worker(void *arg)
+{
+    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
+    HEVCRpiContext *const s = pq->context;
+
+    for (;;)
+    {
+        rpi_sem_wait(&pq->sem_in);
+
+        if (pq->terminate)
+            break;
+
+        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
+        // * should really set jb->passes_done here
+
+        sem_post(pq->psem_out);
+    }
+    return NULL;
+}
+
+static void pass_queues_start_all(HEVCRpiContext *const s)
+{
+    unsigned int i;
+    HEVCRpiPassQueue * const pqs = s->passq;
+
+    for (i = 0; i != RPI_PASSES; ++i)
+    {
+        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
+        pqs[i].started = 1;
+    }
+}
+
+static void pass_queues_term_all(HEVCRpiContext *const s)
+{
+    unsigned int i;
+    HEVCRpiPassQueue * const pqs = s->passq;
+
+    for (i = 0; i != RPI_PASSES; ++i)
+        pqs[i].terminate = 1;
+    for (i = 0; i != RPI_PASSES; ++i)
+    {
+        if (pqs[i].started)
+            sem_post(&pqs[i].sem_in);
+    }
+    for (i = 0; i != RPI_PASSES; ++i)
+    {
+        if (pqs[i].started) {
+            pthread_join(pqs[i].thread, NULL);
+            pqs[i].started = 0;
+        }
+    }
+}
+
+static void pass_queues_kill_all(HEVCRpiContext *const s)
+{
+    unsigned int i;
+    HEVCRpiPassQueue * const pqs = s->passq;
+
+    for (i = 0; i != RPI_PASSES; ++i)
+        pass_queue_kill(pqs + i);
+}
+
+
+static void worker_pic_free_one(HEVCRpiJob * const jb)
+{
+    // Free coeff stuff - allocation not the same for all buffers
+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+
+    if (cf->s[0].buf != NULL)
+        av_freep(&cf->mptr);
+    if (cf->s[2].buf != NULL)
+        gpu_free(&cf->gptr);
+    memset(cf, 0, sizeof(*cf));
+}
+
+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
+{
+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+
+    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
+        goto fail;
+    cf->s[2].buf = (int16_t *)cf->gptr.arm;
+    cf->s[3].buf = cf->s[2].buf + coeff_count;
+
+    // Must be 64 byte aligned for our zero zapping code so over-allocate &
+    // round
+    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
+        goto fail;
+    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
+    return 0;
+
+fail:
+    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
+    worker_pic_free_one(jb);
+    return -1;
+}
+
+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
+{
+    unsigned int i;
+    for (i = 0; i != 4; ++i) {
+        cf->s[i].n = 0;
+#if RPI_COMPRESS_COEFFS
+        cf->s[i].packed = 1;
+        cf->s[i].packed_n = 0;
+#endif
+    }
+}
+
+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
+{
+    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
+    cfe->n += n;
+    return coeffs;
+}
+
+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCRpiFrame * const ref, const int val, const int field)
+{
+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
+        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
+        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
+        sem_t * sem = NULL;
+
+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
+            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
+
+            av_assert1(pwait->req == -1 && pwait->next == NULL);
+            jb->waited = 1;  // Remember that we had to wait for later scheduling
+
+            pwait->req = val;
+            pwait->next = NULL;
+            if (pstate->first == NULL)
+                pstate->first = pwait;
+            else
+                pstate->last->next = pwait;
+            pstate->last = pwait;
+            sem = &pwait->sem;
+        }
+        pthread_mutex_unlock(&pstate->lock);
+
+        if (sem != NULL) {
+            rpi_sem_wait(sem);
+        }
+    }
+}
+
+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
+{
+    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
+
+    ((int *)s->ref->tf.progress->data)[field] = val;
+
+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+    {
+        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
+        HEVCRpiFrameProgressWait * pwait;
+
+        while ((pwait = *ppwait) != NULL) {
+            if (pwait->req > val)
+            {
+                ppwait = &pwait->next;
+                pstate->last = pwait;
+            }
+            else
+            {
+                *ppwait = pwait->next;
+                pwait->req = -1;
+                pwait->next = NULL;
+                sem_post(&pwait->sem);
+            }
+        }
+    }
+    pthread_mutex_unlock(&pstate->lock);
+}
+
+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
+{
+    pstate->first = NULL;
+    pstate->last = NULL;
+    pthread_mutex_init(&pstate->lock, NULL);
+}
+
+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
+{
+    pwait->req = -1;
+    pwait->next = NULL;
+    sem_init(&pwait->sem, 0, 0);
+}
+
+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
+{
+    av_assert1(pstate->first == NULL);
+    pthread_mutex_destroy(&pstate->lock);
+}
+
+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
+{
+    sem_destroy(&pwait->sem);
+}
+
+
+/**
+ * NOTE: Each function hls_foo correspond to the function foo in the
+ * specification (HLS stands for High Level Syntax).
+ */
+
+/**
+ * Section 5.7
+ */
+
+// Realloc the entry point arrays
+static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
+{
+    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
+    {
+        // Round up alloc to multiple of 32
+        int a = (n + 31) & ~31;
+
+        // We don't care about the previous contents so probably fastest to simply discard
+        av_freep(&sh->entry_point_offset);
+        av_freep(&sh->offset);
+        av_freep(&sh->size);
+
+        if (a != 0)
+        {
+            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
+            sh->offset = av_malloc_array(a, sizeof(int));
+            sh->size = av_malloc_array(a, sizeof(int));
+
+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
+                sh->num_entry_point_offsets = 0;
+                sh->offsets_allocated = 0;
+                return AVERROR(ENOMEM);
+            }
+        }
+
+        sh->offsets_allocated = a;
+    }
+
+    return 0;
+}
+
+/* free everything allocated  by pic_arrays_init() */
+static void pic_arrays_free(HEVCRpiContext *s)
+{
+    av_freep(&s->sao);
+    av_freep(&s->deblock);
+
+    av_freep(&s->cabac_stash_up);
+    s->cabac_stash_left = NULL;  // freed with _up
+
+    av_freep(&s->mvf_up);
+    av_freep(&s->mvf_left);
+
+    av_freep(&s->is_pcm);
+    av_freep(&s->is_intra_store);
+    s->is_intra = NULL;
+    av_freep(&s->rpl_tab);
+    s->rpl_tab_size = 0;
+
+    av_freep(&s->qp_y_tab);
+    av_freep(&s->tab_slice_address);
+    av_freep(&s->filter_slice_edges);
+
+    av_freep(&s->bs_horizontal);
+    s->bs_vertical = NULL;  // freed with H
+    av_freep(&s->bsf_stash_left);
+    av_freep(&s->bsf_stash_up);
+
+    av_freep(&s->rpl_up);
+    av_freep(&s->rpl_left);
+
+    alloc_entry_points(&s->sh, 0);
+
+    av_buffer_pool_uninit(&s->col_mvf_pool);
+}
+
+/* allocate arrays that depend on frame dimensions */
+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
+{
+    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
+    const unsigned int width            = sps->width;
+    const unsigned int height           = sps->height;
+    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
+                           ((height >> log2_min_cb_size) + 1);
+    const unsigned int ctb_count        = sps->ctb_size;
+
+    {
+        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
+        unsigned int h = ((height + 15) & ~15);
+
+        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
+        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
+    }
+
+    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
+    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
+    if (!s->sao || !s->deblock)
+        goto fail;
+
+    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
+    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
+    if (s->cabac_stash_up == NULL)
+        goto fail;
+
+    // Round width up to max ctb size
+    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
+    // * Only needed if we have H tiles
+    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
+
+    // We can overread by 1 line & one byte in deblock so alloc & zero
+    // We don't need to zero the extra @ start of frame as it will never be
+    // written
+    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
+    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
+    if (s->is_pcm == NULL || s->is_intra_store == NULL)
+        goto fail;
+
+    s->filter_slice_edges = av_mallocz(ctb_count);
+    s->tab_slice_address  = av_malloc_array(ctb_count,
+                                      sizeof(*s->tab_slice_address));
+    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
+                                      sizeof(*s->qp_y_tab));
+    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
+        goto fail;
+
+    s->bs_horizontal = av_mallocz(s->bs_size * 2);
+    s->bs_vertical   = s->bs_horizontal + s->bs_size;
+    if (s->bs_horizontal == NULL)
+        goto fail;
+
+    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
+    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
+    if (s->rpl_left == NULL || s->rpl_up == NULL)
+        goto fail;
+
+    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
+        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
+        goto fail;
+
+    s->col_mvf_stride = (width + 15) >> 4;
+    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
+                                          av_buffer_allocz);
+    if (s->col_mvf_pool == NULL)
+        goto fail;
+
+    return 0;
+
+fail:
+    pic_arrays_free(s);
+    return AVERROR(ENOMEM);
+}
+
+static void default_pred_weight_table(HEVCRpiContext * const s)
+{
+  unsigned int i;
+  const unsigned int wt = 1 << QPU_MC_DENOM;
+  s->sh.luma_log2_weight_denom = 0;
+  s->sh.chroma_log2_weight_denom = 0;
+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+      s->sh.luma_weight_l0[i] = wt;
+      s->sh.luma_offset_l0[i] = 0;
+      s->sh.chroma_weight_l0[i][0] = wt;
+      s->sh.chroma_weight_l0[i][1] = wt;
+      s->sh.chroma_offset_l0[i][0] = 0;
+      s->sh.chroma_offset_l0[i][1] = 0;
+  }
+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+      s->sh.luma_weight_l1[i] = wt;
+      s->sh.luma_offset_l1[i] = 0;
+      s->sh.chroma_weight_l1[i][0] = wt;
+      s->sh.chroma_weight_l1[i][1] = wt;
+      s->sh.chroma_offset_l1[i][0] = 0;
+      s->sh.chroma_offset_l1[i][1] = 0;
+  }
+}
+
+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
+                       const unsigned int refs,
+                       int16_t * luma_weight,   int16_t * luma_offset,
+                       int16_t * chroma_weight, int16_t * chroma_offset)
+{
+    unsigned int luma_flags;
+    unsigned int chroma_flags;
+    unsigned int i;
+    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
+    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
+    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
+    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
+    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
+    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
+
+    if (refs == 0)
+        return 0;
+
+    luma_flags = get_bits(gb, refs);
+    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
+    i = 1 << (refs - 1);
+
+    do
+    {
+        if ((luma_flags & i) != 0)
+        {
+            const int delta_weight = get_se_golomb(gb);
+            const int offset = get_se_golomb(gb);
+            if (delta_weight < -128 || delta_weight > 127 ||
+                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
+            {
+                return AVERROR_INVALIDDATA;
+            }
+            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
+            *luma_offset++ = offset << wp_offset_bd_shift;
+        }
+        else
+        {
+            *luma_weight++ = luma_weight_base;
+            *luma_offset++ = 0;
+        }
+
+        if ((chroma_flags & i) != 0)
+        {
+            unsigned int j;
+            for (j = 0; j != 2; ++j)
+            {
+                const int delta_weight = get_se_golomb(gb);
+                const int delta_offset = get_se_golomb(gb);
+
+                if (delta_weight < -128 || delta_weight > 127 ||
+                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
+                {
+                    return AVERROR_INVALIDDATA;
+                }
+
+                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
+                *chroma_offset++ = av_clip(
+                    wp_offset_half_range + delta_offset -
+                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
+                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
+            }
+        }
+        else
+        {
+            *chroma_weight++ = chroma_weight_base;
+            *chroma_weight++ = chroma_weight_base;
+            *chroma_offset++ = 0;
+            *chroma_offset++ = 0;
+        }
+    } while ((i >>= 1) != 0);
+
+    return 0;
+}
+
+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
+{
+    int err;
+    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
+    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
+
+    if (luma_log2_weight_denom > 7 ||
+        chroma_log2_weight_denom > 7)
+    {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
+               luma_log2_weight_denom, chroma_log2_weight_denom);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
+    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
+
+    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
+                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
+                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
+        (err = get_weights(s, gb, s->sh.nb_refs[L1],
+                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
+                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
+    {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
+        return err;
+    }
+
+    return 0;
+}
+
+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
+{
+    const HEVCRpiSPS *sps = s->ps.sps;
+    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
+    int prev_delta_msb = 0;
+    unsigned int nb_sps = 0, nb_sh;
+    int i;
+
+    rps->nb_refs = 0;
+    if (!sps->long_term_ref_pics_present_flag)
+        return 0;
+
+    if (sps->num_long_term_ref_pics_sps > 0)
+        nb_sps = get_ue_golomb_long(gb);
+    nb_sh = get_ue_golomb_long(gb);
+
+    if (nb_sps > sps->num_long_term_ref_pics_sps)
+        return AVERROR_INVALIDDATA;
+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
+        return AVERROR_INVALIDDATA;
+
+    rps->nb_refs = nb_sh + nb_sps;
+
+    for (i = 0; i < rps->nb_refs; i++) {
+        uint8_t delta_poc_msb_present;
+
+        if (i < nb_sps) {
+            uint8_t lt_idx_sps = 0;
+
+            if (sps->num_long_term_ref_pics_sps > 1)
+                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
+
+            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
+            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
+        } else {
+            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
+            rps->used[i] = get_bits1(gb);
+        }
+
+        delta_poc_msb_present = get_bits1(gb);
+        if (delta_poc_msb_present) {
+            int64_t delta = get_ue_golomb_long(gb);
+            int64_t poc;
+
+            if (i && i != nb_sps)
+                delta += prev_delta_msb;
+
+            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
+            if (poc != (int32_t)poc)
+                return AVERROR_INVALIDDATA;
+            rps->poc[i] = poc;
+            prev_delta_msb = delta;
+        }
+    }
+
+    return 0;
+}
+
+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
+                                 const HEVCRpiSPS *sps)
+{
+    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
+    const HEVCRpiWindow *ow = &sps->output_window;
+    unsigned int num = 0, den = 0;
+
+    avctx->pix_fmt             = sps->pix_fmt;
+    avctx->coded_width         = sps->width;
+    avctx->coded_height        = sps->height;
+    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
+    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
+    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
+    avctx->profile             = sps->ptl.general_ptl.profile_idc;
+    avctx->level               = sps->ptl.general_ptl.level_idc;
+
+    ff_set_sar(avctx, sps->vui.sar);
+
+    if (sps->vui.video_signal_type_present_flag)
+        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
+                                                            : AVCOL_RANGE_MPEG;
+    else
+        avctx->color_range = AVCOL_RANGE_MPEG;
+
+    if (sps->vui.colour_description_present_flag) {
+        avctx->color_primaries = sps->vui.colour_primaries;
+        avctx->color_trc       = sps->vui.transfer_characteristic;
+        avctx->colorspace      = sps->vui.matrix_coeffs;
+    } else {
+        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
+        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
+        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
+    }
+
+    if (vps->vps_timing_info_present_flag) {
+        num = vps->vps_num_units_in_tick;
+        den = vps->vps_time_scale;
+    } else if (sps->vui.vui_timing_info_present_flag) {
+        num = sps->vui.vui_num_units_in_tick;
+        den = sps->vui.vui_time_scale;
+    }
+
+    if (num != 0 && den != 0)
+        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
+                  num, den, 1 << 30);
+}
+
+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
+{
+    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
+
+    // Admit to no h/w formats
+
+    *fmt++ = sps->pix_fmt;
+    *fmt = AV_PIX_FMT_NONE;
+
+    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
+}
+
+static int is_sps_supported(const HEVCRpiSPS * const sps)
+{
+    return av_rpi_is_sand_format(sps->pix_fmt) &&
+           sps->width <= HEVC_RPI_MAX_WIDTH &&
+           sps->height <= HEVC_RPI_MAX_HEIGHT;
+}
+
+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
+                   const enum AVPixelFormat pix_fmt)
+{
+    int ret;
+
+    pic_arrays_free(s);
+    s->ps.sps = NULL;
+    s->ps.vps = NULL;
+
+    if (sps == NULL)
+        return 0;
+
+    if (!is_sps_supported(sps))
+        return AVERROR_DECODER_NOT_FOUND;
+
+    ret = pic_arrays_init(s, sps);
+    if (ret < 0)
+        goto fail;
+
+    export_stream_params(s->avctx, &s->ps, sps);
+
+    s->avctx->pix_fmt = pix_fmt;
+
+    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
+    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
+
+    // * We don't support cross_component_prediction_enabled_flag but as that
+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
+    //   only deal with sand which is never 4:4:4
+    //   [support wouldn't be hard]
+
+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
+
+    av_freep(&s->sao_pixel_buffer_h[0]);
+    av_freep(&s->sao_pixel_buffer_v[0]);
+
+    if (sps->sao_enabled)
+    {
+        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
+        unsigned int c_idx;
+        size_t vsize[3] = {0};
+        size_t hsize[3] = {0};
+
+        for(c_idx = 0; c_idx < c_count; c_idx++) {
+            int w = sps->width >> ctx_hshift(s, c_idx);
+            int h = sps->height >> ctx_vshift(s, c_idx);
+            // ctb height & width are a min of 8 so this must a multiple of 16
+            // so no point rounding up!
+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
+        }
+
+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
+        // when we have plaited chroma
+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
+    }
+
+    s->ps.sps = sps;
+    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
+
+    return 0;
+
+fail:
+    pic_arrays_free(s);
+    s->ps.sps = NULL;
+    return ret;
+}
+
+static inline int qp_offset_valid(const int qp_offset)
+{
+    return qp_offset >= -12 && qp_offset <= 12;
+}
+
+static int hls_slice_header(HEVCRpiContext * const s)
+{
+    GetBitContext * const gb = &s->HEVClc->gb;
+    RpiSliceHeader * const sh   = &s->sh;
+    int i, ret;
+
+    // Coded parameters
+    sh->first_slice_in_pic_flag = get_bits1(gb);
+    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra     = INT_MAX;
+        if (IS_IDR(s))
+            ff_hevc_rpi_clear_refs(s);
+    }
+    sh->no_output_of_prior_pics_flag = 0;
+    if (IS_IRAP(s))
+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
+
+    sh->pps_id = get_ue_golomb_long(gb);
+    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
+        return AVERROR_INVALIDDATA;
+    }
+    if (!sh->first_slice_in_pic_flag &&
+        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
+        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
+    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
+        sh->no_output_of_prior_pics_flag = 1;
+
+    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
+        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
+        const HEVCRpiSPS *last_sps = s->ps.sps;
+        enum AVPixelFormat pix_fmt;
+
+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
+            if (sps->width != last_sps->width || sps->height != last_sps->height ||
+                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
+                sh->no_output_of_prior_pics_flag = 0;
+        }
+        ff_hevc_rpi_clear_refs(s);
+
+        ret = set_sps(s, sps, sps->pix_fmt);
+        if (ret < 0)
+            return ret;
+
+        pix_fmt = get_format(s, sps);
+        if (pix_fmt < 0)
+            return pix_fmt;
+
+//        ret = set_sps(s, sps, pix_fmt);
+//        if (ret < 0)
+//            return ret;
+
+        s->avctx->pix_fmt = pix_fmt;
+
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra     = INT_MAX;
+    }
+
+    sh->dependent_slice_segment_flag = 0;
+    if (!sh->first_slice_in_pic_flag) {
+        int slice_address_length;
+
+        if (s->ps.pps->dependent_slice_segments_enabled_flag)
+            sh->dependent_slice_segment_flag = get_bits1(gb);
+
+        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
+        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Invalid slice segment address: %u.\n",
+                   sh->slice_segment_addr);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (!sh->dependent_slice_segment_flag) {
+            sh->slice_addr = sh->slice_segment_addr;
+            s->slice_idx++;
+        }
+    } else {
+        sh->slice_segment_addr = sh->slice_addr = 0;
+        s->slice_idx           = 0;
+        s->slice_initialized   = 0;
+    }
+
+    if (!sh->dependent_slice_segment_flag) {
+        s->slice_initialized = 0;
+
+        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
+            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
+
+        sh->slice_type = get_ue_golomb_long(gb);
+        if (!(sh->slice_type == HEVC_SLICE_I ||
+              sh->slice_type == HEVC_SLICE_P ||
+              sh->slice_type == HEVC_SLICE_B)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
+                   sh->slice_type);
+            return AVERROR_INVALIDDATA;
+        }
+        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
+            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // when flag is not present, picture is inferred to be output
+        sh->pic_output_flag = 1;
+        if (s->ps.pps->output_flag_present_flag)
+            sh->pic_output_flag = get_bits1(gb);
+
+        if (s->ps.sps->separate_colour_plane_flag)
+            sh->colour_plane_id = get_bits(gb, 2);
+
+        if (!IS_IDR(s)) {
+            int poc, pos;
+
+            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
+            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
+            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
+                av_log(s->avctx, AV_LOG_WARNING,
+                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+                poc = s->poc;
+            }
+            s->poc = poc;
+
+            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
+            pos = get_bits_left(gb);
+            if (!sh->short_term_ref_pic_set_sps_flag) {
+                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
+                if (ret < 0)
+                    return ret;
+
+                sh->short_term_rps = &sh->slice_rps;
+            } else {
+                int numbits, rps_idx;
+
+                if (!s->ps.sps->nb_st_rps) {
+                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
+                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
+                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
+            }
+            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
+
+            pos = get_bits_left(gb);
+            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
+
+            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
+                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
+            else
+                sh->slice_temporal_mvp_enabled_flag = 0;
+        } else {
+            s->sh.short_term_rps = NULL;
+            s->poc               = 0;
+        }
+
+        /* 8.3.1 */
+        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
+            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
+            s->nal_unit_type != HEVC_NAL_TSA_N   &&
+            s->nal_unit_type != HEVC_NAL_STSA_N  &&
+            s->nal_unit_type != HEVC_NAL_RADL_N  &&
+            s->nal_unit_type != HEVC_NAL_RADL_R  &&
+            s->nal_unit_type != HEVC_NAL_RASL_N  &&
+            s->nal_unit_type != HEVC_NAL_RASL_R)
+            s->pocTid0 = s->poc;
+
+        if (s->ps.sps->sao_enabled) {
+            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
+            if (ctx_cfmt(s) != 0) {
+                sh->slice_sample_adaptive_offset_flag[1] =
+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            }
+        } else {
+            sh->slice_sample_adaptive_offset_flag[0] = 0;
+            sh->slice_sample_adaptive_offset_flag[1] = 0;
+            sh->slice_sample_adaptive_offset_flag[2] = 0;
+        }
+
+        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
+        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
+            int nb_refs;
+
+            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
+            if (sh->slice_type == HEVC_SLICE_B)
+                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
+
+            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
+                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
+                if (sh->slice_type == HEVC_SLICE_B)
+                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
+            }
+            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
+                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
+                       sh->nb_refs[L0], sh->nb_refs[L1]);
+                return AVERROR_INVALIDDATA;
+            }
+
+            sh->rpl_modification_flag[0] = 0;
+            sh->rpl_modification_flag[1] = 0;
+            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
+            if (!nb_refs) {
+                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
+                sh->rpl_modification_flag[0] = get_bits1(gb);
+                if (sh->rpl_modification_flag[0]) {
+                    for (i = 0; i < sh->nb_refs[L0]; i++)
+                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
+                }
+
+                if (sh->slice_type == HEVC_SLICE_B) {
+                    sh->rpl_modification_flag[1] = get_bits1(gb);
+                    if (sh->rpl_modification_flag[1] == 1)
+                        for (i = 0; i < sh->nb_refs[L1]; i++)
+                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
+                }
+            }
+
+            if (sh->slice_type == HEVC_SLICE_B)
+                sh->mvd_l1_zero_flag = get_bits1(gb);
+
+            if (s->ps.pps->cabac_init_present_flag)
+                sh->cabac_init_flag = get_bits1(gb);
+            else
+                sh->cabac_init_flag = 0;
+
+            sh->collocated_ref_idx = 0;
+            if (sh->slice_temporal_mvp_enabled_flag) {
+                sh->collocated_list = L0;
+                if (sh->slice_type == HEVC_SLICE_B)
+                    sh->collocated_list = !get_bits1(gb);
+
+                if (sh->nb_refs[sh->collocated_list] > 1) {
+                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
+                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Invalid collocated_ref_idx: %d.\n",
+                               sh->collocated_ref_idx);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            }
+
+            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
+                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
+            {
+                if ((ret = pred_weight_table(s, gb)) != 0)
+                    return ret;
+            }
+            else
+            {
+                // Give us unit weights
+                default_pred_weight_table(s);
+            }
+
+            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                       "Invalid number of merging MVP candidates: %d.\n",
+                       sh->max_num_merge_cand);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
+        sh->slice_qp_delta = get_se_golomb(gb);
+
+        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
+            sh->slice_cb_qp_offset = get_se_golomb(gb);
+            sh->slice_cr_qp_offset = get_se_golomb(gb);
+            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
+                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
+                !qp_offset_valid(sh->slice_cr_qp_offset) ||
+                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
+            {
+                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
+                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
+                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
+                return AVERROR_INVALIDDATA;
+            }
+        } else
+        {
+            sh->slice_cb_qp_offset = 0;
+            sh->slice_cr_qp_offset = 0;
+        }
+
+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
+        else
+            sh->cu_chroma_qp_offset_enabled_flag = 0;
+
+        if (s->ps.pps->deblocking_filter_control_present_flag) {
+            int deblocking_filter_override_flag = 0;
+
+            if (s->ps.pps->deblocking_filter_override_enabled_flag)
+                deblocking_filter_override_flag = get_bits1(gb);
+
+            if (deblocking_filter_override_flag) {
+                sh->disable_deblocking_filter_flag = get_bits1(gb);
+                if (!sh->disable_deblocking_filter_flag) {
+                    int beta_offset_div2 = get_se_golomb(gb);
+                    int tc_offset_div2   = get_se_golomb(gb) ;
+                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
+                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                            "Invalid deblock filter offsets: %d, %d\n",
+                            beta_offset_div2, tc_offset_div2);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    sh->beta_offset = beta_offset_div2 * 2;
+                    sh->tc_offset   =   tc_offset_div2 * 2;
+                }
+            } else {
+                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
+                sh->beta_offset                    = s->ps.pps->beta_offset;
+                sh->tc_offset                      = s->ps.pps->tc_offset;
+            }
+        } else {
+            sh->disable_deblocking_filter_flag = 0;
+            sh->beta_offset                    = 0;
+            sh->tc_offset                      = 0;
+        }
+
+        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
+            (sh->slice_sample_adaptive_offset_flag[0] ||
+             sh->slice_sample_adaptive_offset_flag[1] ||
+             !sh->disable_deblocking_filter_flag)) {
+            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
+        } else {
+            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
+        }
+        sh->no_dblk_boundary_flags =
+            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
+                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
+            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
+                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
+
+
+    } else if (!s->slice_initialized) {
+        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    sh->num_entry_point_offsets = 0;
+    sh->offload_wpp = 0;
+    sh->offload_tiles = 0;
+
+    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
+        // It would be possible to bound this tighter but this here is simpler
+        if (num_entry_point_offsets > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
+            return AVERROR_INVALIDDATA;
+        }
+
+        sh->num_entry_point_offsets = num_entry_point_offsets;
+        if (sh->num_entry_point_offsets > 0) {
+            int offset_len = get_ue_golomb_long(gb) + 1;
+
+            if (offset_len < 1 || offset_len > 32) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
+                return AVERROR_INVALIDDATA;
+            }
+
+            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
+            {
+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
+                return ret;
+            }
+
+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
+                uint32_t val_minus1 = get_bits_long(gb, offset_len);
+                if (val_minus1 > (1 << 28))
+                {
+                    // We can declare offsets of > 2^28 bad without loss of generality
+                    // Will check actual bounds wrt NAL later, but this keeps
+                    // the values within bounds we can deal with easily
+                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
+                    return AVERROR_INVALIDDATA;
+                }
+                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
+            }
+
+            // Do we want to offload this
+            if (s->threads_type != 0)
+            {
+                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
+                    s->ps.pps->num_tile_columns > 1;
+                // * We only cope with WPP in a single column
+                //   Probably want to deal with that case as tiles rather than WPP anyway
+                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
+                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
+                    s->ps.pps->num_tile_columns == 1;
+            }
+        }
+    }
+
+    if (s->ps.pps->slice_header_extension_present_flag) {
+        unsigned int length = get_ue_golomb_long(gb);
+        if (length*8LL > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < length; i++)
+            skip_bits(gb, 8);  // slice_header_extension_data_byte
+    }
+
+    // Inferred parameters
+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
+    if (sh->slice_qp > 51 ||
+        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "The slice_qp %d is outside the valid range "
+               "[%d, 51].\n",
+               sh->slice_qp,
+               -s->ps.sps->qp_bd_offset);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Overread slice header by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->slice_initialized = 1;
+    return 0;
+}
+
+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
+{
+    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
+    int c_idx, i;
+
+    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
+        s->sh.slice_sample_adaptive_offset_flag[1]) {
+        if ((lc->ctb_avail & AVAIL_L) != 0)
+        {
+            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
+            if (sao_merge_left_flag) {
+                *sao = sao[-1];
+                return;
+            }
+        }
+        if ((lc->ctb_avail & AVAIL_U) != 0)
+        {
+            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
+            if (sao_merge_up_flag) {
+                *sao = sao[-(int)s->ps.sps->ctb_width];
+                return;
+            }
+        }
+    }
+
+    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
+        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
+        int offset_abs[4];
+        char offset_sign[4] = {0};
+
+        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
+            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
+            continue;
+        }
+
+        if (c_idx == 2) {
+            sao->type_idx[2] = sao->type_idx[1];
+            sao->eo_class[2] = sao->eo_class[1];
+        } else {
+            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
+        }
+
+        // ** Could use BY22 here quite plausibly - this is all bypass stuff
+        //    though only per CTB so not very timing critical
+
+        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
+            continue;
+
+        for (i = 0; i < 4; i++)
+            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
+
+        if (sao->type_idx[c_idx] == SAO_BAND) {
+            for (i = 0; i < 4; i++) {
+                if (offset_abs[i] != 0)
+                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
+            }
+            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
+        } else if (c_idx != 2) {
+            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
+        }
+
+        // Inferred parameters
+        sao->offset_val[c_idx][0] = 0;
+        for (i = 0; i < 4; i++) {
+            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
+            if (sao->type_idx[c_idx] == SAO_EDGE) {
+                if (i > 1)
+                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
+            } else if (offset_sign[i]) {
+                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
+            }
+        }
+    }
+}
+
+#if 0
+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
+    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
+
+    if (log2_res_scale_abs_plus1 !=  0) {
+        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
+                               (1 - 2 * res_scale_sign_flag);
+    } else {
+        lc->tu.res_scale_val = 0;
+    }
+
+
+    return 0;
+}
+#endif
+
+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
+{
+    return jb->intra.cmds + jb->intra.n++;
+}
+
+#define A0(x, y, U, L, UL, UR, DL) \
+    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
+
+#define A1(x, y, U, L, UL, UR, DL) \
+    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
+    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
+
+#define A2(x, y, U, L, UL, UR, DL) \
+    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
+    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
+
+#define A3(x, y, U, L, UL, UR, DL) \
+    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
+    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
+
+#define A4(x, y, U, L, UL, UR, DL) \
+    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
+    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
+
+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
+
+unsigned int ff_hevc_rpi_tb_avail_flags(
+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
+{
+    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
+    const unsigned int tb_x = x & ~ctb_mask;
+    const unsigned int tb_y = y & ~ctb_mask;
+    const unsigned int ctb_avail = lc->ctb_avail;
+
+    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
+
+    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
+
+    // This deals with both the U & L edges
+    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
+        f |= AVAIL_UL;
+
+    if (x + w < lc->end_of_ctb_x)
+        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
+    else if (tb_y == 0)
+        f |= (ctb_avail & AVAIL_UR);
+#if AVAIL_S_U - AVAIL_S_UR < 0
+#error Shift problem
+#endif
+
+    // Never any D if Y beyond eoctb
+    if (y + h < lc->end_of_ctb_y)
+        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
+#if AVAIL_S_DL - AVAIL_S_L < 0
+#error Shift problem
+#endif
+
+//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
+//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
+//           lc->end_of_ctb_x, lc->end_of_ctb_y);
+
+    return f;
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+
+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
+                          unsigned int avail)
+{
+    // If rpi_enabled then sand - U & V done on U call
+    if (c_idx <= 1)
+    {
+        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
+        cmd->type = RPI_PRED_INTRA + c_idx;
+        cmd->size = log2_trafo_size;
+        cmd->avail = avail;
+        cmd->i_pred.x = x0;
+        cmd->i_pred.y = y0;
+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+
+//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
+    }
+}
+
+#define CBF_CB0_S 0
+#define CBF_CB1_S 1 // CB1 must be CB0 + 1
+#define CBF_CR0_S 2
+#define CBF_CR1_S 3
+
+#define CBF_CB0 (1 << CBF_CB0_S)
+#define CBF_CR0 (1 << CBF_CR0_S)
+#define CBF_CB1 (1 << CBF_CB1_S)
+#define CBF_CR1 (1 << CBF_CR1_S)
+
+// * Only good for chroma_idx == 1
+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                              const unsigned int x0, const unsigned int y0,
+                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
+                              const unsigned int blk_idx, const int cbf_luma,
+                              const unsigned int cbf_chroma)
+{
+    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
+    const unsigned int x0_c = x0 & ~7;
+    const unsigned int y0_c = y0 & ~7;
+
+    enum ScanType scan_idx   = SCAN_DIAG;
+    enum ScanType scan_idx_c = SCAN_DIAG;
+
+    if (lc->cu.pred_mode == MODE_INTRA)
+    {
+        const unsigned int trafo_size = 1 << log2_trafo_size;
+        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
+
+        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
+
+        if (log2_trafo_size > 2)
+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
+        else if (blk_idx == 3)
+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
+                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
+
+        if (log2_trafo_size < 4) {
+            if (lc->tu.intra_pred_mode >= 6 &&
+                lc->tu.intra_pred_mode <= 14) {
+                scan_idx = SCAN_VERT;
+            } else if (lc->tu.intra_pred_mode >= 22 &&
+                       lc->tu.intra_pred_mode <= 30) {
+                scan_idx = SCAN_HORIZ;
+            }
+
+            if (lc->tu.intra_pred_mode_c >=  6 &&
+                lc->tu.intra_pred_mode_c <= 14) {
+                scan_idx_c = SCAN_VERT;
+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
+                       lc->tu.intra_pred_mode_c <= 30) {
+                scan_idx_c = SCAN_HORIZ;
+            }
+        }
+    }
+
+    if (!cbf_luma && cbf_chroma == 0)
+        return 0;
+
+    if (lc->tu.is_cu_qp_delta_wanted)
+    {
+        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
+        const unsigned int cb_mask = ~0U << log2_cb_size;
+
+        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
+            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
+        {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "The cu_qp_delta %d is outside the valid range "
+                   "[%d, %d].\n",
+                   qp_delta,
+                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
+                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
+            return AVERROR_INVALIDDATA;
+        }
+
+        lc->tu.is_cu_qp_delta_wanted = 0;
+        lc->tu.cu_qp_delta = qp_delta;
+        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
+    }
+
+    // * Not main profile & untested due to no conform streams
+    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
+        !lc->cu.cu_transquant_bypass_flag) {
+        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
+        if (cu_chroma_qp_offset_flag) {
+            int cu_chroma_qp_offset_idx  = 0;
+            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
+                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
+            }
+            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
+            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
+        }
+        lc->tu.cu_chroma_qp_offset_wanted = 0;
+    }
+
+    if (cbf_luma)
+        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
+
+    if (log2_trafo_size > 2 || blk_idx == 3)
+    {
+        if ((cbf_chroma & CBF_CB0) != 0)
+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
+                                        log2_trafo_size_c, scan_idx_c, 1);
+        if ((cbf_chroma & CBF_CR0) != 0)
+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
+                                        log2_trafo_size_c, scan_idx_c, 2);
+    }
+
+    return 0;
+}
+
+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
+{
+    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
+}
+
+
+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                              const unsigned int x0, const unsigned int y0,
+                              const unsigned int log2_trafo_size,
+                              const unsigned int trafo_depth, const unsigned int blk_idx,
+                              const unsigned int cbf_c0)
+{
+    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
+    unsigned int cbf_c1 = cbf_c0;
+    int split_transform_flag;
+    int ret;
+
+    if (lc->cu.intra_split_flag) {
+        if (trafo_depth == 1) {
+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
+            if (ctx_cfmt(s) == 3) {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
+            } else {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
+            }
+        }
+    } else {
+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
+    }
+
+    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
+        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
+        trafo_depth     < lc->cu.max_trafo_depth       &&
+        !(lc->cu.intra_split_flag && trafo_depth == 0))
+    {
+        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
+    } else {
+        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
+                          lc->cu.pred_mode == MODE_INTER &&
+                          lc->cu.part_mode != PART_2Nx2N &&
+                          trafo_depth == 0;
+
+        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
+                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
+                               inter_split;
+    }
+
+    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
+    {
+        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
+        cbf_c1 = 0;
+
+        if ((cbf_c0 & CBF_CB0) != 0)
+        {
+            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
+            if (wants_c1)
+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
+        }
+
+        if ((cbf_c0 & CBF_CR0) != 0)
+        {
+            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
+            if (wants_c1)
+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
+        }
+    }
+
+    if (split_transform_flag) {
+        const int trafo_size_split = 1 << (log2_trafo_size - 1);
+        const int x1 = x0 + trafo_size_split;
+        const int y1 = y0 + trafo_size_split;
+
+#define SUBDIVIDE(x, y, idx)                                                    \
+do {                                                                            \
+    ret = hls_transform_tree(s, lc, x, y,                                       \
+                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
+                             cbf_c1);                                           \
+    if (ret < 0)                                                                \
+        return ret;                                                             \
+} while (0)
+
+        SUBDIVIDE(x0, y0, 0);
+        SUBDIVIDE(x1, y0, 1);
+        SUBDIVIDE(x0, y1, 2);
+        SUBDIVIDE(x1, y1, 3);
+
+#undef SUBDIVIDE
+    } else {
+        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
+        // trafo_size == 2 with depth == 0 the issue is moot
+        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
+            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
+
+        ret = hls_transform_unit(s, lc, x0, y0,
+                                 log2_trafo_size + trafo_depth, log2_trafo_size,
+                                 blk_idx, cbf_luma, cbf_c1);
+        if (ret < 0)
+            return ret;
+
+        if (!s->sh.disable_deblocking_filter_flag) {
+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
+        }
+    }
+    return 0;
+}
+
+
+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
+{
+    GetBitContext gb;
+    int ret;
+
+    ret = init_get_bits(&gb, pcm, length);
+    if (ret < 0)
+        return ret;
+
+    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
+                       frame_stride1(s->frame, 0),
+                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+
+    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
+                       s->frame->linesize[1],
+                       cb_size >> ctx_hshift(s, 1),
+                       cb_size >> ctx_vshift(s, 1),
+                       &gb, s->ps.sps->pcm.bit_depth_chroma);
+
+    return 0;
+}
+
+
+// x * 2^(y*2)
+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+{
+    return x << (y * 2);
+}
+
+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
+{
+    // Length in bits
+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
+
+    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
+
+    if (!s->sh.disable_deblocking_filter_flag)
+        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
+
+    // Copy coeffs
+    {
+        const int blen = (length + 7) >> 3;
+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+        // Allocation is in int16_t s
+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+        // sample this rounding doesn't affect the total size we need to allocate for
+        // the coeff buffer
+        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
+        memcpy(coeffs, pcm, blen);
+
+        // Our coeff stash assumes that any partially allocated 64byte lump
+        // is zeroed so make that true.
+        {
+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+            if ((-(intptr_t)eopcm & 63) != 0)
+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+        }
+
+        // Add command
+        {
+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
+            cmd->type = RPI_PRED_I_PCM;
+            cmd->size = log2_cb_size;
+            cmd->i_pcm.src = coeffs;
+            cmd->i_pcm.x = x0;
+            cmd->i_pcm.y = y0;
+            cmd->i_pcm.src_len = length;
+        }
+        return 0;
+    }
+}
+
+
+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
+                                const MvXY xy, const int y0, const int height)
+{
+    if (s->threads_type != 0) {
+        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
+
+        // Progress has to be attached to current job as the actual wait
+        // is in worker_core which can't use lc
+        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
+        if (*pr < y) {
+            *pr = y;
+        }
+    }
+}
+
+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                  const int x0, const int y0, const int nPbW,
+                                  const int nPbH,
+                                  HEVCRpiMvField * const mv)
+{
+    enum InterPredIdc inter_pred_idc = PRED_L0;
+    int mvp_flag;
+    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
+
+    mv->pred_flag = 0;
+    if (s->sh.slice_type == HEVC_SLICE_B)
+        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
+
+    if (inter_pred_idc != PRED_L1) {
+        MvXY mvd;
+
+        if (s->sh.nb_refs[L0])
+            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
+
+        mv->pred_flag = PF_L0;
+        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
+                                 mv, mvp_flag, 0);
+        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
+    }
+
+    if (inter_pred_idc != PRED_L0) {
+        MvXY mvd = 0;
+
+        if (s->sh.nb_refs[L1])
+            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
+
+        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
+            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
+
+        mv->pred_flag += PF_L1;
+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
+                                 mv, mvp_flag, 1);
+        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
+    }
+}
+
+
+static HEVCRpiInterPredQ *
+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
+{
+    HEVCRpiInterPredQ * yp = NULL;
+    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
+    const unsigned int max_fill = ipe->max_fill;
+    unsigned int load = UINT_MAX;
+
+    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
+        // We will always have enough room between the Qs but if we are
+        // running critically low due to poor scheduling then use fill size
+        // rather than load to determine QPU.  This has obvious dire
+        // performance implications but (a) it is better than crashing
+        // and (b) it should (almost) never happen
+        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
+        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
+
+        if (tload < load)
+        {
+            yp = ypt;
+            load = tload;
+        }
+    }
+
+    yp->load += load_val;
+    ipe->used_grp = 1;
+    qpu_mc_link_set(yp->qpu_mc_curr, fn);
+
+    return yp;
+}
+
+
+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
+{
+    for (unsigned int i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const q = ipe->q + i;
+        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
+
+        qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
+        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
+    }
+}
+
+// Returns 0 on success
+// We no longer check for Q fullness as wew have emergncy code in ctu alloc
+// * However it might be an idea to have some means of spotting that we've used it
+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
+{
+    if (!ipe->used_grp)
+        return 0;
+
+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
+    {
+        ipe->curr = 0;
+        rpi_inter_pred_sync(ipe);
+    }
+    ipe->used = 1;
+    ipe->used_grp = 0;
+
+    return 0;
+}
+
+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
+{
+    unsigned int i;
+
+    ipe->curr = 0;
+    ipe->used = 0;
+    ipe->used_grp = 0;
+    for (i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const q = ipe->q + i;
+        q->qpu_mc_curr = q->qpu_mc_base;
+        q->load = 0;
+        q->last_l0 = NULL;
+        q->last_l1 = NULL;
+    }
+}
+
+static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
+                                 const unsigned int n_max, const unsigned int n_grp,
+                                 const unsigned int total_size, const unsigned int min_gap)
+{
+    int rv;
+
+    memset(ipe, 0, sizeof(*ipe));
+    if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
+        return AVERROR(ENOMEM);
+
+    ipe->n_grp = n_grp;
+    ipe->min_gap = min_gap;
+
+    if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
+        av_freep(&ipe->q);
+    return rv;
+}
+
+
+#if RPI_QPU_EMU_Y
+#define get_mc_address_y(f) ((f)->data[0])
+#else
+#define get_mc_address_y(f) get_vc_address_y(f)
+#endif
+#if RPI_QPU_EMU_C
+#define get_mc_address_u(f) ((f)->data[1])
+#else
+#define get_mc_address_u(f) get_vc_address_u(f)
+#endif
+
+static inline uint32_t pack_wo_p(const int off, const int mul)
+{
+    return PACK2(off * 2 + 1, mul);
+}
+
+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
+{
+    return PACK2(off0 + off1 + 1, mul);
+}
+
+
+static void
+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
+           const int x0, const int y0,
+           const int nPbW, const int nPbH,
+           const MvXY mv_xy,
+           const int weight_mul,
+           const int weight_offset,
+           AVFrame *const src_frame)
+{
+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+    const unsigned int mx          = MV_X(mv_xy) & 3;
+    const unsigned int my          = MV_Y(mv_xy) & 3;
+    const unsigned int my_mx       = (my << 8) | mx;
+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
+    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+
+    if (my_mx == 0)
+    {
+        const int x1 = x0 + (MV_X(mv_xy) >> 2);
+        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
+        const int bh = nPbH;
+
+        for (int start_x = 0; start_x < nPbW; start_x += 16)
+        {
+            const int bw = FFMIN(nPbW - start_x, 16);
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
+
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
+                ++ts->y_pred1_x0y0;
+
+                if (nPbW > 8)
+                    ++ts->y_pred1_wgt8;
+                else
+                    ++ts->y_pred1_wle8;
+
+                if (nPbH > 16)
+                    ++ts->y_pred1_hgt16;
+                else
+                    ++ts->y_pred1_hle16;
+            }
+#endif
+
+            src1->x = x1 + start_x;
+            src1->y = y1;
+            src1->base = src_vc_address_y;
+            cmd_y->w = bw;
+            cmd_y->h = bh;
+            cmd_y->wo1 = wo;
+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+        }
+    }
+    else
+    {
+        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
+        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
+        const unsigned int bh = nPbH;
+        int start_x = 0;
+
+#if 1
+        // As Y-pred operates on two independant 8-wide src blocks we can merge
+        // this pred with the previous one if it the previous one is 8 pel wide,
+        // the same height as the current block, immediately to the left of our
+        // current dest block and mono-pred.
+
+        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
+        {
+            const int bw = FFMIN(nPbW, 8);
+            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
+
+            last_y8_src2->x = x1_m3;
+            last_y8_src2->y = y1_m3;
+            last_y8_src2->base = src_vc_address_y;
+            last_y8_p->w += bw;
+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
+            last_y8_p->wo2 = wo;
+
+            jb->last_y8_p = NULL;
+            jb->last_y8_l1 = NULL;
+            start_x = bw;
+#if RPI_TSTATS
+            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
+#endif
+        }
+#endif
+
+        for (; start_x < nPbW; start_x += 16)
+        {
+            const int bw = FFMIN(nPbW - start_x, 16);
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_src_t *const src2 = yp->last_l1;
+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
+                if (mx == 0 && my == 0)
+                    ++ts->y_pred1_x0y0;
+                else if (mx == 0)
+                    ++ts->y_pred1_x0;
+                else if (my == 0)
+                    ++ts->y_pred1_y0;
+                else
+                    ++ts->y_pred1_xy;
+
+                if (nPbW > 8)
+                    ++ts->y_pred1_wgt8;
+                else
+                    ++ts->y_pred1_wle8;
+
+                if (nPbH > 16)
+                    ++ts->y_pred1_hgt16;
+                else
+                    ++ts->y_pred1_hle16;
+            }
+#endif
+            src1->x = x1_m3 + start_x;
+            src1->y = y1_m3;
+            src1->base = src_vc_address_y;
+            if (bw <= 8)
+            {
+                src2->x = MC_DUMMY_X;
+                src2->y = MC_DUMMY_Y;
+#if RPI_QPU_EMU_Y
+                src2->base = s->qpu_dummy_frame_emu;
+#else
+                src2->base = s->qpu_dummy_frame_qpu;
+#endif
+            }
+            else
+            {
+                src2->x = x1_m3 + start_x + 8;
+                src2->y = y1_m3;
+                src2->base = src_vc_address_y;
+            }
+            cmd_y->w = bw;
+            cmd_y->h = bh;
+            cmd_y->mymx21 = my2_mx2_my_mx;
+            cmd_y->wo1 = wo;
+            cmd_y->wo2 = wo;
+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->last_l1 = &cmd_y->next_src2;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+
+            if (bw == 8) {
+                jb->last_y8_l1 = src2;
+                jb->last_y8_p = cmd_y;
+            }
+        }
+    }
+}
+
+static void
+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+           const int x0, const int y0,
+           const int nPbW, const int nPbH,
+           const struct HEVCRpiMvField *const mv_field,
+           const AVFrame *const src_frame,
+           const AVFrame *const src_frame2)
+{
+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+    const MvXY mv  = mv_field->xy[0];
+    const MvXY mv2 = mv_field->xy[1];
+
+    const unsigned int mx          = MV_X(mv) & 3;
+    const unsigned int my          = MV_Y(mv) & 3;
+    const unsigned int my_mx = (my<<8) | mx;
+    const unsigned int mx2          = MV_X(mv2) & 3;
+    const unsigned int my2          = MV_Y(mv2) & 3;
+    const unsigned int my2_mx2 = (my2<<8) | mx2;
+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
+    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
+    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
+
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
+
+    if (my2_mx2_my_mx == 0)
+    {
+        const int x1 = x0 + (MV_X(mv) >> 2);
+        const int y1 = y0 + (MV_Y(mv) >> 2);
+        const int x2 = x0 + (MV_X(mv2) >> 2);
+        const int y2 = y0 + (MV_Y(mv2) >> 2);
+        const int bh = nPbH;
+
+        // Can do chunks a full 16 wide if we don't want the H filter
+        for (int start_x=0; start_x < nPbW; start_x += 16)
+        {
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_src_t *const src2 = yp->last_l1;
+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
+                ++ts->y_pred2_x0y0;
+
+                if (nPbH > 16)
+                    ++ts->y_pred2_hgt16;
+                else
+                    ++ts->y_pred2_hle16;
+            }
+#endif
+            src1->x = x1 + start_x;
+            src1->y = y1;
+            src1->base = src1_base;
+            src2->x = x2 + start_x;
+            src2->y = y2;
+            src2->base = src2_base;
+            cmd_y->w = FFMIN(nPbW - start_x, 16);
+            cmd_y->h = bh;
+            cmd_y->mymx21 = 0;
+            cmd_y->wo1 = wo1;
+            cmd_y->wo2 = wo2;
+            cmd_y->dst_addr =  dst + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->last_l1 = &cmd_y->next_src2;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+        }
+    }
+    else
+    {
+        // Filter requires a run-up of 3
+        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
+        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
+        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
+        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
+        const int bh = nPbH;
+
+        for (int start_x=0; start_x < nPbW; start_x += 8)
+        { // B blocks work 8 at a time
+            // B weights aren't doubled as the QPU code does the same
+            // amount of work as it does for P
+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
+            qpu_mc_src_t *const src1 = yp->last_l0;
+            qpu_mc_src_t *const src2 = yp->last_l1;
+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+#if RPI_TSTATS
+            {
+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
+                const unsigned int mmx = mx | mx2;
+                const unsigned int mmy = my | my2;
+                if (mmx == 0 && mmy == 0)
+                    ++ts->y_pred2_x0y0;
+                else if (mmx == 0)
+                    ++ts->y_pred2_x0;
+                else if (mmy == 0)
+                    ++ts->y_pred2_y0;
+                else
+                    ++ts->y_pred2_xy;
+
+                if (nPbH > 16)
+                    ++ts->y_pred2_hgt16;
+                else
+                    ++ts->y_pred2_hle16;
+            }
+#endif
+            src1->x = x1 + start_x;
+            src1->y = y1;
+            src1->base = src1_base;
+            src2->x = x2 + start_x;
+            src2->y = y2;
+            src2->base = src2_base;
+            cmd_y->w = FFMIN(nPbW - start_x, 8);
+            cmd_y->h = bh;
+            cmd_y->mymx21 = my2_mx2_my_mx;
+            cmd_y->wo1 = wo1;
+            cmd_y->wo2 = wo2;
+            cmd_y->dst_addr =  dst + (start_x << xshl);
+            yp->last_l0 = &cmd_y->next_src1;
+            yp->last_l1 = &cmd_y->next_src2;
+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+        }
+    }
+}
+
+// h/v shifts fixed at one as that is all the qasm copes with
+static void
+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+  const unsigned int lx, const int x0_c, const int y0_c,
+  const int nPbW_c, const int nPbH_c,
+  const MvXY mv,
+  const int16_t * const c_weights,
+  const int16_t * const c_offsets,
+  AVFrame * const src_frame)
+{
+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+    const int hshift = 1; // = s->ps.sps->hshift[1];
+    const int vshift = 1; // = s->ps.sps->vshift[1];
+
+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
+    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
+    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+    const unsigned int bh = nPbH_c;
+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
+
+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+    {
+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
+        qpu_mc_src_t * const last_lx = *plast_lx;
+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+
+        last_lx->x = x1_c + start_x;
+        last_lx->y = y1_c;
+        last_lx->base = src_base_u;
+        cmd_c->h = bh;
+        cmd_c->w = bw;
+        cmd_c->coeffs_x = x_coeffs;
+        cmd_c->coeffs_y = y_coeffs;
+        cmd_c->wo_u = wo_u;
+        cmd_c->wo_v = wo_v;
+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
+        *plast_lx = &cmd_c->next_src;
+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
+    }
+    return;
+}
+
+// h/v shifts fixed at one as that is all the qasm copes with
+static void
+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+  const int x0_c, const int y0_c,
+  const int nPbW_c, const int nPbH_c,
+  const struct HEVCRpiMvField * const mv_field,
+  const int16_t * const c_weights,
+  const int16_t * const c_offsets,
+  const int16_t * const c_weights2,
+  const int16_t * const c_offsets2,
+  AVFrame * const src_frame,
+  AVFrame * const src_frame2)
+{
+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+    const int hshift = 1; // s->ps.sps->hshift[1];
+    const int vshift = 1; // s->ps.sps->vshift[1];
+    const MvXY mv = mv_field->xy[0];
+    const MvXY mv2 = mv_field->xy[1];
+
+    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
+    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
+
+    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
+    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+
+    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
+    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
+
+    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
+    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
+
+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+    const unsigned int bh = nPbH_c;
+
+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
+    {
+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+
+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
+        qpu_mc_src_t * const src_l0 = cp->last_l0;
+        qpu_mc_src_t * const src_l1 = cp->last_l1;
+
+        src_l0->x = x1_c + start_x;
+        src_l0->y = y1_c;
+        src_l0->base = src1_base;
+        src_l1->x = x2_c + start_x;
+        src_l1->y = y2_c;
+        src_l1->base = src2_base;
+
+        u[0].h = bh;
+        u[0].w = bw;
+        u[0].coeffs_x1 = coefs0_x;
+        u[0].coeffs_y1 = coefs0_y;
+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
+        u[0].coeffs_x2 = coefs1_x;
+        u[0].coeffs_y2 = coefs1_y;
+        u[0].wo_u2 = wo_u2;
+        u[0].wo_v2 = wo_v2;
+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
+
+        cp->last_l0 = &u[0].next_src1;
+        cp->last_l1 = &u[0].next_src2;
+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+    }
+}
+
+
+static inline void
+col_stash(const HEVCRpiContext * const s,
+          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
+          const HEVCRpiMvField * const mvf)
+{
+    ColMvField * const col_mvf = s->ref->col_mvf;
+    const unsigned int x = (x0 + 15) >> 4;
+    const unsigned int y = (y0 + 15) >> 4;
+    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
+    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
+
+    if (col_mvf != NULL && w != 0 && h != 0)
+    {
+        // Only record MV from the top left of the 16x16 block
+
+        const RefPicList * const rpl = s->refPicList;
+        const ColMvField cmv = {
+            .L = {
+                {
+                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
+                            COL_POC_INTRA :
+                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
+                    .xy = mvf->xy[0]
+                },
+                {
+                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
+                            COL_POC_INTRA :
+                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
+                    .xy = mvf->xy[1]
+                }
+            }
+        };
+
+        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
+        const unsigned int stride = s->col_mvf_stride - w;
+        unsigned int j = h;
+
+        do
+        {
+            unsigned int k = w;
+            do
+            {
+                *p++ = cmv;
+            } while (--k != 0);
+            p += stride;
+        } while (--j != 0);
+    }
+}
+
+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                const unsigned int x0, const unsigned int y0,
+                                const unsigned int nPbW, const unsigned int nPbH,
+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+{
+    HEVCRpiJob * const jb = lc->jb0;
+
+    struct HEVCRpiMvField current_mv = {{0}};
+    const RefPicList  *const refPicList = s->refPicList;
+    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
+
+    if (lc->cu.pred_mode != MODE_SKIP)
+        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
+
+    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
+        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
+            ff_hevc_rpi_merge_idx_decode(s, lc);
+
+        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
+                                   partIdx, merge_idx, &current_mv);
+    } else {
+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
+    }
+
+    {
+        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
+        unsigned int i, j;
+
+        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
+        {
+            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
+                p[i] = current_mv;
+            p += MVF_STASH_WIDTH_PU;
+        }
+    }
+
+    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
+
+    if (current_mv.pred_flag & PF_L0) {
+        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
+        if (!ref0)
+            return;
+        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
+    }
+    if (current_mv.pred_flag & PF_L1) {
+        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
+        if (!ref1)
+            return;
+        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
+    }
+
+    if (current_mv.pred_flag == PF_L0) {
+        const int x0_c = x0 >> ctx_hshift(s, 1);
+        const int y0_c = y0 >> ctx_vshift(s, 1);
+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
+
+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
+          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+          ref0->frame);
+
+        if (ctx_cfmt(s) != 0) {
+            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
+              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+              ref0->frame);
+            return;
+        }
+    } else if (current_mv.pred_flag == PF_L1) {
+        const int x0_c = x0 >> ctx_hshift(s, 1);
+        const int y0_c = y0 >> ctx_vshift(s, 1);
+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
+
+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
+          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+          ref1->frame);
+
+        if (ctx_cfmt(s) != 0) {
+            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
+              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+              ref1->frame);
+            return;
+        }
+    } else if (current_mv.pred_flag == PF_BI) {
+        const int x0_c = x0 >> ctx_hshift(s, 1);
+        const int y0_c = y0 >> ctx_vshift(s, 1);
+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
+
+        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+
+        if (ctx_cfmt(s) != 0) {
+          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
+                       &current_mv,
+                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+                       ref0->frame,
+                       ref1->frame);
+            return;
+        }
+    }
+}
+
+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                    const unsigned int x0, const unsigned int y0,
+                    const unsigned int log2_cb_size,
+                    const unsigned int ipm)
+{
+    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
+    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
+
+    {
+        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
+        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
+    }
+
+    // If IRAP then everything is Intra & we avoid ever looking at these
+    // stashes so don't bother setting them
+    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
+    {
+        if (s->is_intra != NULL)
+        {
+            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
+        }
+
+        {
+            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
+            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
+            unsigned int n = size_in_pus;
+
+            do
+            {
+                memset(p, 0, size_in_pus * sizeof(*p));
+                p += MVF_STASH_WIDTH_PU;
+            } while (--n != 0);
+        }
+
+
+        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
+        {
+            // Only record top left stuff
+            // Blocks should always be alinged on size boundries
+            // so cannot have overflow from a small block
+
+            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
+            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
+            const unsigned int stride = s->col_mvf_stride - size_in_col;
+            unsigned int j = size_in_col;
+
+            do
+            {
+                unsigned int k = size_in_col;
+                do
+                {
+                    p->L[0].poc = COL_POC_INTRA;
+                    p->L[0].xy = 0;
+                    p->L[1].poc = COL_POC_INTRA;
+                    p->L[1].xy = 0;
+                    ++p;
+                } while (--k != 0);
+                p += stride;
+            } while (--j != 0);
+        }
+    }
+}
+
+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                                const unsigned int x0, const unsigned int y0,
+                                                const unsigned int log2_cb_size)
+{
+    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
+}
+
+
+/**
+ * 8.4.1
+ */
+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                int x0, int y0, int log2_pu_size,
+                                int prev_intra_luma_pred_flag,
+                                const unsigned int idx)
+{
+    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
+    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
+    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
+
+    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
+    // lc we can just keep 1 CTB lR stashes
+    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
+    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
+    const unsigned int cand_left = lc->ipm_left[yb_pu];
+
+    unsigned int intra_pred_mode;
+    unsigned int a, b, c;
+
+    if (cand_left == cand_up) {
+        if (cand_left < 2) {
+            a = INTRA_PLANAR;
+            b = INTRA_DC;
+            c = INTRA_ANGULAR_26;
+        } else {
+            a = cand_left;
+            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
+            c = 2 + ((cand_left - 2 + 1) & 31);
+        }
+    } else {
+        a = cand_left;
+        b = cand_up;
+        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
+                INTRA_PLANAR :
+            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
+                INTRA_DC :
+                INTRA_ANGULAR_26;
+    }
+
+    if (prev_intra_luma_pred_flag) {
+        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
+    } else {
+        // Sort lowest 1st
+        if (a > b)
+            FFSWAP(int, a, b);
+        if (a > c)
+            FFSWAP(int, a, c);
+        if (b > c)
+            FFSWAP(int, b, c);
+
+        intra_pred_mode = idx;
+        if (intra_pred_mode >= a)
+            intra_pred_mode++;
+        if (intra_pred_mode >= b)
+            intra_pred_mode++;
+        if (intra_pred_mode >= c)
+            intra_pred_mode++;
+    }
+
+    /* write the intra prediction units into the mv array */
+    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
+    return intra_pred_mode;
+}
+
+static const uint8_t tab_mode_idx[] = {
+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
+
+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                  const unsigned int x0, const unsigned int y0,
+                                  const unsigned int log2_cb_size)
+{
+    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
+    uint8_t prev_intra_luma_pred_flag[4];
+    int split   = lc->cu.part_mode == PART_NxN;
+    const unsigned int split_size = (1 << (log2_cb_size - 1));
+    int chroma_mode;
+    const unsigned int n = split ? 4 : 1;
+    unsigned int i;
+
+    for (i = 0; i != n; i++)
+        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
+
+    for (i = 0; i < n; i++) {
+        // depending on mode idx is mpm or luma_pred_mode
+        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
+            ff_hevc_rpi_mpm_idx_decode(lc) :
+            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
+
+        lc->pu.intra_pred_mode[i] =
+            luma_intra_pred_mode(s, lc,
+                                 x0 + ((i & 1) == 0 ? 0 : split_size),
+                                 y0 + ((i & 2) == 0 ? 0 : split_size),
+                                 log2_cb_size - split,
+                                 prev_intra_luma_pred_flag[i], idx);
+    }
+
+    if (ctx_cfmt(s) == 3) {
+        for (i = 0; i < n; i++) {
+            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
+            if (chroma_mode != 4) {
+                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
+                    lc->pu.intra_pred_mode_c[i] = 34;
+                else
+                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
+            } else {
+                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
+            }
+        }
+    } else if (ctx_cfmt(s) == 2) {
+        int mode_idx;
+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                mode_idx = 34;
+            else
+                mode_idx = intra_chroma_table[chroma_mode];
+        } else {
+            mode_idx = lc->pu.intra_pred_mode[0];
+        }
+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
+    } else if (ctx_cfmt(s) != 0) {
+        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                lc->pu.intra_pred_mode_c[0] = 34;
+            else
+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
+        } else {
+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
+        }
+    }
+}
+
+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
+{
+    const unsigned int cb_size          = 1 << log2_cb_size;
+    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
+    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
+    const unsigned int x_cb             = x0 >> log2_min_cb_size;
+    const unsigned int y_cb             = y0 >> log2_min_cb_size;
+    const unsigned int idx              = log2_cb_size - 2;
+    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
+    int skip_flag = 0;
+
+    lc->cu.x                = x0;
+    lc->cu.y                = y0;
+    lc->cu.x_split          = x0;
+    lc->cu.y_split          = y0;
+
+    lc->cu.pred_mode        = MODE_INTRA;
+    lc->cu.part_mode        = PART_2Nx2N;
+    lc->cu.intra_split_flag = 0;
+    lc->cu.cu_transquant_bypass_flag = 0;
+    lc->pu.intra_pred_mode[0] = 1;
+    lc->pu.intra_pred_mode[1] = 1;
+    lc->pu.intra_pred_mode[2] = 1;
+    lc->pu.intra_pred_mode[3] = 1;
+
+    if (s->ps.pps->transquant_bypass_enable_flag) {
+        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
+        if (lc->cu.cu_transquant_bypass_flag)
+            set_deblocking_bypass(s, x0, y0, log2_cb_size);
+    }
+
+    if (s->sh.slice_type != HEVC_SLICE_I) {
+        lc->cu.pred_mode = MODE_INTER;
+        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
+    }
+
+    if (skip_flag) {
+        lc->cu.pred_mode = MODE_SKIP;
+
+        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
+        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
+
+        if (!s->sh.disable_deblocking_filter_flag)
+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
+    } else {
+        int pcm_flag = 0;
+
+        if (s->sh.slice_type != HEVC_SLICE_I)
+            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
+        if (lc->cu.pred_mode != MODE_INTRA ||
+            log2_cb_size == s->ps.sps->log2_min_cb_size) {
+            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
+            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
+                                      lc->cu.pred_mode == MODE_INTRA;
+        }
+
+        if (lc->cu.pred_mode == MODE_INTRA) {
+            if (lc->cu.part_mode == PART_2Nx2N &&
+                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
+                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
+                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
+            {
+                int ret;
+                pcm_flag = 1;
+                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
+                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
+                    return ret;
+
+                if (s->ps.sps->pcm.loop_filter_disable_flag)
+                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
+            } else {
+                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
+            }
+        } else {
+            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
+            switch (lc->cu.part_mode) {
+            case PART_2Nx2N:
+                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
+                break;
+            case PART_2NxN:
+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
+                lc->cu.y_split = y0 + cb_size / 2;
+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
+                break;
+            case PART_Nx2N:
+                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
+                lc->cu.x_split = x0 + cb_size / 2;
+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
+                break;
+            case PART_2NxnU:
+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
+                lc->cu.y_split = y0 + cb_size / 4;
+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
+                break;
+            case PART_2NxnD:
+                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
+                lc->cu.y_split = y0 + cb_size / 4 * 3;
+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
+                break;
+            case PART_nLx2N:
+                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
+                lc->cu.x_split = x0 + cb_size / 4;
+                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
+                break;
+            case PART_nRx2N:
+                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
+                lc->cu.x_split = x0 + cb_size / 4 * 3;
+                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
+                break;
+            case PART_NxN:
+                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
+                lc->cu.x_split = x0 + cb_size / 2;
+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
+                lc->cu.y_split = y0 + cb_size / 2;
+                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
+                break;
+            }
+        }
+
+        if (!pcm_flag) {
+            int rqt_root_cbf = 1;
+
+            if (lc->cu.pred_mode != MODE_INTRA &&
+                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
+                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
+            }
+            if (rqt_root_cbf) {
+                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
+                int ret;
+
+                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
+                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
+                                         s->ps.sps->max_transform_hierarchy_depth_inter;
+                // transform_tree does deblock_boundary_strengths
+                ret = hls_transform_tree(s, lc, x0, y0,
+                                         log2_cb_size, 0, 0, cbf_c);
+                if (ret < 0)
+                    return ret;
+            } else {
+                if (!s->sh.disable_deblocking_filter_flag)
+                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
+            }
+        }
+    }
+
+    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
+    if (lc->tu.is_cu_qp_delta_wanted)
+        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
+
+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
+        lc->qPy_pred = lc->qp_y;
+    }
+
+    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
+
+    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
+
+    return 0;
+}
+
+// Returns:
+//  < 0  Error
+//  0    More data wanted
+//  1    EoSlice / EoPicture
+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
+                               const int log2_cb_size, const unsigned int cb_depth)
+{
+    const int cb_size    = 1 << log2_cb_size;
+    int ret;
+    int split_cu;
+
+    lc->ct_depth = cb_depth;
+    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
+    if (x0 + cb_size <= s->ps.sps->width  &&
+        y0 + cb_size <= s->ps.sps->height &&
+        split_cu)
+    {
+        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
+    }
+
+    // Qp delta (and offset) need to remain wanted if cb_size < min until
+    // a coded block is found so we still initial state at depth 0 (outside
+    // this fn) and only reset here
+    if (s->ps.pps->cu_qp_delta_enabled_flag &&
+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
+    {
+        lc->tu.is_cu_qp_delta_wanted = 1;
+        lc->tu.cu_qp_delta          = 0;
+    }
+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
+    {
+        lc->tu.cu_chroma_qp_offset_wanted = 1;
+    }
+
+    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
+    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
+    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
+
+    if (split_cu) {
+        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
+        const int cb_size_split = cb_size >> 1;
+        const int x1 = x0 + cb_size_split;
+        const int y1 = y0 + cb_size_split;
+
+        int more_data = 0;
+
+        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
+        if (more_data < 0)
+            return more_data;
+
+        if (more_data && x1 < s->ps.sps->width) {
+            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && x1 < s->ps.sps->width &&
+            y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+
+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
+            lc->qPy_pred = lc->qp_y;
+
+        if (more_data)
+            return ((x1 + cb_size_split) < s->ps.sps->width ||
+                    (y1 + cb_size_split) < s->ps.sps->height);
+        else
+            return 0;
+    } else {
+        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
+        if (ret < 0)
+            return ret;
+        if ((!((x0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (x0 + cb_size >= s->ps.sps->width)) &&
+            (!((y0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (y0 + cb_size >= s->ps.sps->height))) {
+            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
+            return !end_of_slice_flag;
+        } else {
+            return 1;
+        }
+    }
+
+    return 0;  // NEVER
+}
+
+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
+{
+    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
+    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
+    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
+    const unsigned int line_w = s->ps.sps->ctb_width;
+
+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
+
+    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
+    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
+
+    lc->boundary_flags = 0;
+
+    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
+    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
+    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
+
+    // Use line width rather than tile width for addr_in_slice test as
+    // addr_in_slice is in raster units
+
+    lc->ctb_avail =
+        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
+        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
+        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
+            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
+        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
+            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
+    // Down-left never avail at CTB level
+}
+
+
+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
+{
+    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
+        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
+
+    // Signal
+    if (y > 0) {
+        // Cast away const as progress is held in s, but this really shouldn't confuse anything
+        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
+    }
+
+    // Job done now
+    // ? Move outside this fn
+    job_free(s->jbc, jb);
+}
+
+// I-pred, transform_and_add for all blocks types done here
+// All ARM
+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
+{
+    unsigned int i;
+    HEVCRpiIntraPredEnv * const iap = &jb->intra;
+    const HEVCPredCmd *cmd = iap->cmds;
+
+#if !RPI_WORKER_WAIT_PASS_0
+    rpi_sem_wait(&jb->sem);
+    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
+#endif
+
+    for (i = iap->n; i > 0; i--, cmd++)
+    {
+        switch (cmd->type)
+        {
+            case RPI_PRED_INTRA:
+                s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
+                break;
+            case RPI_PRED_INTRA_C:
+                s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
+                break;
+            case RPI_PRED_ADD_RESIDUAL:
+                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+                break;
+            case RPI_PRED_ADD_DC:
+                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+                break;
+            case RPI_PRED_ADD_RESIDUAL_U:
+                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+                break;
+            case RPI_PRED_ADD_RESIDUAL_V:
+                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+                break;
+            case RPI_PRED_ADD_RESIDUAL_C:
+                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+                break;
+            case RPI_PRED_ADD_DC_U:
+            case RPI_PRED_ADD_DC_V:
+                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+                break;
+
+            case RPI_PRED_I_PCM:
+                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+                break;
+
+            default:
+                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+                abort();
+        }
+    }
+
+    // Mark done
+    iap->n = 0;
+}
+
+
+// Set initial uniform job values & zero ctu_count
+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
+{
+    unsigned int i;
+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
+    const HEVCRpiSPS * const sps = s->ps.sps;
+
+    const uint16_t pic_width_y   = sps->width;
+    const uint16_t pic_height_y  = sps->height;
+
+    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
+    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
+
+    // We expect the pointer to change if we use another sps
+    if (sps != jb->sps)
+    {
+        worker_pic_free_one(jb);
+
+        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
+        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
+
+        {
+            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
+            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
+            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
+        }
+
+        jb->sps = sps;
+    }
+
+    jb->waited = 0;
+    jb->ctu_ts_first = ctu_ts_first;
+    jb->ctu_ts_last = -1;
+
+    rpi_inter_pred_reset(cipe);
+    for (i = 0; i < cipe->n; i++) {
+        HEVCRpiInterPredQ * const cp = cipe->q + i;
+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
+
+        u->next_src1.x = 0;
+        u->next_src1.y = 0;
+        u->next_src1.base = 0;
+        u->pic_cw = pic_width_c;
+        u->pic_ch = pic_height_c;
+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
+        cp->last_l0 = &u->next_src1;
+
+        u->next_fn = 0;
+        u->next_src2.x = 0;
+        u->next_src2.y = 0;
+        u->next_src2.base = 0;
+        cp->last_l1 = &u->next_src2;
+
+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+    }
+
+    rpi_inter_pred_reset(yipe);
+    for (i = 0; i < yipe->n; i++) {
+        HEVCRpiInterPredQ * const yp = yipe->q + i;
+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
+
+        y->next_src1.x = 0;
+        y->next_src1.y = 0;
+        y->next_src1.base = 0;
+        y->next_src2.x = 0;
+        y->next_src2.y = 0;
+        y->next_src2.base = 0;
+        y->pic_h = pic_height_y;
+        y->pic_w = pic_width_y;
+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
+        y->next_fn = 0;
+        yp->last_l0 = &y->next_src1;
+        yp->last_l1 = &y->next_src2;
+
+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
+    }
+
+    jb->last_y8_p = NULL;
+    jb->last_y8_l1 = NULL;
+
+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
+        jb->progress_req[i] = -1;
+    }
+
+    worker_pic_reset(&jb->coeffs);
+}
+
+
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
+                                     const vpu_qpu_job_h vqj,
+                                     rpi_cache_flush_env_t * const rfe,
+                                     HEVCRpiInterPredEnv * const ipe)
+{
+    unsigned int i;
+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
+    unsigned int max_block = 0;
+
+    if (!ipe->used) {
+        return 0;
+    }
+
+    if (ipe->curr != 0) {
+        rpi_inter_pred_sync(ipe);
+    }
+
+    // Add final commands to Q
+    for(i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const yp = ipe->q + i;
+        qpu_mc_src_t *const p0 = yp->last_l0;
+        qpu_mc_src_t *const p1 = yp->last_l1;
+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
+
+        if (block_size > max_block)
+            max_block = block_size;
+
+        qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
+
+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+        p0->x = MC_DUMMY_X;
+        p0->y = MC_DUMMY_Y;
+        p0->base = s->qpu_dummy_frame_qpu;
+        p1->x = MC_DUMMY_X;
+        p1->y = MC_DUMMY_Y;
+        p1->base = s->qpu_dummy_frame_qpu;
+
+        yp->last_l0 = NULL;
+        yp->last_l1 = NULL;
+
+        // Add to mailbox list
+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
+        mail[i][1] = yp->code_setup;
+    }
+
+    // We don't need invalidate here as the uniforms aren't changed by the QPU
+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
+    // new values which seems to give us a small performance advantage
+    //
+    // In most cases we will not have a completely packed set of uniforms and as
+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
+    // fullest
+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
+                                  ipe->n, ipe->max_fill + ipe->min_gap);
+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
+
+    return 1;
+}
+#endif
+
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
+                                     const vpu_qpu_job_h vqj,
+                                     rpi_cache_flush_env_t * const rfe,
+                                     HEVCRpiInterPredEnv * const ipe)
+{
+    unsigned int i;
+    if (!ipe->used) {
+        return 0;
+    }
+
+    if (ipe->curr != 0) {
+        rpi_inter_pred_sync(ipe);
+    }
+
+    // Add final commands to Q
+    for(i = 0; i != ipe->n; ++i) {
+        HEVCRpiInterPredQ * const yp = ipe->q + i;
+        qpu_mc_src_t *const p0 = yp->last_l0;
+        qpu_mc_src_t *const p1 = yp->last_l1;
+
+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+
+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+        p0->x = MC_DUMMY_X;
+        p0->y = MC_DUMMY_Y;
+        p0->base = s->qpu_dummy_frame_emu;
+        p1->x = MC_DUMMY_X;
+        p1->y = MC_DUMMY_Y;
+        p1->base = s->qpu_dummy_frame_emu;
+
+        yp->last_l0 = NULL;
+        yp->last_l1 = NULL;
+    }
+
+    return 1;
+}
+#endif
+
+
+#if RPI_QPU_EMU_Y
+#define mc_terminate_add_y mc_terminate_add_emu
+#else
+#define mc_terminate_add_y mc_terminate_add_qpu
+#endif
+#if RPI_QPU_EMU_C
+#define mc_terminate_add_c mc_terminate_add_emu
+#else
+#define mc_terminate_add_c mc_terminate_add_qpu
+#endif
+
+
+static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
+{
+    rpi_cache_buf_t cbuf;
+    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
+    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+    rpi_cache_flush_finish(rfe);
+}
+
+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
+{
+    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
+    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
+    const unsigned int ctb_width = s->ps.sps->ctb_width;
+    RpiBlk *const bounds = &jb->bounds;
+    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
+    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
+    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
+    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
+    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
+
+    bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
+    bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
+}
+
+#if RPI_PASSES == 2
+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
+{
+    // Perform intra prediction and residual reconstruction
+    rpi_execute_pred_cmds(s, jb);
+
+    // Perform deblocking for CTBs in this row
+    rpi_execute_dblk_cmds(s, jb);
+}
+#endif
+
+// Core execution tasks
+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
+{
+    int pred_y, pred_c;
+    vpu_qpu_job_env_t qvbuf;
+    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
+#if RPI_WORKER_WAIT_PASS_0
+    int do_wait;
+#endif
+
+    {
+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+        if (cf->s[3].n + cf->s[2].n != 0)
+        {
+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
+            unsigned int n16 = (cf->s[2].n >> 8);
+            unsigned int n32 = (cf->s[3].n >> 10);
+#if RPI_COMPRESS_COEFFS
+            if (cf->s[2].packed) {
+                n16 = n16 | (n16<<16);
+            } else {
+                const unsigned int npack16 = (cf->s[2].packed_n>>8);
+                n16 = n16 | (npack16<<16);
+            }
+            if (cf->s[3].packed) {
+                n32 = n32 | (n32<<16);
+            } else {
+                const unsigned int npack32 = (cf->s[3].packed_n>>10);
+                n32 = n32 | (npack32<<16);
+            }
+#endif
+            vpu_qpu_job_add_vpu(vqj,
+                vpu_get_fn(s->ps.sps->bit_depth),
+                vpu_get_constants(),
+                cf->gptr.vc,
+                n16,
+                cf->gptr.vc + offset32,
+                n32,
+                0);
+
+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
+        }
+    }
+
+    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
+
+// We could take a sync here and try to locally overlap QPU processing with ARM
+// but testing showed a slightly negative benefit with noticable extra complexity
+
+    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
+
+    // Returns 0 if nothing to do, 1 if sync added
+#if RPI_WORKER_WAIT_PASS_0
+    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
+#else
+    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
+        sem_post(&jb->sem);
+#endif
+
+    rpi_cache_flush_execute(jb->rfe);
+
+    // Await progress as required
+    // jb->waited will only be clear if we have already tested the progress values
+    // (in worker_submit_job) and found we don't have to wait
+    if (jb->waited)
+    {
+        unsigned int i;
+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
+            if (jb->progress_req[i] >= 0) {
+                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
+            }
+        }
+    }
+
+    vpu_qpu_job_finish(vqj);
+
+    // We always work on a rectangular block
+    if (pred_y || pred_c)
+    {
+        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
+                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
+                                        ctx_vshift(s, 1), pred_y, pred_c);
+    }
+
+    // If we have emulated VPU ops - do it here
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+    if (av_rpi_is_sand8_frame(s->frame))
+    {
+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
+#elif RPI_QPU_EMU_Y
+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
+#else
+        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
+#endif
+    }
+    else
+    {
+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
+#elif RPI_QPU_EMU_Y
+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
+#else
+        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
+#endif
+    }
+#endif
+
+#if RPI_WORKER_WAIT_PASS_0
+    if (do_wait)
+        rpi_sem_wait(&jb->sem);
+    rpi_cache_flush_execute(jb->rfe);
+#endif
+}
+
+
+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
+{
+    av_freep(&ipe->q);
+    gpu_free(&ipe->gptr);
+}
+
+static HEVCRpiJob * job_new(void)
+{
+    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
+
+    if (jb == NULL)
+        return NULL;
+
+    sem_init(&jb->sem, 0, 0);
+    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
+    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
+
+    jb->intra.n = 0;
+    if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
+        goto fail1;
+
+    // * Sizeof the union structure might be overkill but at the moment it
+    //   is correct (it certainly isn't going to be too small)
+    // Set max fill to slack/2 from the end of the Q
+    // If we exceed this in any Q then we will schedule by size (which should
+    // mean that we never use that Q again part from syncs)
+    // * Given how agressive the overflow resonse is we could maybe put the
+    //   threshold even nearer the end, but I don't expect us to ever hit
+    //   it on any real stream anyway.
+
+    if (rpi_inter_pred_alloc(&jb->chroma_ip,
+                         QPU_N_MAX, QPU_N_GRP,
+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
+                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
+        goto fail2;
+    if (rpi_inter_pred_alloc(&jb->luma_ip,
+                         QPU_N_MAX,  QPU_N_GRP,
+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
+                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
+        goto fail3;
+
+    return jb;
+
+fail3:
+    rpi_free_inter_pred(&jb->luma_ip);
+fail2:
+    av_freep(&jb->intra.cmds);
+fail1:
+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
+    rpi_cache_flush_finish(jb->rfe);
+    sem_destroy(&jb->sem);
+    return NULL;
+}
+
+static void job_delete(HEVCRpiJob * const jb)
+{
+    worker_pic_free_one(jb);
+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
+    rpi_free_inter_pred(&jb->chroma_ip);
+    rpi_free_inter_pred(&jb->luma_ip);
+    av_freep(&jb->intra.cmds);
+    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
+    sem_destroy(&jb->sem);
+    av_free(jb);
+}
+
+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
+{
+    HEVCRpiJob * jb;
+
+    if (jbg == NULL)
+        return;
+
+    jb = jbg->free1;
+    while (jb != NULL)
+    {
+        HEVCRpiJob * const jb2 = jb;
+        jb = jb2->next;
+        job_delete(jb2);
+    }
+
+    pthread_mutex_destroy(&jbg->lock);
+    av_free(jbg);
+}
+
+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
+{
+    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
+    if (jbg == NULL)
+        return NULL;
+
+    pthread_mutex_init(&jbg->lock, NULL);
+
+    while (job_count-- != 0)
+    {
+        HEVCRpiJob * const jb = job_new();
+        if (jb == NULL)
+            goto fail;
+
+        jb->next = jbg->free1;
+        jbg->free1 = jb;
+    }
+
+    return jbg;
+
+fail:
+    jbg_delete(jbg);
+    return NULL;
+}
+
+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
+{
+    HEVCRpiJobGlobal * jbg;
+
+    if (jbc == NULL)
+        return;
+
+    jbg = jbc->jbg;
+
+    if (jbc->jb1 != NULL)
+        job_delete(jbc->jb1);
+
+    pthread_mutex_destroy(&jbc->in_lock);
+    sem_destroy(&jbc->sem_out);
+    av_free(jbc);
+
+    // Deref the global job context
+    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
+        jbg_delete(jbg);
+}
+
+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
+{
+    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
+
+    if (jbc == NULL)
+        return NULL;
+
+    jbc->jbg = jbg;
+    atomic_fetch_add(&jbg->ref_count, 1);
+
+    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
+    pthread_mutex_init(&jbc->in_lock, NULL);
+
+    if ((jbc->jb1 = job_new()) == NULL)
+        goto fail;
+    jbc->jb1->jbc_local = jbc;
+
+    return jbc;
+
+fail:
+    rpi_job_ctl_delete(jbc);
+    return NULL;
+}
+
+
+
+static av_cold void hevc_init_worker(HEVCRpiContext * const s)
+{
+#if RPI_PASSES == 2
+    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
+#elif RPI_PASSES == 3
+    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
+    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
+#else
+#error Passes confused
+#endif
+    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
+
+    pass_queues_start_all(s);
+}
+
+static av_cold void hevc_exit_worker(HEVCRpiContext *s)
+{
+    pass_queues_term_all(s);
+
+    pass_queues_kill_all(s);
+
+    rpi_job_ctl_delete(s->jbc);
+    s->jbc = NULL;
+}
+
+
+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
+{
+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
+    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
+
+    // Check for obvious disasters
+    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // If dependant then ctb_addr_ts != 0 from previous check
+    if (s->sh.dependent_slice_segment_flag) {
+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
+        tile_id + s->sh.num_entry_point_offsets >= tiles)
+    {
+        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Tiled stuff must start at start of tile if it has multiple entry points
+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
+        s->sh.num_entry_point_offsets != 0 &&
+        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
+    {
+        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_hevc_rpi_cabac_init_decoder(lc);
+
+    // Setup any required decode vars
+    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
+
+//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
+    lc->qp_y = s->sh.slice_qp;
+
+    // General setup
+    lc->bt_line_no = 0;
+    lc->ts = ctb_addr_ts;
+    return 0;
+}
+
+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
+{
+    const GetBitContext * const gb = &s->HEVClc->gb;
+    RpiSliceHeader * const sh = &s->sh;
+    int i, j;
+
+    const unsigned int length = nal->size;
+    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
+    unsigned int cmpt;
+    unsigned int startheader;
+
+    if (sh->num_entry_point_offsets == 0) {
+        s->data = NULL;
+        return 0;
+    }
+
+    // offset in slice header includes emulation prevention bytes.
+    // Unfortunately those have been removed by the time we get here so we
+    // have to compensate.  The nal layer keeps a track of where they were.
+    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+            startheader--;
+            cmpt++;
+        }
+    }
+
+    for (i = 1; i < sh->num_entry_point_offsets; i++) {
+        offset += (sh->entry_point_offset[i - 1] - cmpt);
+        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+                startheader--;
+                cmpt++;
+            }
+        }
+        if (sh->entry_point_offset[i] <= cmpt) {
+            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
+            return AVERROR_INVALIDDATA;
+        }
+        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
+        sh->offset[i - 1] = offset;
+    }
+
+    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
+    if (length < offset) {
+        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
+        return AVERROR_INVALIDDATA;
+    }
+    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
+    sh->offset[sh->num_entry_point_offsets - 1] = offset;
+
+    // Remember data start pointer as we won't have nal later
+    s->data = nal->data;
+    return 0;
+}
+
+
+// Return
+// < 0   Error
+// 0     OK
+//
+// jb->ctu_ts_last < 0       Job still filling
+// jb->ctu_ts_last >= 0      Job ready
+
+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
+{
+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
+    const unsigned int ctb_size = (1 << log2_ctb_size);
+    HEVCRpiJob * const jb = lc->jb0;
+    int more_data = 1;
+    unsigned int ctb_addr_ts = lc->ts;
+    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
+    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
+
+    lc->unit_done = 0;
+
+    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
+    {
+        int q_full;
+        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
+
+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
+
+        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
+
+        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
+
+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+
+        // Zap stashes if navail
+        if ((lc->ctb_avail & AVAIL_U) == 0)
+            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
+        if ((lc->ctb_avail & AVAIL_L) == 0)
+        {
+            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
+            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
+        }
+#if MVF_STASH_WIDTH > 64
+        // Restore left mvf stash at start of tile if not at start of line
+        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
+        {
+            unsigned int i;
+            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
+            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
+            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
+            {
+                *dst = *src++;
+                dst += MVF_STASH_WIDTH_PU;
+            }
+        }
+#endif
+
+        // Set initial tu states
+        lc->tu.cu_qp_delta = 0;
+        lc->tu.is_cu_qp_delta_wanted = 0;
+        lc->tu.cu_chroma_qp_offset_wanted = 0;
+
+        // Decode
+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
+
+        if (ff_hevc_rpi_cabac_overflow(lc))
+        {
+            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
+            more_data = AVERROR_INVALIDDATA;
+        }
+
+        if (more_data < 0) {
+            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
+            return more_data;
+        }
+
+        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
+             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
+        {
+            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
+                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
+            {
+                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
+                return -1;
+            }
+        }
+
+        // --- Post CTB processing
+
+        // Stash rpl top/left for deblock that needs to remember such things cross-slice
+        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
+        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
+
+        if (!s->is_irap)
+        {
+            // Copy MVF up to up-left & stash to up
+            {
+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
+                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
+
+    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
+
+                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
+                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
+            }
+            // Stash sideways if end of tile line but not end of line (no point)
+            // ** Could/should do this @ end of fn
+#if MVF_STASH_WIDTH > 64
+            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
+#endif
+            {
+                unsigned int i;
+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
+                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
+                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
+                {
+                    *dst++ = *src;
+                    src += MVF_STASH_WIDTH_PU;
+                }
+            }
+        }
+
+        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
+            ff_hevc_rpi_save_states(s, lc);
+
+        // Report progress so we can use our MVs in other frames
+        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
+            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
+
+        // End of line || End of tile line || End of tile
+        // (EoL covers end of frame for our purposes here)
+        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
+
+        // Allocate QPU chunks on fixed size 64 pel boundries rather than
+        // whatever ctb_size is today.
+        // * We might quite like to continue to 64 pel vertical too but that
+        //   currently confuses WPP
+        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
+        {
+            int overflow = 0;
+            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
+                overflow = 1;
+            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
+                overflow = 1;
+            if (overflow)
+            {
+                // * This is very annoying (and slow) to cope with in WPP so
+                //   we treat it as an error there (no known stream triggers this
+                //   with the current buffer sizes).  Non-wpp should cope fine.
+                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
+                q_full = 1;
+            }
+        }
+
+        // Inc TS to next.
+        ctb_addr_ts++;
+        ctb_addr_rs++;
+        x_ctb += ctb_size;
+
+        if (q_full)
+        {
+            // Do job
+            // Prep for submission
+            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
+            job_gen_bounds(s, jb);
+            break;
+        }
+
+        // If max_blocks started as 0 then this will never be true
+        if (--max_blocks == 0)
+            break;
+    }
+
+    lc->unit_done = (more_data <= 0);
+    lc->ts = ctb_addr_ts;
+    return 0;
+}
+
+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
+{
+    lc->context = s;
+    lc->jb0 = NULL;
+    lc->lc_n = n;
+    lc->bt_terminate = 0;
+    lc->bt_psem_out = NULL;
+    sem_init(&lc->bt_sem_in, 0, 0);
+}
+
+#define TRACE_WPP 0
+#if RPI_EXTRA_BIT_THREADS > 0
+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
+{
+    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
+    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
+}
+
+// Move local context parameters from an aux bit thread back to the main
+// thread at the end of a slice as processing is going to continue there.
+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
+{
+    if (src_lc == dst_lc) {
+        return;
+    }
+
+    // Move the job
+    // We will still have an active job if the final line terminates early
+    // Dest should always be null by now
+    av_assert1(dst_lc->jb0 == NULL);
+    dst_lc->jb0 = src_lc->jb0;
+    src_lc->jb0 = NULL;
+
+    // Always need to store where we are in the bitstream
+    dst_lc->ts = src_lc->ts;
+    dst_lc->gb = src_lc->gb;
+    // Cabac init request will be built at start of next slice
+
+    // Need to store context if we might have a dependent seg
+    if (is_dep)
+    {
+        dst_lc->qPy_pred = src_lc->qPy_pred;
+        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
+        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
+        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
+    }
+}
+
+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
+{
+    rpi_sem_wait(&lc->bt_sem_in);
+    return lc->bt_terminate;
+}
+
+// Do one WPP line
+// Will not work correctly over horizontal tile boundries - vertical should be OK
+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
+{
+    const int is_tile = lc->bt_is_tile;
+    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
+    const unsigned int line = lc->bt_line_no;
+    const unsigned int line_inc = lc->bt_line_inc;
+    const int is_last = (line >= lc->bt_last_line);
+
+    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
+    const unsigned int ts_next =
+        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
+            INT_MAX :
+        is_tile ?
+            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
+            lc->ts + lc->bt_line_width * line_inc;
+    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
+    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
+    unsigned int ts_prev;
+    int loop_n = 0;
+    int err = 0;
+
+    av_assert1(line <= s->sh.num_entry_point_offsets);
+
+#if TRACE_WPP
+    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
+           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
+           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
+           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
+#endif
+    if (line != 0)
+    {
+        const uint8_t * const data = s->data + s->sh.offset[line - 1];
+        const unsigned int len = s->sh.size[line - 1];
+        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
+            return err;
+
+        ff_init_cabac_decoder(&lc->cc, data, len);
+    }
+
+    // We should never be processing a dependent slice here so reset is good
+    // ?? These probably shouldn't be needed (as they should be set by later
+    //    logic) but do seem to be required
+    lc->qp_y = s->sh.slice_qp;
+
+    do
+    {
+        if (!is_last && loop_n > 1) {
+#if TRACE_WPP
+            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
+#endif
+            sem_post(lc->bt_psem_out);
+        }
+        // The wait for loop_n == 0 has been done in bit_thread
+        if (!is_first && loop_n != 0)
+        {
+#if TRACE_WPP
+            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
+#endif
+            if (wait_bt_sem_in(lc) != 0)
+                return AVERROR_EXIT;
+        }
+
+#if TRACE_WPP
+        {
+            int n;
+            sem_getvalue(&lc->bt_sem_in, &n);
+            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
+        }
+#endif
+
+        ts_prev = lc->ts;
+
+        // If we have had an error - do no further decode but do continue
+        // moving signals around so the other threads continue to operate
+        // correctly (or at least as correctly as they can with this line missing)
+        //
+        // Errors in WPP/Tile are less fatal than normal as we have a good idea
+        // of how to restart on the next line so there is no need to give up totally
+        if (err != 0)
+        {
+            lc->unit_done = 0;
+            lc->ts += partial_size;
+        }
+        else
+        {
+            worker_pass0_ready(s, lc);
+
+            if ((err = fill_job(s, lc, partial_size)) < 0 ||
+                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
+            {
+                if (err == 0) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
+                    err = AVERROR_INVALIDDATA;
+                }
+                worker_free(s, lc);
+                lc->ts = ts_prev + partial_size;  // Pretend we did all that
+                lc->unit_done = 0;
+            }
+            else if (is_tile)
+            {
+                worker_submit_job(s, lc);
+            }
+        }
+
+        ++loop_n;
+    } while (lc->ts < ts_eol && !lc->unit_done);
+
+    // If we are on the last line & we didn't get a whole line we must wait for
+    // and sink the sem_posts from the line above / tile to the left.
+    while ((ts_prev += partial_size) < ts_eol)
+    {
+#if TRACE_WPP
+        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
+#endif
+        if (wait_bt_sem_in(lc) != 0)
+            return AVERROR_EXIT;
+    }
+
+    lc->bt_line_no += line_inc;
+
+    if (!is_tile && err == 0)
+        worker_submit_job(s, lc);
+
+    if (!is_last) {
+        lc->ts = ts_next;
+
+#if TRACE_WPP
+        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
+#endif
+        sem_post(lc->bt_psem_out);
+        if (loop_n > 1) {
+#if TRACE_WPP
+            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
+#endif
+            sem_post(lc->bt_psem_out);
+        }
+    }
+    else
+    {
+        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
+#if MVF_STASH_WIDTH > 64
+        // Horrid calculations to work out what we want but luckily this should almost never execute
+        // **** Move to movlc
+        if (!s->is_irap)
+        {
+            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
+            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
+            {
+                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
+                unsigned int i;
+                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
+                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
+
+                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
+                {
+                    *d_mvf = *s_mvf;
+                    d_mvf += MVF_STASH_WIDTH_PU;
+                    s_mvf += MVF_STASH_WIDTH_PU;
+                }
+
+            }
+        }
+#endif
+        // When all done poke the thread 0 sem_in one final time
+#if TRACE_WPP
+        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
+#endif
+        sem_post(&s->HEVClcList[0]->bt_sem_in);
+    }
+
+#if TRACE_WPP
+    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
+#endif
+    return err;
+}
+
+static void wpp_setup_lcs(HEVCRpiContext * const s)
+{
+    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+    const unsigned int line_width = line_ts_width(s, ts);
+
+    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
+    {
+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
+        lc->ts = ts;
+        lc->bt_is_tile = 0;
+        lc->bt_line_no = i;
+        lc->bt_line_width = line_width;
+        lc->bt_last_line = s->sh.num_entry_point_offsets;
+        lc->bt_line_inc = RPI_BIT_THREADS;
+        ts += line_width;
+    }
+}
+
+
+// Can only process tile single row at once
+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
+{
+    const HEVCRpiPPS * const pps = s->ps.pps;
+    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+    const unsigned int tile0 = pps->tile_id[ts0];
+    const unsigned int col0 = tile0 % pps->num_tile_columns;
+
+    const unsigned int col = (slice_row == 0) ? col0 : 0;
+    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
+    const unsigned int last_line = FFMIN(
+        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
+
+    const unsigned int par =
+        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
+#if TRACE_WPP
+    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
+           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
+#endif
+    for (unsigned int i = 0; i != par; ++i, ++line)
+    {
+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
+        const unsigned int tile = tile0 + line;
+
+        lc->ts = pps->tile_pos_ts[tile];
+        lc->bt_line_no = line;
+        lc->bt_is_tile = 1;
+        lc->bt_line_width = line_ts_width(s, lc->ts);
+        lc->bt_last_line = last_line;
+        lc->bt_line_inc = par;
+    }
+}
+
+
+static void * bit_thread(void * v)
+{
+    HEVCRpiLocalContext * const lc = v;
+    HEVCRpiContext *const s = lc->context;
+
+    while (wait_bt_sem_in(lc) == 0)
+    {
+        int err;
+
+        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
+            if (lc->bt_terminate) {
+                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
+                break;
+            }
+            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
+        }
+    }
+
+    return NULL;
+}
+
+static int bit_threads_start(HEVCRpiContext * const s)
+{
+    if (s->bt_started)
+        return 0;
+
+    for (int i = 1; i < RPI_BIT_THREADS; ++i)
+    {
+        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
+        if (s->HEVClcList[i] == NULL) {
+            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
+                return -1;
+        }
+
+        bt_lc_init(s, s->HEVClcList[i], i);
+        job_lc_init(s->HEVClcList[i]);
+    }
+
+    // Link the sems in a circle
+    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
+        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
+    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
+
+    // Init all lc before starting any threads
+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
+    {
+        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
+            return -1;
+    }
+
+    s->bt_started = 1;
+    return 0;
+}
+
+static int bit_threads_kill(HEVCRpiContext * const s)
+{
+    if (!s->bt_started)
+        return 0;
+    s->bt_started = 0;
+
+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
+    {
+        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
+        if (lc == NULL)
+            break;
+
+        lc->bt_terminate = 1;
+        sem_post(&lc->bt_sem_in);
+        pthread_join(s->bit_threads[i], NULL);
+
+        sem_destroy(&lc->bt_sem_in);
+        job_lc_kill(lc);
+    }
+    return 0;
+}
+#endif
+
+
+// If we are at EoT and the row is shorter than the number of jobs
+// we can Q we have to wait for it finish otherwise we risk cache/QPU
+// disasters
+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
+{
+    return
+        s->ps.pps->tile_wpp_inter_disable >= 2 &&
+        s->sh.slice_type != HEVC_SLICE_I &&
+        n >= 0 &&
+        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
+}
+
+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+{
+    HEVCRpiContext * const s  = avctxt->priv_data;
+    HEVCRpiLocalContext * const lc = s->HEVClc;
+    int err;
+
+    // Start of slice
+    if ((err = slice_start(s, lc)) != 0)
+        return err;
+
+#if RPI_EXTRA_BIT_THREADS > 0
+
+    if (s->sh.offload_tiles)
+    {
+        unsigned int slice_row = 0;
+
+#if TRACE_WPP
+        printf("%s: Do Tiles\n", __func__);
+#endif
+        // Generate & start extra bit threads if they aren't already running
+        bit_threads_start(s);
+
+        do
+        {
+            // Reset lc lines etc.
+            tile_one_row_setup_lcs(s, slice_row);
+
+#if TRACE_WPP
+            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
+#endif
+
+            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
+#if TRACE_WPP
+            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
+#endif
+
+            while (lc->bt_line_no <= lc->bt_last_line) {
+                rpi_sem_wait(&lc->bt_sem_in);
+                rpi_run_one_line(s, lc, 0);
+            }
+#if TRACE_WPP
+            printf("%s: Done body\n", __func__);
+#endif
+
+            // Wait for everything else to finish
+            rpi_sem_wait(&lc->bt_sem_in);
+
+            ++slice_row;
+        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
+
+
+#if TRACE_WPP
+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
+#endif
+    }
+    else if (s->sh.offload_wpp)
+    {
+#if TRACE_WPP
+        printf("%s: Do WPP\n", __func__);
+#endif
+        // Generate & start extra bit threads if they aren't already running
+        bit_threads_start(s);
+
+        // Reset lc lines etc.
+        wpp_setup_lcs(s);
+
+        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
+#if TRACE_WPP
+        printf("%s: Done 1st\n", __func__);
+#endif
+
+        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
+            rpi_sem_wait(&lc->bt_sem_in);
+            rpi_run_one_line(s, lc, 0);
+        }
+#if TRACE_WPP
+        printf("%s: Done body\n", __func__);
+#endif
+
+        // Wait for everything else to finish
+        rpi_sem_wait(&lc->bt_sem_in);
+
+#if TRACE_WPP
+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
+#endif
+    }
+    else
+#endif
+    {
+#if TRACE_WPP
+        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
+#endif
+        // Single bit thread
+        do {
+            // Make sure we have space to prepare the next job
+            worker_pass0_ready(s, lc);
+
+            if ((err = fill_job(s, lc, 0)) < 0)
+                goto fail;
+
+            worker_submit_job(s, lc);
+
+            if (tile_needs_wait(s, lc->ts - 1))
+                worker_wait(s, lc);
+
+        } while (!lc->unit_done);
+
+#if TRACE_WPP
+        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
+#endif
+    }
+
+    // If we have reached the end of the frame or
+    // then wait for the worker to finish all its jobs
+    if (lc->ts >= s->ps.sps->ctb_size)
+        worker_wait(s, lc);
+
+#if RPI_TSTATS
+    {
+        HEVCRpiStats *const ts = &s->tstats;
+
+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+        memset(ts, 0, sizeof(*ts));
+    }
+#endif
+
+    return lc->ts;
+
+fail:
+    // Cleanup
+    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
+    // Free our job & wait for temination
+    worker_free(s, lc);
+    worker_wait(s, lc);
+    return err;
+}
+
+
+static void set_no_backward_pred(HEVCRpiContext * const s)
+{
+    int i, j;
+    const RefPicList *const refPicList = s->refPicList;
+
+    s->no_backward_pred_flag = 0;
+    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
+        return;
+
+    for (j = 0; j < 2; j++) {
+        for (i = 0; i < refPicList[j].nb_refs; i++) {
+            if (refPicList[j].list[i] > s->poc) {
+                s->no_backward_pred_flag = 1;
+                return;
+            }
+        }
+    }
+}
+
+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
+{
+    int err;
+    if ((err = gen_entry_points(s, nal)) < 0)
+        return err;
+
+    set_no_backward_pred(s);
+
+    return rpi_decode_entry(s->avctx, NULL);
+}
+
+static int set_side_data(HEVCRpiContext *s)
+{
+    AVFrame *out = s->ref->frame;
+
+    if (s->sei.frame_packing.present &&
+        s->sei.frame_packing.arrangement_type >= 3 &&
+        s->sei.frame_packing.arrangement_type <= 5 &&
+        s->sei.frame_packing.content_interpretation_type > 0 &&
+        s->sei.frame_packing.content_interpretation_type < 3) {
+        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
+        if (!stereo)
+            return AVERROR(ENOMEM);
+
+        switch (s->sei.frame_packing.arrangement_type) {
+        case 3:
+            if (s->sei.frame_packing.quincunx_subsampling)
+                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
+            else
+                stereo->type = AV_STEREO3D_SIDEBYSIDE;
+            break;
+        case 4:
+            stereo->type = AV_STEREO3D_TOPBOTTOM;
+            break;
+        case 5:
+            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
+            break;
+        }
+
+        if (s->sei.frame_packing.content_interpretation_type == 2)
+            stereo->flags = AV_STEREO3D_FLAG_INVERT;
+
+        if (s->sei.frame_packing.arrangement_type == 5) {
+            if (s->sei.frame_packing.current_frame_is_frame0_flag)
+                stereo->view = AV_STEREO3D_VIEW_LEFT;
+            else
+                stereo->view = AV_STEREO3D_VIEW_RIGHT;
+        }
+    }
+
+    if (s->sei.display_orientation.present &&
+        (s->sei.display_orientation.anticlockwise_rotation ||
+         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
+        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
+        AVFrameSideData *rotation = av_frame_new_side_data(out,
+                                                           AV_FRAME_DATA_DISPLAYMATRIX,
+                                                           sizeof(int32_t) * 9);
+        if (!rotation)
+            return AVERROR(ENOMEM);
+
+        av_display_rotation_set((int32_t *)rotation->data, angle);
+        av_display_matrix_flip((int32_t *)rotation->data,
+                               s->sei.display_orientation.hflip,
+                               s->sei.display_orientation.vflip);
+    }
+
+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
+    // so the side data persists for the entire coded video sequence.
+    if (s->sei.mastering_display.present > 0 &&
+        IS_IRAP(s) && s->no_rasl_output_flag) {
+        s->sei.mastering_display.present--;
+    }
+    if (s->sei.mastering_display.present) {
+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
+        const int mapping[3] = {2, 0, 1};
+        const int chroma_den = 50000;
+        const int luma_den = 10000;
+        int i;
+        AVMasteringDisplayMetadata *metadata =
+            av_mastering_display_metadata_create_side_data(out);
+        if (!metadata)
+            return AVERROR(ENOMEM);
+
+        for (i = 0; i < 3; i++) {
+            const int j = mapping[i];
+            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
+            metadata->display_primaries[i][0].den = chroma_den;
+            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
+            metadata->display_primaries[i][1].den = chroma_den;
+        }
+        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
+        metadata->white_point[0].den = chroma_den;
+        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
+        metadata->white_point[1].den = chroma_den;
+
+        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
+        metadata->max_luminance.den = luma_den;
+        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
+        metadata->min_luminance.den = luma_den;
+        metadata->has_luminance = 1;
+        metadata->has_primaries = 1;
+
+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
+               av_q2d(metadata->display_primaries[0][0]),
+               av_q2d(metadata->display_primaries[0][1]),
+               av_q2d(metadata->display_primaries[1][0]),
+               av_q2d(metadata->display_primaries[1][1]),
+               av_q2d(metadata->display_primaries[2][0]),
+               av_q2d(metadata->display_primaries[2][1]),
+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "min_luminance=%f, max_luminance=%f\n",
+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
+    }
+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
+    // so the side data persists for the entire coded video sequence.
+    if (s->sei.content_light.present > 0 &&
+        IS_IRAP(s) && s->no_rasl_output_flag) {
+        s->sei.content_light.present--;
+    }
+    if (s->sei.content_light.present) {
+        AVContentLightMetadata *metadata =
+            av_content_light_metadata_create_side_data(out);
+        if (!metadata)
+            return AVERROR(ENOMEM);
+        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
+        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
+
+        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
+        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
+               metadata->MaxCLL, metadata->MaxFALL);
+    }
+
+    if (s->sei.a53_caption.a53_caption) {
+        AVFrameSideData* sd = av_frame_new_side_data(out,
+                                                     AV_FRAME_DATA_A53_CC,
+                                                     s->sei.a53_caption.a53_caption_size);
+        if (sd)
+            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
+        av_freep(&s->sei.a53_caption.a53_caption);
+        s->sei.a53_caption.a53_caption_size = 0;
+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+    }
+
+    if (s->sei.alternative_transfer.present &&
+        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
+        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
+        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
+    }
+
+    return 0;
+}
+
+static int hevc_frame_start(HEVCRpiContext * const s)
+{
+    int ret;
+
+    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
+    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
+    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
+
+    // Only need to remember intra for CIP
+    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
+        s->is_intra = NULL;
+    else
+    {
+        s->is_intra = s->is_intra_store;
+        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
+    }
+
+    s->is_decoded        = 0;
+    s->first_nal_type    = s->nal_unit_type;
+
+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
+
+    if (s->pkt.nb_nals > s->rpl_tab_size)
+    {
+        // In most cases it will be faster to free & realloc as that doesn't
+        // require (an unwanted) copy
+        av_freep(&s->rpl_tab);
+        s->rpl_tab_size = 0;
+        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
+            goto fail;
+        s->rpl_tab_size = s->pkt.nb_nals;
+    }
+    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
+
+    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
+    if (ret < 0)
+        goto fail;
+
+    // Resize rpl_tab to max that we might want
+    ret = ff_hevc_rpi_frame_rps(s);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
+        goto fail;
+    }
+
+    s->ref->frame->key_frame = IS_IRAP(s);
+
+    ret = set_side_data(s);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->pict_type = 3 - s->sh.slice_type;
+
+    if (!IS_IRAP(s))
+        ff_hevc_rpi_bump_frame(s);
+
+    av_frame_unref(s->output_frame);
+    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    ff_thread_finish_setup(s->avctx);
+
+    return 0;
+
+fail:
+    if (s->ref)
+        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
+    s->ref = NULL;
+    return ret;
+}
+
+static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
+{
+    // From Table 7-1
+    return (nal_unit_type & ~0xe) == 0;  // True for 0, 2, 4, 6, 8, 10, 12, 14
+}
+
+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
+{
+    GetBitContext * const gb    = &s->HEVClc->gb;
+    int ctb_addr_ts, ret;
+
+    *gb              = nal->gb;
+    s->nal_unit_type = nal->type;
+    s->temporal_id   = nal->temporal_id;
+
+    switch (s->nal_unit_type) {
+    case HEVC_NAL_VPS:
+        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
+        if (ret < 0)
+            goto fail;
+        break;
+    case HEVC_NAL_SPS:
+        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
+                                     s->apply_defdispwin);
+        if (ret < 0)
+            goto fail;
+        break;
+    case HEVC_NAL_PPS:
+        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
+        if (ret < 0)
+            goto fail;
+        break;
+    case HEVC_NAL_SEI_PREFIX:
+    case HEVC_NAL_SEI_SUFFIX:
+        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
+        if (ret < 0)
+            goto fail;
+        break;
+    case HEVC_NAL_TRAIL_R:
+    case HEVC_NAL_TRAIL_N:
+    case HEVC_NAL_TSA_N:
+    case HEVC_NAL_TSA_R:
+    case HEVC_NAL_STSA_N:
+    case HEVC_NAL_STSA_R:
+    case HEVC_NAL_BLA_W_LP:
+    case HEVC_NAL_BLA_W_RADL:
+    case HEVC_NAL_BLA_N_LP:
+    case HEVC_NAL_IDR_W_RADL:
+    case HEVC_NAL_IDR_N_LP:
+    case HEVC_NAL_CRA_NUT:
+    case HEVC_NAL_RADL_N:
+    case HEVC_NAL_RADL_R:
+    case HEVC_NAL_RASL_N:
+    case HEVC_NAL_RASL_R:
+        ret = hls_slice_header(s);
+        if (ret < 0)
+            return ret;
+
+        // The definition of _N unit types is "non-reference for other frames
+        // with the same temporal_id" so they may/will be ref frames for pics
+        // with a higher temporal_id.
+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+            !is_non_ref_unit_type(s->nal_unit_type);
+        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
+        s->is_irap = IS_IRAP(s);
+
+#if DEBUG_DECODE_N
+        {
+            static int z = 0;
+            if (IS_IDR(s)) {
+                z = 1;
+            }
+            if (z != 0 && z++ > DEBUG_DECODE_N) {
+                s->is_decoded = 0;
+                break;
+            }
+        }
+#endif
+        if (
+            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
+            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
+            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
+            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
+        {
+            s->is_decoded = 0;
+            break;
+        }
+
+        if (s->sh.first_slice_in_pic_flag) {
+            if (s->max_ra == INT_MAX) {
+                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
+                    s->max_ra = s->poc;
+                } else {
+                    if (IS_IDR(s))
+                        s->max_ra = INT_MIN;
+                }
+            }
+
+            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
+                s->poc <= s->max_ra) {
+                s->is_decoded = 0;
+                break;
+            } else {
+                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
+                    s->max_ra = INT_MIN;
+            }
+
+            ret = hevc_frame_start(s);
+            if (ret < 0)
+                return ret;
+        } else if (!s->ref) {
+            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
+            goto fail;
+        }
+
+        if (s->nal_unit_type != s->first_nal_type) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
+                   s->first_nal_type, s->nal_unit_type);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (!s->sh.dependent_slice_segment_flag &&
+            s->sh.slice_type != HEVC_SLICE_I) {
+            ret = ff_hevc_rpi_slice_rpl(s);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_WARNING,
+                       "Error constructing the reference lists for the current slice.\n");
+                goto fail;
+            }
+        }
+
+        ctb_addr_ts = hls_slice_data(s, nal);
+        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
+            s->is_decoded = 1;
+        }
+
+        if (ctb_addr_ts < 0) {
+            ret = ctb_addr_ts;
+            goto fail;
+        }
+        break;
+    case HEVC_NAL_EOS_NUT:
+    case HEVC_NAL_EOB_NUT:
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra     = INT_MAX;
+        break;
+    case HEVC_NAL_AUD:
+    case HEVC_NAL_FD_NUT:
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_INFO,
+               "Skipping NAL unit %d\n", s->nal_unit_type);
+    }
+
+    return 0;
+fail:
+    if (s->avctx->err_recognition & AV_EF_EXPLODE)
+        return ret;
+    return 0;
+}
+
+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
+{
+    int i, ret = 0;
+    int eos_at_start = 1;
+
+    s->ref = NULL;
+    s->last_eos = s->eos;
+    s->eos = 0;
+
+    /* split the input packet into NAL units, so we know the upper bound on the
+     * number of slices in the frame */
+    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
+                                s->nal_length_size, s->avctx->codec_id, 0, 0);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Error splitting the input into NAL units.\n");
+        return ret;
+    }
+
+    for (i = 0; i < s->pkt.nb_nals; i++) {
+        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
+            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
+            if (eos_at_start) {
+                s->last_eos = 1;
+            } else {
+                s->eos = 1;
+            }
+        } else {
+            eos_at_start = 0;
+        }
+    }
+
+    /* decode the NAL units */
+    for (i = 0; i < s->pkt.nb_nals; i++) {
+        ret = decode_nal_unit(s, &s->pkt.nals[i]);
+        if (ret < 0) {
+            av_log(s->avctx, AV_LOG_WARNING,
+                   "Error parsing NAL unit #%d.\n", i);
+            goto fail;
+        }
+    }
+
+fail:  // Also success path
+    if (s->ref != NULL) {
+        if (s->used_for_ref && s->threads_type != 0) {
+            ff_hevc_rpi_progress_signal_all_done(s);
+        }
+        else {
+            // Flush frame to real memory as we expect to be able to pass
+            // it straight on to mmal
+            flush_frame(s, s->frame);
+        }
+    }
+    return ret;
+}
+
+static void print_md5(void *log_ctx, int level, uint8_t md5[16])
+{
+    int i;
+    for (i = 0; i < 16; i++)
+        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
+}
+
+static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    int pixel_shift;
+    int i, j;
+
+    if (!desc)
+        return AVERROR(EINVAL);
+
+    pixel_shift = desc->comp[0].depth > 8;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
+           s->poc);
+
+    /* the checksums are LE, so we have to byteswap for >8bpp formats
+     * on BE arches */
+#if HAVE_BIGENDIAN
+    if (pixel_shift && !s->checksum_buf) {
+        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
+                       FFMAX3(frame->linesize[0], frame->linesize[1],
+                              frame->linesize[2]));
+        if (!s->checksum_buf)
+            return AVERROR(ENOMEM);
+    }
+#endif
+
+    for (i = 0; frame->data[i]; i++) {
+        int width  = s->avctx->coded_width;
+        int height = s->avctx->coded_height;
+        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
+        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
+        uint8_t md5[16];
+
+        av_md5_init(s->md5_ctx);
+        for (j = 0; j < h; j++) {
+            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
+#if HAVE_BIGENDIAN
+            if (pixel_shift) {
+                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
+                                    (const uint16_t *) src, w);
+                src = s->checksum_buf;
+            }
+#endif
+            av_md5_update(s->md5_ctx, src, w << pixel_shift);
+        }
+        av_md5_final(s->md5_ctx, md5);
+
+        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
+            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
+            print_md5(s->avctx, AV_LOG_DEBUG, md5);
+            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
+        } else {
+            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
+            print_md5(s->avctx, AV_LOG_ERROR, md5);
+            av_log   (s->avctx, AV_LOG_ERROR, " != ");
+            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
+            av_log   (s->avctx, AV_LOG_ERROR, "\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
+
+    return 0;
+}
+
+static int all_sps_supported(const HEVCRpiContext * const s)
+{
+    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
+        if (s->ps.sps_list[i] != NULL)
+        {
+            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
+            if (!is_sps_supported(sps))
+                return 0;
+        }
+    }
+    return 1;
+}
+
+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
+{
+    int ret, i;
+
+    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
+                                   &s->nal_length_size, s->avctx->err_recognition,
+                                   s->apply_defdispwin, s->avctx);
+    if (ret < 0)
+        return ret;
+
+    /* export stream parameters from the first SPS */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
+        if (first && s->ps.sps_list[i]) {
+            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
+            export_stream_params(s->avctx, &s->ps, sps);
+            break;
+        }
+    }
+
+    return 0;
+}
+
+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
+                             AVPacket *avpkt)
+{
+    int ret;
+    int new_extradata_size;
+    uint8_t *new_extradata;
+    HEVCRpiContext *s = avctx->priv_data;
+
+    if (!avpkt->size) {
+        ret = ff_hevc_rpi_output_frame(s, data, 1);
+        if (ret < 0)
+            return ret;
+
+        *got_output = ret;
+        return 0;
+    }
+
+    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
+                                            &new_extradata_size);
+    if (new_extradata && new_extradata_size > 0) {
+        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
+        if (ret < 0)
+            return ret;
+    }
+
+    s->ref = NULL;
+    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
+    if (ret < 0)
+        return ret;
+
+    /* verify the SEI checksum */
+    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
+        s->sei.picture_hash.is_md5) {
+        ret = verify_md5(s, s->ref->frame);
+        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
+            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
+            return ret;
+        }
+    }
+    s->sei.picture_hash.is_md5 = 0;
+
+    if (s->is_decoded) {
+        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
+        s->is_decoded = 0;
+    }
+
+    if (s->output_frame->buf[0]) {
+        av_frame_move_ref(data, s->output_frame);
+        *got_output = 1;
+    }
+
+    return avpkt->size;
+}
+
+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
+{
+    int ret;
+
+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
+    if (ret < 0)
+        return ret;
+
+    if (src->col_mvf_buf != NULL)
+    {
+        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
+        if (!dst->col_mvf_buf)
+            goto fail;
+    }
+    dst->col_mvf = src->col_mvf;
+
+    dst->poc        = src->poc;
+    dst->flags      = src->flags;
+    dst->sequence   = src->sequence;
+    return 0;
+
+fail:
+    ff_hevc_rpi_unref_frame(s, dst, ~0);
+    return AVERROR(ENOMEM);
+}
+
+
+static av_cold int hevc_decode_free(AVCodecContext *avctx)
+{
+    HEVCRpiContext * const s = avctx->priv_data;
+    int i;
+
+    pic_arrays_free(s);
+
+    av_freep(&s->md5_ctx);
+
+    av_freep(&s->cabac_save);
+
+#if RPI_EXTRA_BIT_THREADS
+    bit_threads_kill(s);
+#endif
+
+    hevc_exit_worker(s);
+    for (i = 0; i != 2; ++i) {
+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
+    }
+    job_lc_kill(s->HEVClc);
+
+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
+    av_freep(&s->sao_pixel_buffer_v[0]);
+    av_frame_free(&s->output_frame);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
+        av_frame_free(&s->DPB[i].frame);
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
+        av_buffer_unref(&s->ps.vps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
+        av_buffer_unref(&s->ps.sps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
+        av_buffer_unref(&s->ps.pps_list[i]);
+    s->ps.sps = NULL;
+    s->ps.pps = NULL;
+    s->ps.vps = NULL;
+
+    // Free separately from sLists as used that way by RPI WPP
+    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
+        av_freep(s->HEVClcList + i);
+    }
+    s->HEVClc = NULL;  // Allocated as part of HEVClcList
+
+    ff_h2645_packet_uninit(&s->pkt);
+
+    if (s->qpu_init_ok)
+        vpu_qpu_term();
+    s->qpu_init_ok = 0;
+
+    return 0;
+}
+
+
+static av_cold int hevc_init_context(AVCodecContext *avctx)
+{
+    HEVCRpiContext *s = avctx->priv_data;
+    int i;
+
+    s->avctx = avctx;
+
+    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
+    if (!s->HEVClc)
+        goto fail;
+    s->HEVClcList[0] = s->HEVClc;
+
+    if (vpu_qpu_init() != 0)
+        goto fail;
+    s->qpu_init_ok = 1;
+
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+    {
+        static const uint32_t dframe[1] = {0x80808080};
+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
+    }
+#endif
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+    s->qpu_dummy_frame_qpu = qpu_dummy();
+#endif
+
+    bt_lc_init(s, s->HEVClc, 0);
+    job_lc_init(s->HEVClc);
+
+    for (i = 0; i != 2; ++i) {
+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
+    }
+
+    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
+        goto fail;
+
+     if ((s->output_frame = av_frame_alloc()) == NULL)
+        goto fail;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        s->DPB[i].frame = av_frame_alloc();
+        if (!s->DPB[i].frame)
+            goto fail;
+        s->DPB[i].tf.f = s->DPB[i].frame;
+        s->DPB[i].dpb_no = i;
+    }
+
+    s->max_ra = INT_MAX;
+
+    if ((s->md5_ctx = av_md5_alloc()) == NULL)
+        goto fail;
+
+    s->context_initialized = 1;
+    s->eos = 0;
+
+    ff_hevc_rpi_reset_sei(&s->sei);
+
+    return 0;
+
+fail:
+    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
+    hevc_decode_free(avctx);
+    return AVERROR(ENOMEM);
+}
+
+#if HAVE_THREADS
+static int hevc_update_thread_context(AVCodecContext *dst,
+                                      const AVCodecContext *src)
+{
+    HEVCRpiContext *s  = dst->priv_data;
+    HEVCRpiContext *s0 = src->priv_data;
+    int i, ret;
+
+    av_assert0(s->context_initialized);
+
+    // dst == src can happen according to the comments and in that case
+    // there is nothing to do here
+    if (dst == src)
+        return 0;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
+        if (s0->DPB[i].frame->buf[0]) {
+            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
+    if (s->ps.sps != s0->ps.sps)
+        s->ps.sps = NULL;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
+        av_buffer_unref(&s->ps.vps_list[i]);
+        if (s0->ps.vps_list[i]) {
+            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
+            if (!s->ps.vps_list[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
+        av_buffer_unref(&s->ps.sps_list[i]);
+        if (s0->ps.sps_list[i]) {
+            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
+            if (!s->ps.sps_list[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
+        av_buffer_unref(&s->ps.pps_list[i]);
+        if (s0->ps.pps_list[i]) {
+            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
+            if (!s->ps.pps_list[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    if (s->ps.sps != s0->ps.sps)
+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
+            return ret;
+
+    s->seq_decode = s0->seq_decode;
+    s->seq_output = s0->seq_output;
+    s->pocTid0    = s0->pocTid0;
+    s->max_ra     = s0->max_ra;
+    s->eos        = s0->eos;
+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
+
+    s->is_nalff        = s0->is_nalff;
+    s->nal_length_size = s0->nal_length_size;
+
+    s->threads_type        = s0->threads_type;
+
+    if (s0->eos) {
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra = INT_MAX;
+    }
+
+    s->sei.frame_packing        = s0->sei.frame_packing;
+    s->sei.display_orientation  = s0->sei.display_orientation;
+    s->sei.mastering_display    = s0->sei.mastering_display;
+    s->sei.content_light        = s0->sei.content_light;
+    s->sei.alternative_transfer = s0->sei.alternative_transfer;
+
+    // * We do this here as it allows us to easily locate our parents
+    //   global job pool, but there really should be a less nasty way
+    if (s->jbc == NULL)
+    {
+        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
+        hevc_init_worker(s);
+    }
+
+    return 0;
+}
+#endif
+
+#include <sys/stat.h>
+static int qpu_ok(void)
+{
+    static int is_pi3 = -1;
+    if (is_pi3 == -1)
+    {
+        struct stat sb;
+        is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
+    }
+    return is_pi3;
+}
+
+static av_cold int hevc_decode_init(AVCodecContext *avctx)
+{
+    HEVCRpiContext *s = avctx->priv_data;
+    int ret;
+
+    if (!qpu_ok())
+        return AVERROR_DECODER_NOT_FOUND;
+
+    if ((ret = hevc_init_context(avctx)) < 0)
+        return ret;
+
+    // If we are a child context then stop now
+    // Everything after this point is either 1st decode setup or global alloc
+    // that must not be repeated
+    // Global info will be copied into children in update_thread_context (we
+    // can't do it here as we have no way of finding the parent context)
+    if (avctx->internal->is_copy)
+        return 0;
+
+    // Job allocation requires VCSM alloc to work so ensure that we have it
+    // initialised by this point
+    {
+        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
+        if (jbg == NULL) {
+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) {
+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+    }
+
+    hevc_init_worker(s);
+
+    s->eos = 1;
+
+    if (avctx->extradata_size > 0 && avctx->extradata) {
+        if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0)
+            goto fail;
+
+        if (!all_sps_supported(s)) {
+            ret = AVERROR_DECODER_NOT_FOUND;
+            goto fail;
+        }
+    }
+
+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+        s->threads_type = FF_THREAD_FRAME;
+    else
+        s->threads_type = 0;
+
+    return 0;
+
+fail:
+    hevc_decode_free(avctx);
+    return ret;
+}
+
+static void hevc_decode_flush(AVCodecContext *avctx)
+{
+    HEVCRpiContext *s = avctx->priv_data;
+    ff_hevc_rpi_flush_dpb(s);
+    s->max_ra = INT_MAX;
+    s->eos = 1;
+}
+
+typedef struct  hwaccel_rpi3_qpu_env_s {
+    const AVClass *av_class;
+    AVZcEnvPtr zc;
+} hwaccel_rpi3_qpu_env_t;
+
+static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
+{
+    hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
+    int rv;
+
+    if (av_rpi_zc_in_use(s))
+    {
+        rv = s->get_buffer2(s, frame, 0);
+    }
+    else
+    {
+        rv = av_rpi_zc_get_buffer(r3->zc, frame);
+        if (rv == 0)
+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);  // actually do the alloc
+    }
+
+    if (rv == 0 &&
+        (rv = ff_attach_decode_data(frame)) < 0)
+    {
+        av_frame_unref(frame);
+    }
+
+    return rv;
+}
+
+static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
+{
+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
+    av_rpi_zc_int_env_freep(&r3->zc);
+    return 0;
+}
+
+static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
+{
+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
+
+    if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
+        goto fail;
+
+    return 0;
+
+fail:
+    av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
+    hwaccel_rpi3_qpu_free(avctx);
+    return AVERROR(ENOMEM);
+}
+
+
+#define OFFSET(x) offsetof(HEVCRpiContext, x)
+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+
+
+static const AVOption options[] = {
+    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
+    { NULL },
+};
+
+static const AVClass hevc_rpi_decoder_class = {
+    .class_name = "HEVC RPI decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
+    AV_PIX_FMT_SAND128,
+    AV_PIX_FMT_SAND64_10,
+    AV_PIX_FMT_NONE
+};
+
+
+static const AVHWAccel hwaccel_rpi3_qpu = {
+    .name           = "Pi3 QPU Hwaccel",
+    .alloc_frame    = hwaccel_alloc_frame,
+    .init           = hwaccel_rpi3_qpu_init,
+    .uninit         = hwaccel_rpi3_qpu_free,
+    .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
+};
+
+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
+{
+    .public = {
+        .pix_fmt = AV_PIX_FMT_SAND128,
+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
+        .device_type = AV_HWDEVICE_TYPE_NONE,
+    },
+    .hwaccel = &hwaccel_rpi3_qpu
+};
+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
+{
+    .public = {
+        .pix_fmt = AV_PIX_FMT_SAND64_10,
+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
+        .device_type = AV_HWDEVICE_TYPE_NONE,
+    },
+    .hwaccel = &hwaccel_rpi3_qpu
+};
+
+
+static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
+    &hevc_rpi_hw_config_sand128,
+    &hevc_rpi_hw_config_sand64_10,
+    NULL
+};
+
+
+AVCodec ff_hevc_rpi_decoder = {
+    .name                  = "hevc_rpi",
+    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
+    .type                  = AVMEDIA_TYPE_VIDEO,
+    .id                    = AV_CODEC_ID_HEVC,
+    .priv_data_size        = sizeof(HEVCRpiContext),
+    .priv_class            = &hevc_rpi_decoder_class,
+    .init                  = hevc_decode_init,
+    .close                 = hevc_decode_free,
+    .decode                = hevc_rpi_decode_frame,
+    .flush                 = hevc_decode_flush,
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+                             AV_CODEC_CAP_HARDWARE |
+                             AV_CODEC_CAP_AVOID_PROBING |
+#if 0
+    // Debugging is often easier without threads getting in the way
+                            0,
+#warning H265 threading turned off
+#else
+    // We only have decent optimisation for frame - so only admit to that
+                             AV_CODEC_CAP_FRAME_THREADS,
+#endif
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
+                             FF_CODEC_CAP_EXPORTS_CROPPING |
+                             FF_CODEC_CAP_ALLOCATE_PROGRESS,
+    .pix_fmts              = hevc_rpi_pix_fmts,
+    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
+    .hw_configs            = hevc_rpi_hw_configs,
+//    .wrapper_name          = "hevc_rpi",
+};
+
diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
new file mode 100644
index 0000000000..1f94d18673
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.h
@@ -0,0 +1,1091 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RPI_HEVCDEC_H
+#define AVCODEC_RPI_HEVCDEC_H
+
+#include "config.h"
+
+#include <stdatomic.h>
+
+#include "libavutil/buffer.h"
+
+#include "avcodec.h"
+#include "bswapdsp.h"
+#include "cabac.h"
+#include "get_bits.h"
+#include "rpi_hevcpred.h"
+#include "h2645_parse.h"
+#include "hevc.h"
+#include "rpi_hevc_mv.h"
+#include "rpi_hevc_ps.h"
+#include "rpi_hevc_sei.h"
+#include "rpi_hevcdsp.h"
+#include "internal.h"
+#include "thread.h"
+#include "videodsp.h"
+
+#if ARCH_ARM
+#include "arm/rpi_hevc_misc_neon.h"
+#endif
+
+#define MAX_NB_THREADS 16
+#define SHIFT_CTB_WPP 2
+
+//TODO: check if this is really the maximum
+#define MAX_TRANSFORM_DEPTH 5
+
+#define MAX_TB_SIZE 32
+#define MAX_QP 51
+#define DEFAULT_INTRA_TC_OFFSET 2
+
+#define HEVC_CONTEXTS 199
+
+#define MRG_MAX_NUM_CANDS     5
+
+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
+
+// Size of DPB array
+#define HEVC_DPB_ELS            32
+
+#define L0 0
+#define L1 1
+
+#define EPEL_EXTRA_BEFORE 1
+#define EPEL_EXTRA_AFTER  2
+#define EPEL_EXTRA        3
+#define QPEL_EXTRA_BEFORE 3
+#define QPEL_EXTRA_AFTER  4
+#define QPEL_EXTRA        7
+
+#define EDGE_EMU_BUFFER_STRIDE 80
+
+#include <semaphore.h>
+#include "rpi_qpu.h"
+
+// Max jobs per frame thread. Actual usage will be limited by the size
+// of the global job pool
+// ?? Limits
+#define RPI_MAX_JOBS            8
+
+// This is the number of _extra_ bit threads - we will have
+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
+//
+// 0 is legitimate and will disable our WPP processing
+//#define RPI_EXTRA_BIT_THREADS 0
+#define RPI_EXTRA_BIT_THREADS   2
+
+// Number of separate threads/passes in worker
+// 2 and 3 are the currently valid numbers
+// At the moment 3 seems fractionally faster
+//#define RPI_PASSES              2
+#define RPI_PASSES              3
+
+// Print out various usage stats
+#define RPI_TSTATS              0
+
+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
+#define RPI_COMPRESS_COEFFS     1
+
+// Wait for VPU/QPU to finish in worker pass 0
+// If 0 then the wait is in pass 1
+//
+// One might expect the better place to wait would be in pass 1 however
+// testing shows that pass 0 produces overall faster decode.
+// Interestingly it is QPU/VPU limited streams that seem to suffer
+// from pass 1 waits, CPU limited ones tend to show a very mild gain.
+// This define exists so it is easy to test this.
+#define RPI_WORKER_WAIT_PASS_0  1
+
+// Use ARM emulation of QPU pred
+// These are for debug only as the emulation makes only limited
+// effort to be fast
+#define RPI_QPU_EMU_Y           0
+#define RPI_QPU_EMU_C           0
+
+// Max width & height we are prepared to consider
+// Sand frame shape calc becomes confused with large frames
+// Some buffer alloc also depends on this
+#define HEVC_RPI_MAX_WIDTH      2048
+#define HEVC_RPI_MAX_HEIGHT     1088
+
+
+// Min CTB size is 16
+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
+
+/**
+ * Value of the luma sample at position (x, y) in the 2D array tab.
+ */
+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
+
+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
+                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
+
+enum RPSType {
+    ST_CURR_BEF = 0,
+    ST_CURR_AFT,
+    ST_FOLL,
+    LT_CURR,
+    LT_FOLL,
+    NB_RPS_TYPE,
+};
+
+enum SyntaxElement {
+    SAO_MERGE_FLAG = 0,
+    SAO_TYPE_IDX,
+    SAO_EO_CLASS,
+    SAO_BAND_POSITION,
+    SAO_OFFSET_ABS,
+    SAO_OFFSET_SIGN,
+    END_OF_SLICE_FLAG,
+    SPLIT_CODING_UNIT_FLAG,
+    CU_TRANSQUANT_BYPASS_FLAG,
+    SKIP_FLAG,
+    CU_QP_DELTA,
+    PRED_MODE_FLAG,
+    PART_MODE,
+    PCM_FLAG,
+    PREV_INTRA_LUMA_PRED_FLAG,
+    MPM_IDX,
+    REM_INTRA_LUMA_PRED_MODE,
+    INTRA_CHROMA_PRED_MODE,
+    MERGE_FLAG,
+    MERGE_IDX,
+    INTER_PRED_IDC,
+    REF_IDX_L0,
+    REF_IDX_L1,
+    ABS_MVD_GREATER0_FLAG,
+    ABS_MVD_GREATER1_FLAG,
+    ABS_MVD_MINUS2,
+    MVD_SIGN_FLAG,
+    MVP_LX_FLAG,
+    NO_RESIDUAL_DATA_FLAG,
+    SPLIT_TRANSFORM_FLAG,
+    CBF_LUMA,
+    CBF_CB_CR,
+    TRANSFORM_SKIP_FLAG,
+    EXPLICIT_RDPCM_FLAG,
+    EXPLICIT_RDPCM_DIR_FLAG,
+    LAST_SIGNIFICANT_COEFF_X_PREFIX,
+    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
+    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
+    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
+    SIGNIFICANT_COEFF_GROUP_FLAG,
+    SIGNIFICANT_COEFF_FLAG,
+    COEFF_ABS_LEVEL_GREATER1_FLAG,
+    COEFF_ABS_LEVEL_GREATER2_FLAG,
+    COEFF_ABS_LEVEL_REMAINING,
+    COEFF_SIGN_FLAG,
+    LOG2_RES_SCALE_ABS,
+    RES_SCALE_SIGN_FLAG,
+    CU_CHROMA_QP_OFFSET_FLAG,
+    CU_CHROMA_QP_OFFSET_IDX,
+};
+
+enum PartMode {
+    PART_2Nx2N = 0,
+    PART_2NxN  = 1,
+    PART_Nx2N  = 2,
+    PART_NxN   = 3,
+    PART_2NxnU = 4,
+    PART_2NxnD = 5,
+    PART_nLx2N = 6,
+    PART_nRx2N = 7,
+};
+
+enum PredMode {
+    MODE_INTER = 0,
+    MODE_INTRA,
+    MODE_SKIP,
+};
+
+enum InterPredIdc {
+    PRED_L0 = 0,
+    PRED_L1,
+    PRED_BI,
+};
+
+enum PredFlag {
+    PF_INTRA = 0,
+    PF_L0,
+    PF_L1,
+    PF_BI,
+};
+
+enum SAOType {
+    SAO_NOT_APPLIED = 0,
+    SAO_BAND,
+    SAO_EDGE,
+    SAO_APPLIED
+};
+
+enum SAOEOClass {
+    SAO_EO_HORIZ = 0,
+    SAO_EO_VERT,
+    SAO_EO_135D,
+    SAO_EO_45D,
+};
+
+enum ScanType {
+    SCAN_DIAG = 0,
+    SCAN_HORIZ,
+    SCAN_VERT,
+};
+
+typedef struct RefPicList {
+    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
+    int list[HEVC_MAX_REFS];
+    uint8_t isLongTerm[HEVC_MAX_REFS];
+    int nb_refs;
+} RefPicList;
+
+typedef struct RefPicListTab {
+    RefPicList refPicList[2];
+} RefPicListTab;
+
+typedef struct RpiCodingUnit {
+    unsigned int x;             // Passed to deblock
+    unsigned int y;
+    unsigned int x_split;
+    unsigned int y_split;
+
+    enum PredMode pred_mode;    ///< PredMode
+    enum PartMode part_mode;    ///< PartMode
+
+    // Inferred parameters
+    uint8_t intra_split_flag;   ///< IntraSplitFlag
+    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
+    uint8_t cu_transquant_bypass_flag;
+} RpiCodingUnit;
+
+typedef struct RpiPredictionUnit {
+    uint8_t intra_pred_mode[4];
+    uint8_t intra_pred_mode_c[4];
+    uint8_t chroma_mode_c[4];
+    uint8_t merge_flag;
+} RpiPredictionUnit;
+
+typedef struct HEVCRpiTransformUnit {
+    int8_t cu_qp_delta;
+
+    // Inferred parameters;
+    uint8_t intra_pred_mode;
+    uint8_t intra_pred_mode_c;
+    uint8_t chroma_mode_c;
+    uint8_t is_cu_qp_delta_wanted;
+    uint8_t cu_chroma_qp_offset_wanted;
+    const int8_t * qp_divmod6[3];
+} HEVCRpiTransformUnit;
+
+typedef struct DBParams {
+    int8_t beta_offset; // -12 to +12
+    int8_t tc_offset;   // -12 to +12
+} DBParams;
+
+#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
+#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
+
+struct HEVCRpiJob;
+
+typedef struct HEVCRpiFrame {
+    AVFrame *frame;
+    ThreadFrame tf;
+    ColMvField *col_mvf;
+    int poc;
+    struct HEVCRpiFrame *collocated_ref;
+
+    AVBufferRef *col_mvf_buf;
+
+    /**
+     * A sequence counter, so that old frames are output first
+     * after a POC reset
+     */
+    uint16_t sequence;
+
+    /**
+     * A combination of HEVC_FRAME_FLAG_*
+     */
+    uint8_t flags;
+
+    // Entry no in DPB - can be used as a small unique
+    // frame identifier (within the current thread)
+    uint8_t dpb_no;
+} HEVCRpiFrame;
+
+typedef struct HEVCRpiLocalContext {
+    HEVCRpiTransformUnit tu;
+
+    CABACContext cc;
+
+    // Vars that allow us to locate everything from just an lc
+    struct HEVCRpiContext * context;  // ??? make const ???
+    unsigned int lc_n; // lc list el no
+
+    // Job wait links
+    struct HEVCRpiLocalContext * jw_next;
+    struct HEVCRpiLocalContext * jw_prev;
+    struct HEVCRpiLocalContext * ljw_next;
+    struct HEVCRpiLocalContext * ljw_prev;
+    struct HEVCRpiJob * volatile jw_job;
+    sem_t jw_sem;
+
+    // ?? Wrap in structure ??
+    sem_t bt_sem_in;
+    sem_t * bt_psem_out;
+    volatile int bt_terminate;
+    unsigned int ts;
+    unsigned int bt_last_line;  // Last line in this bit_thread chunk
+    unsigned int bt_line_no;
+    unsigned int bt_line_width;
+    unsigned int bt_line_inc;
+
+    struct HEVCRpiJob * jb0;
+    char unit_done;  // Set once we have dealt with this slice
+    char bt_is_tile;
+    char last_progress_good;
+    char cabac_init_req;
+
+    uint8_t cabac_state[HEVC_CONTEXTS];
+    uint8_t stat_coeff[4];
+    GetBitContext gb;
+
+    uint8_t ct_depth;
+    int8_t qp_y;
+    int8_t curr_qp_y;
+    int8_t qPy_pred;
+
+// N.B. Used by asm (neon) - do not change
+#define AVAIL_S_UR  0
+#define AVAIL_S_U   1
+#define AVAIL_S_UL  2
+#define AVAIL_S_L   3
+#define AVAIL_S_DL  4
+
+#define AVAIL_U     (1 << AVAIL_S_U)
+#define AVAIL_L     (1 << AVAIL_S_L)
+#define AVAIL_UL    (1 << AVAIL_S_UL)
+#define AVAIL_UR    (1 << AVAIL_S_UR)
+#define AVAIL_DL    (1 << AVAIL_S_DL)
+
+// Intra filters - same number space as avail
+#define FILTER_LIGHT    0x40
+#define FILTER_STRONG   0x80
+#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
+
+    uint8_t ctb_avail;
+    int     end_of_ctb_x;
+    int     end_of_ctb_y;
+
+    RpiCodingUnit cu;
+    RpiPredictionUnit pu;
+
+#define BOUNDARY_LEFT_SLICE     (1 << 0)
+#define BOUNDARY_LEFT_TILE      (1 << 1)
+#define BOUNDARY_UPPER_SLICE    (1 << 2)
+#define BOUNDARY_UPPER_TILE     (1 << 3)
+    /* properties of the boundary of the current CTB for the purposes
+     * of the deblocking filter */
+    unsigned int boundary_flags;
+
+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
+    uint8_t ipm_left[IPM_TAB_SIZE];
+    uint8_t ipm_up[IPM_TAB_SIZE];
+
+//#define MVF_STASH_WIDTH       128
+#define MVF_STASH_WIDTH       64
+#define MVF_STASH_HEIGHT      64
+#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
+#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
+    HEVCRpiMvField mvf_ul[1];
+    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
+
+    /* +7 is for subpixel interpolation, *2 for high bit depths */
+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
+    /* The extended size between the new edge emu buffer is abused by SAO */
+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
+//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
+
+} HEVCRpiLocalContext;
+
+// Each block can have an intra prediction and an add_residual command
+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
+
+// Sand only has 2 planes (Y/C)
+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
+
+// Command for intra prediction and transform_add of predictions to coefficients
+enum rpi_pred_cmd_e
+{
+    RPI_PRED_ADD_RESIDUAL,
+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
+    RPI_PRED_ADD_DC,
+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
+    RPI_PRED_ADD_DC_V,
+    RPI_PRED_INTRA,
+    RPI_PRED_INTRA_C,
+    RPI_PRED_I_PCM,
+    RPI_PRED_CMD_MAX
+};
+
+typedef struct HEVCPredCmd {
+    uint8_t type;
+    uint8_t size;  // log2 "size" used by all variants
+    uint8_t avail; // i_pred - but left here as they pack well
+    uint8_t dummy;
+    union {
+        struct {  // TRANSFORM_ADD
+            uint8_t * dst;
+            const int16_t * buf;
+            uint16_t stride;  // Should be good enough for all pic fmts we use
+            int16_t dc;
+        } ta;
+        struct {
+            uint8_t * dst;
+            uint32_t stride;
+            int dc;
+        } dc;
+        struct {  // INTRA
+            uint16_t x;
+            uint16_t y;
+            enum IntraPredMode mode;
+        } i_pred;
+        struct {  // I_PCM
+            uint16_t x;
+            uint16_t y;
+            const void * src;
+            uint32_t src_len;
+        } i_pcm;
+    };
+} HEVCPredCmd;
+
+union qpu_mc_pred_cmd_s;
+struct qpu_mc_pred_y_p_s;
+struct qpu_mc_src_s;
+
+typedef struct HEVCRpiInterPredQ
+{
+    union qpu_mc_pred_cmd_u *qpu_mc_base;
+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
+    struct qpu_mc_src_s *last_l0;
+    struct qpu_mc_src_s *last_l1;
+    unsigned int load;
+    uint32_t code_setup;
+    uint32_t code_sync;
+    uint32_t code_exit;
+} HEVCRpiInterPredQ;
+
+typedef struct HEVCRpiInterPredEnv
+{
+    HEVCRpiInterPredQ * q;
+    uint8_t n;                  // Number of Qs
+    uint8_t n_grp;              // Number of Q in a group
+    uint8_t curr;               // Current Q number (0..n-1)
+    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
+    uint8_t used_grp;           // 0 if nothing in any Q in the current group
+    unsigned int max_fill;
+    unsigned int min_gap;
+    GPU_MEM_PTR_T gptr;
+} HEVCRpiInterPredEnv;
+
+typedef struct HEVCRpiIntraPredEnv {
+    unsigned int n;        // Number of commands
+    HEVCPredCmd * cmds;
+} HEVCRpiIntraPredEnv;
+
+typedef struct HEVCRpiCoeffEnv {
+    unsigned int n;
+#if RPI_COMPRESS_COEFFS
+    unsigned int packed; // Equal to 1 if coefficients should be being packed
+    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
+#endif
+    int16_t * buf;
+} HEVCRpiCoeffEnv;
+
+typedef struct HEVCRpiCoeffsEnv {
+    HEVCRpiCoeffEnv s[4];
+    GPU_MEM_PTR_T gptr;
+    void * mptr;
+} HEVCRpiCoeffsEnv;
+
+typedef struct HEVCRpiFrameProgressWait {
+    int req;
+    struct HEVCRpiFrameProgressWait * next;
+    sem_t sem;
+} HEVCRpiFrameProgressWait;
+
+typedef struct HEVCRpiFrameProgressState {
+    struct HEVCRpiFrameProgressWait * first;
+    struct HEVCRpiFrameProgressWait * last;
+    pthread_mutex_t lock;
+} HEVCRpiFrameProgressState;
+
+typedef struct RpiBlk
+{
+    unsigned int x;
+    unsigned int y;
+    unsigned int w;
+    unsigned int h;
+} RpiBlk;
+
+typedef struct HEVCRpiJob {
+    struct HEVCRpiJob * next;  // Free chain
+    struct HEVCRpiJobCtl * jbc_local;
+    const HEVCRpiSPS * sps;       // sps used to set up this job
+
+    int waited;
+    int ctu_ts_first;
+    int ctu_ts_last;
+    RpiBlk bounds;  // Bounding box of job
+
+    struct qpu_mc_pred_y_p_s * last_y8_p;
+    struct qpu_mc_src_s * last_y8_l1;
+    rpi_cache_flush_env_t * rfe;
+
+    HEVCRpiInterPredEnv chroma_ip;
+    HEVCRpiInterPredEnv luma_ip;
+    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
+    HEVCRpiIntraPredEnv intra;
+    HEVCRpiCoeffsEnv coeffs;
+    HEVCRpiFrameProgressWait progress_wait;
+    sem_t sem;
+    rpi_cache_buf_t flush_buf;
+} HEVCRpiJob;
+
+struct HEVCRpiContext;
+
+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
+
+typedef struct HEVCRpiPassQueue
+{
+//    int pending;
+    volatile int terminate;
+    sem_t sem_in;
+    sem_t * psem_out;
+    unsigned int job_n;
+    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
+    HEVCRpiWorkerFn * worker;
+    pthread_t thread;
+    uint8_t pass_n;  // Pass number - debug
+    uint8_t started;
+} HEVCRpiPassQueue;
+
+
+struct HEVCRpiJobGlobal;
+
+typedef struct HEVCRpiJobCtl
+{
+    sem_t sem_out;
+
+    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
+    struct HEVCRpiJobGlobal * jbg;
+
+    HEVCRpiLocalContext * lcw_head;
+    HEVCRpiLocalContext * lcw_tail;
+
+    pthread_mutex_t in_lock;
+    int offload_in;
+
+    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
+} HEVCRpiJobCtl;
+
+
+typedef struct HEVCRpiJobGlobal
+{
+    intptr_t ref_count;
+    pthread_mutex_t lock;
+    HEVCRpiJob * free1;                 // Singly linked list of free jobs
+    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
+    HEVCRpiLocalContext * wait_good;  // Last good tail
+    HEVCRpiLocalContext * wait_tail;
+
+} HEVCRpiJobGlobal;
+
+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
+
+#if RPI_TSTATS
+typedef struct HEVCRpiStats {
+    int y_pred1_y8_merge;
+    int y_pred1_xy;
+    int y_pred1_x0;
+    int y_pred1_y0;
+    int y_pred1_x0y0;
+    int y_pred1_wle8;
+    int y_pred1_wgt8;
+    int y_pred1_hle16;
+    int y_pred1_hgt16;
+    int y_pred2_xy;
+    int y_pred2_x0;
+    int y_pred2_y0;
+    int y_pred2_x0y0;
+    int y_pred2_hle16;
+    int y_pred2_hgt16;
+} HEVCRpiStats;
+#endif
+
+typedef struct HEVCRpiCabacState
+{
+    uint8_t rice[4];
+    uint8_t state[HEVC_CONTEXTS];
+} HEVCRpiCabacState;
+
+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
+#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
+#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
+#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
+#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
+#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
+
+typedef struct HEVCRpiContext {
+    const AVClass *c;  // needed by private avoptions
+    AVCodecContext *avctx;
+
+    uint8_t             threads_type;
+    char qpu_init_ok;
+
+    /** 1 if the independent slice segment header was successfully parsed */
+    uint8_t slice_initialized;
+    char used_for_ref;  // rpi
+    char is_irap;
+    char offload_recon;
+    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
+    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
+    uint8_t no_backward_pred_flag;
+    uint8_t is_decoded;
+    uint8_t no_rasl_output_flag;
+
+
+    /**
+     * Sequence counters for decoded and output frames, so that old
+     * frames are output first after a POC reset
+     */
+    uint16_t seq_decode;
+    uint16_t seq_output;
+
+    int                 width;
+    int                 height;
+
+    HEVCRpiJobCtl * jbc;
+    // cabac stash
+    // b0       skip flag
+    // b1+      ct_depth
+    uint8_t * cabac_stash_left;
+    uint8_t * cabac_stash_up;
+
+    // Function pointers
+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+    const uint8_t * qpu_dummy_frame_emu;
+#endif
+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
+#endif
+    HEVCRpiQpu qpu;
+
+    HEVCRpiFrameProgressState progress_states[2];
+
+    HEVCRpiCabacState *cabac_save;
+
+    AVFrame *frame;
+    AVFrame *output_frame;
+    uint8_t *sao_pixel_buffer_h[3];
+    uint8_t *sao_pixel_buffer_v[3];
+
+    unsigned int col_mvf_stride;
+    AVBufferPool *col_mvf_pool;
+
+    RpiSAOParams *sao;
+    DBParams *deblock;
+    enum HEVCNALUnitType nal_unit_type;
+    int temporal_id;  ///< temporal_id_plus1 - 1
+    HEVCRpiFrame *ref;
+    int poc;
+    int pocTid0;
+    int slice_idx; ///< number of the slice being currently decoded
+    int max_ra;
+
+    int8_t *qp_y_tab;
+
+    // Deblocking block strength bitmaps
+    unsigned int bs_stride2;
+    unsigned int bs_size;
+    uint8_t *bs_horizontal;
+    uint8_t *bs_vertical;
+    uint8_t *bsf_stash_up;
+    uint8_t *bsf_stash_left;
+
+#if HEVC_RPI_MAX_CTBS >= 0xffff
+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
+    uint32_t *tab_slice_address;
+#else
+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
+    uint16_t *tab_slice_address;
+#endif
+
+    // Bitfield 1 bit per 8 pels (min pcm size)
+    uint8_t *is_pcm;
+    // Bitfield 1 bit per 8 pels (min cb size)
+    // Only needed for CIP as CIP processing is async to the main thread
+    uint8_t *is_intra;
+
+    // PU
+    HEVCRpiMvField *mvf_up;
+    HEVCRpiMvField *mvf_left;
+
+    const RefPicList **rpl_up;
+    const RefPicList **rpl_left;
+    RefPicList * refPicList;
+
+    // CTB-level flags affecting loop filter operation
+    uint8_t *filter_slice_edges;
+
+    /** used on BE to byteswap the lines for checksumming */
+    uint8_t *checksum_buf;
+    int      checksum_buf_size;
+
+    const uint8_t *data;
+
+    H2645Packet pkt;
+    // type of the first VCL NAL of the current frame
+    enum HEVCNALUnitType first_nal_type;
+
+    uint8_t context_initialized;
+    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
+                            ///< as a format defined in 14496-15
+    int apply_defdispwin;
+
+    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
+    int nuh_layer_id;
+
+    struct AVMD5 *md5_ctx;
+
+    RefPicListTab * rpl_tab;
+    unsigned int rpl_tab_size;
+
+    uint8_t *is_intra_store;
+
+    RpiSliceHeader sh;
+
+    HEVCRpiParamSets ps;
+
+    HEVCRpiLocalContext    *HEVClc;
+    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
+
+    HEVCRpiFrame DPB[HEVC_DPB_ELS];
+
+    ///< candidate references for the current frame
+    RefPicList rps[5];
+
+    HEVCRpiPredContext hpc;
+    HEVCDSPContext hevcdsp;
+
+    HEVCSEIContext sei;
+
+    // Put structures that allocate non-trivial storage at the end
+    // These are mostly used indirectly so position in the structure doesn't matter
+    HEVCRpiPassQueue passq[RPI_PASSES];
+#if RPI_EXTRA_BIT_THREADS > 0
+    int bt_started;
+    // This simply contains thread descriptors - task setup is held elsewhere
+    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
+#endif
+#if RPI_TSTATS
+    HEVCRpiStats tstats;
+#endif
+} HEVCRpiContext;
+
+/**
+ * Mark all frames in DPB as unused for reference.
+ */
+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
+
+/**
+ * Drop all frames currently in DPB.
+ */
+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
+
+/**
+ * Construct the reference picture sets for the current frame.
+ */
+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
+
+/**
+ * Construct the reference picture list(s) for the current slice.
+ */
+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
+
+
+/**
+ * Get the number of candidate references for the current frame.
+ */
+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
+
+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
+
+/**
+ * Find next frame in output order and put a reference to it in frame.
+ * @return 1 if a frame was output, 0 otherwise
+ */
+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
+
+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
+
+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
+
+unsigned int ff_hevc_rpi_tb_avail_flags(
+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
+
+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
+                                int nPbH, int log2_cb_size, int part_idx,
+                                int merge_idx, HEVCRpiMvField * const mv);
+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
+    const unsigned int x0, const unsigned int y0,
+    const unsigned int nPbW, const unsigned int nPbH,
+    const unsigned int avail,
+    HEVCRpiMvField * const mv,
+    const unsigned int mvp_lx_flag, const unsigned int LX);
+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
+                                               const unsigned int x0, const unsigned int y0,
+                                               const unsigned int log2_trafo_size, const int is_coded_block);
+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
+
+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
+extern const uint8_t ff_hevc_rpi_qpel_extra[4];
+
+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
+
+// arm/hevc_misc_neon.S
+// Neon coeff zap fn
+#if HAVE_NEON
+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+#endif
+
+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCRpiFrame * const ref, const int val, const int field);
+
+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
+
+// All of these expect that s->threads_type == FF_THREAD_FRAME
+
+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCRpiFrame * const ref, const int y)
+{
+    if (s->threads_type != 0)
+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
+}
+
+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
+{
+    if (s->used_for_ref && s->threads_type != 0)
+        ff_hevc_rpi_progress_signal_field(s, y, 1);
+}
+
+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+                                     const HEVCRpiFrame * const ref, const int y)
+{
+    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
+}
+
+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
+{
+    if (s->used_for_ref && s->threads_type != 0)
+    {
+        ff_hevc_rpi_progress_signal_field(s, y, 0);
+    }
+}
+
+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
+{
+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
+}
+
+
+// Set all done - signal nothing (used in missing refs)
+// Works for both rpi & non-rpi
+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
+{
+    if (ref->tf.progress != NULL)
+    {
+        int * const p = (int *)ref->tf.progress->data;
+        p[0] = INT_MAX;
+        p[1] = INT_MAX;
+    }
+}
+
+#define HEVC_RPI_420_ONLY 1
+#define HEVC_RPI_SAND128_ONLY 1
+
+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
+{
+#if HEVC_RPI_420_ONLY
+    return cidx == 0 ? 0 : 1;
+#else
+    return s->ps.sps->hshift[cidx];
+#endif
+}
+
+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
+{
+#if HEVC_RPI_420_ONLY
+    return cidx == 0 ? 0 : 1;
+#else
+    return s->ps.sps->vshift[cidx];
+#endif
+}
+
+static inline int ctx_cfmt(const HEVCRpiContext * const s)
+{
+#if HEVC_RPI_420_ONLY
+    return 1;
+#else
+    return s->ps.sps->chroma_format_idc;
+#endif
+}
+
+static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
+{
+#if HEVC_RPI_SAND128_ONLY
+    return 128;
+#else
+    return frame->linesize[c_idx];
+#endif
+}
+
+#if HEVC_RPI_SAND128_ONLY
+// Propagate this decision to later zc includes
+#define RPI_ZC_SAND128_ONLY 1
+#endif
+
+#ifndef ff_hevc_rpi_copy_vert
+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
+                                         int pixel_shift, int height,
+                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
+{
+    int i;
+    switch (pixel_shift)
+    {
+        case 2:
+            for (i = 0; i < height; i++) {
+                *(uint32_t *)dst = *(uint32_t *)src;
+                dst += stride_dst;
+                src += stride_src;
+            }
+            break;
+        case 1:
+            for (i = 0; i < height; i++) {
+                *(uint16_t *)dst = *(uint16_t *)src;
+                dst += stride_dst;
+                src += stride_src;
+            }
+            break;
+        default:
+            for (i = 0; i < height; i++) {
+                *dst = *src;
+                dst += stride_dst;
+                src += stride_src;
+            }
+            break;
+    }
+}
+#endif
+
+
+#if MVF_STASH_WIDTH == 64
+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
+                               const unsigned int x, const unsigned int y)
+{
+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
+}
+
+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
+                               const unsigned int x0, const unsigned int y0,
+                               const unsigned int x, const unsigned int y)
+{
+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
+    const unsigned int x0_ctb = x0 & mask_cs_hi;
+    const unsigned int y0_ctb = y0 & mask_cs_hi;
+
+    return (HEVCRpiMvField *)((y < y0_ctb) ?
+        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
+        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
+            lc->mvf_stash +
+                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
+                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
+}
+
+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
+                               const unsigned int x0,
+                               const unsigned int x)
+{
+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
+    const unsigned int x0_ctb = x0 & mask_cs_hi;
+    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
+}
+
+#else
+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
+                               const unsigned int x, const unsigned int y)
+{
+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
+}
+
+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
+                               const unsigned int x0, const unsigned int y0,
+                               const unsigned int x, const unsigned int y)
+{
+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
+
+    const unsigned int x0_ctb = x0 & mask_cs_hi;
+    const unsigned int y0_ctb = y0 & mask_cs_hi;
+
+    // If not in the same CTB for Y assume up
+    if (y < y0_ctb) {
+        // If not in the same CTB for X too assume up-left
+        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
+    }
+    return mvf_stash_ptr(s, lc, x, y);
+}
+
+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
+                               const unsigned int x0,
+                               const unsigned int x)
+{
+    return MVF_STASH_WIDTH_PU;
+}
+#endif
+
+#endif /* AVCODEC_RPI_HEVCDEC_H */
diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
new file mode 100644
index 0000000000..87f3cc9d14
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp.c
@@ -0,0 +1,450 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rpi_hevcdsp.h"
+#include "rpi_hevc_mv.h"
+
+static const int8_t transform[32][32] = {
+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
+      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
+     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
+    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
+      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
+    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
+     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
+     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
+    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
+      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
+      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
+    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
+     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
+     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
+    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
+      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
+      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
+    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
+     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
+     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
+    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
+      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
+      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
+    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
+     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
+     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
+    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
+      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
+      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
+    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
+     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
+     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
+    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
+      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
+      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
+    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
+     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
+     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
+    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
+      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
+      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
+    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
+     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
+      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
+    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
+      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
+};
+
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
+    { -2, 58, 10, -2},
+    { -4, 54, 16, -2},
+    { -6, 46, 28, -4},
+    { -4, 36, 36, -4},
+    { -4, 28, 46, -6},
+    { -2, 16, 54, -4},
+    { -2, 10, 58, -2},
+};
+
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
+};
+
+#define BIT_DEPTH 8
+#include "rpi_hevcdsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "rpi_hevcdsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "rpi_hevcdsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "rpi_hevcdsp_template.c"
+#undef BIT_DEPTH
+
+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
+                                               int in_inc0, int in_inc1)
+{
+    int shift = 32;
+    uint32_t bs = 0;
+    for (; pus > 0; pus--) {
+        int strength, out;
+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+        int nr_idx0 = neigh->ref_idx[0];
+        int nr_idx1 = neigh->ref_idx[1];
+        int neigh_refL0 = neigh_rpl0[nr_idx0];
+        int neigh_refL1 = neigh_rpl1[nr_idx1];
+
+        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
+        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
+
+#if 1 // This more directly matches the original implementation
+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+            // same L0 and L1
+            if (curr_refL0 == neigh_refL0 &&
+                curr_refL0 == curr_refL1 &&
+                neigh_refL0 == neigh_refL1) {
+                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
+                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
+                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
+                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
+                    strength = 1;
+                else
+                    strength = 0;
+            } else if (neigh_refL0 == curr_refL0 &&
+                       neigh_refL1 == curr_refL1) {
+                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
+                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else if (neigh_refL1 == curr_refL0 &&
+                       neigh_refL0 == curr_refL1) {
+                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
+                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else {
+                strength = 1;
+            }
+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+            MvXY curr_mv0, neigh_mv0;
+
+            if (curr->pred_flag & 1) {
+                curr_mv0   = curr->xy[0];
+            } else {
+                curr_mv0   = curr->xy[1];
+                curr_refL0 = curr_refL1;
+            }
+
+            if (neigh->pred_flag & 1) {
+                neigh_mv0   = neigh->xy[0];
+            } else {
+                neigh_mv0   = neigh->xy[1];
+                neigh_refL0 = neigh_refL1;
+            }
+
+            if (curr_refL0 == neigh_refL0) {
+                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
+                    strength = 1;
+                else
+                    strength = 0;
+            } else
+                strength = 1;
+        } else
+            strength = 1;
+#else // This has exactly the same effect, but is more suitable for vectorisation
+        MvXY curr_mv[2];
+        MvXY neigh_mv[2];
+        memcpy(curr_mv, curr->xy, sizeof curr_mv);
+        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
+
+        if (!(curr->pred_flag & 2)) {
+            curr_mv[1] = curr_mv[0];
+            curr_refL1 = curr_refL0;
+        }
+        if (!(neigh->pred_flag & 2)) {
+            neigh_mv[1] = neigh_mv[0];
+            neigh_refL1 = neigh_refL0;
+        }
+        if (!(curr->pred_flag & 1)) {
+            curr_mv[0] = curr_mv[1];
+            curr_refL0 = curr_refL1;
+        }
+        if (!(neigh->pred_flag & 1)) {
+            neigh_mv[0] = neigh_mv[1];
+            neigh_refL0 = neigh_refL1;
+        }
+
+        strength = 1;
+
+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
+
+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
+
+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+#endif
+
+        curr += in_inc0 / sizeof (HEVCRpiMvField);
+        neigh += in_inc1 / sizeof (HEVCRpiMvField);
+
+        for (out = dup; out > 0; out--)
+        {
+            bs = (bs >> 2) | (strength << 30);
+            shift -= 2;
+        }
+    }
+    return bs >> shift;
+}
+
+
+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
+{
+    unsigned int i, j;
+
+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=8)
+                AV_COPY64U(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=16)
+                AV_COPY128(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+
+
+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+{
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth
+
+#undef PEL_FUNC
+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
+    for(i = 0 ; i < 10 ; i++)                                                  \
+{                                                                              \
+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
+}
+
+#undef EPEL_FUNCS
+#define EPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
+
+#undef EPEL_UNI_FUNCS
+#define EPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
+
+#undef EPEL_BI_FUNCS
+#define EPEL_BI_FUNCS(depth)                                                \
+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
+
+#undef QPEL_FUNCS
+#define QPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
+
+#undef QPEL_UNI_FUNCS
+#define QPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
+
+#undef QPEL_BI_FUNCS
+#define QPEL_BI_FUNCS(depth)                                                  \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+
+#define SLICED_ADD_RESIDUAL(depth)\
+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
+#define SLICED_LOOP_FILTERS(depth)\
+    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+#define SLICED_SAO(depth)\
+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
+    }                                                                         \
+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
+
+#define HEVC_DSP(depth)                                                     \
+    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
+    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
+    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
+    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
+    SLICED_ADD_RESIDUAL(depth);                                             \
+    hevcdsp->dequant                = FUNC(dequant, depth);                 \
+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
+                                                                            \
+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
+                                                                            \
+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
+    }                                                                       \
+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+    SLICED_SAO(depth);                                                         \
+                                                                               \
+    QPEL_FUNCS(depth);                                                         \
+    QPEL_UNI_FUNCS(depth);                                                     \
+    QPEL_BI_FUNCS(depth);                                                      \
+    EPEL_FUNCS(depth);                                                         \
+    EPEL_UNI_FUNCS(depth);                                                     \
+    EPEL_BI_FUNCS(depth);                                                      \
+                                                                               \
+    SLICED_LOOP_FILTERS(depth);                                                \
+    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
+    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
+    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
+    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
+    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
+    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
+    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
+int i = 0;
+
+    switch (bit_depth) {
+    case 9:
+        HEVC_DSP(9);
+        break;
+    case 10:
+        HEVC_DSP(10);
+        break;
+    case 12:
+        HEVC_DSP(12);
+        break;
+    default:
+        HEVC_DSP(8);
+        break;
+    }
+
+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+    hevcdsp->cpy_blk = cpy_blk;
+
+    if (ARCH_PPC)
+        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
+    if (ARCH_X86)
+        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
+    if (ARCH_ARM)
+        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
+    if (ARCH_MIPS)
+        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
+}
diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
new file mode 100644
index 0000000000..5a7cdeeb66
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp.h
@@ -0,0 +1,177 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RPI_HEVCDSP_H
+#define AVCODEC_RPI_HEVCDSP_H
+
+#include "hevc.h"
+#include "get_bits.h"
+
+struct HEVCRpiMvField;
+
+#define MAX_PB_SIZE 64
+
+#define RPI_HEVC_SAO_BUF_STRIDE 160
+
+
+typedef struct RpiSAOParams {
+    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
+    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
+    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
+
+    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
+
+} RpiSAOParams;
+
+
+// This controls how many sao dsp functions there are
+// N=5 has width = 8, 16, 32, 48, 64
+// N=6 adds a function for width=24 (in fn array el 5 so existing code should
+// still work)
+#define SAO_FILTER_N 6
+
+
+typedef struct HEVCDSPContext {
+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                    struct GetBitContext *gb, int pcm_bit_depth);
+
+    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
+
+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                    struct GetBitContext *gb, int pcm_bit_depth);
+
+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
+
+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
+
+    void (*transform_4x4_luma)(int16_t *coeffs);
+
+    void (*idct[4])(int16_t *coeffs, int col_limit);
+
+    void (*idct_dc[4])(int16_t *coeffs);
+
+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+                               int width, int height);
+
+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+
+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+
+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int wx1,
+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int ox0, int wx1,
+                                         int ox1, intptr_t mx, intptr_t my, int width);
+
+    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+                                    int beta, int32_t *tc,
+                                    uint8_t *no_p, uint8_t *no_q);
+    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+                                    int beta, int32_t *tc,
+                                    uint8_t *no_p, uint8_t *no_q);
+    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
+                                      int beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
+                                      int beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
+                                 uint8_t * _pix_l);
+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+                                 unsigned int no_f);
+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                                 uint8_t * src_l,
+                                 unsigned int no_f);
+
+    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
+                                               int in_inc0, int inc_inc1);
+
+    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
+} HEVCDSPContext;
+
+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+
+extern const int8_t ff_hevc_rpi_epel_filters[7][4];
+extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
+
+void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
+void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
+#endif /* AVCODEC_RPI_HEVCDSP_H */
diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
new file mode 100644
index 0000000000..dea4e55e4b
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp_template.c
@@ -0,0 +1,2279 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "get_bits.h"
+#include "rpi_hevcdec.h"
+
+#include "bit_depth_template.c"
+#include "rpi_hevcdsp.h"
+
+#include "rpi_hevc_shader_template.h"
+
+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+                          GetBitContext *gb, int pcm_bit_depth)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+        dst += stride;
+    }
+}
+
+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+                          GetBitContext *gb, int pcm_bit_depth)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+        dst += stride;
+    }
+
+    dst = (pixel *)_dst + 1;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+        dst += stride;
+    }
+}
+
+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
+                                                ptrdiff_t stride, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size; x++) {
+            dst[x] = av_clip_pixel(dst[x] + *res);
+            res++;
+        }
+        dst += stride;
+    }
+}
+
+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size; x++) {
+            dst[x] = av_clip_pixel(dst[x] + dc);
+        }
+        dst += stride;
+    }
+}
+
+
+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
+                                                ptrdiff_t stride, const int dc_v, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x] = av_clip_pixel(dst[x] + *res);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+            res++;
+        }
+        dst += stride;
+    }
+}
+
+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
+                                                ptrdiff_t stride, const int dc_u, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x] = av_clip_pixel(dst[x] + dc_u);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
+            res++;
+        }
+        dst += stride;
+    }
+}
+
+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
+                                                ptrdiff_t stride, unsigned int size)
+{
+    unsigned int x, y;
+    pixel *dst = (pixel *)_dst;
+    const int16_t * ru = res;
+    const int16_t * rv = res + size * size;
+
+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
+        }
+        dst += stride;
+    }
+
+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
+}
+
+
+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+    const int dc_v = dc >> 16;
+    const int dc_u = (dc << 16) >> 16;
+
+    stride /= sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size * 2; x += 2) {
+            dst[x] = av_clip_pixel(dst[x] + dc_u);
+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+        }
+        dst += stride;
+    }
+}
+
+
+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual)(_dst, res, stride, 4);
+}
+
+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual)(_dst, res, stride, 8);
+}
+
+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
+                                    ptrdiff_t stride)
+{
+    FUNC(add_residual)(_dst, res, stride, 16);
+}
+
+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+                                    ptrdiff_t stride)
+{
+    FUNC(add_residual)(_dst, res, stride, 32);
+}
+
+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
+}
+
+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
+}
+
+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
+}
+
+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+{
+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
+}
+
+// -- U -- (plaited)
+
+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_u)
+{
+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
+}
+
+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_u)
+{
+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
+}
+
+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_u)
+{
+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
+}
+
+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_u)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+// -- V -- (plaited)
+
+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_v)
+{
+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
+}
+
+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride, int dc_v)
+{
+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
+}
+
+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_v)
+{
+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
+}
+
+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride, int dc_v)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+// -- C -- (plaited - both U & V)
+
+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual_c)(_dst, res, stride, 4);
+}
+
+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
+                                  ptrdiff_t stride)
+{
+    FUNC(add_residual_c)(_dst, res, stride, 8);
+}
+
+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride)
+{
+    FUNC(add_residual_c)(_dst, res, stride, 16);
+}
+
+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
+                                    ptrdiff_t stride)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
+}
+
+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
+}
+
+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
+}
+
+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+{
+    // Should never occur for 420, which is all that sand supports
+    av_assert0(0);
+}
+
+
+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
+{
+    int16_t *coeffs = (int16_t *) _coeffs;
+    int x, y;
+    int size = 1 << log2_size;
+
+    if (mode) {
+        coeffs += size;
+        for (y = 0; y < size - 1; y++) {
+            for (x = 0; x < size; x++)
+                coeffs[x] += coeffs[x - size];
+            coeffs += size;
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 1; x < size; x++)
+                coeffs[x] += coeffs[x - 1];
+            coeffs += size;
+        }
+    }
+}
+
+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
+{
+    int shift  = 15 - BIT_DEPTH - log2_size;
+    int x, y;
+    int size = 1 << log2_size;
+
+    if (shift > 0) {
+        int offset = 1 << (shift - 1);
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = (*coeffs + offset) >> shift;
+                coeffs++;
+            }
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = *coeffs << -shift;
+                coeffs++;
+            }
+        }
+    }
+}
+
+#define SET(dst, x)   (dst) = (x)
+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
+
+#define TR_4x4_LUMA(dst, src, step, assign)                             \
+    do {                                                                \
+        int c0 = src[0 * step] + src[2 * step];                         \
+        int c1 = src[2 * step] + src[3 * step];                         \
+        int c2 = src[0 * step] - src[3 * step];                         \
+        int c3 = 74 * src[1 * step];                                    \
+                                                                        \
+        assign(dst[2 * step], 74 * (src[0 * step] -                     \
+                                    src[2 * step] +                     \
+                                    src[3 * step]));                    \
+        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
+        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
+        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
+    } while (0)
+
+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
+{
+    int i;
+    int shift    = 7;
+    int add      = 1 << (shift - 1);
+    int16_t *src = coeffs;
+
+    for (i = 0; i < 4; i++) {
+        TR_4x4_LUMA(src, src, 4, SCALE);
+        src++;
+    }
+
+    shift = 20 - BIT_DEPTH;
+    add   = 1 << (shift - 1);
+    for (i = 0; i < 4; i++) {
+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
+        coeffs += 4;
+    }
+}
+
+#undef TR_4x4_LUMA
+
+#define TR_4(dst, src, dstep, sstep, assign, end)                 \
+    do {                                                          \
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
+                                                                  \
+        assign(dst[0 * dstep], e0 + o0);                          \
+        assign(dst[1 * dstep], e1 + o1);                          \
+        assign(dst[2 * dstep], e1 - o1);                          \
+        assign(dst[3 * dstep], e0 - o0);                          \
+    } while (0)
+
+#define TR_8(dst, src, dstep, sstep, assign, end)                 \
+    do {                                                          \
+        int i, j;                                                 \
+        int e_8[4];                                               \
+        int o_8[4] = { 0 };                                       \
+        for (i = 0; i < 4; i++)                                   \
+            for (j = 1; j < end; j += 2)                          \
+                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
+                                                                  \
+        for (i = 0; i < 4; i++) {                                 \
+            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
+        }                                                         \
+    } while (0)
+
+#define TR_16(dst, src, dstep, sstep, assign, end)                \
+    do {                                                          \
+        int i, j;                                                 \
+        int e_16[8];                                              \
+        int o_16[8] = { 0 };                                      \
+        for (i = 0; i < 8; i++)                                   \
+            for (j = 1; j < end; j += 2)                          \
+                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
+                                                                  \
+        for (i = 0; i < 8; i++) {                                 \
+            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
+        }                                                         \
+    } while (0)
+
+#define TR_32(dst, src, dstep, sstep, assign, end)                \
+    do {                                                          \
+        int i, j;                                                 \
+        int e_32[16];                                             \
+        int o_32[16] = { 0 };                                     \
+        for (i = 0; i < 16; i++)                                  \
+            for (j = 1; j < end; j += 2)                          \
+                o_32[i] += transform[j][i] * src[j * sstep];      \
+        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
+                                                                  \
+        for (i = 0; i < 16; i++) {                                \
+            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
+        }                                                         \
+    } while (0)
+
+#define IDCT_VAR4(H)                                              \
+    int limit2 = FFMIN(col_limit + 4, H)
+#define IDCT_VAR8(H)                                              \
+    int limit  = FFMIN(col_limit, H);                             \
+    int limit2 = FFMIN(col_limit + 4, H)
+#define IDCT_VAR16(H)   IDCT_VAR8(H)
+#define IDCT_VAR32(H)   IDCT_VAR8(H)
+
+#define IDCT(H)                                                   \
+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
+                                        int col_limit)            \
+{                                                                 \
+    int i;                                                        \
+    int      shift = 7;                                           \
+    int      add   = 1 << (shift - 1);                            \
+    int16_t *src   = coeffs;                                      \
+    IDCT_VAR ## H(H);                                             \
+                                                                  \
+    for (i = 0; i < H; i++) {                                     \
+        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
+        if (limit2 < H && i%4 == 0 && !!i)                        \
+            limit2 -= 4;                                          \
+        src++;                                                    \
+    }                                                             \
+                                                                  \
+    shift = 20 - BIT_DEPTH;                                       \
+    add   = 1 << (shift - 1);                                     \
+    for (i = 0; i < H; i++) {                                     \
+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
+        coeffs += H;                                              \
+    }                                                             \
+}
+
+#define IDCT_DC(H)                                                \
+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
+{                                                                 \
+    int i, j;                                                     \
+    int shift = 14 - BIT_DEPTH;                                   \
+    int add   = 1 << (shift - 1);                                 \
+    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
+                                                                  \
+    for (j = 0; j < H; j++) {                                     \
+        for (i = 0; i < H; i++) {                                 \
+            coeffs[i + j * H] = coeff;                            \
+        }                                                         \
+    }                                                             \
+}
+
+IDCT( 4)
+IDCT( 8)
+IDCT(16)
+IDCT(32)
+
+IDCT_DC( 4)
+IDCT_DC( 8)
+IDCT_DC(16)
+IDCT_DC(32)
+
+#undef TR_4
+#undef TR_8
+#undef TR_16
+#undef TR_32
+
+#undef SET
+#undef SCALE
+
+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height)
+{
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int offset_table[32] = { 0 };
+    int k, y, x;
+    int shift  = BIT_DEPTH - 5;
+
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
+
+    for (k = 0; k < 4; k++)
+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+        dst += stride_dst;
+        src += stride_src;
+    }
+}
+
+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
+
+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+                                  int eo, int width, int height) {
+
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
+    stride_dst /= sizeof(pixel);
+
+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int diff0 = CMP(src[x], src[x + a_stride]);
+            int diff1 = CMP(src[x], src[x + b_stride]);
+            int offset_val        = edge_idx[2 + diff0 + diff1];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
+        }
+        src += stride_src;
+        dst += stride_dst;
+    }
+}
+
+
+#if BIT_DEPTH == 10
+// We need a 32 bit variation for the _c restores so hijack bit depth 10
+#undef pixel
+#undef BIT_DEPTH
+#define pixel uint32_t
+#define BIT_DEPTH 32
+// All 16 bit variations are the same
+#define sao_edge_restore_0_10 sao_edge_restore_0_9
+#define sao_edge_restore_1_10 sao_edge_restore_1_9
+#define sao_edge_restore_0_11 sao_edge_restore_0_9
+#define sao_edge_restore_1_11 sao_edge_restore_1_9
+#define sao_edge_restore_0_12 sao_edge_restore_0_9
+#define sao_edge_restore_1_12 sao_edge_restore_1_9
+#define sao_edge_restore_0_13 sao_edge_restore_0_9
+#define sao_edge_restore_1_13 sao_edge_restore_1_9
+#define sao_edge_restore_0_14 sao_edge_restore_0_9
+#define sao_edge_restore_1_14 sao_edge_restore_1_9
+#define sao_edge_restore_0_15 sao_edge_restore_0_9
+#define sao_edge_restore_1_15 sao_edge_restore_1_9
+#define sao_edge_restore_0_16 sao_edge_restore_0_9
+#define sao_edge_restore_1_16 sao_edge_restore_1_9
+#endif
+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
+                                    int *borders, int _width, int _height,
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int sao_eo_class    = sao->eo_class[c_idx];
+    int init_x = 0, width = _width, height = _height;
+
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
+
+    if (sao_eo_class != SAO_EO_VERT) {
+        if (borders[0]) {
+            for (y = 0; y < height; y++) {
+                dst[y * stride_dst] = src[y * stride_src];
+            }
+            init_x = 1;
+        }
+        if (borders[2]) {
+            int offset     = width - 1;
+            for (x = 0; x < height; x++) {
+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+            }
+            width--;
+        }
+    }
+    if (sao_eo_class != SAO_EO_HORIZ) {
+        if (borders[1]) {
+            for (x = init_x; x < width; x++)
+                dst[x] = src[x];
+        }
+        if (borders[3]) {
+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+            ptrdiff_t y_stride_src = stride_src * (height - 1);
+            for (x = init_x; x < width; x++)
+                dst[x + y_stride_dst] = src[x + y_stride_src];
+            height--;
+        }
+    }
+}
+
+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
+                                    int *borders, int _width, int _height,
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+{
+    int x, y;
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int sao_eo_class    = sao->eo_class[c_idx];
+    int init_x = 0, init_y = 0, width = _width, height = _height;
+
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
+
+    if (sao_eo_class != SAO_EO_VERT) {
+        if (borders[0]) {
+            for (y = 0; y < height; y++) {
+                dst[y * stride_dst] = src[y * stride_src];
+            }
+            init_x = 1;
+        }
+        if (borders[2]) {
+            int offset     = width - 1;
+            for (x = 0; x < height; x++) {
+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+            }
+            width--;
+        }
+    }
+    if (sao_eo_class != SAO_EO_HORIZ) {
+        if (borders[1]) {
+            for (x = init_x; x < width; x++)
+                dst[x] = src[x];
+            init_y = 1;
+        }
+        if (borders[3]) {
+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+            ptrdiff_t y_stride_src = stride_src * (height - 1);
+            for (x = init_x; x < width; x++)
+                dst[x + y_stride_dst] = src[x + y_stride_src];
+            height--;
+        }
+    }
+
+    {
+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
+
+        // Restore pixels that can't be modified
+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
+                dst[y*stride_dst] = src[y*stride_src];
+        }
+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
+        }
+
+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
+                dst[x] = src[x];
+        }
+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
+        }
+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
+            dst[0] = src[0];
+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
+            dst[width-1] = src[width-1];
+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
+
+    }
+}
+#endif
+#if BIT_DEPTH == 32
+#undef BIT_DEPTH
+#undef pixel
+#define BIT_DEPTH 10
+#define pixel uint16_t
+#endif
+
+// --- Plaited chroma versions
+
+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+                                  int width, int height)
+{
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int offset_table_u[32] = { 0 };
+    int offset_table_v[32] = { 0 };
+    int k, y, x;
+    int shift  = BIT_DEPTH - 5;
+
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
+    width *= 2;
+
+    for (k = 0; k < 4; k++)
+    {
+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+    }
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x += 2)
+        {
+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
+            // *** & 31 shouldn't be wanted but just now we generate broken input that
+            // crashes us in 10-bit world
+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
+        }
+        dst += stride_dst;
+        src += stride_src;
+    }
+}
+
+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+                                  int eo, int width, int height) {
+
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
+
+    stride_dst /= sizeof(pixel);
+    width *= 2;
+
+    av_assert0(width <= 64);
+
+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x += 2) {
+            int diff0u = CMP(src[x], src[x + a_stride]);
+            int diff1u = CMP(src[x], src[x + b_stride]);
+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+        }
+        src += stride_src;
+        dst += stride_dst;
+    }
+}
+
+// Do once
+#if BIT_DEPTH == 8
+// Any old 2 byte 'normal' restore will work for these
+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
+// We need 32 bit for 9 bit+
+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
+#endif
+
+#undef CMP
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = src[x] << (14 - BIT_DEPTH);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, intptr_t mx, intptr_t my, int width)
+{
+    int y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        memcpy(dst, src, width * sizeof(pixel));
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                           int16_t *src2,
+                                           int height, int denom, int wx0, int wx1,
+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
+        }
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define QPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - 3 * stride] +                                         \
+     filter[1] * src[x - 2 * stride] +                                         \
+     filter[2] * src[x -     stride] +                                         \
+     filter[3] * src[x             ] +                                         \
+     filter[4] * src[x +     stride] +                                         \
+     filter[5] * src[x + 2 * stride] +                                         \
+     filter[6] * src[x + 3 * stride] +                                         \
+     filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
+    for (y = 0; y < height; y++)  {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
+                                   uint8_t *_src,
+                                   ptrdiff_t _srcstride,
+                                   int height, intptr_t mx,
+                                   intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_qpel_filters[my - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
+        tmp += MAX_PB_SIZE;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                     uint8_t *_src, ptrdiff_t _srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+
+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                       uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift =  14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_qpel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_qpel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                         uint8_t *_src, ptrdiff_t _srcstride,
+                                         int height, int denom, int wx, int ox,
+                                         intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_qpel_filters[my - 1];
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_qpel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define EPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - stride] +                                             \
+     filter[1] * src[x]          +                                             \
+     filter[2] * src[x + stride] +                                             \
+     filter[3] * src[x + 2 * stride])
+
+static void FUNC(put_hevc_epel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
+                                   uint8_t *_src, ptrdiff_t _srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
+        tmp += MAX_PB_SIZE;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        }
+        dst  += dststride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        dst  += dststride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        }
+        dst += dststride;
+        src += srcstride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        }
+        dst += dststride;
+        src += srcstride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_epel_filters[my - 1];
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_rpi_epel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+// line zero
+#define P3 pix[-4 * xstride]
+#define P2 pix[-3 * xstride]
+#define P1 pix[-2 * xstride]
+#define P0 pix[-1 * xstride]
+#define Q0 pix[0 * xstride]
+#define Q1 pix[1 * xstride]
+#define Q2 pix[2 * xstride]
+#define Q3 pix[3 * xstride]
+
+// line three. used only for deblocking decision
+#define TP3 pix[-4 * xstride + 3 * ystride]
+#define TP2 pix[-3 * xstride + 3 * ystride]
+#define TP1 pix[-2 * xstride + 3 * ystride]
+#define TP0 pix[-1 * xstride + 3 * ystride]
+#define TQ0 pix[0  * xstride + 3 * ystride]
+#define TQ1 pix[1  * xstride + 3 * ystride]
+#define TQ2 pix[2  * xstride + 3 * ystride]
+#define TQ3 pix[3  * xstride + 3 * ystride]
+
+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
+                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
+                                        int beta, int *_tc,
+                                        uint8_t *_no_p, uint8_t *_no_q)
+{
+    int d, j;
+    pixel *pix        = (pixel *)_pix;
+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    beta <<= BIT_DEPTH - 8;
+
+    for (j = 0; j < 2; j++) {
+        const int dp0  = abs(P2  - 2 * P1  + P0);
+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+        const int d0   = dp0 + dq0;
+        const int d3   = dp3 + dq3;
+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+        const int no_p = _no_p[j];
+        const int no_q = _no_q[j];
+
+        if (d0 + d3 >= beta) {
+            pix += 4 * ystride;
+            continue;
+        } else {
+            const int beta_3 = beta >> 3;
+            const int beta_2 = beta >> 2;
+            const int tc25   = ((tc * 5 + 1) >> 1);
+
+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+                // strong filtering
+                const int tc2 = tc << 1;
+                for (d = 0; d < 4; d++) {
+                    const int p3 = P3;
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    const int q3 = Q3;
+                    if (!no_p) {
+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+                    }
+                    if (!no_q) {
+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+                    }
+                    pix += ystride;
+                }
+            } else { // normal filtering
+                int nd_p = 1;
+                int nd_q = 1;
+                const int tc_2 = tc >> 1;
+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+                    nd_p = 2;
+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+                    nd_q = 2;
+
+                for (d = 0; d < 4; d++) {
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+                    if (abs(delta0) < 10 * tc) {
+                        delta0 = av_clip(delta0, -tc, tc);
+                        if (!no_p)
+                            P0 = av_clip_pixel(p0 + delta0);
+                        if (!no_q)
+                            Q0 = av_clip_pixel(q0 - delta0);
+                        if (!no_p && nd_p > 1) {
+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+                            P1 = av_clip_pixel(p1 + deltap1);
+                        }
+                        if (!no_q && nd_q > 1) {
+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+                            Q1 = av_clip_pixel(q1 + deltaq1);
+                        }
+                    }
+                    pix += ystride;
+                }
+            }
+        }
+    }
+}
+
+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
+                                          ptrdiff_t _ystride, int *_tc,
+                                          uint8_t *_no_p, uint8_t *_no_q)
+{
+    int d, j, no_p, no_q;
+    pixel *pix        = (pixel *)_pix;
+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    for (j = 0; j < 2; j++) {
+        const int tc = _tc[j] << (BIT_DEPTH - 8);
+        if (tc <= 0) {
+            pix += 4 * ystride;
+            continue;
+        }
+        no_p = _no_p[j];
+        no_q = _no_q[j];
+
+        for (d = 0; d < 4; d++) {
+            int delta0;
+            const int p1 = P1;
+            const int p0 = P0;
+            const int q0 = Q0;
+            const int q1 = Q1;
+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+            if (!no_p)
+                P0 = av_clip_pixel(p0 + delta0);
+            if (!no_q)
+                Q0 = av_clip_pixel(q0 - delta0);
+            pix += ystride;
+        }
+    }
+}
+
+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+                                            int32_t *tc, uint8_t *no_p,
+                                            uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+                                            int32_t *tc, uint8_t *no_p,
+                                            uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
+}
+
+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+                                          int beta, int32_t *tc, uint8_t *no_p,
+                                          uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
+                                beta, tc, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+                                          int beta, int32_t *tc, uint8_t *no_p,
+                                          uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
+                                beta, tc, no_p, no_q);
+}
+
+#undef P3
+#undef P2
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+
+#undef TP3
+#undef TP2
+#undef TP1
+#undef TP0
+#undef TQ0
+#undef TQ1
+#undef TQ2
+#undef TQ3
+
+// line zero
+#define P3 pix_l[0 * xstride]
+#define P2 pix_l[1 * xstride]
+#define P1 pix_l[2 * xstride]
+#define P0 pix_l[3 * xstride]
+#define Q0 pix_r[0 * xstride]
+#define Q1 pix_r[1 * xstride]
+#define Q2 pix_r[2 * xstride]
+#define Q3 pix_r[3 * xstride]
+
+// line three. used only for deblocking decision
+#define TP3 pix_l[0 * xstride + 3 * ystride]
+#define TP2 pix_l[1 * xstride + 3 * ystride]
+#define TP1 pix_l[2 * xstride + 3 * ystride]
+#define TP0 pix_l[3 * xstride + 3 * ystride]
+#define TQ0 pix_r[0 * xstride + 3 * ystride]
+#define TQ1 pix_r[1 * xstride + 3 * ystride]
+#define TQ2 pix_r[2 * xstride + 3 * ystride]
+#define TQ3 pix_r[3 * xstride + 3 * ystride]
+
+// This is identical to hevc_loop_filter_luma except that the P/Q
+// components are on separate pointers
+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
+                                 uint8_t * _pix_l)
+{
+    int d, j;
+    pixel *pix_l        = (pixel *)_pix_l;
+    pixel *pix_r        = (pixel *)_pix_r;
+    const ptrdiff_t xstride = 1;
+    const ptrdiff_t ystride = _stride / sizeof(pixel);
+
+    beta <<= BIT_DEPTH - 8;
+
+    for (j = 0; j < 2; j++) {
+        const int dp0  = abs(P2  - 2 * P1  + P0);
+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+        const int d0   = dp0 + dq0;
+        const int d3   = dp3 + dq3;
+        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
+        const int no_p = no_f & 1;
+        const int no_q = no_f & 2;
+
+        if (d0 + d3 >= beta) {
+            pix_l += 4 * ystride;
+            pix_r += 4 * ystride;
+            continue;
+        } else {
+            const int beta_3 = beta >> 3;
+            const int beta_2 = beta >> 2;
+            const int tc25   = ((tc * 5 + 1) >> 1);
+
+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+                // strong filtering
+                const int tc2 = tc << 1;
+                for (d = 0; d < 4; d++) {
+                    const int p3 = P3;
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    const int q3 = Q3;
+                    if (!no_p) {
+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+                    }
+                    if (!no_q) {
+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+                    }
+                    pix_l += ystride;
+                    pix_r += ystride;
+                }
+            } else { // normal filtering
+                int nd_p = 1;
+                int nd_q = 1;
+                const int tc_2 = tc >> 1;
+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+                    nd_p = 2;
+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+                    nd_q = 2;
+
+                for (d = 0; d < 4; d++) {
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+                    if (abs(delta0) < 10 * tc) {
+                        delta0 = av_clip(delta0, -tc, tc);
+                        if (!no_p)
+                            P0 = av_clip_pixel(p0 + delta0);
+                        if (!no_q)
+                            Q0 = av_clip_pixel(q0 - delta0);
+                        if (!no_p && nd_p > 1) {
+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+                            P1 = av_clip_pixel(p1 + deltap1);
+                        }
+                        if (!no_q && nd_q > 1) {
+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+                            Q1 = av_clip_pixel(q1 + deltaq1);
+                        }
+                    }
+                    pix_l += ystride;
+                    pix_r += ystride;
+                }
+            }
+        }
+    }
+}
+
+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
+{
+    // Just call the non-2 function having massaged the parameters
+    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
+    uint8_t no_p[2] = {no_f & 1, no_f & 1};
+    uint8_t no_q[2] = {no_f & 2, no_f & 2};
+    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
+}
+
+#undef TP3
+#undef TP2
+#undef TP1
+#undef TP0
+#undef TQ0
+#undef TQ1
+#undef TQ2
+#undef TQ3
+
+#undef P3
+#undef P2
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+
+#define P1 pix_l[0 * xstride]
+#define P0 pix_l[1 * xstride]
+#define Q0 pix_r[0 * xstride]
+#define Q1 pix_r[1 * xstride]
+
+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+                                          ptrdiff_t _ystride, const int32_t *_tc,
+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+{
+    int d, j, no_p, no_q;
+    pixel *pix_l        = (pixel *)_pix_l;
+    pixel *pix_r        = (pixel *)_pix_r;
+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    for (j = 0; j < 2; j++) {
+        const int tc = _tc[j] << (BIT_DEPTH - 8);
+        if (tc <= 0) {
+            pix_l += 4 * ystride;
+            pix_r += 4 * ystride;
+            continue;
+        }
+        no_p = _no_p[j];
+        no_q = _no_q[j];
+
+        for (d = 0; d < 4; d++) {
+            int delta0;
+            const int p1 = P1;
+            const int p0 = P0;
+            const int q0 = Q0;
+            const int q1 = Q1;
+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+            if (!no_p)
+                P0 = av_clip_pixel(p0 + delta0);
+            if (!no_q)
+                Q0 = av_clip_pixel(q0 - delta0);
+            pix_l += ystride;
+            pix_r += ystride;
+        }
+    }
+}
+
+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+                                 unsigned int no_f)
+{
+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+                                 uint8_t * src_l,
+                                 unsigned int no_f)
+{
+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+}
+
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+
diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
new file mode 100644
index 0000000000..0aa8809a4b
--- /dev/null
+++ b/libavcodec/rpi_hevcpred.c
@@ -0,0 +1,161 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rpi_hevcdec.h"
+
+#include "rpi_hevcpred.h"
+#if (ARCH_ARM)
+#include "arm/rpi_hevcpred_arm.h"
+#endif
+
+#define PRED_C 0
+#define BIT_DEPTH 8
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+#undef PRED_C
+
+#define PRED_C 1
+#define BIT_DEPTH 8
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "rpi_hevcpred_template.c"
+#undef BIT_DEPTH
+#undef PRED_C
+
+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
+{
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth
+
+#undef FUNCC
+#define FUNCC(a, depth) a ## _ ## depth ## _c
+
+#define HEVC_PRED_Y(depth)                                \
+    hpc->intra_pred      = FUNC(intra_pred, depth);     \
+    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
+    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
+    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
+    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
+    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
+    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
+    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
+    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
+    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
+    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
+    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
+    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
+    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
+    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
+    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
+    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
+    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
+    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
+    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
+    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
+
+#define HEVC_PRED_C(depth)                                \
+    hpc->intra_pred_c      = FUNCC(intra_pred, depth);     \
+	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
+	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
+	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
+	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
+    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
+    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
+    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
+    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
+    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
+    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
+    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
+    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
+    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
+    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
+    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
+    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
+    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
+    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
+    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
+
+#define HEVC_PRED(depth) \
+    HEVC_PRED_Y(depth); \
+    HEVC_PRED_C(depth);
+
+    switch (bit_depth) {
+    case 9:
+        HEVC_PRED(9);
+        break;
+    case 10:
+        HEVC_PRED(10);
+        break;
+    case 12:
+        HEVC_PRED(12);
+        break;
+    default:
+        HEVC_PRED(8);
+        break;
+    }
+
+#if (ARCH_ARM)
+    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
+#elif (ARCH_MIPS)
+    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
+#endif
+}
diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
new file mode 100644
index 0000000000..9f0edb8798
--- /dev/null
+++ b/libavcodec/rpi_hevcpred.h
@@ -0,0 +1,123 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RPI_HEVCPRED_H
+#define AVCODEC_RPI_HEVCPRED_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "config.h"
+
+struct HEVCRpiContext;
+struct HEVCRpiLocalContext;
+
+enum IntraPredMode {
+    INTRA_PLANAR = 0,
+    INTRA_DC,
+    INTRA_ANGULAR_2,
+    INTRA_ANGULAR_3,
+    INTRA_ANGULAR_4,
+    INTRA_ANGULAR_5,
+    INTRA_ANGULAR_6,
+    INTRA_ANGULAR_7,
+    INTRA_ANGULAR_8,
+    INTRA_ANGULAR_9,
+    INTRA_ANGULAR_10,
+    INTRA_ANGULAR_11,
+    INTRA_ANGULAR_12,
+    INTRA_ANGULAR_13,
+    INTRA_ANGULAR_14,
+    INTRA_ANGULAR_15,
+    INTRA_ANGULAR_16,
+    INTRA_ANGULAR_17,
+    INTRA_ANGULAR_18,
+    INTRA_ANGULAR_19,
+    INTRA_ANGULAR_20,
+    INTRA_ANGULAR_21,
+    INTRA_ANGULAR_22,
+    INTRA_ANGULAR_23,
+    INTRA_ANGULAR_24,
+    INTRA_ANGULAR_25,
+    INTRA_ANGULAR_26,
+    INTRA_ANGULAR_27,
+    INTRA_ANGULAR_28,
+    INTRA_ANGULAR_29,
+    INTRA_ANGULAR_30,
+    INTRA_ANGULAR_31,
+    INTRA_ANGULAR_32,
+    INTRA_ANGULAR_33,
+    INTRA_ANGULAR_34,
+};
+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
+#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
+
+typedef void intra_filter_fn_t(
+        uint8_t * const left, uint8_t * const top,
+        const unsigned int req, const unsigned int avail,
+        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
+        const unsigned int stride,
+        const unsigned int top_right_size, const unsigned int down_left_size);
+
+typedef struct HEVCRpiPredContext {
+    void (*intra_pred)(const struct HEVCRpiContext * const s,
+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
+                          const unsigned int avail, const unsigned int log2_size);
+
+    intra_filter_fn_t *intra_filter[4];
+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
+                           const uint8_t *left, ptrdiff_t stride);
+    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
+                    ptrdiff_t stride);
+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int mode);
+    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int mode);
+    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int mode);
+    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
+
+    void (*intra_pred_c)(const struct HEVCRpiContext * const s,
+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
+                          const unsigned int avail, const unsigned int log2_size);
+    intra_filter_fn_t *intra_filter_c[4];
+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+                           const uint8_t *left, ptrdiff_t stride);
+    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
+                    ptrdiff_t stride);
+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int mode);
+    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int mode);
+    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int mode);
+    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
+} HEVCRpiPredContext;
+
+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_RPI_HEVCPRED_H */
diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
new file mode 100644
index 0000000000..f2ebcad332
--- /dev/null
+++ b/libavcodec/rpi_hevcpred_template.c
@@ -0,0 +1,1407 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/rpi_sand_fns.h"
+#include "bit_depth_template.c"
+
+#include "rpi_hevcdec.h"
+#include "rpi_hevcpred.h"
+
+#define DUMP_PRED 0
+
+#define POS(x, y) src[(x) + stride * (y)]
+
+// INCLUDED_ONCE defined at EOF
+#ifndef INCLUDED_ONCE
+typedef uint8_t (* c8_dst_ptr_t)[2];
+typedef const uint8_t (* c8_src_ptr_t)[2];
+typedef uint16_t (* c16_dst_ptr_t)[2];
+typedef const uint16_t (* c16_src_ptr_t)[2];
+
+// *** On ARM make these NEON registers
+typedef struct pixel4_16 {
+    uint16_t x[4];
+} pixel4_16;
+typedef struct pixel4_32 {
+    uint32_t x[4];
+} pixel4_32;
+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
+{
+    pixel4_16 t = {{x, x, x, x}};
+    return t;
+}
+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
+{
+    pixel4_32 t = {{x, x, x, x}};
+    return t;
+}
+#endif
+
+#if PRED_C
+// For chroma we double pixel size so we copy pairs
+#undef pixel
+#undef pixel2
+#undef pixel4
+#undef dctcoef
+#undef INIT_CLIP
+#undef no_rnd_avg_pixel4
+#undef rnd_avg_pixel4
+#undef AV_RN2P
+#undef AV_RN4P
+#undef AV_RN4PA
+#undef AV_WN2P
+#undef AV_WN4P
+#undef AV_WN4PA
+#undef CLIP
+#undef FUNC
+#undef FUNCC
+#undef av_clip_pixel
+#undef PIXEL_SPLAT_X4
+
+#if BIT_DEPTH == 8
+#define pixel uint16_t
+#define pixel4 pixel4_16
+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
+#define cpel uint8_t
+#define c_src_ptr_t  c8_src_ptr_t
+#define c_dst_ptr_t  c8_dst_ptr_t
+#else
+#define pixel uint32_t
+#define pixel4 pixel4_32
+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
+#define cpel uint16_t
+#define c_src_ptr_t c16_dst_ptr_t
+#define c_dst_ptr_t c16_dst_ptr_t
+#endif
+#define AV_RN4P(p) (*(pixel4*)(p))
+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
+#endif
+
+
+// Get PW prior to horrid PRED_C trickery
+#if BIT_DEPTH == 8
+#define PW 1
+#else
+#define PW 2
+#endif
+
+
+#if DUMP_PRED && !defined(INCLUDED_ONCE)
+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+{
+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+        for (unsigned int x = 0; x != size; x++) {
+            printf("%4d", data[x * 2]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+#endif
+
+#ifndef INCLUDED_ONCE
+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
+{
+    if ((n >>= 2) != 0) {
+        uint32_t v4 = v | (v << 8);
+        uint32_t * p = (uint32_t *)ptr;
+        v4 = v4 | (v4 << 16);
+        do {
+            *p++ = v4;
+        } while (--n != 0);
+    }
+}
+
+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
+{
+    if ((n >>= 2) != 0) {
+        uint32_t v2 = v | (v << 16);
+        uint32_t * p = (uint32_t *)ptr;
+        do {
+            *p++ = v2;
+            *p++ = v2;
+        } while (--n != 0);
+    }
+}
+
+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
+{
+    if ((n >>= 2) != 0) {
+        uint32_t * p = (uint32_t *)ptr;
+        do {
+            *p++ = v;
+            *p++ = v;
+            *p++ = v;
+            *p++ = v;
+        } while (--n != 0);
+    }
+}
+
+// Beware that this inverts the avail ordering
+// For CIP it seems easier this way round
+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
+                              unsigned int s0, unsigned int odd_s)
+{
+    const unsigned int n = 1 << log2_intra_bits;
+    unsigned int fa = 0;
+    unsigned int i;
+
+    size >>= 2;   // Now in 4-pel units
+    s0 >>= 2;
+
+    if ((avail & AVAIL_DL) != 0)
+        fa |= ((1 << s0) - 1) << (size - s0);
+    if ((avail & AVAIL_L) != 0)
+        fa |= ((1 << size) - 1) << size;
+    if ((avail & AVAIL_UL) != 0)
+        fa |= 1 << (size << 1);
+
+    if (odd_s) {
+        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
+            fa &= ~1;
+        is_intra += i_stride;
+    }
+
+    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
+        const unsigned int m = ((1 << n) - 1) << i;
+        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
+            fa &= ~m;
+    }
+
+    return fa;
+}
+
+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
+                                unsigned int s1, unsigned int odd_s)
+{
+    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
+    {
+        return 0;
+    }
+    else
+    {
+        const unsigned int n = 1 << log2_intra_bits;
+        unsigned int fa = 0;
+        unsigned int i;
+        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
+
+        size >>= 2;   // Now in 4-pel units
+        s1 >>= 2;
+
+        if ((avail & AVAIL_U) != 0)
+            fa |= ((1 << size) - 1);
+        if ((avail & AVAIL_UR) != 0)
+            fa |= ((1 << s1) - 1) << size;
+
+        if (odd_s) {
+            fa &= im | ~1;
+            im >>= 1;
+        }
+
+        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
+            const unsigned int m = ((1 << n) - 1) << i;
+            if ((im & 1) == 0)
+                fa &= ~m;
+        }
+        return fa;
+    }
+}
+
+
+
+static inline unsigned int rmbd(unsigned int x)
+{
+#if 1
+    return __builtin_ctz(x);
+#else
+    unsigned int n = 0;
+    if ((x & 0xffff) == 0) {
+        x >>= 16;
+        n += 16;
+    }
+    if ((x & 0xff) == 0) {
+        x >>= 8;
+        n += 8;
+    }
+    if ((x & 0xf) == 0) {
+        x >>= 4;
+        n += 4;
+    }
+    if ((x & 0x3) == 0) {
+        x >>= 2;
+        n += 2;
+    }
+
+    return (x & 1) == 0 ? n + 1 : n;
+#endif
+}
+#endif
+
+
+static void FUNC(cip_fill)(pixel * const left, pixel * const top,
+    const unsigned int avail_l, const unsigned int avail_u,
+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
+    const unsigned int stride,
+    const unsigned int size)
+{
+    pixel a;
+    unsigned int i;
+
+    // 1st find DL value
+    if ((avail_l & 1) == 0) {
+        if (avail_l != 0)
+            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
+        else
+        {
+            // (avail_l | avail_u) != 0 so this must be good
+            const unsigned int n = rmbd(avail_u)*4;
+            a = (n >= size) ? src_ur[n - size] : src_u[n];
+        }
+    }
+
+    // L
+    {
+        pixel * d = left + size * 2 - 1;
+        const pixel * s = src_l + (size * 2 - 1) * stride;
+        unsigned int x = avail_l;
+        for (i = 0; i < size * 2; i += 4, x >>= 1)
+        {
+            if ((x & 1) != 0) {
+                // Avail
+                *d-- = *s;
+                s -= stride;
+                *d-- = *s;
+                s -= stride;
+                *d-- = *s;
+                s -= stride;
+                *d-- = a = *s;
+                s -= stride;
+            }
+            else
+            {
+                *d-- = a;
+                *d-- = a;
+                *d-- = a;
+                *d-- = a;
+                s -= stride * 4;
+            }
+        }
+        // UL
+        *d = a = (x & 1) != 0 ? *s : a;
+    }
+
+    // U
+    {
+        pixel * d = top;
+        const pixel * s = src_u;
+        unsigned int x = avail_u;
+
+        for (i = 0; i < size; i += 4, x >>= 1)
+        {
+            if ((x & 1) != 0) {
+                // Avail
+                *d++ = *s++;
+                *d++ = *s++;
+                *d++ = *s++;
+                *d++ = a = *s++;
+            }
+            else
+            {
+                *d++ = a;
+                *d++ = a;
+                *d++ = a;
+                *d++ = a;
+                s += 4;
+            }
+        }
+
+        // UR
+        s = src_ur;
+        for (i = 0; i < size; i += 4, x >>= 1)
+        {
+            if ((x & 1) != 0) {
+                // Avail
+                *d++ = *s++;
+                *d++ = *s++;
+                *d++ = *s++;
+                *d++ = a = *s++;
+            }
+            else
+            {
+                *d++ = a;
+                *d++ = a;
+                *d++ = a;
+                *d++ = a;
+                s += 4;
+            }
+        }
+    }
+}
+
+
+#if !PRED_C && PW == 1
+#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
+#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
+#else
+#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
+#endif
+
+// Reqs:
+//
+// Planar:  DL[0], L, ul, U, UR[0]
+// DC:         dl, L, ul, U, ur
+// A2-9:       DL, L, ul, u, ur
+// A10:        dl, L, ul, u, ur
+// A11-17      dl, L, UL, U, ur
+// A18-25      dl, L, Ul, U, ur
+// A26         dl, l, ul, U, ur
+// A27-34      dl, l, ul, U, UR
+
+#ifndef INCLUDED_ONCE
+
+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
+
+static const uint8_t req_avail_c[35] =
+{
+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
+               AVAIL_L | 0         |  AVAIL_U,             // DC
+    AVAIL_DL | AVAIL_L,                                    // 2
+    AVAIL_DL | AVAIL_L,                                    // 3
+    AVAIL_DL | AVAIL_L,                                    // 4
+    AVAIL_DL | AVAIL_L,                                    // 5
+    AVAIL_DL | AVAIL_L,                                    // 6
+    AVAIL_DL | AVAIL_L,                                    // 7
+    AVAIL_DL | AVAIL_L,                                    // 8
+    AVAIL_DL | AVAIL_L,                                    // 9
+               AVAIL_L,                                    // 10 (H)
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
+                                    AVAIL_U,               // 26 (V)
+                                    AVAIL_U | AVAIL_UR,    // 27
+                                    AVAIL_U | AVAIL_UR,    // 28
+                                    AVAIL_U | AVAIL_UR,    // 29
+                                    AVAIL_U | AVAIL_UR,    // 30
+                                    AVAIL_U | AVAIL_UR,    // 31
+                                    AVAIL_U | AVAIL_UR,    // 32
+                                    AVAIL_U | AVAIL_UR,    // 33
+                                    AVAIL_U | AVAIL_UR     // 34
+};
+
+static const uint8_t req_avail[4][35] = {
+{
+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
+               AVAIL_L | 0         |  AVAIL_U,             // DC
+    AVAIL_DL | AVAIL_L,                                    // 2
+    AVAIL_DL | AVAIL_L,                                    // 3
+    AVAIL_DL | AVAIL_L,                                    // 4
+    AVAIL_DL | AVAIL_L,                                    // 5
+    AVAIL_DL | AVAIL_L,                                    // 6
+    AVAIL_DL | AVAIL_L,                                    // 7
+    AVAIL_DL | AVAIL_L,                                    // 8
+    AVAIL_DL | AVAIL_L,                                    // 9
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
+                                    AVAIL_U | AVAIL_UR,    // 27
+                                    AVAIL_U | AVAIL_UR,    // 28
+                                    AVAIL_U | AVAIL_UR,    // 29
+                                    AVAIL_U | AVAIL_UR,    // 30
+                                    AVAIL_U | AVAIL_UR,    // 31
+                                    AVAIL_U | AVAIL_UR,    // 32
+                                    AVAIL_U | AVAIL_UR,    // 33
+                                    AVAIL_U | AVAIL_UR     // 34
+},
+{  // 3
+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
+               AVAIL_L | 0        | AVAIL_U,                            // DC
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
+    AVAIL_DL | AVAIL_L                                 | 0,             // 3
+    AVAIL_DL | AVAIL_L                                 | 0,             // 4
+    AVAIL_DL | AVAIL_L                                 | 0,             // 5
+    AVAIL_DL | AVAIL_L                                 | 0,             // 6
+    AVAIL_DL | AVAIL_L                                 | 0,             // 7
+    AVAIL_DL | AVAIL_L                                 | 0,             // 8
+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
+                                    AVAIL_U | AVAIL_UR | 0,             // 27
+                                    AVAIL_U | AVAIL_UR | 0,             // 28
+                                    AVAIL_U | AVAIL_UR | 0,             // 29
+                                    AVAIL_U | AVAIL_UR | 0,             // 30
+                                    AVAIL_U | AVAIL_UR | 0,             // 31
+                                    AVAIL_U | AVAIL_UR | 0,             // 32
+                                    AVAIL_U | AVAIL_UR | 0,             // 33
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
+},
+{  // 4
+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
+               AVAIL_L | 0        | AVAIL_U,                            // DC
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
+                                    AVAIL_U | AVAIL_UR | 0,             // 27
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
+},
+{  // 5
+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
+               AVAIL_L | 0        | AVAIL_U,                            // DC
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
+               AVAIL_L                                 | 0,             // 10 (H)
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
+                                    AVAIL_U            | 0,             // 26 (V)
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
+}
+};
+
+
+#endif
+
+#define filter_light1 FUNC(filter_light1)
+static inline pixel filter_light1(pixel a, pixel b, pixel c)
+{
+    return (a + b*2 + c + 2) >> 2;
+}
+
+#define filter_light FUNC(filter_light)
+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
+{
+    pixel p0;
+    pixel p2 = *src;
+    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
+    unsigned int n_minus_1 = n - 1;
+
+    do
+    {
+        src += sstride;
+        p0 = p1;
+        p1 = p2;
+        p2 = *src;
+        *dst++ = filter_light1(p0, p1, p2);
+    } while (--n_minus_1 != 0);
+    *dst = filter_light1(p1, p2, pn);
+}
+
+#define filter_strong FUNC(filter_strong)
+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
+{
+    unsigned int a = 64 * p0 + 32;
+    const int v = p1 - p0;
+
+    do
+    {
+        *dst++ = (a += v) >> 6;
+    } while (--n != 0);
+}
+
+#define intra_filter FUNC(intra_filter)
+static av_always_inline void intra_filter(
+    pixel * const left, pixel * const top,
+    const unsigned int req, const unsigned int avail,
+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
+    const unsigned int stride,
+    const unsigned int top_right_size, const unsigned int down_left_size,
+    const unsigned int log2_size)
+{
+    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
+    const unsigned int size = 1 << log2_size;
+
+    // a_ is the first pel in a section working round dl -> ur
+    // b_ is the last
+    // Beware that top & left work out from UL so usage of a_ & b_ may
+    // swap between them.  It is a bad naming scheme but I have found no
+    // better
+    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
+    const pixel * b_dl = src_l + size * stride;
+    const pixel * a_l  = src_l + (size - 1) * stride;
+    const pixel * b_l  = src_l;
+    const pixel * ab_ul = src_l - stride;
+    const pixel * a_u = src_u;
+    const pixel * b_u = src_u + size - 1;
+    const pixel * a_ur = src_ur;
+    const pixel * b_ur = src_ur + top_right_size - 1;
+
+    const unsigned int want = req & ~avail;
+    const unsigned int have = req & avail;
+    unsigned int i;
+
+    if ((avail & AVAIL_DL) == 0)
+    {
+        a_dl = a_ur;
+        if ((avail & AVAIL_U) != 0)
+            a_dl = a_u;
+        if ((avail & AVAIL_UL) != 0)
+            a_dl = ab_ul;
+        if ((avail & AVAIL_L) != 0)
+            a_dl = a_l;
+        b_dl = a_dl;
+    }
+
+    if ((avail & AVAIL_L) == 0)
+    {
+        a_l = b_dl;
+        b_l = b_dl;
+    }
+    if ((avail & AVAIL_UL) == 0)
+    {
+        ab_ul = b_l;
+    }
+    if ((avail & AVAIL_U) == 0)
+    {
+        a_u = ab_ul;
+        b_u = ab_ul;
+    }
+    if ((avail & AVAIL_UR) == 0)
+    {
+        a_ur = b_u;
+        b_ur = b_u;
+    }
+
+    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
+    {
+        if ((req & AVAIL_UL) != 0)
+            left[-1] = *ab_ul;
+
+        if ((want & AVAIL_L) != 0)
+            EXTEND(left, *a_l, size);
+        if ((want & AVAIL_DL) != 0)
+            EXTEND(left + size, *a_dl, size);
+        if ((want & AVAIL_U) != 0)
+            EXTEND(top, *a_u, size);
+        if ((want & AVAIL_UR) != 0)
+            EXTEND(top + size, *a_ur, size);
+
+        if ((have & AVAIL_U) != 0)
+            // Always good - even with sand
+            memcpy(top, a_u, size * sizeof(pixel));
+        if ((have & AVAIL_UR) != 0)
+        {
+            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
+            EXTEND(top + size + top_right_size, *b_ur,
+                   size - top_right_size);
+        }
+        if ((have & AVAIL_L) != 0)
+        {
+            for (i = 0; i < size; i++)
+                left[i] = b_l[stride * i];
+        }
+        if ((have & AVAIL_DL) != 0)
+        {
+            for (i = 0; i < down_left_size; i++)
+                left[i + size] = b_dl[stride * i];
+            EXTEND(left + size + down_left_size, *a_dl,
+                   size - down_left_size);
+        }
+    }
+    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
+            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
+            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
+    {
+        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
+            filter_strong(top, *ab_ul, *b_ur, size * 2);
+        left[-1] = *ab_ul;
+        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
+            filter_strong(left, *ab_ul, *a_dl, size*2);
+    }
+    else
+    {
+        // Same code for both have & want for UL
+        if ((req & AVAIL_UL) != 0)
+        {
+            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
+        }
+
+        if ((want & AVAIL_L) != 0)
+        {
+            EXTEND(left, *a_l, size);
+            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
+        }
+        if ((want & AVAIL_DL) != 0)
+        {
+            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
+            EXTEND(left + size, *a_l, size);
+        }
+        if ((want & AVAIL_U) != 0)
+        {
+            EXTEND(top, *a_u, size);
+            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
+        }
+        if ((want & AVAIL_UR) != 0)
+        {
+            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
+            EXTEND(top + size, *a_ur, size);
+        }
+
+        if ((have & AVAIL_U) != 0)
+        {
+            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
+        }
+        if ((have & AVAIL_UR) != 0) {
+            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
+            top[size*2 - 1] = *b_ur;
+            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
+        }
+        if ((have & AVAIL_L) != 0)
+        {
+            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
+        }
+        if ((have & AVAIL_DL) != 0)
+        {
+            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
+            left[size*2 - 1] = *a_dl;
+            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
+        }
+    }
+}
+
+#define INTRA_FILTER(log2_size) \
+static void FUNC(intra_filter_ ## log2_size)( \
+     uint8_t * const left, uint8_t * const top, \
+     const unsigned int req, const unsigned int avail, \
+     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
+     const unsigned int stride, \
+     const unsigned int top_right_size, const unsigned int down_left_size) \
+{ \
+    intra_filter((pixel *)left, (pixel *)top, req, avail, \
+        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
+}
+
+INTRA_FILTER(2)
+INTRA_FILTER(3)
+INTRA_FILTER(4)
+INTRA_FILTER(5)
+
+#undef intra_filter
+#undef INTRA_FILTER
+
+static void FUNC(intra_pred)(const HEVCRpiContext * const s,
+                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
+                                              const unsigned int log2_size)
+{
+    // c_idx will alaways be 1 for _c versions and 0 for y
+    const unsigned int c_idx = PRED_C;
+    const unsigned int hshift = ctx_hshift(s, c_idx);
+    const unsigned int vshift = ctx_vshift(s, c_idx);
+    const unsigned int size = (1 << log2_size);
+    const unsigned int x = x0 >> hshift;
+    const unsigned int y = y0 >> vshift;
+
+    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
+    pixel *const src = c_idx == 0 ?
+        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
+        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+
+    // Align so we can do multiple loads in the asm
+    // Padded to 16 byte boundary so as not to confuse anything
+    DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
+    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
+
+    pixel  * const left  = left_array  + 16 / sizeof(pixel);
+    const pixel * top_pred = top;
+
+    const pixel * src_l = src - 1;
+    const pixel * src_u = src - stride;
+    const pixel * src_ur = src_u + size;
+#if !PRED_C
+    const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
+#else
+    const unsigned int req = req_avail_c[mode];
+#endif
+
+    // If we have nothing to pred from then fill with grey
+    // This isn't a common case but dealing with it here means we don't have to
+    // test for it later
+    if (avail == 0)
+    {
+dc_only:
+#if !PRED_C
+        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
+#else
+        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
+#endif
+        return;
+    }
+
+    {
+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
+        const AVFrame * const frame = s->frame;
+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
+        if ((x & mask) == 0)
+            src_l -= stripe_adj;
+        if (((x + size) & mask) == 0)
+            src_ur += stripe_adj;
+    }
+
+    // Can deal with I-slices in 'normal' code even if CIP
+    // This also means that we don't need to generate (elsewhere) is_intra
+    // for IRAP frames
+    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
+        s->sh.slice_type != HEVC_SLICE_I)
+    {
+        // * If we ever actually care about CIP performance then we should
+        //   special case out size 4 stuff (can be done by 'normal') and
+        //   have 8-pel avail masks
+        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
+                                           -(int)(s->ps.sps->pcm_width),
+                                           1 << (((x - 1) >> (3 - hshift)) & 7),
+                                           1 - hshift,
+                                           avail,
+                                           size,
+                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
+                                           vshift != 0 ? 0 : (y >> 2) & 1);
+
+        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
+                                           (x >> (3 - hshift)) & 7,
+                                           1 - hshift,
+                                           avail,
+                                           size,
+                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
+                                           hshift != 0 ? 0 : (x >> 2) & 1);
+
+        // Anything left?
+        if ((avail_l | avail_u) == 0)
+            goto dc_only;
+
+        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
+
+#if !PRED_C
+        if ((req & FILTER_LIGHT) != 0)
+        {
+            const unsigned threshold = 1 << (BIT_DEPTH - 5);
+            if ((req & FILTER_STRONG) != 0 &&
+                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
+                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
+            {
+                filter_strong(top, left[-1], top[63], 64);
+                filter_strong(left, left[-1], left[63], 64);
+            } else
+            {
+                // LHS writes UL too so copy for top
+                const pixel p_ul = left[-1];
+                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
+                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
+            }
+        }
+#endif
+    }
+    else
+    {
+        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
+        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
+            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
+        {
+            top_pred = src_u;
+        }
+        else
+        {
+#if !PRED_C
+            s->hpc.intra_filter[log2_size - 2]
+#else
+            s->hpc.intra_filter_c[log2_size - 2]
+#endif
+                ((uint8_t *)left, (uint8_t *)top, req, avail,
+                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
+                              ur_size,
+                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
+        }
+    }
+
+
+#if !PRED_C
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                          (uint8_t *)left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                       (uint8_t *)left, stride);
+        break;
+    case INTRA_ANGULAR_HORIZONTAL:
+        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                           (uint8_t *)left, stride,
+                                           mode);
+        break;
+    case INTRA_ANGULAR_VERTICAL:
+        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                           (uint8_t *)left, stride,
+                                           mode);
+        break;
+    default:
+        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                           (uint8_t *)left, stride,
+                                           mode);
+        break;
+    }
+#else
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                          (uint8_t *)left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                       (uint8_t *)left, stride);
+        break;
+    case INTRA_ANGULAR_HORIZONTAL:
+        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                           (uint8_t *)left, stride,
+                                           mode);
+        break;
+    case INTRA_ANGULAR_VERTICAL:
+        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                           (uint8_t *)left, stride,
+                                           mode);
+        break;
+    default:
+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+                                           (uint8_t *)left, stride,
+                                           mode);
+        break;
+    }
+
+#if DUMP_PRED
+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+#endif
+#endif
+}
+
+#if !PRED_C
+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+                                  const uint8_t *_left, ptrdiff_t stride,
+                                  int trafo_size)
+{
+    int x, y;
+    pixel *src        = (pixel *)_src;
+    const pixel *top  = (const pixel *)_top;
+    const pixel *left = (const pixel *)_left;
+    int size = 1 << trafo_size;
+    for (y = 0; y < size; y++)
+        for (x = 0; x < size; x++)
+            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
+                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
+}
+#else
+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+                                  const uint8_t * _left, ptrdiff_t stride,
+                                  int trafo_size)
+{
+    int x, y;
+    int size = 1 << trafo_size;
+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+    const c_src_ptr_t top = (c_src_ptr_t)_top;
+    const c_src_ptr_t left = (c_src_ptr_t)_left;
+
+    for (y = 0; y < size; y++, src += stride)
+    {
+        for (x = 0; x < size; x++)
+        {
+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+        }
+    }
+}
+#endif
+
+#define PRED_PLANAR(size)\
+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+                                       const uint8_t *left, ptrdiff_t stride)   \
+{                                                                               \
+    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
+}
+
+PRED_PLANAR(0)
+PRED_PLANAR(1)
+PRED_PLANAR(2)
+PRED_PLANAR(3)
+
+#undef PRED_PLANAR
+
+#if !PRED_C
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                          const uint8_t *_left,
+                          ptrdiff_t stride, int log2_size)
+{
+    int i, j, x, y;
+    int size          = (1 << log2_size);
+    pixel *src        = (pixel *)_src;
+    const pixel *top  = (const pixel *)_top;
+    const pixel *left = (const pixel *)_left;
+    int dc            = size;
+    pixel4 a;
+    for (i = 0; i < size; i++)
+        dc += left[i] + top[i];
+
+    dc >>= log2_size + 1;
+
+    a = PIXEL_SPLAT_X4(dc);
+
+    for (i = 0; i < size; i++)
+        for (j = 0; j < size; j+=4)
+            AV_WN4P(&POS(j, i), a);
+
+//    if (c_idx == 0 && size < 32)
+// As we now have separate fns for y & c - no need to test that
+    if (size < 32)
+    {
+        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
+        for (x = 1; x < size; x++)
+            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
+        for (y = 1; y < size; y++)
+            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+    }
+}
+#else
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                          const uint8_t *_left,
+                          ptrdiff_t stride, int log2_size)
+{
+    unsigned int i, j;
+    const unsigned int size = (1 << log2_size);
+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+    const c_src_ptr_t top = (c_src_ptr_t)_top;
+    const c_src_ptr_t left = (c_src_ptr_t)_left;
+    unsigned int dc0 = size;
+    unsigned int dc1 = size;
+
+    for (i = 0; i < size; i++)
+    {
+        dc0 += left[i][0] + top[i][0];
+        dc1 += left[i][1] + top[i][1];
+    }
+
+    dc0 >>= log2_size + 1;
+    dc1 >>= log2_size + 1;
+
+    for (i = 0; i < size; i++, src += stride)
+    {
+        for (j = 0; j < size; ++j)
+        {
+            src[j][0] = dc0;
+            src[j][1] = dc1;
+
+        }
+    }
+}
+#endif
+
+#define PRED_DC(size)\
+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
+                                       const uint8_t *left, ptrdiff_t stride)   \
+{                                                                               \
+    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
+}
+
+PRED_DC(0)
+PRED_DC(1)
+PRED_DC(2)
+PRED_DC(3)
+
+#undef PRED_DC
+
+
+
+
+#if !PRED_C
+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
+{
+    int i, j;
+    int size          = (1 << log2_size);
+    pixel *src        = (pixel *)_src;
+    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
+
+    for (i = 0; i < size; i++)
+        for (j = 0; j < size; j+=4)
+            AV_WN4P(&POS(j, i), a);
+}
+#else
+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
+{
+    unsigned int i, j;
+    const unsigned int size = (1 << log2_size);
+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+    const pixel a = (1 << (BIT_DEPTH - 1));
+
+    for (i = 0; i < size; i++, src += stride)
+    {
+        for (j = 0; j < size; ++j)
+        {
+            src[j][0] = a;
+            src[j][1] = a;
+        }
+    }
+}
+#endif
+
+#define PRED_DC0(size)\
+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
+{                                                                               \
+    FUNC(pred_dc0)(src, stride, size + 2);                        \
+}
+
+PRED_DC0(0)
+PRED_DC0(1)
+PRED_DC0(2)
+PRED_DC0(3)
+
+#undef PRED_DC0
+
+
+
+
+#ifndef ANGLE_CONSTS
+#define ANGLE_CONSTS
+static const int intra_pred_angle[] = {
+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+};
+static const int inv_angle[] = {
+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+    -630, -910, -1638, -4096
+};
+#endif
+
+#if !PRED_C
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                const uint8_t *_top,
+                                                const uint8_t *_left,
+                                                ptrdiff_t stride,
+                                                int mode, int size)
+{
+    int x, y;
+    pixel *src        = (pixel *)_src;
+    const pixel *top  = (const pixel *)_top;
+    const pixel *left = (const pixel *)_left;
+
+    int angle = intra_pred_angle[mode - 2];
+    pixel ref_array[3 * MAX_TB_SIZE + 4];
+    pixel *ref_tmp = ref_array + size;
+    const pixel *ref;
+    int last = (size * angle) >> 5;
+
+    if (mode >= 18) {
+        ref = top - 1;
+
+        if (angle < 0)
+        {
+            memcpy(ref_tmp + 1, top, size * PW);
+            ref_tmp[0] = left[-1];
+
+            for (x = last; x <= -1; x++)
+                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
+            ref = ref_tmp;
+        }
+
+        for (y = 0; y < size; y++) {
+            int idx  = ((y + 1) * angle) >> 5;
+            int fact = ((y + 1) * angle) & 31;
+            if (fact) {
+                for (x = 0; x < size; x += 4) {
+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
+                                           fact  * ref[x + idx + 2] + 16) >> 5;
+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
+                }
+            } else {
+                for (x = 0; x < size; x += 4)
+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
+            }
+        }
+        if (mode == 26 && size < 32) {
+            for (y = 0; y < size; y++)
+                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
+        }
+
+    } else {
+        ref = left - 1;
+        if (angle < 0 && last < -1) {
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
+            // Inv angle <= -256 so top offset >= 0
+            for (x = last; x <= -1; x++)
+                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
+            ref = ref_tmp;
+        }
+
+        for (x = 0; x < size; x++) {
+            int idx  = ((x + 1) * angle) >> 5;
+            int fact = ((x + 1) * angle) & 31;
+            if (fact) {
+                for (y = 0; y < size; y++) {
+                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
+                                       fact  * ref[y + idx + 2] + 16) >> 5;
+                }
+            } else {
+                for (y = 0; y < size; y++)
+                    POS(x, y) = ref[y + idx + 1];
+            }
+        }
+        if (mode == 10 && size < 32) {
+            for (x = 0; x < size; x += 4) {
+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - left[-1]) >> 1));
+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
+            }
+        }
+    }
+}
+#else
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                const uint8_t *_top,
+                                                const uint8_t *_left,
+                                                ptrdiff_t stride,
+                                                int mode, int size)
+{
+    int x, y;
+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
+    c_src_ptr_t top  = (c_src_ptr_t)_top;
+    c_src_ptr_t left = (c_src_ptr_t)_left;
+
+    const int angle = intra_pred_angle[mode - 2];
+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
+    c_dst_ptr_t ref_tmp = ref_array + size;
+    c_src_ptr_t ref;
+    const int last = (size * angle) >> 5;
+
+    if (mode >= 18) {
+        ref = top - 1;
+        if (angle < 0) {
+            memcpy(ref_tmp + 1, top, size * 2 * PW);
+            ref_tmp[0][0] = left[-1][0];
+            ref_tmp[0][1] = left[-1][1];
+            for (x = last; x <= -1; x++)
+            {
+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+            }
+            ref = (c_src_ptr_t)ref_tmp;
+        }
+
+        for (y = 0; y < size; y++, src += stride) {
+            const int idx  = ((y + 1) * angle) >> 5;
+            const int fact = ((y + 1) * angle) & 31;
+            if (fact) {
+                for (x = 0; x < size; ++x) {
+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
+                }
+            } else {
+                memcpy(src, ref + idx + 1, size * 2 * PW);
+            }
+        }
+    } else {
+        ref = left - 1;
+        if (angle < 0 && last < -1) {
+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
+            for (x = last; x <= -1; x++)
+            {
+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+            }
+            ref = (c_src_ptr_t)ref_tmp;
+        }
+
+        for (x = 0; x < size; x++, src++) {
+            const int idx  = ((x + 1) * angle) >> 5;
+            const int fact = ((x + 1) * angle) & 31;
+            if (fact) {
+                for (y = 0; y < size; y++) {
+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+                }
+            } else {
+                for (y = 0; y < size; y++)
+                {
+                    src[y * stride][0] = ref[y + idx + 1][0];
+                    src[y * stride][1] = ref[y + idx + 1][1];
+                }
+            }
+        }
+    }
+}
+#endif
+
+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
+}
+
+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
+}
+
+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
+}
+
+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
+}
+
+#undef cpel
+#undef c_src_ptr_t
+#undef c_dst_ptr_t
+
+#undef EXTEND
+#undef POS
+#undef PW
+
+#undef filter_light1
+#undef filter_light
+#undef filter_strong
+#undef ref_gen
+
+#ifndef INCLUDED_ONCE
+#define INCLUDED_ONCE
+#endif
+
diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
new file mode 100644
index 0000000000..98a0b104b7
--- /dev/null
+++ b/libavcodec/rpi_mailbox.c
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdint.h>
+#include <sys/ioctl.h>
+
+#include <linux/ioctl.h>
+
+#define MAJOR_NUM 100
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+#define DEVICE_FILE_NAME "/dev/vcio"
+
+#include "rpi_mailbox.h"
+//#include <interface/vctypes/vc_image_structs.h>
+
+/*
+ * use ioctl to send mbox property message
+ */
+
+static int mbox_property(int file_desc, void *buf)
+{
+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
+   if (ret_val < 0) {
+      printf("ioctl_set_msg failed:%d\n", ret_val);
+   }
+
+#ifdef DEBUG
+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+   for (i=0; i<size/4; i++)
+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+#endif
+   return ret_val;
+}
+
+#define GET_VCIMAGE_PARAMS 0x30044
+
+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
+{
+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
+    uint32_t * p = buf;
+    void * rimg;
+    int rv;
+
+    *p++ = 0; // size
+    *p++ = 0; // process request
+    *p++ = GET_VCIMAGE_PARAMS;
+    *p++ = sizeof(*img);
+    *p++ = sizeof(*img);
+    rimg = p;
+    memcpy(p, img, sizeof(*img));
+    p += sizeof(*img) / sizeof(*p);
+    *p++ = 0;  // End tag
+    buf[0] = (p - buf) * sizeof(*p);
+
+    rv = mbox_property(fd, buf);
+    memcpy(img, rimg, sizeof(*img));
+
+    return rv;
+}
+
+
+#define SET_CLOCK_RATE 0x00038002
+#define GET_MAX_CLOCK 0x00030004
+#define CLOCK_HEVC 11
+
+static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
+{
+    uint32_t buf[32];
+    uint32_t * p = buf;
+    int rv;
+
+    *p++ = 0; // size
+    *p++ = 0; // process request
+    *p++ = command;
+    *p++ = 8;
+    *p++ = 8;
+    *p++ = *word0;
+    *p++ = *word1;
+    *p++ = 0;  // End tag
+    buf[0] = (p - buf) * sizeof(*p);
+
+    rv = mbox_property(fd, buf);
+    *word0 = buf[6];
+    *word1 = buf[7];
+    return rv;
+}
+
+int mbox_open() {
+   int file_desc;
+
+   // open a char device file used for communicating with kernel mbox driver
+   file_desc = open(DEVICE_FILE_NAME, 0);
+   if (file_desc < 0) {
+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+   }
+   return file_desc;
+}
+
+void mbox_close(int file_desc) {
+  close(file_desc);
+}
+
+int mbox_request_clock(int fd) {
+   int rv;
+   unsigned word0, word1 = 0;
+   word0 = CLOCK_HEVC;
+   rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
+   if (rv != 0)
+      return rv;
+   word1 = word0;
+   word0 = CLOCK_HEVC;
+   rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
+   return rv;
+}
+
+int mbox_release_clock(int fd) {
+  int rv;
+  unsigned word0, word1 = 0;
+  word0 = CLOCK_HEVC;
+  word1 = 0;
+  rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
+  return rv;
+}
diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
new file mode 100644
index 0000000000..b2654ef01e
--- /dev/null
+++ b/libavcodec/rpi_mailbox.h
@@ -0,0 +1,58 @@
+#ifndef RPI_MAILBOX_H
+#define RPI_MAILBOX_H
+
+/* The image structure. */
+typedef struct vc_image_extra_uv_s {
+  void *u, *v;
+  int vpitch;
+} VC_IMAGE_EXTRA_UV_T;
+
+typedef union {
+    VC_IMAGE_EXTRA_UV_T uv;
+//  VC_IMAGE_EXTRA_RGBA_T rgba;
+//  VC_IMAGE_EXTRA_PAL_T pal;
+//  VC_IMAGE_EXTRA_TF_T tf;
+//  VC_IMAGE_EXTRA_BAYER_T bayer;
+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
+//  VC_IMAGE_EXTRA_CODEC_T codec;
+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
+} VC_IMAGE_EXTRA_T;
+
+
+typedef struct VC_IMAGE_T {
+  unsigned short                  type;           /* should restrict to 16 bits */
+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
+  unsigned short                  width;          /* width in pixels */
+  unsigned short                  height;         /* height in pixels */
+  int                             pitch;          /* pitch of image_data array in bytes */
+  int                             size;           /* number of bytes available in image_data array */
+  void                           *image_data;     /* pixel data */
+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
+  void                           *metadata;       /* metadata header for the image */
+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
+  int                             metadata_size;  /* size of metadata of each channel in bytes */
+  int                             channel_offset; /* offset of consecutive channels in bytes */
+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
+                                                            into a linked-mulitchannel image */
+  uint8_t                         channel_index;         /* index of the channel this header represents while
+                                                            it is being linked. */
+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
+} VC_IMAGE_T;
+
+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+
+
+extern int mbox_open(void);
+extern void mbox_close(int file_desc);
+
+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+
+int mbox_request_clock(int fd);
+int mbox_release_clock(int fd);
+
+#endif
diff --git a/libavcodec/rpi_mem.c b/libavcodec/rpi_mem.c
new file mode 100644
index 0000000000..812921f665
--- /dev/null
+++ b/libavcodec/rpi_mem.c
@@ -0,0 +1,326 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <bcm_host.h>
+#include <interface/vctypes/vc_image_types.h>
+#include <interface/vcsm/user-vcsm.h>
+#pragma GCC diagnostic pop
+
+#include "rpi_mem.h"
+#include "rpi_zc_frames.h"
+
+
+#define OPT_PREFER_CMA 0
+
+struct rpi_cache_flush_env_s {
+  struct vcsm_user_clean_invalid2_s v;
+};
+
+
+// GPU memory alloc fns (internal)
+
+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
+{
+    if (p->arm != NULL)
+        vcsm_unlock_ptr(p->arm);
+    if (p->vcsm_handle != 0)
+        vcsm_free(p->vcsm_handle);
+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+}
+
+
+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
+    const int numbytes, const unsigned int cache_type, const char * const name)
+{
+    memset(p, 0, sizeof(*p));
+    p->numbytes = (numbytes + 255) & ~255;  // Round up
+
+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
+    {
+        av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
+        goto fail;
+    }
+    if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
+    {
+        av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
+        goto fail;
+    }
+    if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
+        goto fail;
+    }
+    if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
+    {
+        av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
+        goto fail;
+    }
+
+    return 0;
+
+fail:
+    gpu_free_internal(p);
+    return AVERROR(ENOMEM);
+}
+
+// Public gpu fns
+
+// Allocate memory on GPU
+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+// Returns 0 on success.
+// This allocates memory that will not be cached in ARM's data cache.
+// Therefore safe to use without data cache flushing.
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+{
+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
+}
+
+// This allocates data that will be
+//    Cached in ARM L2
+//    Uncached in VPU L2
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+{
+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
+}
+
+void gpu_free(GPU_MEM_PTR_T * const p) {
+    gpu_free_internal(p);
+}
+
+void rpi_mem_gpu_uninit(void)
+{
+    vcsm_exit();
+    bcm_host_deinit();
+}
+
+int rpi_mem_gpu_init(const unsigned int flags)
+{
+    const int wants_cma = bcm_host_is_fkms_active();
+    int use_cma;
+
+    (void)flags;
+
+    if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
+        use_cma = 1;
+    else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
+        use_cma = 0;
+    else
+        return AVERROR(EINVAL);
+
+    bcm_host_init();
+
+    return use_cma + 1;
+}
+
+// ----------------------------------------------------------------------------
+//
+// Cache flush functions
+
+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
+
+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
+{
+  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
+  *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
+  return rfe;
+}
+
+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+{
+  // Nothing needed
+}
+
+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
+{
+    int rc = 0;
+    if (rfe->v.op_count != 0) {
+        if (vcsm_clean_invalid2(&rfe->v) != 0)
+        {
+          const int err = errno;
+          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
+          rc = AVERROR(err);
+        }
+        rfe->v.op_count = 0;
+    }
+    return rc;
+}
+
+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+{
+  int rc = rpi_cache_flush_execute(rfe);;
+
+  return rc;
+}
+
+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
+{
+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+
+  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
+
+  b->invalidate_mode = mode;
+  b->block_count = blocks;
+  b->start_address = gm->arm + offset0;
+  b->block_size = block_size;
+  b->inter_block_stride = block_stride;
+}
+
+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+  const unsigned int offset, const unsigned int size)
+{
+  // Deal with empty pointer trivially
+  if (gm == NULL || size == 0)
+    return;
+
+  av_assert1(offset <= gm->numbytes);
+  av_assert1(size <= gm->numbytes);
+  av_assert1(offset + size <= gm->numbytes);
+
+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
+}
+
+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+{
+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
+}
+
+
+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+{
+#if !RPI_ONE_BUF
+#error Fixme! (NIF)
+#endif
+  if (gpu_is_buf1(frame)) {
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
+  }
+  else
+  {
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
+  }
+}
+
+// Flush an area of a frame
+// Width, height, x0, y0 in luma pels
+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
+{
+  const unsigned int y_offset = frame->linesize[0] * y0;
+  const unsigned int y_size = frame->linesize[0] * height;
+  // Round UV up/down to get everything
+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
+
+#if 0
+  // *** frame->height is cropped height so not good
+  // As all unsigned they will also reject -ve
+  // Test individually as well as added to reject overflow
+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
+  av_assert0(n <= (unsigned int)frame->height);
+  av_assert0(start_line + n <= (unsigned int)frame->height);
+#endif
+
+  if (!gpu_is_buf1(frame))
+  {
+    if (do_luma) {
+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
+    }
+    if (do_chroma) {
+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+    }
+  }
+  else if (!av_rpi_is_sand_frame(frame))
+  {
+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+    if (do_luma) {
+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
+    }
+    if (do_chroma) {
+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
+    }
+  }
+  else
+  {
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
+    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
+
+    if (do_chroma)
+    {
+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+      b->invalidate_mode = mode;
+      b->block_count = block_count;
+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
+      b->block_size = uv_size;
+      b->inter_block_stride = stride1 * stride2;
+    }
+    if (do_luma)
+    {
+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+      b->invalidate_mode = mode;
+      b->block_count = block_count;
+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
+      b->block_size = y_size;
+      b->inter_block_stride = stride1 * stride2;
+    }
+  }
+}
+
+// Call this to clean and invalidate a region of memory
+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
+{
+  rpi_cache_buf_t cbuf;
+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
+  rpi_cache_flush_finish(rfe);
+}
+
diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h
new file mode 100644
index 0000000000..a451079806
--- /dev/null
+++ b/libavcodec/rpi_mem.h
@@ -0,0 +1,88 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#ifndef RPI_MEM_H
+#define RPI_MEM_H
+
+typedef struct gpu_mem_ptr_s {
+  unsigned char *arm; // Pointer to memory mapped on ARM side
+  int vc_handle;   // Videocore handle of relocatable memory
+  int vcsm_handle; // Handle for use by VCSM
+  int vc;       // Address for use in GPU code
+  int numbytes; // Size of memory block
+} GPU_MEM_PTR_T;
+
+// General GPU functions
+
+#define GPU_INIT_GPU 1
+#define GPU_INIT_CMA 2
+
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+extern void gpu_free(GPU_MEM_PTR_T * const p);
+int rpi_mem_gpu_init(const unsigned int flags);
+void rpi_mem_gpu_uninit(void);
+
+// Cache flush stuff
+
+struct rpi_cache_flush_env_s;
+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
+
+typedef struct {uint32_t t[33];} rpi_cache_buf_t;
+
+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
+// Free env without flushing
+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
+// Do the accumulated flush & clear but do not free the env
+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
+// Do the accumulated flush & free the env
+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
+
+typedef enum
+{
+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
+} rpi_cache_flush_mode_t;
+
+struct AVFrame;
+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+  const unsigned int offset, const unsigned int size);
+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
+
+// init, add, finish for one gm ptr
+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+
+#endif
diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
new file mode 100644
index 0000000000..cb7b96119e
--- /dev/null
+++ b/libavcodec/rpi_qpu.c
@@ -0,0 +1,776 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "libavutil/avassert.h"
+
+#include "config.h"
+
+#include <pthread.h>
+#include <time.h>
+
+#include <interface/vcsm/user-vcsm.h>
+
+#include "rpi_mailbox.h"
+#include "rpi_mem.h"
+#include "rpi_qpu.h"
+#include "rpi_hevc_shader.h"
+#include "rpi_hevc_transform8.h"
+#include "rpi_hevc_transform10.h"
+#include "libavutil/rpi_sand_fns.h"
+
+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
+
+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
+// Beware this is expensive and will probably throw off all other timing by >10%
+#define RPI_TRACE_QPU_PROFILE_ALL       0
+
+// QPU "noflush" flags
+// a mixture of flushing & profiling
+
+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
+
+#define vcos_verify_ge0(x) ((x)>=0)
+
+// Size in 32bit words
+#define QPU_CODE_SIZE 4098
+#define VPU_CODE_SIZE 16384
+
+static const short rpi_transMatrix2even[32][16] = { // Even rows first
+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+// Odd rows
+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+};
+
+// Code/constants on GPU
+struct GPU
+{
+//  unsigned int qpu_code[QPU_CODE_SIZE];
+    unsigned int vpu_code8[VPU_CODE_SIZE];
+    unsigned int vpu_code10[VPU_CODE_SIZE];
+    short transMatrix2even[16*16*2];
+};
+
+#define WAIT_COUNT_MAX 16
+
+typedef struct trace_time_one_s
+{
+    int count;
+    int64_t start[WAIT_COUNT_MAX];
+    int64_t total[WAIT_COUNT_MAX];
+} trace_time_one_t;
+
+typedef struct trace_time_wait_s
+{
+    unsigned int jcount;
+    int64_t start0;
+    int64_t last_update;
+    trace_time_one_t active;
+    trace_time_one_t wait;
+} trace_time_wait_t;
+
+typedef struct vq_wait_s
+{
+    sem_t sem;
+    struct vq_wait_s * next;
+} vq_wait_t;
+
+#define VQ_WAIT_POOL_SIZE 16
+typedef struct vq_wait_pool_s
+{
+    vq_wait_t * head;
+    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
+} vq_wait_pool_t;
+
+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
+
+typedef struct gpu_env_s
+{
+    int open_count;
+    int init_count;
+    int vpu_i_cache_flushed;
+    GPU_MEM_PTR_T qpu_code_gm_ptr;
+    GPU_MEM_PTR_T code_gm_ptr;
+    GPU_MEM_PTR_T dummy_gm_ptr;
+    vq_wait_pool_t wait_pool;
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+    trace_time_wait_t ttw;
+#endif
+} gpu_env_t;
+
+// Stop more than one thread trying to allocate memory or use the processing resources at once
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+static gpu_env_t * gpu = NULL;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+
+static int64_t ns_time(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
+}
+
+
+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
+
+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
+#define T_ARG(t) T_SEC(t), T_MS(t)
+#define T_FMT "%u.%03u"
+
+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
+{
+    // Update totals for levels that are still pending
+    for (int i = 0; i < tto->count; ++i) {
+        tto->total[i] += now - tto->start[i];
+        tto->start[i] = now;
+    }
+
+    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
+         prefix,
+         T_ARG(now - start0 - tto->total[0]),
+         T_ARG(tto->total[0]),
+         T_ARG(tto->total[1]),
+         T_ARG(tto->total[2]),
+         T_ARG(tto->total[3]));
+}
+
+
+static void tto_start(trace_time_one_t * const tto, const int64_t now)
+{
+    av_assert0(tto->count < WAIT_COUNT_MAX);
+    tto->start[tto->count++] = now;
+}
+
+static void tto_end(trace_time_one_t * const tto, const int64_t now)
+{
+    const int n = --tto->count;
+    av_assert0(n >= 0);
+    tto->total[n] += now - tto->start[n];
+}
+
+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
+{
+    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
+    tto_print(&ttw->active, now, ttw->start0, "Active");
+    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
+}
+
+#endif
+
+// GPU memory alloc fns (internal)
+
+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
+{
+    if (p->arm != NULL)
+        vcsm_unlock_ptr(p->arm);
+    if (p->vcsm_handle != 0)
+        vcsm_free(p->vcsm_handle);
+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+}
+
+
+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
+    const int numbytes, const unsigned int cache_type, const char * const name)
+{
+    memset(p, 0, sizeof(*p));
+    p->numbytes = (numbytes + 255) & ~255;  // Round up
+
+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
+        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
+        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
+        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
+    {
+        gpu_free_internal(p);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+
+// GPU init, free, lock, unlock
+
+static void gpu_term(void)
+{
+    gpu_env_t * const ge = gpu;
+
+    // We have to hope that eveything has terminated...
+    gpu = NULL;
+
+    vc_gpuserv_deinit();
+
+    gpu_free_internal(&ge->code_gm_ptr);
+    gpu_free_internal(&ge->qpu_code_gm_ptr);
+    gpu_free_internal(&ge->dummy_gm_ptr);
+
+    vcsm_exit();
+
+    vq_wait_pool_deinit(&ge->wait_pool);
+
+    free(ge);
+}
+
+
+// Connect to QPU, returns 0 on success.
+static int gpu_init(gpu_env_t ** const gpu) {
+    volatile struct GPU* ptr;
+    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
+    int rv;
+    *gpu = NULL;
+
+    if (ge == NULL)
+        return -1;
+
+    vq_wait_pool_init(&ge->wait_pool);
+
+    vcsm_init();
+
+    // Now copy over the QPU code into GPU memory
+    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
+      return rv;
+
+    {
+        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
+        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
+        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
+    }
+
+    // And the VPU code
+    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
+        return rv;
+    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+
+    // Zero everything so we have zeros between the code bits
+    memset((void *)ptr, 0, sizeof(*ptr));
+    {
+        int num_bytes = sizeof(rpi_hevc_transform8);
+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
+    }
+    {
+        int num_bytes = sizeof(rpi_hevc_transform10);
+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
+    }
+    // And the transform coefficients
+    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
+    // Generate a dummy "frame" & fill with 0x80
+    // * Could reset to 1 <<bit_depth?
+    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
+        return rv;
+    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
+
+    *gpu = ge;
+    return 0;
+}
+
+
+
+static void gpu_unlock(void) {
+    pthread_mutex_unlock(&gpu_mutex);
+}
+
+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+static gpu_env_t * gpu_lock(void) {
+    pthread_mutex_lock(&gpu_mutex);
+
+    av_assert1(gpu != NULL);
+    return gpu;
+}
+
+static gpu_env_t * gpu_lock_ref(void)
+{
+    pthread_mutex_lock(&gpu_mutex);
+
+    if (gpu == NULL) {
+        int rv = gpu_init(&gpu);
+        if (rv != 0) {
+            gpu_unlock();
+            return NULL;
+        }
+    }
+
+    ++gpu->open_count;
+    return gpu;
+}
+
+static void gpu_unlock_unref(gpu_env_t * const ge)
+{
+    if (--ge->open_count == 0)
+        gpu_term();
+
+    gpu_unlock();
+}
+
+static inline gpu_env_t * gpu_ptr(void)
+{
+    av_assert1(gpu != NULL);
+    return gpu;
+}
+
+unsigned int vpu_get_fn(const unsigned int bit_depth) {
+  uint32_t a = 0;
+
+  // Make sure that the gpu is initialized
+  av_assert1(gpu != NULL);
+  switch (bit_depth){
+    case 8:
+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
+      break;
+    case 10:
+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
+      break;
+    default:
+      av_assert0(0);
+  }
+  return a;
+}
+
+unsigned int vpu_get_constants(void) {
+  av_assert1(gpu != NULL);
+  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
+}
+
+void gpu_ref(void)
+{
+  gpu_lock_ref();
+  gpu_unlock();
+}
+
+void gpu_unref(void)
+{
+  gpu_env_t * const ge = gpu_lock();
+  gpu_unlock_unref(ge);
+}
+
+// ----------------------------------------------------------------------------
+
+
+// Wait abstractions - mostly so we can easily add profile code
+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
+{
+  unsigned int i;
+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+    sem_init(&wp->pool[i].sem, 0, 0);
+    wp->pool[i].next = wp->pool + i + 1;
+  }
+  wp->head = wp->pool + 0;
+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
+}
+
+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
+{
+  unsigned int i;
+  wp->head = NULL;
+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+    sem_destroy(&wp->pool[i].sem);
+    wp->pool[i].next = NULL;
+  }
+}
+
+
+// If sem_init actually takes time then maybe we want a pool...
+static vq_wait_t * vq_wait_new(void)
+{
+  gpu_env_t * const ge = gpu_lock_ref();
+  vq_wait_t * const wait = ge->wait_pool.head;
+  ge->wait_pool.head = wait->next;
+  wait->next = NULL;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  tto_start(&ge->ttw.active, ns_time());
+#endif
+
+  gpu_unlock();
+  return wait;
+}
+
+static void vq_wait_delete(vq_wait_t * const wait)
+{
+  gpu_env_t * const ge = gpu_lock();
+  wait->next = ge->wait_pool.head;
+  ge->wait_pool.head = wait;
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  {
+    trace_time_wait_t * const ttw = &ge->ttw;
+    const int64_t now = ns_time();
+    ++ttw->jcount;
+    tto_end(&ttw->wait, now);
+
+    if (ttw->start0 == 0)
+    {
+      ttw->start0 = ttw->active.start[0];
+      ttw->last_update = ttw->start0;
+    }
+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
+    {
+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
+      ttw_print(ttw, now);
+    }
+  }
+#endif
+  gpu_unlock_unref(ge);
+}
+
+static void vq_wait_wait(vq_wait_t * const wait)
+{
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  {
+      const int64_t now = ns_time();
+      gpu_env_t * const ge = gpu_lock();
+      tto_start(&ge->ttw.wait, now);
+      gpu_unlock();
+  }
+#endif
+
+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
+    /* loop */;
+}
+
+static void vq_wait_post(vq_wait_t * const wait)
+{
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+  {
+    gpu_env_t *const ge = gpu_lock();
+    tto_end(&ge->ttw.active, ns_time());
+    gpu_unlock();
+  }
+#endif
+
+  sem_post(&wait->sem);
+}
+
+
+
+// Header comments were wrong for these two
+#define VPU_QPU_MASK_QPU  1
+#define VPU_QPU_MASK_VPU  2
+
+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+
+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
+{
+//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
+  vpu_qpu_job_env_t * vqj = buf;
+//  memset(vqj, 0, sizeof(*vqj));
+  vqj->n = 0;
+  vqj->mask = 0;
+  return vqj;
+}
+
+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
+{
+//  memset(vqj, 0, sizeof(*vqj));
+//  free(vqj);
+}
+
+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
+{
+  struct gpu_job_s * const j = vqj->j + vqj->n++;
+  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
+  return j;
+}
+
+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
+{
+  if (vpu_code != 0) {
+    struct gpu_job_s *const j = new_job(vqj);
+    vqj->mask |= VPU_QPU_MASK_VPU;
+
+    j->command = EXECUTE_VPU;
+    j->callback.func = 0;
+    j->callback.cookie = NULL;
+    // The bottom two bits of the execute address contain no-flush flags
+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
+    // as we never reload code
+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
+    j->u.v.q[1] = r0;
+    j->u.v.q[2] = r1;
+    j->u.v.q[3] = r2;
+    j->u.v.q[4] = r3;
+    j->u.v.q[5] = r4;
+    j->u.v.q[6] = r5;
+    gpu->vpu_i_cache_flushed = 1;
+  }
+}
+
+// flags are QPU_FLAGS_xxx
+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
+{
+  if (n != 0) {
+    struct gpu_job_s *const j = new_job(vqj);
+    vqj->mask |= VPU_QPU_MASK_QPU;
+
+    j->command = EXECUTE_QPU;
+    j->callback.func = 0;
+    j->callback.cookie = NULL;
+
+    j->u.q.jobs = n;
+#if RPI_TRACE_QPU_PROFILE_ALL
+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
+#else
+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
+#endif
+    j->u.q.timeout = 5000;
+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+  }
+}
+
+// Convert callback to sem post
+static void vpu_qpu_job_callback_wait(void * v)
+{
+  vq_wait_post(v);
+}
+
+// Poke a user-supplied sem
+static void vpu_qpu_job_callback_sem(void * v)
+{
+  sem_post((sem_t *)v);
+}
+
+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
+{
+  vq_wait_t * wait;
+
+  if (vqj->mask == 0) {
+    *wait_h = NULL;
+    return;
+  }
+
+  // We are going to want a sync object
+  wait = vq_wait_new();
+
+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+  // If we only posted one thing or only QPU jobs
+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
+  {
+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
+    av_assert1(j->callback.func == 0);
+
+    j->callback.func = vpu_qpu_job_callback_wait;
+    j->callback.cookie = wait;
+  }
+  else
+  {
+    struct gpu_job_s *const j = new_job(vqj);
+
+    j->command = EXECUTE_SYNC;
+    j->u.s.mask = vqj->mask;
+    j->callback.func = vpu_qpu_job_callback_wait;
+    j->callback.cookie = wait;
+  }
+
+  vqj->mask = 0;
+  *wait_h = wait;
+}
+
+// Returns 0 if no sync added ('cos Q empty), 1 if sync added
+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
+{
+  // If nothing on q then just return
+  if (vqj->mask == 0)
+    return 0;
+
+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+  // If we only posted one thing or only QPU jobs
+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
+  {
+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
+    av_assert1(j->callback.func == 0);
+
+    j->callback.func = vpu_qpu_job_callback_sem;
+    j->callback.cookie = sem;
+  }
+  else
+  {
+    struct gpu_job_s *const j = new_job(vqj);
+
+    j->command = EXECUTE_SYNC;
+    j->u.s.mask = vqj->mask;
+    j->callback.func = vpu_qpu_job_callback_sem;
+    j->callback.cookie = sem;
+  }
+
+  vqj->mask = 0;
+  return 1;
+}
+
+
+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
+{
+  if (vqj->n == 0)
+    return 0;
+
+  return vc_gpuserv_execute_code(vqj->n, vqj->j);
+}
+
+// Simple wrapper of start + delete
+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
+{
+  int rv;
+  rv = vpu_qpu_job_start(vqj);
+  vpu_qpu_job_delete(vqj);
+  return rv;
+}
+
+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+{
+  if (wait_h != NULL)
+  {
+    vq_wait_t * const wait = *wait_h;
+    if (wait != NULL) {
+      *wait_h = NULL;
+      vq_wait_wait(wait);
+      vq_wait_delete(wait);
+    }
+  }
+}
+
+int vpu_qpu_init()
+{
+  gpu_env_t * const ge = gpu_lock_ref();
+  if (ge == NULL)
+    return -1;
+
+  if (ge->init_count++ == 0)
+  {
+    vc_gpuserv_init();
+  }
+
+  gpu_unlock();
+  return 0;
+}
+
+void vpu_qpu_term()
+{
+  gpu_env_t * const ge = gpu_lock();
+
+  if (--ge->init_count == 0) {
+    vc_gpuserv_deinit();
+
+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+    ttw_print(&ge->ttw, ns_time());
+#endif
+  }
+
+  gpu_unlock_unref(ge);
+}
+
+uint32_t qpu_fn(const int * const mc_fn)
+{
+  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
+}
+
+uint32_t qpu_dummy(void)
+{
+  return gpu->dummy_gm_ptr.vc;
+}
+
+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
+{
+  // Dummy values we can catch with emulation
+  qf->y_pxx = ~1U;
+  qf->y_bxx = ~2U;
+  qf->y_p00 = ~3U;
+  qf->y_b00 = ~4U;
+  qf->c_pxx = ~5U;
+  qf->c_bxx = ~6U;
+
+  switch (bit_depth) {
+    case 8:
+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
+      qf->c_pxx = qpu_fn(mc_filter_c_p);
+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
+      qf->c_bxx = qpu_fn(mc_filter_c_b);
+      break;
+    case 10:
+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
+      break;
+    default:
+      return -1;
+  }
+  return 0;
+}
+
diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
new file mode 100644
index 0000000000..8777687021
--- /dev/null
+++ b/libavcodec/rpi_qpu.h
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#ifndef RPI_QPU_H
+#define RPI_QPU_H
+
+#include "rpi_mem.h"
+#include "rpi_zc_frames.h"
+
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"  // for gpu_job_s
+#pragma GCC diagnostic pop
+
+// QPU specific functions
+
+typedef struct HEVCRpiQpu {
+    uint32_t c_pxx;
+    uint32_t c_pxx_l1;
+    uint32_t c_bxx;
+    uint32_t y_pxx;
+    uint32_t y_bxx;
+    uint32_t y_p00;
+    uint32_t y_b00;
+} HEVCRpiQpu;
+
+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
+
+uint32_t qpu_fn(const int * const mc_fn);
+uint32_t qpu_dummy(void);
+
+#define QPU_N_GRP    4
+#define QPU_N_MAX    12
+
+#define QPU_MAIL_EL_VALS  2
+
+struct vpu_qpu_wait_s;
+typedef struct vq_wait_s * vpu_qpu_wait_h;
+
+// VPU specific functions
+
+struct vpu_qpu_job_env_s;
+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
+
+#define VPU_QPU_JOB_MAX 4
+struct vpu_qpu_job_env_s
+{
+  unsigned int n;
+  unsigned int mask;
+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
+};
+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+
+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+
+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
+extern unsigned int vpu_get_constants(void);
+
+// Waits for previous post_codee to complete and Will null out *wait_h after use
+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+int vpu_qpu_init(void);
+void vpu_qpu_term(void);
+
+void gpu_ref(void);
+void gpu_unref(void);
+
+#endif
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
new file mode 100644
index 0000000000..37be9a0f49
--- /dev/null
+++ b/libavcodec/rpi_zc.c
@@ -0,0 +1,1227 @@
+#include "config.h"
+
+#include "libavcodec/avcodec.h"
+#include "rpi_mem.h"
+#include "rpi_mailbox.h"
+#include "rpi_zc.h"
+#include "libavutil/avassert.h"
+#include <pthread.h>
+
+#include "libavutil/buffer_internal.h"
+
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <interface/vctypes/vc_image_types.h>
+#include <interface/vcsm/user-vcsm.h>
+#pragma GCC diagnostic pop
+
+#define TRACE_ALLOC 0
+#define DEBUG_ALWAYS_KEEP_LOCKED 0
+
+struct ZcPoolEnt;
+
+typedef struct ZcPool
+{
+    size_t numbytes;
+    struct ZcPoolEnt * head;
+    pthread_mutex_t lock;
+} ZcPool;
+
+typedef struct ZcPoolEnt
+{
+    size_t numbytes;
+
+    unsigned int vcsm_handle;
+    unsigned int vc_handle;
+    void * map_arm;
+    unsigned int map_vc;
+
+    struct ZcPoolEnt * next;
+    struct ZcPool * pool;
+} ZcPoolEnt;
+
+typedef struct ZcOldCtxVals
+{
+    int thread_safe_callbacks;
+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+    void * opaque;
+} ZcOldCtxVals;
+
+typedef struct AVZcEnv
+{
+    unsigned int refcount;
+    ZcOldCtxVals old;
+
+    void * pool_env;
+    av_rpi_zc_alloc_buf_fn_t * alloc_buf;
+    av_rpi_zc_free_pool_fn_t * free_pool;
+
+    unsigned int pool_size;
+} ZcEnv;
+
+typedef struct ZcUserBufEnv {
+    void * v;
+    const av_rpi_zc_buf_fn_tab_t * fn;
+    size_t numbytes;
+    int offset;
+} ZcUserBufEnv;
+
+#define ZC_BUF_INVALID  0
+#define ZC_BUF_VALID    1
+#define ZC_BUF_NEVER    2
+
+typedef struct ZcBufEnv {
+    GPU_MEM_PTR_T gmem;
+    AVZcEnvPtr zc;
+    int is_valid;
+    AVBufferRef * user;
+    AVRpiZcFrameGeometry geo;
+    size_t size_y;
+    size_t size_c;
+    size_t size_pic;
+    ssize_t offset;
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+} ZcBufEnv;
+
+
+
+
+
+
+#define ALLOC_PAD       0
+#define ALLOC_ROUND     0x1000
+#define STRIDE_ROUND    64
+#define STRIDE_OR       0
+
+#define DEBUG_ZAP0_BUFFERS 0
+
+static inline int av_rpi_is_sand_format(const int format)
+{
+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
+        (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+{
+    return av_rpi_is_sand_format(frame->format);
+}
+
+//----------------------------------------------------------------------------
+//
+// Internal pool stuff
+
+// Pool entry functions
+
+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
+{
+    ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
+
+    // Round up to 4k & add 4k
+    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
+
+    if (zp == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+        goto fail0;
+    }
+
+    // The 0x80 here maps all pages here rather than waiting for lazy mapping
+    // BEWARE that in GPU land a later unlock/lock pair will put us back into
+    // lazy mode - which will also break cache invalidate calls.
+    if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
+        goto fail1;
+    }
+
+#if TRACE_ALLOC
+    printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
+#endif
+
+    zp->numbytes = alloc_size;
+    zp->pool = pool;
+    return zp;
+
+fail1:
+    av_free(zp);
+fail0:
+    return NULL;
+}
+
+static void zc_pool_ent_free(ZcPoolEnt * const zp)
+{
+#if TRACE_ALLOC
+    printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
+#endif
+
+    if (zp->vcsm_handle != 0)
+    {
+        // VC addr & handle need no dealloc
+        if (zp->map_arm != NULL)
+            vcsm_unlock_hdl(zp->vcsm_handle);
+        vcsm_free(zp->vcsm_handle);
+    }
+    av_free(zp);
+}
+
+//----------------------------------------------------------------------------
+//
+// Pool functions
+
+static void zc_pool_free_ent_list(ZcPoolEnt * p)
+{
+    while (p != NULL)
+    {
+        ZcPoolEnt * const zp = p;
+        p = p->next;
+        zc_pool_ent_free(zp);
+    }
+}
+
+static void zc_pool_flush(ZcPool * const pool)
+{
+    ZcPoolEnt * p = pool->head;
+    pool->head = NULL;
+    pool->numbytes = ~0U;
+    zc_pool_free_ent_list(p);
+}
+
+static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
+{
+    ZcPoolEnt * zp = NULL;
+    ZcPoolEnt * flush_list = NULL;
+    size_t numbytes;
+
+    pthread_mutex_lock(&pool->lock);
+
+    numbytes = pool->numbytes;
+
+    // If size isn't close then dump the pool
+    // Close in this context means within 128k
+    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
+    {
+        flush_list = pool->head;
+        pool->head = NULL;
+        pool->numbytes = numbytes = req_bytes;
+    }
+    else if (pool->head != NULL)
+    {
+        zp = pool->head;
+        pool->head = zp->next;
+    }
+
+    pthread_mutex_unlock(&pool->lock);
+
+    zc_pool_free_ent_list(flush_list);
+
+    if (zp == NULL)
+        zp = zc_pool_ent_alloc(pool, numbytes);
+
+    return zp;
+}
+
+static void zc_pool_put_ent(ZcPoolEnt * const zp)
+{
+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
+    if (zp != NULL)
+    {
+        pthread_mutex_lock(&pool->lock);
+#if TRACE_ALLOC
+        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
+#endif
+
+        if (pool->numbytes == zp->numbytes)
+        {
+            zp->next = pool->head;
+            pool->head = zp;
+            pthread_mutex_unlock(&pool->lock);
+        }
+        else
+        {
+            pthread_mutex_unlock(&pool->lock);
+            zc_pool_ent_free(zp);
+        }
+    }
+}
+
+static ZcPool *
+zc_pool_new(void)
+{
+    ZcPool * const pool = av_mallocz(sizeof(*pool));
+    if (pool == NULL)
+        return NULL;
+
+    pool->numbytes = -1;
+    pool->head = NULL;
+    pthread_mutex_init(&pool->lock, NULL);
+    return pool;
+}
+
+static void
+zc_pool_delete(ZcPool * const pool)
+{
+    if (pool != NULL)
+    {
+        pool->numbytes = -1;
+        zc_pool_flush(pool);
+        pthread_mutex_destroy(&pool->lock);
+        av_free(pool);
+    }
+}
+
+//============================================================================
+//
+// ZC implementation using above pool implementation
+//
+// Fn table fns...
+
+static void zc_pool_free_v(void * v)
+{
+    zc_pool_put_ent(v);
+}
+
+static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
+{
+    ZcPoolEnt * zp = v;
+    return zp->vcsm_handle;
+}
+
+static unsigned int zc_pool_ent_vc_handle_v(void * v)
+{
+    ZcPoolEnt * zp = v;
+    if (zp->vc_handle == 0)
+    {
+        if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
+                   __func__, zp->vcsm_handle);
+    }
+    return zp->vc_handle;
+}
+
+static void * zc_pool_ent_map_arm_v(void * v)
+{
+    ZcPoolEnt * zp = v;
+    if (zp->map_arm == NULL)
+    {
+        if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
+                   __func__, zp->vcsm_handle);
+    }
+    return zp->map_arm;
+}
+
+static unsigned int zc_pool_ent_map_vc_v(void * v)
+{
+    ZcPoolEnt * zp = v;
+    if (zp->map_vc == 0)
+    {
+        if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
+                   __func__, zp->vcsm_handle);
+    }
+    return zp->map_vc;
+}
+
+static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
+    .free        = zc_pool_free_v,
+    .vcsm_handle = zc_pool_ent_vcsm_handle_v,
+    .vc_handle   = zc_pool_ent_vc_handle_v,
+    .map_arm     = zc_pool_ent_map_arm_v,
+    .map_vc      = zc_pool_ent_map_vc_v,
+};
+
+// ZC Env fns
+
+// Delete pool
+// All buffers guaranteed freed by now
+static void
+zc_pool_delete_v(void * v)
+{
+    zc_pool_delete((ZcPool *)v);
+    rpi_mem_gpu_uninit();
+}
+
+// Allocate a new ZC buffer
+static AVBufferRef *
+zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
+{
+    ZcPool * const pool = v;
+    ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
+    AVBufferRef * buf;
+
+    (void)geo;  // geo ignored here
+
+    if (zp == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+        goto fail0;
+    }
+
+    if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
+        goto fail2;
+    }
+
+    return buf;
+
+fail2:
+    zc_pool_put_ent(zp);
+fail0:
+    return NULL;
+}
+
+// Init wrappers - the public fns
+
+AVZcEnvPtr
+av_rpi_zc_int_env_alloc(void * logctx)
+{
+    ZcEnv * zc;
+    ZcPool * pool_env;
+
+    if (rpi_mem_gpu_init(0) < 0)
+        return NULL;
+
+    if ((pool_env = zc_pool_new()) == NULL)
+        goto fail1;
+
+    if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
+        goto fail2;
+
+    return zc;
+
+fail2:
+    zc_pool_delete(pool_env);
+fail1:
+    rpi_mem_gpu_uninit();
+    return NULL;
+}
+
+void
+av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
+{
+    const AVZcEnvPtr zc = *zcp;
+    *zcp = NULL;
+    if (zc != NULL)
+        av_rpi_zc_env_release(zc);
+}
+
+//============================================================================
+//
+// Geometry
+//
+// This is a separate chunck to the rest
+
+// Get mailbox fd - should be in a lock when called
+// Rely on process close to close it
+static int mbox_fd(void)
+{
+    static int fd = -1;
+    if (fd != -1)
+        return fd;
+    return (fd = mbox_open());
+}
+
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+    const int format, const unsigned int video_width, const unsigned int video_height)
+{
+    static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+
+    AVRpiZcFrameGeometry geo = {
+        .format       = format,
+        .video_width  = video_width,
+        .video_height = video_height
+    };
+
+    switch (format)
+    {
+        case AV_PIX_FMT_YUV420P:
+            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+            geo.stride_c = geo.stride_y / 2;
+            geo.height_y = (video_height + 32 + 31) & ~31;
+            geo.height_c = geo.height_y / 2;
+            geo.planes_c = 2;
+            geo.stripes = 1;
+            geo.bytes_per_pel = 1;
+            geo.stripe_is_yc = 1;
+            break;
+
+        case AV_PIX_FMT_YUV420P10:
+            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+            geo.stride_c = geo.stride_y / 2;
+            geo.height_y = (video_height + 32 + 31) & ~31;
+            geo.height_c = geo.height_y / 2;
+            geo.planes_c = 2;
+            geo.stripes = 1;
+            geo.bytes_per_pel = 2;
+            geo.stripe_is_yc = 1;
+            break;
+
+        case AV_PIX_FMT_SAND128:
+        case AV_PIX_FMT_RPI4_8:
+        {
+            const unsigned int stripe_w = 128;
+
+            static VC_IMAGE_T img = {0};
+
+            // Given the overhead of calling the mailbox keep a stashed
+            // copy as we will almost certainly just want the same numbers again
+            // but that means we need a lock
+            pthread_mutex_lock(&sand_lock);
+
+            if (img.width != video_width || img.height != video_height)
+            {
+                VC_IMAGE_T new_img = {
+                    .type = VC_IMAGE_YUV_UV,
+                    .width = video_width,
+                    .height = video_height
+                };
+
+                mbox_get_image_params(mbox_fd(), &new_img);
+                img = new_img;
+            }
+
+            geo.stride_y = stripe_w;
+            geo.stride_c = stripe_w;
+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+            geo.height_c = img.pitch / stripe_w - geo.height_y;
+            geo.stripe_is_yc = 1;
+            if (geo.height_y * stripe_w > img.pitch)
+            {
+                // "tall" sand - all C blocks now follow Y
+                geo.height_y = img.pitch / stripe_w;
+                geo.height_c = geo.height_y;
+                geo.stripe_is_yc = 0;
+            }
+            geo.planes_c = 1;
+            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+            geo.bytes_per_pel = 1;
+
+            pthread_mutex_unlock(&sand_lock);
+#if 0
+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
+                   video_width, video_height,
+                   geo.stride_y, geo.stride_c,
+                   geo.height_y, geo.height_c,
+                   geo.stripes, img.pitch);
+#endif
+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
+            break;
+        }
+
+        case AV_PIX_FMT_RPI4_10:
+        {
+            const unsigned int stripe_w = 128;  // bytes
+
+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+            static VC_IMAGE_T img = {0};
+
+            // Given the overhead of calling the mailbox keep a stashed
+            // copy as we will almost certainly just want the same numbers again
+            // but that means we need a lock
+            pthread_mutex_lock(&sand_lock);
+
+            if (img.width != video_width || img.height != video_height)
+            {
+                VC_IMAGE_T new_img = {
+                    .type = VC_IMAGE_YUV10COL,
+                    .width = video_width,
+                    .height = video_height
+                };
+
+                mbox_get_image_params(mbox_fd(), &new_img);
+                img = new_img;
+            }
+
+            geo.stride_y = stripe_w;
+            geo.stride_c = stripe_w;
+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+            geo.height_c = img.pitch / stripe_w - geo.height_y;
+            geo.planes_c = 1;
+            geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
+            geo.bytes_per_pel = 1;
+            geo.stripe_is_yc = 1;
+
+            pthread_mutex_unlock(&sand_lock);
+
+#if 0
+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
+                   video_width, video_height,
+                   geo.stride_y, geo.stride_c,
+                   geo.height_y, geo.height_c,
+                   geo.stripes, img.pitch);
+#endif
+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
+            break;
+        }
+
+        case AV_PIX_FMT_SAND64_16:
+        case AV_PIX_FMT_SAND64_10:
+        {
+            const unsigned int stripe_w = 128;  // bytes
+
+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+            static VC_IMAGE_T img = {0};
+
+            // Given the overhead of calling the mailbox keep a stashed
+            // copy as we will almost certainly just want the same numbers again
+            // but that means we need a lock
+            pthread_mutex_lock(&sand_lock);
+
+             if (img.width != video_width || img.height != video_height)
+            {
+                VC_IMAGE_T new_img = {
+                    .type = VC_IMAGE_YUV_UV_16,
+                    .width = video_width,
+                    .height = video_height
+                };
+
+                mbox_get_image_params(mbox_fd(), &new_img);
+                img = new_img;
+            }
+
+            geo.stride_y = stripe_w;
+            geo.stride_c = stripe_w;
+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+            geo.height_c = img.pitch / stripe_w - geo.height_y;
+            geo.planes_c = 1;
+            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
+            geo.bytes_per_pel = 2;
+            geo.stripe_is_yc = 1;
+
+            pthread_mutex_unlock(&sand_lock);
+            break;
+        }
+
+        default:
+            break;
+    }
+    return geo;
+}
+
+//============================================================================
+//
+// ZC Env fns
+//
+// Frame copy fns
+
+static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
+    const AVFrame * const src)
+{
+    AVFrame dest_frame;
+    AVFrame * const dest = &dest_frame;
+    unsigned int i;
+    uint8_t * psrc, * pdest;
+
+    dest->format = src->format;
+    dest->width = src->width;
+    dest->height = src->height;
+
+    if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
+        av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
+    {
+        return NULL;
+    }
+
+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+         i != dest->height;
+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+    {
+        memcpy(pdest, psrc, dest->width);
+    }
+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
+    {
+        memcpy(pdest, psrc, dest->width / 2);
+    }
+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
+         i != dest->height / 2;
+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
+    {
+        memcpy(pdest, psrc, dest->width / 2);
+    }
+
+    return dest->buf[0];
+}
+
+
+static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
+    const AVFrame * const src)
+{
+    assert(0);
+    return NULL;
+}
+
+
+static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
+    const AVFrame * const src, const unsigned int src_bits)
+{
+    assert(0);
+    return NULL;
+}
+
+//----------------------------------------------------------------------------
+//
+// Public info extraction calls
+
+static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
+
+static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
+{
+    // Kludge where we check the free fn to check this is really
+    // one of our buffers - can't think of a better way
+    return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
+        av_buffer_get_opaque(buf);
+}
+
+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
+{
+    // As gmem is the first el NULL should be preserved
+    return &pic_zbe_ptr(buf)->gmem;
+}
+
+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? 0 : p->vcsm_handle;
+}
+
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? -1 : p->vc_handle;
+}
+
+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
+{
+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
+    return zbe == NULL ? 0 : zbe->offset;
+}
+
+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
+{
+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
+    return zbe == NULL ? 0 : zbe->size_pic;
+}
+
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+{
+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+    return p == NULL ? 0 : p->numbytes;
+}
+
+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
+{
+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
+    return zbe == NULL ? NULL : &zbe->geo;
+}
+
+AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
+    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
+{
+    av_assert0(!maycopy || zc != NULL);
+
+    if (frame->format != AV_PIX_FMT_YUV420P &&
+        frame->format != AV_PIX_FMT_YUV420P10 &&
+        !av_rpi_is_sand_frame(frame))
+    {
+        av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+        return NULL;
+    }
+
+    if (frame->buf[1] != NULL || frame->format != expected_format)
+    {
+#if RPI_ZC_SAND_8_IN_10_BUF
+        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
+        {
+//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
+            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
+        }
+#endif
+
+        if (maycopy)
+        {
+            if (frame->buf[1] != NULL)
+                av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+            else
+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
+
+            switch (frame->format)
+            {
+                case AV_PIX_FMT_YUV420P10:
+                    return zc_420p10_to_sand128(zc, frame);
+
+                case AV_PIX_FMT_SAND64_10:
+                    return zc_sand64_16_to_sand128(zc, frame, 10);
+
+                default:
+                    return zc_copy(zc, frame);
+            }
+        }
+        else
+        {
+            if (frame->buf[1] != NULL)
+                av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
+            else
+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
+            return NULL;
+        }
+    }
+
+    if (pic_gm_ptr(frame->buf[0]) == NULL)
+    {
+        if (maycopy)
+        {
+            av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
+            return zc_copy(zc, frame);
+        }
+        else
+        {
+            av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
+            return NULL;
+        }
+    }
+
+    return av_buffer_ref(frame->buf[0]);
+}
+
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
+{
+    if (fr_ref != NULL)
+    {
+        av_buffer_unref(&fr_ref);
+    }
+}
+
+//----------------------------------------------------------------------------
+
+// Extract user environment from an AVBufferRef
+void * av_rpi_zc_buf_v(AVBufferRef * const buf)
+{
+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
+    if (zbe != NULL && zbe->user != NULL)
+    {
+        const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
+        return zub == NULL ? NULL : zub->v;
+    }
+    return NULL;
+}
+
+// AV buffer pre-free callback
+static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
+{
+    if (opaque != NULL)
+    {
+        ZcUserBufEnv * const zub = opaque;
+
+        if (zub->fn->free)
+            zub->fn->free(zub->v);
+
+        av_free(zub);
+    }
+}
+
+static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
+{
+    if (opaque != NULL)
+    {
+        ZcBufEnv * const zbe = opaque;
+
+        av_buffer_unref(&zbe->user);
+
+        if (zbe->zc != NULL)
+            av_rpi_zc_env_release(zbe->zc);
+
+        pthread_cond_destroy(&zbe->cond);
+        pthread_mutex_destroy(&zbe->lock);
+        av_free(zbe);
+    }
+}
+
+
+// Wrap the various ZC bits in an AV Buffer and resolve those things we want
+// resolved now.
+// Currently we resolve everything, but in future we might not
+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
+{
+    AVBufferRef *buf;
+    ZcUserBufEnv * zub;
+
+    if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
+        return NULL;
+
+    zub->fn = fn_tab;
+    zub->v = v;
+    zub->numbytes = numbytes;
+    zub->offset = addr_offset;
+
+    if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
+        av_free(zub);
+        return NULL;
+    }
+
+    return buf;
+}
+
+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
+{
+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
+
+    if (zbe == NULL)
+        return AVERROR(EINVAL);
+
+    if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
+        return AVERROR(EAGAIN);
+
+    if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
+    {
+        pthread_mutex_lock(&zbe->lock);
+        while (!zbe->is_valid)
+            pthread_cond_wait(&zbe->cond, &zbe->lock);
+        pthread_mutex_unlock(&zbe->lock);
+    }
+
+    if (zbe->is_valid == ZC_BUF_NEVER)
+        return AVERROR(EINVAL);
+
+    // Do alloc if we need it
+    if (zbe->user == NULL)
+    {
+        ZcEnv * const zc = zbe->zc;
+        const ZcUserBufEnv * zub;
+
+        av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
+
+        if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
+        {
+            av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+            goto fail;
+        }
+        zub = (const ZcUserBufEnv *)zbe->user->data;
+
+        // Track
+
+        zbe->offset = zub->offset;
+        zbe->gmem.numbytes = zub->numbytes;
+        if ((zbe->gmem.arm =  zub->fn->map_arm(zub->v)) == NULL)
+        {
+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
+            goto fail;
+        }
+
+        if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
+        {
+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
+            goto fail;
+        }
+
+        if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
+        {
+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
+            goto fail;
+        }
+        if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
+        {
+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
+            goto fail;
+        }
+
+        buf->buffer->data = zbe->gmem.arm + zbe->offset;
+        buf->buffer->size = zbe->size_pic;
+
+        // In this mode we shouldn't have anyone waiting for us
+        // so no need to signal
+        if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
+            zbe->is_valid = 1;
+    }
+
+    // Just overwrite - no point in testing
+    buf->data = zbe->gmem.arm + zbe->offset;
+    buf->size = zbe->size_pic;
+    return 0;
+
+fail:
+    av_buffer_unref(&zbe->user);
+    return AVERROR(ENOMEM);
+}
+
+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
+{
+    int rv;
+
+    // Do alloc if we need it
+    if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
+        return rv;
+
+    // If we are a framebuf copy then the alloc can be done but we haven't
+    // imported its results yet
+    if (frame->data[0] == NULL)
+    {
+        const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
+
+        frame->linesize[0] = zbe->geo.stride_y;
+        frame->linesize[1] = zbe->geo.stride_c;
+        frame->linesize[2] = zbe->geo.stride_c;
+        // abuse: linesize[3] = "stripe stride"
+        // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
+        // In a general case this makes the calculation an xor and multiply rather
+        // than a divide and multiply
+        if (zbe->geo.stripes > 1)
+            frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
+
+        frame->data[0] = frame->buf[0]->data;
+        frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
+        if (zbe->geo.planes_c > 1)
+            frame->data[2] = frame->data[1] + zbe->size_c;
+
+        frame->extended_data = frame->data;
+        // Leave extended buf alone
+    }
+
+    return 0;
+}
+
+int av_rpi_zc_set_valid_frame(AVFrame * const frame)
+{
+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
+
+    if (zbe == NULL)
+        return AVERROR(EINVAL);
+
+    zbe->is_valid = ZC_BUF_VALID;
+    pthread_cond_broadcast(&zbe->cond);
+
+    return 0;
+}
+
+int av_rpi_zc_set_broken_frame(AVFrame * const frame)
+{
+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
+
+    if (zbe == NULL)
+        return AVERROR(EINVAL);
+
+    zbe->is_valid = ZC_BUF_NEVER;
+    pthread_cond_broadcast(&zbe->cond);
+
+    return 0;
+}
+
+void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
+{
+    zc->pool_size = pool_size;
+}
+
+unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
+{
+    return zc->pool_size;
+}
+
+int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
+{
+#if 1
+    ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
+
+    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
+        frame->buf[i] = NULL;
+        frame->data[i] = NULL;
+        frame->linesize[i] = 0;
+    }
+
+    if (zbe == NULL)
+        return AVERROR(ENOMEM);
+
+    if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
+    {
+        av_free(zbe);
+        return AVERROR(ENOMEM);
+    }
+
+    pthread_mutex_init(&zbe->lock, NULL);
+    pthread_cond_init(&zbe->cond, NULL);
+    zbe->zc = zc;
+    atomic_fetch_add(&zc->refcount, 1);
+
+    zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);  // Note geometry for later use
+    zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
+    zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
+    zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
+
+#else
+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
+    const unsigned int size_y = geo.stride_y * geo.height_y;
+    const unsigned int size_c = geo.stride_c * geo.height_c;
+    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
+    AVBufferRef * buf;
+    unsigned int i;
+
+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
+
+    if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
+    {
+        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+        return AVERROR(ENOMEM);
+    }
+
+    // Track
+    atomic_fetch_add(&zc->refcount, 1);
+    pic_zbe_ptr(buf)->zc = zc;
+
+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
+        frame->buf[i] = NULL;
+        frame->data[i] = NULL;
+        frame->linesize[i] = 0;
+    }
+
+    frame->buf[0] = buf;
+
+    frame->linesize[0] = geo.stride_y;
+    frame->linesize[1] = geo.stride_c;
+    frame->linesize[2] = geo.stride_c;
+    // abuse: linesize[3] = "stripe stride"
+    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
+    // In a general case this makes the calculation an xor and multiply rather
+    // than a divide and multiply
+    if (geo.stripes > 1)
+        frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
+
+    frame->data[0] = buf->data;
+    frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
+    if (geo.planes_c > 1)
+        frame->data[2] = frame->data[1] + size_c;
+
+    frame->extended_data = frame->data;
+    // Leave extended buf alone
+
+#if RPI_ZC_SAND_8_IN_10_BUF != 0
+    // *** If we intend to use this for real we will want a 2nd buffer pool
+    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
+#endif
+#endif
+
+    return 0;
+}
+
+void av_rpi_zc_env_release(const AVZcEnvPtr zc)
+{
+    const int n = atomic_fetch_add(&zc->refcount, -1);
+    if (n == 1)  // was 1, now 0
+    {
+        zc->free_pool(zc->pool_env);
+        av_free(zc);
+    }
+}
+
+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
+                    void * pool_env,
+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
+{
+    ZcEnv * zc;
+
+    if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
+    {
+        av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
+        return NULL;
+    }
+
+    *zc = (ZcEnv){
+        .refcount = ATOMIC_VAR_INIT(1),
+        .pool_env = pool_env,
+        .alloc_buf = alloc_buf_fn,
+        .free_pool = free_pool_fn,
+        .pool_size = 0
+    };
+
+    return zc;
+}
+
+//============================================================================
+//
+// External ZC initialisation
+
+#define RPI_GET_BUFFER2 1
+
+
+static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+{
+#if !RPI_GET_BUFFER2
+    return avcodec_default_get_buffer2(s, frame, flags);
+#else
+    int rv;
+
+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
+    {
+//        printf("Do default alloc: format=%#x\n", frame->format);
+        rv = avcodec_default_get_buffer2(s, frame, flags);
+    }
+    else if (frame->format == AV_PIX_FMT_YUV420P ||
+             av_rpi_is_sand_frame(frame))
+    {
+        if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
+    }
+    else
+    {
+        rv = avcodec_default_get_buffer2(s, frame, flags);
+    }
+
+#if 0
+    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+        frame->format, frame->width, frame->height,
+        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
+        frame->data[0], frame->data[1], frame->data[2],
+        frame->buf[0], frame->buf[1], frame->buf[2],
+        av_buffer_get_opaque(frame->buf[0]));
+#endif
+    return rv;
+#endif
+}
+
+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
+{
+    return s->get_buffer2 == zc_get_buffer2;
+}
+
+int av_rpi_zc_init2(struct AVCodecContext * const s,
+                    void * pool_env,
+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
+{
+    ZcEnv * zc;
+
+    av_assert0(!av_rpi_zc_in_use(s));
+
+    if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
+        return AVERROR(ENOMEM);
+
+    zc->old = (ZcOldCtxVals){
+        .opaque = s->opaque,
+        .get_buffer2 = s->get_buffer2,
+        .thread_safe_callbacks = s->thread_safe_callbacks
+    };
+
+    s->opaque = zc;
+    s->get_buffer2 = zc_get_buffer2;
+    s->thread_safe_callbacks = 1;
+    return 0;
+}
+
+void av_rpi_zc_uninit2(struct AVCodecContext * const s)
+{
+    ZcEnv * const zc = s->opaque;
+
+    av_assert0(av_rpi_zc_in_use(s));
+
+    s->get_buffer2 = zc->old.get_buffer2;
+    s->opaque = zc->old.opaque;
+    s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
+
+    av_rpi_zc_env_release(zc);
+}
+
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
new file mode 100644
index 0000000000..f00a7c962c
--- /dev/null
+++ b/libavcodec/rpi_zc.h
@@ -0,0 +1,228 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef LIBAVCODEC_RPI_ZC_H
+#define LIBAVCODEC_RPI_ZC_H
+
+// Zero-Copy frame code for RPi
+// RPi needs Y/U/V planes to be contiguous for display.  By default
+// ffmpeg will allocate separated planes so a memcpy is needed before
+// display.  This code provides a method a making ffmpeg allocate a single
+// bit of memory for the frame when can then be reference counted until
+// display has finished with it.
+
+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
+// 0 disables
+// *** This option still in development
+//     Only works if SAO active
+//     Allocates buffers that are twice the required size
+#define RPI_ZC_SAND_8_IN_10_BUF  0
+
+struct AVBufferRef;
+struct AVFrame;
+struct AVCodecContext;
+enum AVPixelFormat;
+
+// "Opaque" pointer to whatever we are using as a buffer reference
+typedef struct AVBufferRef * AVRpiZcRefPtr;
+
+struct AVZcEnv;
+typedef struct AVZcEnv * AVZcEnvPtr;
+
+typedef struct AVRpiZcFrameGeometry
+{
+    unsigned int stride_y;  // Luma stride (bytes)
+    unsigned int height_y;  // Luma height (lines)
+    unsigned int stride_c;  // Chroma stride (bytes)
+    unsigned int height_c;  // Chroma stride (lines)
+    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
+    unsigned int stripes;   // Number of stripes (sand)
+    unsigned int bytes_per_pel;
+    int stripe_is_yc;       // A single stripe is Y then C (false for tall sand)
+
+    int format;                 // Requested format
+    unsigned int video_width;   // Requested width
+    unsigned int video_height;  // Requested height
+} AVRpiZcFrameGeometry;
+
+// Get expected MMAL geometry for a given format, width & height
+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+    const int format,
+    const unsigned int video_width, const unsigned int video_height);
+
+//----------------------------------------------------------------------------
+//
+// Calls that extract info from a ZC frame whether internally or externally
+// allocated
+
+// Generate a ZC reference to the buffer(s) in this frame
+// If the buffer doesn't appear to be one allocated by ZC
+// then the behaviour depends on maycopy:
+//   If maycopy=0 then return NULL
+//   If maycopy=1 && the src frame is in a form where we can easily copy
+//     the data, then allocate a new buffer and copy the data into it
+//   Otherwise return NULL
+// If maycopy == 0 then ZC may be NULL
+AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
+    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
+
+// Unreference the buffer refed/allocated by _zc_ref
+// If fr_ref is NULL then this will NOP
+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
+
+// Get the vc_handle from the frame ref
+// Returns -1 if ref doesn't look valid
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+// Get the vcsm_handle from the frame ref
+// Returns 0 if ref doesn't look valid
+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
+// Get offset from the start of the memory referenced
+// by the vc_handle to valid data
+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
+// Length of buffer data
+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
+// Get the number of bytes allocated from the frame ref
+// Returns 0 if ref doesn't look valid
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+// Geometry this frame was allocated with
+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
+
+//----------------------------------------------------------------------------
+//
+// Calls for external frame allocation
+
+// Callbacks registered in av_rpi_zc_init2
+
+// Callback to allocate a buf for a frame
+// The frame itself is generated in the calling code
+//
+// Parameters:
+//   pool_env  value passed to av-rpi_zc_init2
+//   size      size wanted
+//   geo       geometry of the frame to be allocated
+// Returns:
+//   NULL      Alloc failed
+//   ptr       AVBufferBuf* of allocated buffer
+//             In most cases av_rpi_zc_buf will be called by this function
+//             and this will be the buf returned by that.
+typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
+                                               const AVRpiZcFrameGeometry * geo);
+
+// Callback once ffmpeg is completely done with this pool
+// Called once all allocated buffers have been derefed and ffmpegs ref to this
+// pool has been dropped
+typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
+
+// Init ZC into a context
+// Sets opaque, get_buffer2, thread_safe_callbacks
+// Use if you want to allocate your own pools and/or create ZC buffers for
+// all decoders
+// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
+// apart by av_rpi_zc_xxx calls without this
+int av_rpi_zc_init2(struct AVCodecContext * const s,
+                    void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
+
+// Free ZC from a context
+void av_rpi_zc_uninit2(struct AVCodecContext * const s);
+
+// Get minimum pool size in frames - valid by the time the first alloc request
+// occurs.  Takes into account thread requests and DPB sizes derived from SPS
+// rather than just adding a worst case DPB size.
+unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
+
+typedef struct av_rpi_zc_buf_fn_tab_s {
+    // This AVBuffer is being freed by ffmpeg - return memory
+    // to external pool. Memory may be, but need not be, unmapped.
+    // v is the ptr passed in av_rpi_zc_buf
+    void (* free)(void * v);
+
+    // Return appropriate handles / mappings
+    // v is the ptr passed in av_rpi_zc_buf
+    unsigned int (* vcsm_handle)(void * v);
+    unsigned int (* vc_handle)(void * v);
+    void * (* map_arm)(void * v);
+    unsigned int (* map_vc)(void * v);
+} av_rpi_zc_buf_fn_tab_t;
+
+// Allocate a ZC AVBufferRef and set its callback table
+// Doesn't take a buffer address directly - relies on callbacks to return
+// addresses as they are required.  Mappings need not be generated until
+// the map callbacks are called but they should persist from then until
+// the buffer is freed.
+//
+// Parameters:
+//   numbytes    Size of the buffer
+//   addr_offset Offset to first usable byte of buffer (for alignment)
+//               normally 0
+//   v           Pointer passed to callbacks
+//   fn_tab      Function table
+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
+
+// Get v ptr set in in av_rpi_zc_buf
+void * av_rpi_zc_buf_v(AVBufferRef * const buf);
+
+//----------------------------------------------------------------------------
+//
+// Mostly internal calls but might possibly be wanted by outside code
+
+void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
+AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
+void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
+
+// Test to see if the context is using zc (checks get_buffer2)
+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+
+// Get buffer generates placeholders for later alloc
+int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
+// Resolve actually does the alloc (noop if already alloced)
+// Set data pointers on a buffer/frame that was copied before the alloc
+// accured
+#define ZC_RESOLVE_FAIL         0  // return error on invalid
+#define ZC_RESOLVE_ALLOC        1  // alloc as invalid
+#define ZC_RESOLVE_WAIT_VALID   2  // wait for valid
+#define ZC_RESOLVE_ALLOC_VALID  3  // alloc as valid
+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
+
+int av_rpi_zc_set_valid_frame(AVFrame * const frame);
+int av_rpi_zc_set_broken_frame(AVFrame * const frame);
+
+
+
+
+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
+                    void * pool_env,
+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
+void av_rpi_zc_env_release(const AVZcEnvPtr zc);
+
+
+#endif
+
diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h
new file mode 100644
index 0000000000..9b7b6536a4
--- /dev/null
+++ b/libavcodec/rpi_zc_frames.h
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox, Ben Avison
+*/
+
+#ifndef RPI_ZC_FRAMES_H
+#define RPI_ZC_FRAMES_H
+
+#define RPI_ONE_BUF 1
+
+#include "rpi_mem.h"  // for GPU_MEM_PTR_T
+#include "libavutil/frame.h"
+
+#if !RPI_ONE_BUF
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
+    return p->vc;
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
+    return p->vc;
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
+    return p->vc;
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
+}
+
+#else
+
+static inline int gpu_is_buf1(const AVFrame * const frame)
+{
+    return frame->buf[1] == NULL;
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+{
+    return av_buffer_get_opaque(frame->buf[0]);
+}
+
+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
+{
+    return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
+}
+
+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
+{
+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
+    return gm->vc + (frame->data[n] - gm->arm);
+}
+
+
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+    return get_vc_address3(frame, 0);
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+    return get_vc_address3(frame, 1);
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+    return get_vc_address3(frame, 2);
+}
+
+#if 0
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.numbytes = frame->data[1] - frame->data[0];
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 0);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.arm += frame->data[1] - frame->data[0];
+        g.vc += frame->data[1] - frame->data[0];
+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 1);
+}
+
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+    if (gpu_is_buf1(frame))
+    {
+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+        g.arm += frame->data[2] - frame->data[0];
+        g.vc += frame->data[2] - frame->data[0];
+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+        return g;
+    }
+    else
+        return *gpu_buf3_gmem(frame, 2);
+}
+#endif
+#endif
+
+#endif
diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c
new file mode 100644
index 0000000000..85c5b46d75
--- /dev/null
+++ b/libavcodec/rpivid_hevc.c
@@ -0,0 +1,2128 @@
+// FFMPEG HEVC decoder hardware accelerator
+// Andrew Holme, Argon Design Ltd
+// Copyright (c) June 2017 Raspberry Pi Ltd
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "fftools/ffmpeg.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "avcodec.h"
+#include "hwconfig.h"
+#include "decode.h"
+
+#include "hevc.h"
+#include "hevcdec.h"
+#include "rpi_zc.h"
+#include "rpi_mem.h"
+#include "rpi_zc_frames.h"
+#include "rpi_mailbox.h"
+
+
+#define OPT_PHASE_TIMING 0      // Generate stats for phase usage
+
+#define OPT_EMU 0
+
+#define TRACE_DEV 0
+#define TRACE_ENTRY 0
+
+#define NUM_SCALING_FACTORS 4064
+
+#define AXI_BASE64 0
+
+#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
+#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
+
+#define RPIVID_COL_PICS 17                 // 16 ref & current
+
+#define RPIVID_BITBUFS          2          // Bit + Cmd bufs (phase 0 & 1)
+#define RPIVID_BITBUF_SIZE      (4 << 20)  // Bit + Cmd buf size
+
+#define RPIVID_COEFFBUFS        3          // PU + Coeff bufs (phase 1 & 2)
+#define RPIVID_COEFFBUF_SIZE    (16 << 20) // PU + Coeff buf size
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Register offsets
+
+#define RPI_SPS0         0
+#define RPI_SPS1         4
+#define RPI_PPS          8
+#define RPI_SLICE        12
+#define RPI_TILESTART    16
+#define RPI_TILEEND      20
+#define RPI_SLICESTART   24
+#define RPI_MODE         28
+#define RPI_LEFT0        32
+#define RPI_LEFT1        36
+#define RPI_LEFT2        40
+#define RPI_LEFT3        44
+#define RPI_QP           48
+#define RPI_CONTROL      52
+#define RPI_STATUS       56
+#define RPI_VERSION      60
+#define RPI_BFBASE       64
+#define RPI_BFNUM        68
+#define RPI_BFCONTROL    72
+#define RPI_BFSTATUS     76
+#define RPI_PUWBASE      80
+#define RPI_PUWSTRIDE    84
+#define RPI_COEFFWBASE   88
+#define RPI_COEFFWSTRIDE 92
+#define RPI_SLICECMDS    96
+#define RPI_BEGINTILEEND 100
+#define RPI_TRANSFER     104
+#define RPI_CFBASE       108
+#define RPI_CFNUM        112
+#define RPI_CFSTATUS     116
+
+#define RPI_PURBASE       0x8000
+#define RPI_PURSTRIDE     0x8004
+#define RPI_COEFFRBASE    0x8008
+#define RPI_COEFFRSTRIDE  0x800C
+#define RPI_NUMROWS       0x8010
+#define RPI_CONFIG2       0x8014
+#define RPI_OUTYBASE      0x8018
+#define RPI_OUTYSTRIDE    0x801C
+#define RPI_OUTCBASE      0x8020
+#define RPI_OUTCSTRIDE    0x8024
+#define RPI_STATUS2       0x8028
+#define RPI_FRAMESIZE     0x802C
+#define RPI_MVBASE        0x8030
+#define RPI_MVSTRIDE      0x8034
+#define RPI_COLBASE       0x8038
+#define RPI_COLSTRIDE     0x803C
+#define RPI_CURRPOC       0x8040
+
+//////////////////////////////////////////////////////////////////////////////
+
+// Unused but left here to illustrate the diffrences between FFmpegs prob
+// structure and the rpivid one
+
+struct FFM_PROB {
+    uint8_t  sao_merge_flag                   [ 1];
+    uint8_t  sao_type_idx                     [ 1];
+    uint8_t  split_coding_unit_flag           [ 3];
+    uint8_t  cu_transquant_bypass_flag        [ 1];
+    uint8_t  skip_flag                        [ 3];
+    uint8_t  cu_qp_delta                      [ 3];
+    uint8_t  pred_mode_flag                   [ 1];
+    uint8_t  part_mode                        [ 4];
+    uint8_t  prev_intra_luma_pred_flag        [ 1];
+    uint8_t  intra_chroma_pred_mode           [ 2];
+    uint8_t  merge_flag                       [ 1];
+    uint8_t  merge_idx                        [ 1];
+    uint8_t  inter_pred_idc                   [ 5];
+    uint8_t  ref_idx_l0                       [ 2];
+    uint8_t  ref_idx_l1                       [ 2];
+    uint8_t  abs_mvd_greater0_flag            [ 2];
+    uint8_t  abs_mvd_greater1_flag            [ 2];
+    uint8_t  mvp_lx_flag                      [ 1];
+    uint8_t  no_residual_data_flag            [ 1];
+    uint8_t  split_transform_flag             [ 3];
+    uint8_t  cbf_luma                         [ 2];
+    uint8_t  cbf_cb_cr                        [ 4];
+    uint8_t  transform_skip_flag/*[][]*/      [ 2];
+    uint8_t  explicit_rdpcm_flag/*[][]*/      [ 2];
+    uint8_t  explicit_rdpcm_dir_flag/*[][]*/  [ 2];
+    uint8_t  last_significant_coeff_x_prefix  [18];
+    uint8_t  last_significant_coeff_y_prefix  [18];
+    uint8_t  significant_coeff_group_flag     [ 4];
+    uint8_t  significant_coeff_flag           [44];
+    uint8_t  coeff_abs_level_greater1_flag    [24];
+    uint8_t  coeff_abs_level_greater2_flag    [ 6];
+    uint8_t  log2_res_scale_abs               [ 8];
+    uint8_t  res_scale_sign_flag              [ 2];
+    uint8_t  cu_chroma_qp_offset_flag         [ 1];
+    uint8_t  cu_chroma_qp_offset_idx          [ 1];
+} __attribute__((packed));
+
+//////////////////////////////////////////////////////////////////////////////
+
+struct RPI_PROB {
+    uint8_t  SAO_MERGE_FLAG             [ 1];
+    uint8_t  SAO_TYPE_IDX               [ 1];
+    uint8_t  SPLIT_FLAG                 [ 3];
+    uint8_t  CU_SKIP_FLAG               [ 3];
+    uint8_t  CU_TRANSQUANT_BYPASS_FLAG  [ 1];
+    uint8_t  PRED_MODE                  [ 1];
+    uint8_t  PART_SIZE                  [ 4];
+    uint8_t  INTRA_PRED_MODE            [ 1];
+    uint8_t  CHROMA_PRED_MODE           [ 1];
+    uint8_t  MERGE_FLAG_EXT             [ 1];
+    uint8_t  MERGE_IDX_EXT              [ 1];
+    uint8_t  INTER_DIR                  [ 5];
+    uint8_t  REF_PIC                    [ 2];
+    uint8_t  MVP_IDX                    [ 1];
+    uint8_t  MVD                        [ 2];
+    uint8_t  QT_ROOT_CBF                [ 1];
+    uint8_t  TRANS_SUBDIV_FLAG          [ 3];
+    uint8_t  QT_CBF                     [ 6];
+    uint8_t  DQP                        [ 2];
+    uint8_t  ONE_FLAG                   [24];
+    uint8_t  LASTX                      [18];
+    uint8_t  LASTY                      [18];
+    uint8_t  SIG_CG_FLAG                [ 4];
+    uint8_t  ABS_FLAG                   [ 6];
+    uint8_t  TRANSFORMSKIP_FLAG         [ 2];
+    uint8_t  SIG_FLAG                   [42];
+    uint8_t  SIG_FLAG_unused            [ 2];
+} __attribute__((packed));
+
+//////////////////////////////////////////////////////////////////////////////
+
+struct RPI_CMD {
+    uint32_t addr;
+    uint32_t data;
+} __attribute__((packed));
+
+struct RPI_BIT {
+    int         cmd;
+    const void *ptr;
+    int         len;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+struct RPI_T;
+
+// Actual addressability is 38bits but we can only alloc in the bottom 32
+// currently - when passed to rpivid h/w the address is always >> 6 so will
+// fit in 32 bit there
+// At some point we may weant to make this uint64_t
+typedef uint32_t vid_vc_addr_t;
+
+typedef enum rpivid_decode_state_e {
+    RPIVID_DECODE_NEW = 0,
+    RPIVID_DECODE_START,
+    RPIVID_DECODE_SLICE,
+    RPIVID_DECODE_END,
+} rpivid_decode_state_t;
+
+#define RPI_PROB_VALS 154U
+#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
+
+typedef struct dec_env_s {
+    const AVCodecContext * avctx;
+
+    rpivid_decode_state_t state;
+    unsigned int    decode_order;
+
+    int             phase_no;           // Current phase (i.e. the last one we waited for)
+    struct dec_env_s * phase_wait_q_next;
+    sem_t           phase_wait;
+
+    struct RPI_BIT *bit_fifo;
+    struct RPI_CMD *cmd_fifo;
+    unsigned int    bit_len, bit_max;
+    unsigned int    cmd_len, cmd_max;
+    unsigned int    num_slice_msgs;
+    unsigned int    PicWidthInCtbsY;
+    unsigned int    PicHeightInCtbsY;
+    unsigned int    dpbno_col;
+    uint32_t        reg_slicestart;
+    unsigned int    wpp_entry_x;
+    unsigned int    wpp_entry_y;
+
+    const uint8_t * nal_buffer;
+    size_t          nal_size;
+
+    uint16_t        slice_msgs[2*HEVC_MAX_REFS*8+3];
+    uint8_t         scaling_factors[NUM_SCALING_FACTORS];
+//    unsigned int    RefPicList[2][HEVC_MAX_REFS];
+} dec_env_t;
+
+#define RPIVID_PHASES 3
+#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
+#define RPIVID_PHASE_START (-1)          // Phase after we have inced decode_order
+
+#if OPT_PHASE_TIMING
+static const unsigned int time_thresholds[8] = {
+    10, 15, 20, 30, 45, 60, 75, 90
+};
+#endif
+
+typedef struct phase_wait_env_s {
+    unsigned int    last_order;
+    dec_env_t *     q;
+#if OPT_PHASE_TIMING
+    uint64_t phase_time;
+    uint64_t max_phase_time;
+    uint64_t time_in_phase;
+    uint64_t time_out_phase;
+    unsigned int max_time_decode_order;
+    unsigned int time_bins[9];
+    unsigned int time_bins3[9];
+    unsigned int time_bins5[9];
+    uint64_t time_stash[16];
+    unsigned int i3;
+#endif
+} phase_wait_env_t;                      // Single linked list of threads waiting for this phase
+
+typedef struct RPI_T {
+    atomic_int      ref_count;
+    sem_t           ref_zero;
+
+    dec_env_t **    dec_envs;
+    AVZcEnvPtr      zc;
+
+    pthread_mutex_t phase_lock;
+    phase_wait_env_t phase_reqs[RPIVID_PHASES];
+
+    volatile uint32_t * regs;
+    volatile uint32_t * ints;
+
+    GPU_MEM_PTR_T   gcolbuf;
+    unsigned int    col_stride;
+    size_t          col_picsize;
+
+    unsigned int    bitbuf_no;
+    sem_t           bitbuf_sem;
+    GPU_MEM_PTR_T   gbitbufs[RPIVID_BITBUFS];
+
+    unsigned int    max_pu_msgs;
+    unsigned int    coeffbuf_no;
+    sem_t           coeffbuf_sem;
+    GPU_MEM_PTR_T   gcoeffbufs[RPIVID_COEFFBUFS];
+
+    unsigned int    decode_order;
+    int             mbox_fd;
+    int             gpu_init_type;
+} RPI_T;
+
+#if OPT_PHASE_TIMING
+static uint64_t tus64(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
+#endif
+
+static inline unsigned int rnd64(unsigned int x)
+{
+    return (x + 63) & ~63;
+}
+
+static inline int rpi_sem_wait(sem_t * const sem)
+{
+    int rv;
+    while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
+        /* Loop */;
+    return rv;
+}
+
+//============================================================================
+
+#define REGS_NAME "/dev/rpivid-hevcmem"
+#define REGS_SIZE 0x10000
+#define INTS_NAME "/dev/rpivid-intcmem"
+#define INTS_SIZE 0x10000  // 4 is probably enough but we are going to alloc a page anyway
+
+static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
+{
+    void *gpio_map;
+    int  mem_fd;
+
+    /* open /dev/mem */
+    if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
+        av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
+        return NULL;
+    }
+
+    // Now map it
+    gpio_map = mmap(
+       NULL,
+       size,
+       PROT_READ|PROT_WRITE,
+       MAP_SHARED,
+       mem_fd,
+       0
+    );
+
+    close(mem_fd);  // No longer need the FD
+
+    if (gpio_map == MAP_FAILED) {
+        av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
+        return NULL;
+    }
+
+    return (volatile uint32_t *)gpio_map;
+}
+
+static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
+{
+    volatile uint32_t * const gpio_map = *p_gpio_map;
+    if (gpio_map != NULL) {
+        *p_gpio_map = NULL;
+        munmap((void *)gpio_map, size);
+    }
+}
+
+#define MANGLE(x) ((x) &~0xc0000000)          // ** If x is ever a 64 bit thing this will need fixing!
+#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
+
+static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
+{
+#if TRACE_DEV
+    printf("W %x %08x\n", addr, MANGLE64(data));
+#endif
+
+    rpi->regs[addr >> 2] = MANGLE64(data);
+}
+
+static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
+{
+#if TRACE_DEV
+    printf("W %x %08x\n", addr, data >> 6);
+#endif
+
+    rpi->regs[addr >> 2] = data >> 6;  // ?? rnd64 - but not currently needed
+}
+
+static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
+{
+#if TRACE_DEV
+    printf("W %x %08x\n", addr, data);
+#endif
+
+    rpi->regs[addr >> 2] = data;
+}
+
+static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
+{
+    const uint32_t v = rpi->regs[addr >> 2];
+#if TRACE_DEV
+    printf("R %x (=%x)\n", addr, v);
+#endif
+    return v;
+}
+
+#define ARG_IC_ICTRL_ACTIVE1_INT_SET                   0x00000001
+#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET                  0x00000002
+#define ARG_IC_ICTRL_ACTIVE1_EN_SET                    0x00000004
+#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET                0x00000008
+#define ARG_IC_ICTRL_ACTIVE2_INT_SET                   0x00000010
+#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET                  0x00000020
+#define ARG_IC_ICTRL_ACTIVE2_EN_SET                    0x00000040
+#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET                0x00000080
+
+static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
+{
+    const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
+    const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
+    uint32_t ival;
+    while (((ival = rpi->ints[0]) & mask_done) == 0) {
+        usleep(1000);
+    }
+    rpi->ints[0] = ival & mask_reset;
+}
+
+#if TRACE_DEV && 0
+static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
+    int i;
+
+    for (i=0; i<num; i++)
+    {
+        if ((i%4)==0)
+          printf("%08x: ", 0x7eb00000 + addr + 4*i);
+
+        printf("%08x", rpi->regs[(addr>>2)+i]);
+
+        if ((i%4)==3 || i+1 == num)
+            printf("\n");
+        else
+            printf(" ");
+    }
+}
+
+static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
+    int i;
+
+    for (i=0; i<size>>2; i++)
+    {
+        if ((i%4)==0)
+            printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
+
+        printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
+
+        if ((i%4)==3 || i+1 == size>>2)
+            printf("\n");
+        else
+            printf(" ");
+    }
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+
+static inline size_t round_up_size(const size_t x)
+{
+    /* Admit no size < 256 */
+    const unsigned int n = x < 256 ? 8 : av_log2(x) - 1;
+
+    return x >= (3 << n) ? 4 << n : (3 << n);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Scaling factors
+
+static void expand_scaling_list(
+    const unsigned int sizeID,
+    const unsigned int matrixID,
+    uint8_t * const dst0,
+    const uint8_t * const src0,
+    uint8_t dc)
+{
+    switch (sizeID) {
+        case 0:
+            memcpy(dst0, src0, 16);
+            break;
+        case 1:
+            memcpy(dst0, src0, 64);
+            break;
+        case 2:
+        {
+            uint8_t * d = dst0;
+            for (unsigned int y=0; y != 16; y++) {
+                const uint8_t * s = src0 + (y >> 1) * 8;
+                for (unsigned int x = 0; x != 8; ++x) {
+                    *d++ = *s;
+                    *d++ = *s++;
+                }
+            }
+            dst0[0] = dc;
+            break;
+        }
+        default:
+        {
+            uint8_t * d = dst0;
+            for (unsigned int y=0; y != 32; y++) {
+                const uint8_t * s = src0 + (y >> 2) * 8;
+                for (unsigned int x = 0; x != 8; ++x) {
+                    *d++ = *s;
+                    *d++ = *s;
+                    *d++ = *s;
+                    *d++ = *s++;
+                }
+            }
+            dst0[0] = dc;
+            break;
+        }
+    }
+}
+
+static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
+    // Array of constants for scaling factors
+    static const uint32_t scaling_factor_offsets[4][6] = {
+        // MID0    MID1    MID2    MID3    MID4    MID5
+        {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050},   // SID0 (4x4)
+        {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0},   // SID1 (8x8)
+        {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0},   // SID2 (16x16)
+        {0x07E0,      0,      0, 0x0BE0,      0,      0}};  // SID3 (32x32)
+
+    // ffmpeg places SID3,MID1 where matrixID 3 normally is
+    const ScalingList * const sl =
+        s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
+                                                  : &s->ps.sps->scaling_list;
+    unsigned int mid;
+
+    for (mid=0; mid<6; mid++)
+        expand_scaling_list(0, mid,
+            de->scaling_factors + scaling_factor_offsets[0][mid],
+            sl->sl[0][mid], 0);
+    for (mid=0; mid<6; mid++)
+        expand_scaling_list(1, mid,
+            de->scaling_factors + scaling_factor_offsets[1][mid],
+            sl->sl[1][mid], 0);
+    for (mid=0; mid<6; mid++)
+        expand_scaling_list(2, mid,
+            de->scaling_factors + scaling_factor_offsets[2][mid],
+            sl->sl[2][mid],
+            sl->sl_dc[0][mid]);
+    // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
+    for (mid=0; mid<6; mid += 3)
+        expand_scaling_list(3, mid,
+            de->scaling_factors + scaling_factor_offsets[3][mid],
+            sl->sl[3][mid],
+            sl->sl_dc[1][mid]);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Probabilities
+
+static const uint8_t prob_init[3][156] = {
+	{
+		 153, 200, 139, 141, 157, 154, 154, 154,
+		 154, 154, 184, 154, 154, 154, 184,  63,
+		 154, 154, 154, 154, 154, 154, 154, 154,
+		 154, 154, 154, 154, 154, 153, 138, 138,
+		 111, 141,  94, 138, 182, 154, 154, 154,
+		 140,  92, 137, 138, 140, 152, 138, 139,
+		 153,  74, 149,  92, 139, 107, 122, 152,
+		 140, 179, 166, 182, 140, 227, 122, 197,
+		 110, 110, 124, 125, 140, 153, 125, 127,
+		 140, 109, 111, 143, 127, 111,  79, 108,
+		 123,  63, 110, 110, 124, 125, 140, 153,
+		 125, 127, 140, 109, 111, 143, 127, 111,
+		  79, 108, 123,  63,  91, 171, 134, 141,
+		 138, 153, 136, 167, 152, 152, 139, 139,
+		 111, 111, 125, 110, 110,  94, 124, 108,
+		 124, 107, 125, 141, 179, 153, 125, 107,
+		 125, 141, 179, 153, 125, 107, 125, 141,
+		 179, 153, 125, 140, 139, 182, 182, 152,
+		 136, 152, 136, 153, 136, 139, 111, 136,
+		 139, 111,   0,   0,	},
+	{
+		 153, 185, 107, 139, 126, 197, 185, 201,
+		 154, 149, 154, 139, 154, 154, 154, 152,
+		 110, 122,  95,  79,  63,  31,  31, 153,
+		 153, 168, 140, 198,  79, 124, 138,  94,
+		 153, 111, 149, 107, 167, 154, 154, 154,
+		 154, 196, 196, 167, 154, 152, 167, 182,
+		 182, 134, 149, 136, 153, 121, 136, 137,
+		 169, 194, 166, 167, 154, 167, 137, 182,
+		 125, 110,  94, 110,  95,  79, 125, 111,
+		 110,  78, 110, 111, 111,  95,  94, 108,
+		 123, 108, 125, 110,  94, 110,  95,  79,
+		 125, 111, 110,  78, 110, 111, 111,  95,
+		  94, 108, 123, 108, 121, 140,  61, 154,
+		 107, 167,  91, 122, 107, 167, 139, 139,
+		 155, 154, 139, 153, 139, 123, 123,  63,
+		 153, 166, 183, 140, 136, 153, 154, 166,
+		 183, 140, 136, 153, 154, 166, 183, 140,
+		 136, 153, 154, 170, 153, 123, 123, 107,
+		 121, 107, 121, 167, 151, 183, 140, 151,
+		 183, 140,   0,   0,	},
+	{
+		 153, 160, 107, 139, 126, 197, 185, 201,
+		 154, 134, 154, 139, 154, 154, 183, 152,
+		 154, 137,  95,  79,  63,  31,  31, 153,
+		 153, 168, 169, 198,  79, 224, 167, 122,
+		 153, 111, 149,  92, 167, 154, 154, 154,
+		 154, 196, 167, 167, 154, 152, 167, 182,
+		 182, 134, 149, 136, 153, 121, 136, 122,
+		 169, 208, 166, 167, 154, 152, 167, 182,
+		 125, 110, 124, 110,  95,  94, 125, 111,
+		 111,  79, 125, 126, 111, 111,  79, 108,
+		 123,  93, 125, 110, 124, 110,  95,  94,
+		 125, 111, 111,  79, 125, 126, 111, 111,
+		  79, 108, 123,  93, 121, 140,  61, 154,
+		 107, 167,  91, 107, 107, 167, 139, 139,
+		 170, 154, 139, 153, 139, 123, 123,  63,
+		 124, 166, 183, 140, 136, 153, 154, 166,
+		 183, 140, 136, 153, 154, 166, 183, 140,
+		 136, 153, 154, 170, 153, 138, 138, 122,
+		 121, 122, 121, 167, 151, 183, 140, 151,
+		 183, 140,   0,   0,	},
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Phase 1 command and bit FIFOs
+
+// ???? uint16_t addr - put in uint32_t
+static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
+    if (de->cmd_len==de->cmd_max)
+        av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
+
+#if TRACE_DEV
+    printf("[%02x] %x %x\n", de->cmd_len, addr, data);
+#endif
+
+    de->cmd_fifo[de->cmd_len].addr = addr;
+    de->cmd_fifo[de->cmd_len].data = data;
+    return de->cmd_len++;
+}
+
+static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
+    if (de->bit_len==de->bit_max)
+        av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
+    de->bit_fifo[de->bit_len].cmd = cmd_idx;
+    de->bit_fifo[de->bit_len].ptr = ptr;
+    de->bit_fifo[de->bit_len].len = len;
+    de->bit_len++;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Write probability and scaling factor memories
+
+#if 0
+static void WriteProb(dec_env_t * const de) {
+    int i;
+    const uint8_t *p = (uint8_t *) &de->probabilities;
+    for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
+        p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
+}
+#endif
+
+static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
+    uint8_t dst[RPI_PROB_ARRAY_SIZE];
+
+    const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
+        s->sh.slice_type + 1 : 2 - s->sh.slice_type;
+    const uint8_t * p = prob_init[init_type];
+    const int q = av_clip(s->sh.slice_qp, 0, 51);
+    unsigned int i;
+
+    for (i = 0; i < RPI_PROB_VALS; i++) {
+        int init_value = p[i];
+        int m = (init_value >> 4) * 5 - 45;
+        int n = ((init_value & 15) << 3) - 16;
+        int pre = 2 * (((m * q) >> 4) + n) - 127;
+
+        pre ^= pre >> 31;
+        if (pre > 124)
+            pre = 124 + (pre & 1);
+        dst[i] = pre;
+    }
+    for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
+        dst[i] = 0;
+    }
+
+    for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
+        p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
+
+}
+
+
+static void WriteScalingFactors(dec_env_t * const de) {
+    int i;
+    const uint8_t *p = (uint8_t *) de->scaling_factors;
+    for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
+        p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
+    int i;
+    for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
+    return i-1;
+}
+
+static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
+    if (ctb < bd[num-1]) return ctb_size;
+    else if (width % ctb_size) return width % ctb_size;
+    else return ctb_size;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Handle PU and COEFF stream overflow
+
+
+// Returns:
+// -2 Other error
+// -1 Out of coeff space
+//  0  OK
+//  1  Out of PU space
+
+static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
+    uint32_t status;
+
+    // this is the definition of successful completion of phase 1
+    // it assures that status register is zero and all blocks in each tile have completed
+    if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
+        return 0;
+
+    status = apb_read(rpi, RPI_STATUS);
+
+    if ((status & 8) != 0)
+        return -1;
+
+    if ((status & 0x10) != 0)
+        return 1;
+
+    return -2;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Write STATUS register with expected end CTU address of previous slice
+
+static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
+    const HEVCPPS * const pps = s->ps.pps;
+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
+}
+
+static void wpp_pause(dec_env_t * const de, int ctb_row) {
+    p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
+    p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
+    p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
+}
+
+static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
+    const HEVCPPS *pps = s->ps.pps;
+    int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
+    int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
+    if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
+        wpp_pause(de, last_y);
+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
+    if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
+        p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
+{
+    const HEVCSPS *sps = s->ps.sps;
+    const HEVCPPS *pps = s->ps.pps;
+
+    p1_apb_write(de, RPI_SPS0,
+        (sps->log2_min_cb_size                    <<  0) +
+        (sps->log2_ctb_size                       <<  4) +
+        (sps->log2_min_tb_size                    <<  8) +
+        (sps->log2_max_trafo_size                 << 12) +
+        (sps->bit_depth                           << 16) +
+        (sps->bit_depth                           << 20) +
+        (sps->max_transform_hierarchy_depth_intra << 24) +
+        (sps->max_transform_hierarchy_depth_inter << 28));
+
+    p1_apb_write(de, RPI_SPS1,
+        (sps->pcm.bit_depth                                        <<  0) +
+        (sps->pcm.bit_depth_chroma                                 <<  4) +
+        (sps->pcm.log2_min_pcm_cb_size                             <<  8) +
+        (sps->pcm.log2_max_pcm_cb_size                             << 12) +
+        (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
+        (sps->amp_enabled_flag                                     << 18) +
+        (sps->pcm_enabled_flag                                     << 19) +
+        (sps->scaling_list_enable_flag                             << 20) +
+        (sps->sps_strong_intra_smoothing_enable_flag               << 21));
+
+    p1_apb_write(de, RPI_PPS,
+        (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth   <<  0) +
+        (pps->cu_qp_delta_enabled_flag                      <<  4) +
+        (pps->transquant_bypass_enable_flag                 <<  5) +
+        (pps->transform_skip_enabled_flag                   <<  6) +
+        (pps->sign_data_hiding_flag                         <<  7) +
+      (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) <<  8) +
+      (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
+        (pps->constrained_intra_pred_flag                   << 24));
+
+    if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
+
+    if (!s->sh.dependent_slice_segment_flag) {
+        int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
+        int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
+        de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
+    }
+
+    p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+static void write_slice(dec_env_t * const de, const HEVCContext * const s,
+                        const unsigned int slice_w, const unsigned int slice_h) {
+    uint32_t u32 =
+          (s->sh.slice_type                           << 12)
+        + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
+        + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
+        + (slice_w                                    << 17)
+        + (slice_h                                    << 24);
+
+    if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
+          (s->sh.max_num_merge_cand << 0)
+        + (s->sh.nb_refs[L0]        << 4)
+        + (s->sh.nb_refs[L1]        << 8);
+
+    if (s->sh.slice_type==HEVC_SLICE_B)
+        u32 |= s->sh.mvd_l1_zero_flag<<16;
+    p1_apb_write(de, RPI_SLICE, u32);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Wavefront mode
+
+static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
+    const HEVCSPS * const sps = s->ps.sps;
+    const HEVCPPS * const pps = s->ps.pps;
+
+    int ctb_size = 1<<sps->log2_ctb_size;
+    int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+
+    int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
+    int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
+
+    int endx = de->PicWidthInCtbsY-1;
+    int endy = ctb_row;
+
+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
+
+    p1_apb_write(de, RPI_TILESTART, 0);
+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
+
+    if (do_bte)
+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
+
+    write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
+
+    if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
+
+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Tiles mode
+
+static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
+    const HEVCSPS * const sps = s->ps.sps;
+    const HEVCPPS * const pps = s->ps.pps;
+
+    int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
+    int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
+
+    int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
+    int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
+
+    int endx = pps->col_bd[tile_x+1] - 1;
+    int endy = pps->row_bd[tile_y+1] - 1;
+
+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
+
+    p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
+
+    if (do_bte)
+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
+
+    write_slice(de, s, slice_w, slice_h);
+
+    if (resetQPY)
+        p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
+
+    p1_apb_write(de, RPI_MODE, (0xFFFF                            <<  0)
+                              + (0x0                               << 16)
+                              + ((tile_x==pps->num_tile_columns-1) << 17)
+                              + ((tile_y==pps->num_tile_rows-1)    << 18));
+
+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+// Doesn't attempt to remove from context as we should only do this at the end
+// of time or on create error
+static void
+dec_env_delete(dec_env_t * const de)
+{
+//    gpu_free(&de->gbuf);
+
+    av_freep(&de->cmd_fifo);
+    av_freep(&de->bit_fifo);
+
+    sem_destroy(&de->phase_wait);
+    av_free(de);
+}
+
+static dec_env_t *
+dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
+{
+    dec_env_t * const de = av_mallocz(sizeof(*de));
+    int i;
+
+    if (de == NULL)
+        return NULL;
+
+    de->avctx = avctx;
+    de->phase_no = RPIVID_PHASE_NEW;
+
+    sem_init(&de->phase_wait, 0, 0);
+
+    if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
+        goto fail;
+
+    if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
+        goto fail;
+
+    pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
+    for (i = 0; i != avctx->thread_count; ++i) {
+        if (rpi->dec_envs[i] == NULL)
+        {
+            rpi->dec_envs[i] = de;
+            break;
+        }
+    }
+    pthread_mutex_unlock(&rpi->phase_lock);
+
+    if (i == avctx->thread_count) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
+        goto fail;
+    }
+
+    return de;
+
+fail:
+    dec_env_delete(de);
+    return NULL;
+}
+
+
+static dec_env_t *
+dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
+{
+    dec_env_t * de = NULL;
+    const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
+
+    if (ref_count <= 0) {
+        // Already dead
+        av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
+        return NULL;
+    }
+
+    for (int i = 0; i != avctx->thread_count; ++i) {
+        if (rpi->dec_envs[i] == NULL)
+        {
+            de = dec_env_new(avctx, rpi);
+            break;
+        }
+        if (rpi->dec_envs[i]->avctx == avctx)
+        {
+            de = rpi->dec_envs[i];
+            break;
+        }
+    }
+    return de;
+}
+
+// Call at end of fn
+// Used to ensure we aren't in a worker thead when killed
+static void
+dec_env_release(RPI_T * const rpi, dec_env_t * const de)
+{
+    const int n = atomic_fetch_sub(&rpi->ref_count, 1);
+    if (n == 1) {
+        sem_post(&rpi->ref_zero);
+    }
+}
+
+//----------------------------------------------------------------------------
+
+// Wait for a slot in the given phase
+// Any error return is probably fatal
+static int
+wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
+{
+    int needs_wait = 0;
+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
+
+    pthread_mutex_lock(&rpi->phase_lock);
+    if (p->last_order + 1 != de->decode_order) {
+        de->phase_wait_q_next = p->q;
+        p->q = de;
+        needs_wait = 1;
+    }
+    pthread_mutex_unlock(&rpi->phase_lock);
+
+    if (needs_wait) {
+        while (sem_wait(&de->phase_wait) == -1)
+        {
+            int err;
+            if ((err = errno) != EINTR)
+                return AVERROR(err);
+        }
+    }
+
+    de->phase_no = phase_no;
+    return 0;
+}
+
+static void
+post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
+{
+    dec_env_t * next_de = NULL;
+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
+    dec_env_t ** q = &p->q;
+
+    pthread_mutex_lock(&rpi->phase_lock);
+
+    p->last_order = de->decode_order;
+    while (*q != NULL) {
+        dec_env_t * const t_de = *q;
+
+        if (t_de->decode_order == p->last_order + 1) {
+            // This is us - remove from Q
+            *q = t_de->phase_wait_q_next;
+            t_de->phase_wait_q_next = NULL; // Tidy
+            next_de = t_de;
+            break;
+        }
+        q = &t_de->phase_wait_q_next;
+    }
+
+    pthread_mutex_unlock(&rpi->phase_lock);
+
+    if (next_de != NULL)
+        sem_post(&next_de->phase_wait);
+}
+
+// Wait & signal stuff s.t. threads in other phases can continue
+static void
+abort_phases(RPI_T * const rpi, dec_env_t * const de)
+{
+    for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
+        wait_phase(rpi, de, i);
+        post_phase(rpi, de, i);
+    }
+    de->phase_no = RPIVID_PHASE_NEW;
+}
+
+// Start timing for phase
+// Stats only - no actual effect
+static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
+{
+#if OPT_PHASE_TIMING
+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
+    const int64_t now = tus64();
+    if (p->phase_time != 0)
+        p->time_out_phase += now - p->phase_time;
+    p->phase_time = now;
+#endif
+}
+
+#if OPT_PHASE_TIMING
+static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
+{
+    uint64_t tsum = 0;
+    unsigned int i;
+    for (i = 0; i != avg_n; ++i)
+        tsum += p->time_stash[(p->i3 - i) & 15];
+    for (i = 0; i != 9; ++i) {
+        if (time_thresholds[i] * 1000 * avg_n > tsum)
+            break;
+    }
+    return i;
+}
+#endif
+
+// End timing for phase
+// Stats only - no actual effect
+static inline void tend_phase(RPI_T * const rpi, const int phase_no)
+{
+#if OPT_PHASE_TIMING
+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
+    const uint64_t now = tus64();
+    const uint64_t in_time = now - p->phase_time;
+
+    p->time_in_phase += in_time;
+    p->phase_time = now;
+    p->time_stash[p->i3] = in_time;
+    if (in_time > p->max_phase_time) {
+        p->max_phase_time = in_time;
+        p->max_time_decode_order = p->last_order;
+    }
+    ++p->time_bins[tavg_bin_phase(p, 1)];
+    ++p->time_bins3[tavg_bin_phase(p, 3)];
+    ++p->time_bins5[tavg_bin_phase(p, 5)];
+
+    p->i3 = (p->i3 + 1) & 15;
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Start frame
+
+static int rpi_hevc_start_frame(
+    AVCodecContext * avctx,
+    const uint8_t *buffer,
+    uint32_t size) {
+
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+    dec_env_t * const de = dec_env_get(avctx, rpi);
+    const HEVCContext * const s = avctx->priv_data;
+    const HEVCSPS * const sps = s->ps.sps;
+    const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
+
+#if TRACE_ENTRY
+    printf("<<< %s[%p]\n", __func__, de);
+#endif
+
+    if (de == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
+        return -1;
+    }
+
+    de->phase_no = RPIVID_PHASE_START;
+    de->decode_order = ++rpi->decode_order;  // *** atomic?
+
+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
+
+    if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
+        return -1;
+    }
+    de->state = RPIVID_DECODE_START;
+
+    de->PicWidthInCtbsY  = (sps->width + CtbSizeY - 1) / CtbSizeY;  //7-15
+    de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY;  //7-17
+    de->bit_len = 0;
+    de->cmd_len = 0;
+
+#if TRACE_ENTRY
+    printf(">>> %s[%p]\n", __func__, de);
+#endif
+
+    dec_env_release(rpi, de);
+    return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Slice messages
+
+static void msg_slice(dec_env_t * const de, const uint16_t msg) {
+    de->slice_msgs[de->num_slice_msgs++] = msg;
+}
+
+static void program_slicecmds(dec_env_t * const de, const int sliceid) {
+    int i;
+    p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
+    for(i=0; i < de->num_slice_msgs; i++) {
+        p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
+    }
+}
+
+static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
+    const HEVCSPS * const sps = s->ps.sps;
+    const HEVCPPS * const pps = s->ps.pps;
+    const SliceHeader *sh = &s->sh;
+
+    int weightedPredFlag, i, rIdx;
+    uint16_t cmd_slice;
+    unsigned int collocated_from_l0_flag;
+
+    de->num_slice_msgs=0;
+    de->dpbno_col = 0;
+    cmd_slice = 0;
+    if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
+    if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
+    if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
+
+    if (sh->slice_type!=HEVC_SLICE_I) {
+        cmd_slice += sh->nb_refs[L0]<<2;
+        cmd_slice += sh->nb_refs[L1]<<6;
+    }
+
+    if (sh->slice_type==HEVC_SLICE_P ||  sh->slice_type==HEVC_SLICE_B)
+        cmd_slice |= sh->max_num_merge_cand<<11;
+
+    collocated_from_l0_flag =
+        !sh->slice_temporal_mvp_enabled_flag ?
+            0 :
+        sh->slice_type == HEVC_SLICE_B ?
+            (sh->collocated_list == L0) :
+            (sh->slice_type==HEVC_SLICE_P);
+    cmd_slice |= collocated_from_l0_flag<<14;
+
+    if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
+
+        int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
+        for(i=L0; i<=L1; i++) {
+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
+                HEVCFrame *c = s->ref; // CurrentPicture
+                if (c->poc < f->poc) NoBackwardPredFlag = 0;
+            }
+        }
+
+        if (sps->sps_temporal_mvp_enabled_flag)
+        {
+            const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
+                s->ref->refPicList + 0 :
+                s->ref->refPicList + 1;
+            de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
+        }
+
+        cmd_slice += NoBackwardPredFlag<<10;
+        msg_slice(de, cmd_slice);
+
+        // Write reference picture descriptions
+        weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
+
+        for(i=L0; i<=L1; i++)
+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
+                HEVCFrame *c = s->ref; // CurrentPicture
+                int pic = f - s->DPB;
+                // Make sure pictures are in range 0 to 15
+                int adjusted_pic = f<c? pic : pic-1;
+                int lt = s->ref->refPicList[i].isLongTerm[rIdx];
+                msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
+                msg_slice(de, f->poc);
+                if (weightedPredFlag) {
+                    msg_slice(de,   s->sh.luma_log2_weight_denom+(((i?s->  sh.luma_weight_l1:  s->sh.luma_weight_l0)[rIdx]   &0x1ff)<<3));
+                    msg_slice(de,                                  (i?s->  sh.luma_offset_l1:  s->sh.luma_offset_l0)[rIdx]   & 0xff);
+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
+                }
+            }
+    }
+    else
+        msg_slice(de, cmd_slice);
+
+    msg_slice(de, ((sh->beta_offset/2)&15)
+        + (((sh->tc_offset/2)&15)                           <<  4)
+        + (sh->disable_deblocking_filter_flag               <<  8)
+        + (sh->slice_loop_filter_across_slices_enabled_flag <<  9)
+        + (pps->loop_filter_across_tiles_enabled_flag       << 10)); // CMD_DEBLOCK
+
+    msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+
+static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+    dec_env_t * const de = dec_env_get(avctx,  rpi);
+
+#if TRACE_ENTRY
+    printf("<<< %s[%p]\n", __func__, de);
+#endif
+
+    if (de == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
+        return;
+    }
+
+    switch (de->state) {
+        case RPIVID_DECODE_NEW:
+        case RPIVID_DECODE_END:
+            // Expected transition
+            break;
+
+        case RPIVID_DECODE_SLICE:
+            // Error transition
+            av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
+            break;
+
+        case RPIVID_DECODE_START:
+        default:
+            av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
+            break;
+    }
+
+    abort_phases(rpi, de);
+    de->state = RPIVID_DECODE_NEW;
+
+    dec_env_release(rpi, de);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// End frame
+
+static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+    const HEVCContext * const s = avctx->priv_data;
+    const HEVCPPS * const pps = s->ps.pps;
+    const HEVCSPS * const sps = s->ps.sps;
+    dec_env_t * const de = dec_env_get(avctx,  rpi);
+    AVFrame * const f = s->ref->frame;
+    const unsigned int dpbno_cur = s->ref - s->DPB;
+    vid_vc_addr_t cmds_vc;
+    vid_vc_addr_t pu_base_vc;
+    unsigned int pu_stride;
+    vid_vc_addr_t coeff_base_vc;
+    unsigned int coeff_stride;
+    unsigned int i;
+    int rv = 0;
+    int status = 0;
+    int coeffbuf_sem_claimed = 0;
+
+#if TRACE_ENTRY
+    fprintf("<<< %s[%p]\n", __func__, de);
+#endif
+
+    if (de == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
+        return AVERROR_BUG;  // Should never happen
+    }
+
+    if (de->state != RPIVID_DECODE_SLICE) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
+        rv = AVERROR_UNKNOWN;
+        goto fail;
+    }
+    de->state = RPIVID_DECODE_END;
+
+    // End of command compilation
+    {
+        const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
+        const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
+        if (pps->entropy_coding_sync_enabled_flag) {
+            if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
+                wpp_pause(de, last_y);
+        }
+        p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
+    }
+
+    // Phase 0 ---------------------------------------------------------------
+
+    wait_phase(rpi, de, 0);
+    rpi_sem_wait(&rpi->bitbuf_sem);
+    tstart_phase(rpi, 0);
+
+    // Copy cmds & bits into gpu side buffer
+    // Layout: CMDS, BITS
+    {
+        uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
+        vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
+        unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
+
+        uint8_t * p = armbase + rnd64(cmd_bytes);
+        uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
+
+        cmds_vc = vcbase;
+
+        // Copy all the bits & update bitstream cmds to point at the right bits
+        for (i = 0; i < de->bit_len; ++i)
+        {
+            const unsigned int seg_len = de->bit_fifo[i].len;
+
+            if (p + seg_len > eobits) {
+                status = -1;
+                break;
+            }
+
+            memcpy(p, de->bit_fifo[i].ptr, seg_len);
+            de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
+
+            p += rnd64(seg_len);
+        }
+
+        memcpy(armbase, de->cmd_fifo, cmd_bytes);
+    }
+
+    if (status == 0)
+    {
+        if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
+            rpi->bitbuf_no = 0;
+    }
+    else
+    {
+        sem_post(&rpi->bitbuf_sem);
+        av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
+        rv = AVERROR_BUFFER_TOO_SMALL;
+    }
+
+    tend_phase(rpi, 0);
+    post_phase(rpi, de, 0);
+
+    if (status < 0)
+        goto fail;
+
+    // Phase 1 ---------------------------------------------------------------
+
+    wait_phase(rpi, de, 1);
+    rpi_sem_wait(&rpi->coeffbuf_sem);
+    coeffbuf_sem_claimed = 1;
+    tstart_phase(rpi, 1);
+
+    status = 0;
+    for (;;)
+    {
+        // (Re-)allocate PU/COEFF stream space
+        const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
+        unsigned int pu_size;
+
+        pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
+        pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
+        pu_size = pu_stride * de->PicHeightInCtbsY;
+
+        if (pu_size >= total_size || status == -1) {
+            GPU_MEM_PTR_T newbuf;
+
+            if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0)
+            {
+                av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n");
+                status = -1;
+                break;
+            }
+            gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no);
+            rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf;
+            status = 0;
+            continue;
+        }
+
+        // Allocate all remaining space to coeff
+        coeff_base_vc = pu_base_vc + pu_size;
+        coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63;  // Round down to multiple of 64
+
+        apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
+        apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
+        apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
+        apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
+
+        // Trigger command FIFO
+        apb_write(rpi, RPI_CFNUM, de->cmd_len);
+#if TRACE_DEV && 0
+        apb_dump_regs(rpi, 0x0, 32);
+        apb_dump_regs(rpi, 0x8000, 24);
+        axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
+#endif
+        apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
+
+        int_wait(rpi, 1);
+
+        status = check_status(rpi, de);
+
+        if (status == -1)
+            continue;
+        else if (status != 1)
+            break;
+
+        // Status 1 means out of PU space so try again with more
+        // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
+        rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
+    }
+
+    // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
+    // may reuse a live buffer when we kick the coeff sem
+    if (status == 0)
+    {
+        if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
+            rpi->coeffbuf_no = 0;
+    }
+    else
+    {
+        if (status == -1)
+        {
+            av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
+            rv = AVERROR_BUFFER_TOO_SMALL;
+        }
+        else
+        {
+            av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
+            rv = AVERROR_INVALIDDATA;
+        }
+    }
+
+    tend_phase(rpi, 1);
+    sem_post(&rpi->bitbuf_sem);
+    post_phase(rpi, de, 1);
+
+    if (status != 0)
+        goto fail;
+
+    // Phase 2 ---------------------------------------------------------------
+
+    wait_phase(rpi, de, 2);
+
+    if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
+    {
+        // As we are in phase 2 already here we don't need to worry about
+        // ceoffbuf_no despite the early exit
+        post_phase(rpi, de, 2);
+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
+        goto fail;
+    }
+
+    tstart_phase(rpi, 2);
+
+    apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
+    apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
+    apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
+    apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
+
+    apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
+    apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
+    apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
+    apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
+
+    // Keep the last thing we resolved as fallback for any ref we fail to
+    // resolve.  As a final fallback use our current frame.  The pels might
+    // not be there yet but at least the memory is valid.
+    //
+    // Attempt to resolve the entire DPB - we could note what we have used
+    // in ref lists but probably simpler and more reliable to set the whole thing
+    {
+        AVFrame * fallback_frame = f;
+        for (i = 0; i != 16; ++i) {
+            // Avoid current frame
+            const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
+            AVFrame * fr = hevc_fr->frame;
+
+            if (fr != NULL &&
+                av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
+            {
+                fallback_frame = fr;
+            }
+            else
+            {
+                fr = fallback_frame;
+            }
+
+            apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
+            apb_write(rpi, 0x9004+16*i, 0);
+            apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
+            apb_write(rpi, 0x900C+16*i, 0);
+        }
+    }
+
+    apb_write(rpi, RPI_CONFIG2,
+          (sps->bit_depth                             << 0) // BitDepthY
+        + (sps->bit_depth                             << 4) // BitDepthC
+       + ((sps->bit_depth>8)                          << 8) // BitDepthY
+       + ((sps->bit_depth>8)                          << 9) // BitDepthC
+        + (sps->log2_ctb_size                         <<10)
+        + (pps->constrained_intra_pred_flag           <<13)
+        + (sps->sps_strong_intra_smoothing_enable_flag<<14)
+        + (sps->sps_temporal_mvp_enabled_flag         <<15)
+        + (pps->log2_parallel_merge_level             <<16)
+        + (s->sh.slice_temporal_mvp_enabled_flag      <<19)
+        + (sps->pcm.loop_filter_disable_flag          <<20)
+       + ((pps->cb_qp_offset&31)                      <<21)
+       + ((pps->cr_qp_offset&31)                      <<26));
+
+    apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
+    apb_write(rpi, RPI_CURRPOC, s->poc);
+
+    // collocated reads/writes
+    if (sps->sps_temporal_mvp_enabled_flag) {
+        av_assert0(de->dpbno_col < RPIVID_COL_PICS);
+        av_assert0(dpbno_cur < RPIVID_COL_PICS);
+
+        apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
+        apb_write_vc_len(rpi, RPI_MVSTRIDE,  rpi->col_stride);
+        apb_write_vc_addr(rpi, RPI_MVBASE,  rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
+        apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
+    }
+
+#if TRACE_DEV && 0
+    apb_dump_regs(rpi, 0x0, 32);
+    apb_dump_regs(rpi, 0x8000, 24);
+#endif
+
+    apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
+    apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
+
+    int_wait(rpi, 2);
+
+    tend_phase(rpi, 2);
+    coeffbuf_sem_claimed = 0;
+    sem_post(&rpi->coeffbuf_sem);
+    // Set valid here to avoid race in resolving in any pending phase 2
+    av_rpi_zc_set_valid_frame(f);
+
+    post_phase(rpi, de, 2);
+
+    // Flush frame for CPU access
+    // Arguably the best place would be at the start of phase 2 but here
+    // will overlap with the wait
+    //
+    // * Even better would be to have better lock/unlock control in ZC for external access
+    if (rpi->gpu_init_type == GPU_INIT_GPU)  // * CMA is currently always uncached
+    {
+        rpi_cache_buf_t cbuf;
+        rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
+        rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
+        rpi_cache_flush_finish(fe);
+    }
+
+#if TRACE_ENTRY
+    printf(">>> %s[%p] OK\n", __func__, de);
+#endif
+
+    dec_env_release(rpi, de);
+    return 0;
+
+fail:
+    av_rpi_zc_set_broken_frame(f);
+    if (coeffbuf_sem_claimed)
+        sem_post(&rpi->coeffbuf_sem);
+    abort_phases(rpi, de);  // Dummy any unresolved phases
+
+#if TRACE_ENTRY
+    printf(">>> %s[%p] FAIL\n", __func__, de);
+#endif
+
+    dec_env_release(rpi, de);
+    return rv;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+
+#if TRACE_DEV
+static void dump_data(const uint8_t * p, size_t len)
+{
+    size_t i;
+    for (i = 0; i < len; i += 16) {
+        size_t j;
+        printf("%04x", i);
+        for (j = 0; j != 16; ++j) {
+            printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]);
+        }
+        printf("\n");
+    }
+}
+#endif
+
+#if OPT_EMU
+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
+{
+    unsigned int z = 0;
+    while (idx--) {
+        if (*b++ == 0) {
+            ++z;
+            if (z >= 2 && *b == 3) {
+                ++b;
+                z = 0;
+            }
+        }
+        else {
+            z = 0;
+        }
+    }
+    return b;
+}
+#endif
+
+static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
+    const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes
+    const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
+    const GetBitContext *gb = &s->HEVClc->gb;
+
+#if OPT_EMU
+    const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1);
+    const int len = de->nal_size - (ptr - de->nal_buffer);
+#else
+    const int len = 1 + gb->size_in_bits/8 - gb->index/8;
+    const void *ptr = &gb->buffer[gb->index/8];
+#endif
+
+#if TRACE_DEV
+    printf("Index=%d, /8=%#x\n", gb->index, gb->index/8);
+    dump_data(de->nal_buffer, 128);
+#endif
+
+    p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
+    p1_apb_write(de, RPI_BFNUM, len);
+    p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
+    p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Wavefront mode
+
+static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
+{
+    const HEVCPPS * const pps = s->ps.pps;
+
+    int i, resetQPY=1;
+    int indep = !s->sh.dependent_slice_segment_flag;
+    int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
+
+    if (ctb_addr_ts)
+        wpp_end_previous_slice(de, s, ctb_addr_ts);
+    pre_slice_decode(de, s);
+    WriteBitstream(de, s);
+    if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
+        WriteProb(de, s);
+    else if (ctb_col==0)
+        p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
+    else
+        resetQPY=0;
+    program_slicecmds(de, s->slice_idx);
+    new_slice_segment(de, s);
+    wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
+        int last_x = de->PicWidthInCtbsY-1;
+        if (de->PicWidthInCtbsY>2)
+            wpp_pause(de, ctb_row);
+        p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
+        if (de->PicWidthInCtbsY==2)
+            p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
+        if (de->PicWidthInCtbsY==1)
+            WriteProb(de, s);
+        else
+            p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
+        ctb_addr_ts += pps->column_width[0];
+        wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Tiles mode
+
+static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
+    const HEVCPPS * const pps = s->ps.pps;
+    int i, resetQPY;
+
+    if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
+    pre_slice_decode(de, s);
+    WriteBitstream(de, s);
+    resetQPY = ctb_addr_ts==0
+            || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
+            || !s->sh.dependent_slice_segment_flag;
+    if (resetQPY) WriteProb(de, s);
+    program_slicecmds(de, s->slice_idx);
+    new_slice_segment(de, s);
+    new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+        int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
+        int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
+        int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
+        int last_x = pps->col_bd[tile_x+1]-1;
+        int last_y = pps->row_bd[tile_y+1]-1;
+        p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
+        WriteProb(de, s);
+        ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
+        new_entry_point(de, s, 0, 1, ctb_addr_ts);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+static int cabac_start_align(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc->gb;
+    skip_bits(gb, 1);
+    align_get_bits(gb);
+    // Should look at getting rid of this
+    return ff_init_cabac_decoder(&s->HEVClc->cc,
+                          gb->buffer + get_bits_count(gb) / 8,
+                          (get_bits_left(gb) + 7) / 8);
+}
+
+static int rpi_hevc_decode_slice(
+    AVCodecContext *avctx,
+    const uint8_t *buffer,
+    uint32_t size)
+{
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+    HEVCContext * const s = avctx->priv_data;
+    dec_env_t * const de = dec_env_get(avctx, rpi);
+    const HEVCPPS *pps = s->ps.pps;
+    int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+
+#if TRACE_ENTRY
+    printf("<<< %s[%p]\n", __func__, de);
+#endif
+    if (de == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
+        return -1;
+    }
+
+    if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
+        return -1;
+    }
+    de->state = RPIVID_DECODE_SLICE;
+
+    de->nal_buffer = buffer;
+    de->nal_size   = size;
+
+#if !OPT_EMU
+//    ff_hevc_cabac_init(s, ctb_addr_ts);
+    cabac_start_align(s);
+#endif
+    if (s->ps.sps->scaling_list_enable_flag)
+        populate_scaling_factors(de, s);
+    pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
+                                             : decode_slice(de, s, ctb_addr_ts);
+#if TRACE_ENTRY
+    printf(">>> %s[%p]\n", __func__, de);
+#endif
+    dec_env_release(rpi, de);
+    return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
+{
+    int rv;
+    if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
+        av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
+    return rv;
+}
+
+static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+    HEVCContext * const s = avctx->priv_data;
+    // Frame buffering + 1 output.  Would need thread_count extra but we now
+    // alloc at the start of phase 2 so that is the only thread we need the
+    // extra buffer for.
+    const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
+    int rv;
+
+    if (av_rpi_zc_in_use(avctx))
+    {
+        const AVZcEnvPtr zc = avctx->opaque;
+        av_rpi_zc_set_decoder_pool_size(zc, pool_req);
+        rv = av_rpi_zc_get_buffer(zc, frame);   // get_buffer2 would alloc
+    }
+    else
+    {
+        if (rpi->zc == NULL) {
+            pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
+            // Alloc inside lock to make sure we only ever alloc one
+            if (rpi->zc == NULL) {
+                rpi->zc = av_rpi_zc_int_env_alloc(s);
+            }
+            pthread_mutex_unlock(&rpi->phase_lock);
+        }
+        av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
+        rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
+            av_rpi_zc_get_buffer(rpi->zc, frame);
+    }
+
+    if (rv == 0 &&
+        (rv = ff_attach_decode_data(frame)) < 0)
+    {
+        av_frame_unref(frame);
+    }
+
+    if (rv == 0)
+    {
+        FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
+        fdd->post_process = rpivid_retrieve_data;
+    }
+
+    return rv;
+}
+
+#if OPT_PHASE_TIMING
+static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
+{
+    av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
+           bins[0],  bins[1], bins[2], bins[3],
+           bins[4],  bins[5], bins[6], bins[7], bins[8]);
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+
+static int rpi_hevc_free(AVCodecContext *avctx) {
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+
+#if TRACE_ENTRY
+    printf("<<< %s\n", __func__);
+#endif
+
+    dec_env_release(rpi, NULL);
+
+    // Wait for everything else to stop
+    {
+        struct timespec tt;
+        clock_gettime(CLOCK_REALTIME, &tt);
+        tt.tv_sec += 2;
+        while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
+            const int err = errno;
+            if (err == ETIMEDOUT) {
+                av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
+                return -1;
+            }
+            if (err != EINTR) {
+                av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
+                break;
+            }
+        }
+    }
+
+#if OPT_PHASE_TIMING
+    {
+        unsigned int i;
+        for (i = 0; i != RPIVID_PHASES; ++i) {
+            const phase_wait_env_t * const p = rpi->phase_reqs + i;
+            av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
+                   (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
+                   (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
+            av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d        >\n",
+                   time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
+                   time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
+            log_bin_phase(avctx, p->time_bins);
+            log_bin_phase(avctx, p->time_bins3);
+            log_bin_phase(avctx, p->time_bins5);
+            av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
+                   (unsigned int)(p->max_phase_time / 1000),
+                   p->max_time_decode_order);
+        }
+        av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
+    }
+#endif
+
+    if (rpi->dec_envs != NULL)
+    {
+        for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
+            dec_env_delete(rpi->dec_envs[i]);
+        }
+        av_freep(&rpi->dec_envs);
+    }
+
+    av_rpi_zc_int_env_freep(&rpi->zc);
+
+    gpu_free(&rpi->gcolbuf);
+
+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
+        gpu_free(rpi->gbitbufs + i);
+    }
+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
+        gpu_free(rpi->gcoeffbufs + i);
+    }
+
+    unmap_devp(&rpi->regs, REGS_SIZE);
+    unmap_devp(&rpi->ints, INTS_SIZE);
+
+    if (rpi->gpu_init_type > 0)
+        rpi_mem_gpu_uninit();
+
+    if (rpi->mbox_fd >= 0) {
+        mbox_release_clock(rpi->mbox_fd);
+        mbox_close(rpi->mbox_fd);
+    }
+
+    sem_destroy(&rpi->ref_zero);
+    sem_destroy(&rpi->coeffbuf_sem);
+    sem_destroy(&rpi->bitbuf_sem);
+
+#if TRACE_ENTRY
+    printf(">>> %s\n", __func__);
+#endif
+    return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+static int rpi_hevc_init(AVCodecContext *avctx) {
+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
+//    const char *err;
+
+#if TRACE_ENTRY
+    printf("<<< %s\n", __func__);
+#endif
+
+    if (avctx->width>4096 || avctx->height>4096) {
+        av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
+        return AVERROR(ENOTSUP);
+    }
+
+    memset(rpi, 0, sizeof(*rpi));
+
+    rpi->mbox_fd = -1;
+    rpi->decode_order = 0;
+
+    // Initial PU/COEFF stream buffer split chosen as worst case seen so far
+    rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
+
+
+    atomic_store(&rpi->ref_count, 1);
+    sem_init(&rpi->ref_zero, 0, 0);
+
+    sem_init(&rpi->bitbuf_sem,   0, RPIVID_BITBUFS);
+    sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
+
+    pthread_mutex_init(&rpi->phase_lock, NULL);
+
+    if ((rpi->mbox_fd = mbox_open()) < 0)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
+        goto fail;
+    }
+    mbox_request_clock(rpi->mbox_fd);
+
+    if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
+        (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
+        goto fail;
+    }
+
+    if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
+        goto fail;
+    }
+
+    if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
+        goto fail;
+    }
+
+    rpi->col_stride = rnd64(avctx->width);
+    rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
+    if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
+        goto fail;
+    }
+
+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
+        if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
+        {
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
+            goto fail;
+        }
+    }
+
+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
+        if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
+        {
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
+            goto fail;
+        }
+    }
+
+    av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n");
+
+    return 0;
+
+fail:
+    rpi_hevc_free(avctx);
+    return AVERROR_EXTERNAL;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
+    .name           = "hevc_rpi4_8",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_RPI4_8,
+    .alloc_frame    = rpivid_hevc_alloc_frame,
+    .start_frame    = rpi_hevc_start_frame,
+    .end_frame      = rpi_hevc_end_frame,
+    .abort_frame    = rpi_hevc_abort_frame,
+    .decode_slice   = rpi_hevc_decode_slice,
+    .init           = rpi_hevc_init,
+    .uninit         = rpi_hevc_free,
+    .priv_data_size = sizeof(RPI_T),
+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
+};
+
+const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
+    .name           = "hevc_rpi4_10",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_RPI4_10,
+    .alloc_frame    = rpivid_hevc_alloc_frame,
+    .start_frame    = rpi_hevc_start_frame,
+    .end_frame      = rpi_hevc_end_frame,
+    .abort_frame    = rpi_hevc_abort_frame,
+    .decode_slice   = rpi_hevc_decode_slice,
+    .init           = rpi_hevc_init,
+    .uninit         = rpi_hevc_free,
+    .priv_data_size = sizeof(RPI_T),
+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
+};
+
diff --git a/libavcodec/rpzaenc.c b/libavcodec/rpzaenc.c
index b208753e2b..baf067c205 100644
--- a/libavcodec/rpzaenc.c
+++ b/libavcodec/rpzaenc.c
@@ -204,7 +204,7 @@ static void get_max_component_diff(BlockInfo *bi, uint16_t *block_ptr,

     // loop thru and compare pixels
     for (y = 0; y < bi->block_height; y++) {
-        for (x = 0; x < bi->block_width; x++) {
+        for (x = 0; x < bi->block_width; x++){
             // TODO:  optimize
             min_r = FFMIN(R(block_ptr[x]), min_r);
             min_g = FFMIN(G(block_ptr[x]), min_g);
@@ -276,7 +276,7 @@ static int leastsquares(uint16_t *block_ptr, BlockInfo *bi,
         return -1;

     for (i = 0; i < bi->block_height; i++) {
-        for (j = 0; j < bi->block_width; j++) {
+        for (j = 0; j < bi->block_width; j++){
             x = GET_CHAN(block_ptr[j], xchannel);
             y = GET_CHAN(block_ptr[j], ychannel);
             sumx += x;
@@ -323,7 +323,7 @@ static int calc_lsq_max_fit_error(uint16_t *block_ptr, BlockInfo *bi,
     int max_err = 0;

     for (i = 0; i < bi->block_height; i++) {
-        for (j = 0; j < bi->block_width; j++) {
+        for (j = 0; j < bi->block_width; j++){
             int x_inc, lin_y, lin_x;
             x = GET_CHAN(block_ptr[j], xchannel);
             y = GET_CHAN(block_ptr[j], ychannel);
@@ -418,9 +418,7 @@ static void update_block_in_prev_frame(const uint16_t *src_pixels,
                                        uint16_t *dest_pixels,
                                        const BlockInfo *bi, int block_counter)
 {
-    const int y_size = FFMIN(4, bi->image_height - bi->row * 4);
-
-    for (int y = 0; y < y_size; y++) {
+    for (int y = 0; y < 4; y++) {
         memcpy(dest_pixels, src_pixels, 8);
         dest_pixels += bi->rowstride;
         src_pixels += bi->rowstride;
@@ -730,15 +728,14 @@ post_skip :

             if (err > s->sixteen_color_thresh) { // DO SIXTEEN COLOR BLOCK
                 uint16_t *row_ptr;
-                int y_size, rgb555;
+                int rgb555;

                 block_offset = get_block_info(&bi, block_counter);

                 row_ptr = &src_pixels[block_offset];
-                y_size = FFMIN(4, bi.image_height - bi.row * 4);

-                for (int y = 0; y < y_size; y++) {
-                    for (int x = 0; x < 4; x++) {
+                for (int y = 0; y < 4; y++) {
+                    for (int x = 0; x < 4; x++){
                         rgb555 = row_ptr[x] & ~0x8000;

                         put_bits(&s->pb, 16, rgb555);
@@ -746,11 +743,6 @@ post_skip :
                     row_ptr += bi.rowstride;
                 }

-                for (int y = y_size; y < 4; y++) {
-                    for (int x = 0; x < 4; x++)
-                        put_bits(&s->pb, 16, 0);
-                }
-
                 block_counter++;
             } else { // FOUR COLOR BLOCK
                 block_counter += encode_four_color_block(min_color, max_color,
diff --git a/libavcodec/sbrdsp_fixed.c b/libavcodec/sbrdsp_fixed.c
index 0d34a2a710..43fcc90ae5 100644
--- a/libavcodec/sbrdsp_fixed.c
+++ b/libavcodec/sbrdsp_fixed.c
@@ -114,8 +114,8 @@ static void sbr_qmf_deint_neg_c(int *v, const int *src)
 {
     int i;
     for (i = 0; i < 32; i++) {
-        v[     i] = (int)(0x10U + src[63 - 2*i    ]) >> 5;
-        v[63 - i] = (int)(0x10U - src[63 - 2*i - 1]) >> 5;
+        v[     i] = ( src[63 - 2*i    ] + 0x10) >> 5;
+        v[63 - i] = (-src[63 - 2*i - 1] + 0x10) >> 5;
     }
 }

diff --git a/libavcodec/scpr.c b/libavcodec/scpr.c
index f78f43b5cd..2a0ebcecfc 100644
--- a/libavcodec/scpr.c
+++ b/libavcodec/scpr.c
@@ -459,9 +459,6 @@ static int decompress_p(AVCodecContext *avctx,
                 int run, bx = x * 16 + sx1, by = y * 16 + sy1;
                 uint32_t r, g, b, clr, ptype = 0;

-                if (bx >= avctx->width)
-                    return AVERROR_INVALIDDATA;
-
                 for (; by < y * 16 + sy2 && by < avctx->height;) {
                     ret = decode_value(s, s->op_model[ptype], 6, 1000, &ptype);
                     if (ret < 0)
diff --git a/libavcodec/scpr3.c b/libavcodec/scpr3.c
index 274f99ce71..78c58889cb 100644
--- a/libavcodec/scpr3.c
+++ b/libavcodec/scpr3.c
@@ -1168,9 +1168,6 @@ static int decompress_p3(AVCodecContext *avctx,
                 int run, bx = x * 16 + sx1, by = y * 16 + sy1;
                 uint32_t clr, ptype = 0, r, g, b;

-                if (bx >= avctx->width)
-                    return AVERROR_INVALIDDATA;
-
                 for (; by < y * 16 + sy2 && by < avctx->height;) {
                     ret = decode_value3(s, 5, &s->op_model3[ptype].cntsum,
                                         s->op_model3[ptype].freqs[0],
diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c
index a0e8745ce3..16d2b7c302 100644
--- a/libavcodec/snowenc.c
+++ b/libavcodec/snowenc.c
@@ -1544,10 +1544,10 @@ static void calculate_visual_weight(SnowContext *s, Plane *p){
     int level, orientation, x, y;

     for(level=0; level<s->spatial_decomposition_count; level++){
-        int64_t error=0;
         for(orientation=level ? 1 : 0; orientation<4; orientation++){
             SubBand *b= &p->band[level][orientation];
             IDWTELEM *ibuf= b->ibuf;
+            int64_t error=0;

             memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
             ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
@@ -1558,13 +1558,9 @@ static void calculate_visual_weight(SnowContext *s, Plane *p){
                     error += d*d;
                 }
             }
-            if (orientation == 2)
-                error /= 2;
+
             b->qlog= (int)(QROOT * log2(352256.0/sqrt(error)) + 0.5);
-            if (orientation != 1)
-                error = 0;
         }
-        p->band[level][1].qlog = p->band[level][2].qlog;
     }
 }

diff --git a/libavcodec/sonic.c b/libavcodec/sonic.c
index 8662737837..c049f6aedc 100644
--- a/libavcodec/sonic.c
+++ b/libavcodec/sonic.c
@@ -1004,7 +1004,7 @@ static int sonic_decode_frame(AVCodecContext *avctx,

     // dequantize
     for (i = 0; i < s->num_taps; i++)
-        s->predictor_k[i] *= (unsigned) s->tap_quant[i];
+        s->predictor_k[i] *= s->tap_quant[i];

     if (s->lossless)
         quant = 1;
diff --git a/libavcodec/speedhq.c b/libavcodec/speedhq.c
index 5bf03a35e6..711bcd66d7 100644
--- a/libavcodec/speedhq.c
+++ b/libavcodec/speedhq.c
@@ -498,9 +498,7 @@ static int speedhq_decode_frame(AVCodecContext *avctx,
     uint32_t second_field_offset;
     int ret;

-    if (buf_size < 4 || avctx->width < 8 || avctx->width % 8 != 0)
-        return AVERROR_INVALIDDATA;
-    if (buf_size < avctx->width*avctx->height / 64 / 4)
+    if (buf_size < 4 || avctx->width < 8)
         return AVERROR_INVALIDDATA;

     quality = buf[0];
diff --git a/libavcodec/sunrast.c b/libavcodec/sunrast.c
index 991915fa62..e1ec8a0832 100644
--- a/libavcodec/sunrast.c
+++ b/libavcodec/sunrast.c
@@ -19,7 +19,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

-#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
@@ -76,12 +75,6 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_PATCHWELCOME;
     }

-    if (maplength > 768) {
-        av_log(avctx, AV_LOG_WARNING, "invalid colormap length\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    // This also checks depth to be valid
     switch (depth) {
         case 1:
             avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_MONOWHITE;
@@ -103,23 +96,15 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
             return AVERROR_INVALIDDATA;
     }

-    // This checks w and h to be valid in the sense that bytes of a padded bitmap are addressable with 32bit int
     ret = ff_set_dimensions(avctx, w, h);
     if (ret < 0)
         return ret;

-    // ensured by ff_set_dimensions()
-    av_assert0(w <= (INT32_MAX - 7) / depth);
-
     /* scanlines are aligned on 16 bit boundaries */
     len  = (depth * w + 7) >> 3;
     alen = len + (len & 1);

-    // ensured by ff_set_dimensions()
-    av_assert0(h  <= INT32_MAX / (3 * len));
-
-    // maplength is limited to 768 and the right term is limited to INT32_MAX / 256 so the add needs no check
-    if (buf_end - buf < (uint64_t)maplength + (len * h) * 3 / 256)
+    if (buf_end - buf < maplength + (len * h) * 3 / 256)
         return AVERROR_INVALIDDATA;

     if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
@@ -133,7 +118,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     } else if (maplength) {
         unsigned int len = maplength / 3;

-        if (maplength % 3) {
+        if (maplength % 3 || maplength > 768) {
             av_log(avctx, AV_LOG_WARNING, "invalid colormap length\n");
             return AVERROR_INVALIDDATA;
         }
diff --git a/libavcodec/takdsp.c b/libavcodec/takdsp.c
index a8f9dba342..9cb8052596 100644
--- a/libavcodec/takdsp.c
+++ b/libavcodec/takdsp.c
@@ -65,7 +65,7 @@ static void decorrelate_sf(int32_t *p1, int32_t *p2, int length, int dshift, int
     for (i = 0; i < length; i++) {
         int32_t a = p1[i];
         int32_t b = p2[i];
-        b         = (unsigned)((int)(dfactor * (unsigned)(b >> dshift) + 128) >> 8) << dshift;
+        b         = (unsigned)(dfactor * (b >> dshift) + 128 >> 8) << dshift;
         p1[i]     = b - a;
     }
 }
diff --git a/libavcodec/tests/snowenc.c b/libavcodec/tests/snowenc.c
index 65699158ca..d5f94e8a61 100644
--- a/libavcodec/tests/snowenc.c
+++ b/libavcodec/tests/snowenc.c
@@ -31,13 +31,11 @@ int main(void){
 #define width  256
 #define height 256
     int buffer[2][width*height];
-    short obuffer[width*height];
     SnowContext s;
     int i;
     AVLFG prng;
     s.spatial_decomposition_count=6;
     s.spatial_decomposition_type=1;
-    int ret = 0;

     s.temp_dwt_buffer  = av_mallocz_array(width, sizeof(DWTELEM));
     s.temp_idwt_buffer = av_mallocz_array(width, sizeof(IDWTELEM));
@@ -51,34 +49,24 @@ int main(void){

     printf("testing 5/3 DWT\n");
     for(i=0; i<width*height; i++)
-        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 19000 - 9000;
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;

     ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
-    for(i=0; i<width*height; i++)
-        obuffer[i] = buffer[0][i];
-    ff_spatial_idwt(obuffer, s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);

     for(i=0; i<width*height; i++)
-        if(buffer[1][i]!= obuffer[i]) {
-            printf("fsck: %4dx%4dx %12d %7d\n",i%width, i/width, buffer[1][i], obuffer[i]);
-            ret = 1;
-        }
+        if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);

     printf("testing 9/7 DWT\n");
     s.spatial_decomposition_type=0;
     for(i=0; i<width*height; i++)
-        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 11000 - 5000;
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;

     ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
-    for(i=0; i<width*height; i++)
-        obuffer[i] = buffer[0][i];
-    ff_spatial_idwt(obuffer, s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);

     for(i=0; i<width*height; i++)
-        if(FFABS(buffer[1][i] - obuffer[i])>20) {
-            printf("fsck: %4dx%4d %12d %7d\n",i%width, i/width, buffer[1][i], obuffer[i]);
-            ret = 1;
-        }
+        if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);

     {
     int level, orientation, x, y;
@@ -93,18 +81,18 @@ int main(void){
                 int w= width  >> (s.spatial_decomposition_count-level);
                 int h= height >> (s.spatial_decomposition_count-level);
                 int stride= width  << (s.spatial_decomposition_count-level);
-                IDWTELEM *buf= obuffer;
+                DWTELEM *buf= buffer[0];
                 int64_t error=0;

                 if(orientation&1) buf+=w;
                 if(orientation>1) buf+=stride>>1;

-                memset(obuffer, 0, sizeof(short)*width*height);
-                buf[w/2 + h/2*stride]= 8*256;
-                ff_spatial_idwt(obuffer, s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+                memset(buffer[0], 0, sizeof(int)*width*height);
+                buf[w/2 + h/2*stride]= 256*256;
+                ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
                 for(y=0; y<height; y++){
                     for(x=0; x<width; x++){
-                        int64_t d= obuffer[x + y*width];
+                        int64_t d= buffer[0][x + y*width];
                         error += d*d;
                         if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
                     }
@@ -155,5 +143,5 @@ int main(void){
         }

     }
-    return ret;
+    return 0;
 }
diff --git a/libavcodec/texturedspenc.c b/libavcodec/texturedspenc.c
index 5ce72cbd1e..3d68e0cf39 100644
--- a/libavcodec/texturedspenc.c
+++ b/libavcodec/texturedspenc.c
@@ -255,11 +255,11 @@ static void optimize_colors(const uint8_t *block, ptrdiff_t stride,

         muv = minv = maxv = bp[0];
         for (y = 0; y < 4; y++) {
-            for (x = 0; x < 4; x++) {
+            for (x = 4; x < 4; x += 4) {
                 muv += bp[x * 4 + y * stride];
-                if (bp[x * 4 + y * stride] < minv)
+                if (bp[x] < minv)
                     minv = bp[x * 4 + y * stride];
-                else if (bp[x * 4 + y * stride] > maxv)
+                else if (bp[x] > maxv)
                     maxv = bp[x * 4 + y * stride];
             }
         }
diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index 05187dce51..f8c68f1e7d 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -100,6 +100,7 @@ typedef struct TiffContext {
     int is_tiled;
     int tile_byte_counts_offset, tile_offsets_offset;
     int tile_width, tile_length;
+    int tile_count;

     int is_jpeg;

@@ -734,6 +735,19 @@ static int dng_decode_jpeg(AVCodecContext *avctx, AVFrame *frame,
     return 0;
 }

+static int dng_decode_strip(AVCodecContext *avctx, AVFrame *frame)
+{
+    TiffContext *s = avctx->priv_data;
+
+    s->jpgframe->width  = s->width;
+    s->jpgframe->height = s->height;
+
+    s->avctx_mjpeg->width = s->width;
+    s->avctx_mjpeg->height = s->height;
+
+    return dng_decode_jpeg(avctx, frame, s->stripsize, 0, 0, s->width, s->height);
+}
+
 static int tiff_unpack_strip(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
                              const uint8_t *src, int size, int strip_start, int lines)
 {
@@ -772,7 +786,6 @@ static int tiff_unpack_strip(TiffContext *s, AVFrame *p, uint8_t *dst, int strid
     if (s->is_bayer) {
         av_assert0(width == (s->bpp * s->width + 7) >> 3);
     }
-    av_assert0(!(s->is_bayer && is_yuv));
     if (p->format == AV_PIX_FMT_GRAY12) {
         av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, width);
         if (s->yuv_line == NULL) {
@@ -856,9 +869,7 @@ static int tiff_unpack_strip(TiffContext *s, AVFrame *p, uint8_t *dst, int strid
             av_log(s->avctx, AV_LOG_ERROR, "More than one DNG JPEG strips unsupported\n");
             return AVERROR_PATCHWELCOME;
         }
-        if (!s->is_bayer)
-            return AVERROR_PATCHWELCOME;
-        if ((ret = dng_decode_jpeg(s->avctx, p, s->stripsize, 0, 0, s->width, s->height)) < 0)
+        if ((ret = dng_decode_strip(s->avctx, p)) < 0)
             return ret;
         return 0;
     }
@@ -976,8 +987,11 @@ static int dng_decode_tiles(AVCodecContext *avctx, AVFrame *frame,
     int pos_x = 0, pos_y = 0;
     int ret;

-    if (s->tile_width <= 0 || s->tile_length <= 0)
-        return AVERROR_INVALIDDATA;
+    s->jpgframe->width  = s->tile_width;
+    s->jpgframe->height = s->tile_length;
+
+    s->avctx_mjpeg->width = s->tile_width;
+    s->avctx_mjpeg->height = s->tile_length;

     has_width_leftover = (s->width % s->tile_width != 0);
     has_height_leftover = (s->height % s->tile_length != 0);
@@ -987,7 +1001,7 @@ static int dng_decode_tiles(AVCodecContext *avctx, AVFrame *frame,
     tile_count_y = (s->height + s->tile_length - 1) / s->tile_length;

     /* Iterate over the number of tiles */
-    for (tile_idx = 0; tile_idx < tile_count_x * tile_count_y; tile_idx++) {
+    for (tile_idx = 0; tile_idx < s->tile_count; tile_idx++) {
         tile_x = tile_idx % tile_count_x;
         tile_y = tile_idx / tile_count_x;

@@ -1407,6 +1421,7 @@ static int tiff_decode_tag(TiffContext *s, AVFrame *frame)
         break;
     case TIFF_TILE_OFFSETS:
         s->tile_offsets_offset = off;
+        s->tile_count = count;
         s->is_tiled = 1;
         break;
     case TIFF_TILE_BYTE_COUNTS:
@@ -1762,7 +1777,7 @@ static int decode_frame(AVCodecContext *avctx,
     TiffContext *const s = avctx->priv_data;
     AVFrame *const p = data;
     ThreadFrame frame = { .f = data };
-    unsigned off, last_off = 0;
+    unsigned off, last_off;
     int le, ret, plane, planes;
     int i, j, entries, stride;
     unsigned soff, ssize;
@@ -1827,6 +1842,7 @@ again:
     /** whether we should process this multi-page IFD's next page */
     retry_for_page = s->get_page && s->cur_page + 1 < s->get_page;  // get_page is 1-indexed

+    last_off = off;
     if (retry_for_page) {
         // set offset to the next IFD
         off = ff_tget_long(&s->gb, le);
@@ -1844,7 +1860,6 @@ again:
             avpriv_request_sample(s->avctx, "non increasing IFD offset");
             return AVERROR_INVALIDDATA;
         }
-        last_off = off;
         if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
             av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
             return AVERROR_INVALIDDATA;
@@ -1901,7 +1916,7 @@ again:
         return AVERROR_INVALIDDATA;
     }

-    has_tile_bits  = s->is_tiled || s->tile_byte_counts_offset || s->tile_offsets_offset || s->tile_width || s->tile_length;
+    has_tile_bits  = s->is_tiled || s->tile_byte_counts_offset || s->tile_offsets_offset || s->tile_width || s->tile_length || s->tile_count;
     has_strip_bits = s->strippos || s->strips || s->stripoff || s->rps || s->sot || s->sstype || s->stripsize || s->stripsizesoff;

     if (has_tile_bits && has_strip_bits) {
@@ -2154,7 +2169,6 @@ static av_cold int tiff_init(AVCodecContext *avctx)
     s->avctx_mjpeg->flags2 = avctx->flags2;
     s->avctx_mjpeg->dct_algo = avctx->dct_algo;
     s->avctx_mjpeg->idct_algo = avctx->idct_algo;
-    s->avctx_mjpeg->max_pixels = avctx->max_pixels;
     ret = avcodec_open2(s->avctx_mjpeg, codec, NULL);
     if (ret < 0) {
         return ret;
diff --git a/libavcodec/tta.c b/libavcodec/tta.c
index 3630afcfae..f1e159b03d 100644
--- a/libavcodec/tta.c
+++ b/libavcodec/tta.c
@@ -371,15 +371,8 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
     case 3: {
         // shift samples for 24-bit sample format
         int32_t *samples = (int32_t *)frame->data[0];
-        int overflow = 0;
-
-        for (i = 0; i < framelen * s->channels; i++) {
-            int scaled = *samples * 256U;
-            overflow += (scaled >> 8 != *samples);
-            *samples++ = scaled;
-        }
-        if (overflow)
-            av_log(avctx, AV_LOG_WARNING, "%d overflows occurred on 24bit upscale\n", overflow);
+        for (i = 0; i < framelen * s->channels; i++)
+            *samples++ *= 256;
         // reset decode buffer
         s->decode_buffer = NULL;
         break;
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index fdc3de1b1d..825094d2f3 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -236,8 +236,6 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
     case AV_PIX_FMT_GBRAP16BE:
         w_align = 16; //FIXME assume 16 pixel per macroblock
         h_align = 16 * 2; // interlaced needs 2 macroblocks height
-        if (s->codec_id == AV_CODEC_ID_BINKVIDEO)
-            w_align = 16*2;
         break;
     case AV_PIX_FMT_YUV411P:
     case AV_PIX_FMT_YUVJ411P:
@@ -316,7 +314,6 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
     *width  = FFALIGN(*width, w_align);
     *height = FFALIGN(*height, h_align);
     if (s->codec_id == AV_CODEC_ID_H264 || s->lowres ||
-        s->codec_id == AV_CODEC_ID_VC1  || s->codec_id == AV_CODEC_ID_WMV3 ||
         s->codec_id == AV_CODEC_ID_VP5  || s->codec_id == AV_CODEC_ID_VP6 ||
         s->codec_id == AV_CODEC_ID_VP6F || s->codec_id == AV_CODEC_ID_VP6A
     ) {
@@ -330,9 +327,6 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
         // the next rounded up width is 32
         *width = FFMAX(*width, 32);
     }
-    if (s->codec_id == AV_CODEC_ID_SVQ3) {
-        *width = FFMAX(*width, 32);
-    }

     for (i = 0; i < 4; i++)
         linesize_align[i] = STRIDE_ALIGN;
diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 4b2679eb38..8d80d19788 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -21,6 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include <drm_fourcc.h>
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
@@ -29,57 +30,88 @@
 #include <poll.h>
 #include "libavcodec/avcodec.h"
 #include "libavcodec/internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext.h"
 #include "v4l2_context.h"
 #include "v4l2_buffers.h"
 #include "v4l2_m2m.h"
+#include "v4l2_req_dmabufs.h"
+#include "weak_link.h"

 #define USEC_PER_SEC 1000000
-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };

-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
 {
-    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
-        container_of(buf->context, V4L2m2mContext, output) :
-        container_of(buf->context, V4L2m2mContext, capture);
+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+        container_of(ctx, V4L2m2mContext, output) :
+        container_of(ctx, V4L2m2mContext, capture);
 }

-static inline AVCodecContext *logger(V4L2Buffer *buf)
+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
 {
-    return buf_to_m2mctx(buf)->avctx;
+    return ctx_to_m2mctx(buf->context);
 }

-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
+static inline AVCodecContext *logger(const V4L2Buffer * const buf)
 {
-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    return buf_to_m2mctx(buf)->avctx;
+}

-    if (s->avctx->pkt_timebase.num)
-        return s->avctx->pkt_timebase;
-    return s->avctx->time_base;
+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
+{
+    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    const AVRational tb = s->avctx->pkt_timebase.num ?
+        s->avctx->pkt_timebase :
+        s->avctx->time_base;
+    return tb.num && tb.den ? tb : v4l2_timebase;
 }

-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
+static inline struct timeval tv_from_int(const int64_t t)
 {
-    int64_t v4l2_pts;
+    return (struct timeval){
+        .tv_usec = t % USEC_PER_SEC,
+        .tv_sec  = t / USEC_PER_SEC
+    };
+}

-    if (pts == AV_NOPTS_VALUE)
-        pts = 0;
+static inline int64_t int_from_tv(const struct timeval t)
+{
+    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
+}

+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
+{
     /* convert pts to v4l2 timebase */
-    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
+    const int64_t v4l2_pts =
+        pts == AV_NOPTS_VALUE ? 0 :
+            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+    out->buf.timestamp = tv_from_int(v4l2_pts);
 }

-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
 {
-    int64_t v4l2_pts;
-
+    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
+    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
+#if 0
     /* convert pts back to encoder timebase */
-    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                        avbuf->buf.timestamp.tv_usec;
+    return
+        avbuf->context->no_pts_rescale ? v4l2_pts :
+        v4l2_pts == 0 ? AV_NOPTS_VALUE :
+            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+#endif
+}

-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
+{
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        out->planes[plane].bytesused = bytesused;
+        out->planes[plane].length = length;
+    } else {
+        out->buf.bytesused = bytesused;
+        out->buf.length = length;
+    }
 }

 static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
@@ -116,6 +148,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
     return AVCOL_PRI_UNSPECIFIED;
 }

+static void v4l2_set_color(V4L2Buffer *buf,
+                           const enum AVColorPrimaries avcp,
+                           const enum AVColorSpace avcs,
+                           const enum AVColorTransferCharacteristic avxc)
+{
+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
+
+    switch (avcp) {
+    case AVCOL_PRI_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        ycbcr = V4L2_YCBCR_ENC_709;
+        break;
+    case AVCOL_PRI_BT470M:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        ycbcr = V4L2_YCBCR_ENC_601;
+        break;
+    case AVCOL_PRI_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_PRI_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_PRI_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_PRI_BT2020:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    case AVCOL_PRI_SMPTE428:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+    case AVCOL_PRI_EBU3213:
+    case AVCOL_PRI_RESERVED:
+    case AVCOL_PRI_FILM:
+    case AVCOL_PRI_UNSPECIFIED:
+    default:
+        break;
+    }
+
+    switch (avcs) {
+    case AVCOL_SPC_RGB:
+        cs = V4L2_COLORSPACE_SRGB;
+        break;
+    case AVCOL_SPC_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        break;
+    case AVCOL_SPC_FCC:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        break;
+    case AVCOL_SPC_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_SPC_BT2020_CL:
+        cs = V4L2_COLORSPACE_BT2020;
+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    default:
+        break;
+    }
+
+    switch (xfer) {
+    case AVCOL_TRC_BT709:
+        xfer = V4L2_XFER_FUNC_709;
+        break;
+    case AVCOL_TRC_IEC61966_2_1:
+        xfer = V4L2_XFER_FUNC_SRGB;
+        break;
+    case AVCOL_TRC_SMPTE240M:
+        xfer = V4L2_XFER_FUNC_SMPTE240M;
+        break;
+    case AVCOL_TRC_SMPTE2084:
+        xfer = V4L2_XFER_FUNC_SMPTE2084;
+        break;
+    default:
+        break;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
+        buf->context->format.fmt.pix_mp.colorspace = cs;
+        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
+        buf->context->format.fmt.pix_mp.xfer_func = xfer;
+    } else {
+        buf->context->format.fmt.pix.colorspace = cs;
+        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
+        buf->context->format.fmt.pix.xfer_func = xfer;
+    }
+}
+
 static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
 {
     enum v4l2_quantization qt;
@@ -134,6 +265,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
      return AVCOL_RANGE_UNSPECIFIED;
 }

+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
+{
+    const enum v4l2_quantization q =
+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
+            V4L2_QUANTIZATION_DEFAULT;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
+        buf->context->format.fmt.pix_mp.quantization = q;
+    } else {
+        buf->context->format.fmt.pix.quantization = q;
+    }
+}
+
 static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
 {
     enum v4l2_ycbcr_encoding ycbcr;
@@ -210,73 +355,178 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
     return AVCOL_TRC_UNSPECIFIED;
 }

-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
 {
-    V4L2Buffer* avbuf = opaque;
-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
+}

-    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
-        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
+{
+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
+}

-        if (s->reinit) {
-            if (!atomic_load(&s->refcount))
-                sem_post(&s->refsync);
-        } else {
-            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
-                /* no need to queue more buffers to the driver */
-                avbuf->status = V4L2BUF_AVAILABLE;
-            }
-            else if (avbuf->context->streamon)
-                ff_v4l2_buffer_enqueue(avbuf);
-        }
+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
+{
+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
+}

-        av_buffer_unref(&avbuf->context_ref);
+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
+{
+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor *layer;
+
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_objects = avbuf->num_planes;
+    drm_desc->nb_layers = 1;
+
+    layer = &drm_desc->layers[0];
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = avbuf->plane_info[i].offset;
+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
     }
+
+    switch (avbuf->context->av_pix_fmt) {
+    case AV_PIX_FMT_YUYV422:
+
+        layer->format = DRM_FORMAT_YUYV;
+        layer->nb_planes = 1;
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV21:
+
+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 2;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            avbuf->context->format.fmt.pix.height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+
+        layer->format = DRM_FORMAT_YUV420;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 3;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            avbuf->context->format.fmt.pix.height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
+
+        layer->planes[2].object_index = 0;
+        layer->planes[2].offset = layer->planes[1].offset +
+            ((avbuf->plane_info[0].bytesperline *
+              avbuf->context->format.fmt.pix.height) >> 2);
+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
+        break;
+
+    default:
+        drm_desc->nb_layers = 0;
+        break;
+    }
+
+    return (uint8_t *) drm_desc;
 }

-static int v4l2_buf_increase_ref(V4L2Buffer *in)
+static void v4l2_free_bufref(void *opaque, uint8_t *data)
 {
-    V4L2m2mContext *s = buf_to_m2mctx(in);
+    AVBufferRef * bufref = (AVBufferRef *)data;
+    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
+    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);

-    if (in->context_ref)
-        atomic_fetch_add(&in->context_refcount, 1);
-    else {
-        in->context_ref = av_buffer_ref(s->self_ref);
-        if (!in->context_ref)
-            return AVERROR(ENOMEM);
+    if (ctx != NULL) {
+        // Buffer still attached to context
+        V4L2m2mContext *s = buf_to_m2mctx(avbuf);

-        in->context_refcount = 1;
-    }
+        ff_mutex_lock(&ctx->lock);

-    in->status = V4L2BUF_RET_USER;
-    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
+        ff_v4l2_buffer_set_avail(avbuf);

-    return 0;
+        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
+            /* no need to queue more buffers to the driver */
+        }
+        else if (ctx->streamon) {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
+            avbuf->buf.timestamp.tv_sec = 0;
+            avbuf->buf.timestamp.tv_usec = 0;
+            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
+        }
+        else {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
+        }
+
+        ff_mutex_unlock(&ctx->lock);
+    }
+
+    ff_weak_link_unlock(avbuf->context_wl);
+    av_buffer_unref(&bufref);
 }

-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
+static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i)
 {
-    int ret;
+    return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length;
+}

-    if (plane >= in->num_planes)
-        return AVERROR(EINVAL);
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
+{
+    int i, ret;
+    const V4L2m2mContext * const s = buf_to_m2mctx(avbuf);

-    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
-    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
-                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
-    if (!*buf)
-        return AVERROR(ENOMEM);
+    for (i = 0; i < avbuf->num_planes; i++) {
+        int dma_fd = -1;
+        const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i);
+
+        if (s->db_ctl != NULL) {
+            if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL)
+                return AVERROR(ENOMEM);
+            dma_fd = dmabuf_fd(avbuf->dmabuf[i]);
+            if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type))
+                avbuf->buf.m.planes[i].m.fd = dma_fd;
+            else
+                avbuf->buf.m.fd = dma_fd;
+        }
+        else {
+            struct v4l2_exportbuffer expbuf;
+            memset(&expbuf, 0, sizeof(expbuf));
+
+            expbuf.index = avbuf->buf.index;
+            expbuf.type = avbuf->buf.type;
+            expbuf.plane = i;
+
+            ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf);
+            if (ret < 0)
+                return AVERROR(errno);
+            dma_fd = expbuf.fd;
+        }

-    ret = v4l2_buf_increase_ref(in);
-    if (ret)
-        av_buffer_unref(buf);
+        avbuf->drm_frame.objects[i].size = blen;
+        avbuf->drm_frame.objects[i].fd = dma_fd;
+        avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+    }

-    return ret;
+    return 0;
 }

 static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
 {
     unsigned int bytesused, length;
+    int rv = 0;

     if (plane >= out->num_planes)
         return AVERROR(EINVAL);
@@ -284,32 +534,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
     length = out->plane_info[plane].length;
     bytesused = FFMIN(size+offset, length);

-    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
-
-    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-        out->planes[plane].bytesused = bytesused;
-        out->planes[plane].length = length;
-    } else {
-        out->buf.bytesused = bytesused;
-        out->buf.length = length;
+    if (size > length - offset) {
+        size = length - offset;
+        rv = AVERROR(ENOMEM);
     }

-    return 0;
+    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
+
+    set_buf_length(out, plane, bytesused, length);
+
+    return rv;
+}
+
+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
+{
+    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
+    AVBufferRef * newbuf;
+
+    if (!bufref)
+        return NULL;
+
+    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
+    if (newbuf == NULL)
+        av_buffer_unref(&bufref);
+
+    avbuf->status = V4L2BUF_RET_USER;
+    return newbuf;
 }

 static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
 {
-    int i, ret;
+    int i;

     frame->format = avbuf->context->av_pix_fmt;

-    for (i = 0; i < avbuf->num_planes; i++) {
-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-        if (ret)
-            return ret;
+    frame->buf[0] = wrap_avbuf(avbuf);
+    if (frame->buf[0] == NULL)
+        return AVERROR(ENOMEM);

+    if (buf_to_m2mctx(avbuf)->output_drm) {
+        /* 1. get references to the actual data */
+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
+        return 0;
+    }
+
+
+    /* 1. get references to the actual data */
+    for (i = 0; i < avbuf->num_planes; i++) {
+        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
-        frame->data[i] = frame->buf[i]->data;
     }

     /* fixup special cases */
@@ -318,17 +593,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
     case AV_PIX_FMT_NV21:
         if (avbuf->num_planes > 1)
             break;
-        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+        frame->linesize[1] = frame->linesize[0];
+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
         break;

     case AV_PIX_FMT_YUV420P:
         if (avbuf->num_planes > 1)
             break;
-        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
-        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
-        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
+        frame->linesize[1] = frame->linesize[0] / 2;
+        frame->linesize[2] = frame->linesize[1];
+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
         break;

     default:
@@ -338,68 +613,127 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
     return 0;
 }

+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
+{
+    if (dst_stride == src_stride && w + 32 >= dst_stride) {
+        memcpy(dst, src, dst_stride * h);
+    }
+    else {
+        while (--h >= 0) {
+            memcpy(dst, src, w);
+            dst += dst_stride;
+            src += src_stride;
+        }
+    }
+}
+
+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
+{
+    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
+}
+
+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+{
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
+        return AVERROR(EINVAL);
+
+    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        // Only currently cope with single buffer types
+        if (out->buf.length != 1)
+            return AVERROR_PATCHWELCOME;
+        if (src->nb_objects != 1)
+            return AVERROR(EINVAL);
+
+        out->planes[0].m.fd = src->objects[0].fd;
+    }
+    else {
+        if (src->nb_objects != 1)
+            return AVERROR(EINVAL);
+
+        out->buf.m.fd      = src->objects[0].fd;
+    }
+
+    // No need to copy src AVDescriptor and if we did then we may confuse
+    // fd close on free
+    out->ref_buf = av_buffer_ref(frame->buf[0]);
+
+    return 0;
+}
+
 static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    int i, ret;
-    struct v4l2_format fmt = out->context->format;
-    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
-                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
-    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
-                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
-    int is_planar_format = 0;
-
-    switch (pixel_format) {
-    case V4L2_PIX_FMT_YUV420M:
-    case V4L2_PIX_FMT_YVU420M:
-#ifdef V4L2_PIX_FMT_YUV422M
-    case V4L2_PIX_FMT_YUV422M:
-#endif
-#ifdef V4L2_PIX_FMT_YVU422M
-    case V4L2_PIX_FMT_YVU422M:
-#endif
-#ifdef V4L2_PIX_FMT_YUV444M
-    case V4L2_PIX_FMT_YUV444M:
-#endif
-#ifdef V4L2_PIX_FMT_YVU444M
-    case V4L2_PIX_FMT_YVU444M:
-#endif
-    case V4L2_PIX_FMT_NV12M:
-    case V4L2_PIX_FMT_NV21M:
-    case V4L2_PIX_FMT_NV12MT_16X16:
-    case V4L2_PIX_FMT_NV12MT:
-    case V4L2_PIX_FMT_NV16M:
-    case V4L2_PIX_FMT_NV61M:
-        is_planar_format = 1;
-    }
-
-    if (!is_planar_format) {
-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-        int planes_nb = 0;
-        int offset = 0;
-
-        for (i = 0; i < desc->nb_components; i++)
-            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
-
-        for (i = 0; i < planes_nb; i++) {
-            int size, h = height;
-            if (i == 1 || i == 2) {
+    int i;
+    int num_planes = 0;
+    int pel_strides[4] = {0};
+
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+
+    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
+        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
+        return -1;
+    }
+
+    for (i = 0; i != desc->nb_components; ++i) {
+        if (desc->comp[i].plane >= num_planes)
+            num_planes = desc->comp[i].plane + 1;
+        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
+    }
+
+    if (out->num_planes > 1) {
+        if (num_planes != out->num_planes) {
+            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
+            return -1;
+        }
+        for (i = 0; i != num_planes; ++i) {
+            int w = frame->width;
+            int h = frame->height;
+            if (is_chroma(desc, i, num_planes)) {
+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
             }
-            size = frame->linesize[i] * h;
-            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset);
-            if (ret)
-                return ret;
-            offset += size;
+
+            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
+                   frame->data[i], frame->linesize[i],
+                   w * pel_strides[i], h);
+            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
         }
-        return 0;
     }
+    else
+    {
+        unsigned int offset = 0;
+
+        for (i = 0; i != num_planes; ++i) {
+            int w = frame->width;
+            int h = frame->height;
+            int dst_stride = out->plane_info[0].bytesperline;
+            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
+
+            if (is_chroma(desc, i, num_planes)) {
+                // Is chroma
+                dst_stride >>= desc->log2_chroma_w;
+                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+            }
+            else {
+                // Is luma or alpha
+                offset += dst_stride * out->context->height;
+            }
+            if (offset > out->plane_info[0].length) {
+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
+                return -1;
+            }

-    for (i = 0; i < out->num_planes; i++) {
-        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0);
-        if (ret)
-            return ret;
+            cpy_2d(dst, dst_stride,
+                   frame->data[i], frame->linesize[i],
+                   w * pel_strides[i], h);
+        }
+        set_buf_length(out, 0, offset, out->plane_info[0].length);
     }
-
     return 0;
 }

@@ -409,16 +743,31 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
  *
  ******************************************************************************/

-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
 {
-    v4l2_set_pts(out, frame->pts);
-
-    return v4l2_buffer_swframe_to_buf(frame, out);
+    out->buf.flags = frame->key_frame ?
+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
+    // Beware that colour info is held in format rather than the actual
+    // v4l2 buffer struct so this may not be as useful as you might hope
+    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
+    v4l2_set_color_range(out, frame->color_range);
+    // PTS & interlace are buffer vars
+    if (track_ts)
+        out->buf.timestamp = tv_from_int(track_ts);
+    else
+        v4l2_set_pts(out, frame->pts);
+    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
+
+    return frame->format == AV_PIX_FMT_DRM_PRIME ?
+        v4l2_buffer_primeframe_to_buf(frame, out) :
+        v4l2_buffer_swframe_to_buf(frame, out);
 }

 int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
 {
     int ret;
+    V4L2Context * const ctx = avbuf->context;

     av_frame_unref(frame);

@@ -429,17 +778,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)

     /* 2. get frame information */
     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
+        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
+        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
+            AV_PICTURE_TYPE_NONE;
     frame->color_primaries = v4l2_get_color_primaries(avbuf);
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
     frame->color_trc = v4l2_get_color_trc(avbuf);
     frame->pts = v4l2_get_pts(avbuf);
     frame->pkt_dts = AV_NOPTS_VALUE;
+    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
+    frame->top_field_first = v4l2_buf_is_top_first(avbuf);

     /* these values are updated also during re-init in v4l2_process_driver_event */
-    frame->height = avbuf->context->height;
-    frame->width = avbuf->context->width;
-    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
+    frame->height = ctx->height;
+    frame->width = ctx->width;
+    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
+
+    if (ctx->selection.height && ctx->selection.width) {
+        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
+        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
+        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
+            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
+        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
+            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
+    }

     /* 3. report errors upstream */
     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
@@ -452,15 +816,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)

 int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
 {
-    int ret;
-
     av_packet_unref(pkt);
-    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
-    if (ret)
-        return ret;
+
+    pkt->buf = wrap_avbuf(avbuf);
+    if (pkt->buf == NULL)
+        return AVERROR(ENOMEM);

     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-    pkt->data = pkt->buf->data;
+    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
+    pkt->flags = 0;

     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
         pkt->flags |= AV_PKT_FLAG_KEY;
@@ -475,39 +839,107 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
     return 0;
 }

-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
+                                    const void *extdata, size_t extlen,
+                                    const int64_t timestamp)
 {
     int ret;

-    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0);
-    if (ret)
+    if (extlen) {
+        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
+        if (ret)
+            return ret;
+    }
+
+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
+    if (ret && ret != AVERROR(ENOMEM))
         return ret;

-    v4l2_set_pts(out, pkt->pts);
+    if (timestamp)
+        out->buf.timestamp = tv_from_int(timestamp);
+    else
+        v4l2_set_pts(out, pkt->pts);

-    if (pkt->flags & AV_PKT_FLAG_KEY)
-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
+    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);

-    return 0;
+    return ret;
+}
+
+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+{
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
 }

-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+
+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
+{
+    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
+    int i;
+
+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
+        struct V4L2Plane_info *p = avbuf->plane_info + i;
+        if (p->mm_addr != NULL)
+            munmap(p->mm_addr, p->length);
+    }
+
+    if (avbuf->dmabuf[0] == NULL) {
+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+            if (avbuf->drm_frame.objects[i].fd != -1)
+                close(avbuf->drm_frame.objects[i].fd);
+        }
+    }
+    else {
+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) {
+            dmabuf_free(avbuf->dmabuf[i]);
+        }
+    }
+
+    av_buffer_unref(&avbuf->ref_buf);
+
+    ff_weak_link_unref(&avbuf->context_wl);
+
+    av_free(avbuf);
+}
+
+
+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
 {
-    V4L2Context *ctx = avbuf->context;
     int ret, i;
+    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
+    AVBufferRef * bufref;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);

-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
+    *pbufref = NULL;
+    if (avbuf == NULL)
+        return AVERROR(ENOMEM);
+
+    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
+    if (bufref == NULL) {
+        av_free(avbuf);
+        return AVERROR(ENOMEM);
+    }
+
+    avbuf->context = ctx;
+    avbuf->buf.memory = mem;
     avbuf->buf.type = ctx->type;
     avbuf->buf.index = index;

+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+        avbuf->drm_frame.objects[i].fd = -1;
+    }
+
+    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
+
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->buf.length = VIDEO_MAX_PLANES;
         avbuf->buf.m.planes = avbuf->planes;
     }

-    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+    ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf);
     if (ret < 0)
-        return AVERROR(errno);
+        goto fail;

     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->num_planes = 0;
@@ -520,6 +952,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
         avbuf->num_planes = 1;

     for (i = 0; i < avbuf->num_planes; i++) {
+        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
+            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);

         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
@@ -527,25 +961,31 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)

         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+            avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset;
+
+            if (want_mmap)
+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;
-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+            avbuf->plane_info[i].offset = 0;
+
+            if (want_mmap)
+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
         }

-        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
-            return AVERROR(ENOMEM);
+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
+            avbuf->plane_info[i].mm_addr = NULL;
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
     }

     avbuf->status = V4L2BUF_AVAILABLE;

-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        return 0;
-
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->buf.m.planes = avbuf->planes;
         avbuf->buf.length   = avbuf->num_planes;
@@ -555,20 +995,53 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
         avbuf->buf.length    = avbuf->planes[0].length;
     }

-    return ff_v4l2_buffer_enqueue(avbuf);
+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        if (s->output_drm) {
+            ret = v4l2_buffer_export_drm(avbuf);
+            if (ret) {
+                av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n");
+                goto fail;
+            }
+        }
+    }
+
+    *pbufref = bufref;
+    return 0;
+
+fail:
+    av_buffer_unref(&bufref);
+    return ret;
 }

 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
 {
     int ret;
+    int qc;

-    avbuf->buf.flags = avbuf->flags;
+    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
+        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
+               avbuf->context->name, avbuf->buf.index,
+               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
+               avbuf->context->q_count);
+    }

     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
-    if (ret < 0)
-        return AVERROR(errno);
+    if (ret < 0) {
+        int err = errno;
+        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
+               avbuf->context->name, avbuf->buf.index,
+               err, strerror(err));
+        return AVERROR(err);
+    }

+    // Lock not wanted - if called from buffer free then lock already obtained
+    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
     avbuf->status = V4L2BUF_IN_DRIVER;
+    pthread_cond_broadcast(&avbuf->context->cond);
+
+    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
+           avbuf->context->name, avbuf->buf.index,
+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);

     return 0;
 }
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 8dbc7fc104..0bda4dd06b 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -27,29 +27,44 @@
 #include <stdatomic.h>
 #include <linux/videodev2.h>

+#include "libavutil/hwcontext_drm.h"
 #include "avcodec.h"

 enum V4L2Buffer_status {
     V4L2BUF_AVAILABLE,
     V4L2BUF_IN_DRIVER,
+    V4L2BUF_IN_USE,
     V4L2BUF_RET_USER,
 };

 /**
  * V4L2Buffer (wrapper for v4l2_buffer management)
  */
+struct V4L2Context;
+struct ff_weak_link_client;
+struct dmabuf_h;
+
 typedef struct V4L2Buffer {
-    /* each buffer needs to have a reference to its context */
+    /* each buffer needs to have a reference to its context
+     * The pointer is good enough for most operation but once the buffer has
+     * been passed to the user the buffer may become orphaned so for free ops
+     * the weak link must be used to ensure that the context is actually
+     * there
+     */
     struct V4L2Context *context;
+    struct ff_weak_link_client *context_wl;

-    /* This object is refcounted per-plane, so we need to keep track
-     * of how many context-refs we are holding. */
-    AVBufferRef *context_ref;
-    atomic_uint context_refcount;
+    /* DRM descriptor */
+    AVDRMFrameDescriptor drm_frame;
+    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
+     * are done
+     */
+    AVBufferRef * ref_buf;

     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
-        int bytesperline;
+        size_t bytesperline;
+        size_t offset;
         void * mm_addr;
         size_t length;
     } plane_info[VIDEO_MAX_PLANES];
@@ -60,9 +75,9 @@ typedef struct V4L2Buffer {
     struct v4l2_buffer buf;
     struct v4l2_plane planes[VIDEO_MAX_PLANES];

-    int flags;
     enum V4L2Buffer_status status;

+    struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here
 } V4L2Buffer;

 /**
@@ -98,6 +113,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
  */
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);

+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
+                                    const void *extdata, size_t extlen,
+                                    const int64_t timestamp);
+
 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
  *
@@ -106,7 +125,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);

 /**
  * Initializes a V4L2Buffer
@@ -116,7 +135,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);

 /**
  * Enqueues a V4L2Buffer
@@ -127,5 +146,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
  */
 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);

+static inline void
+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
+{
+    avbuf->status = V4L2BUF_AVAILABLE;
+    av_buffer_unref(&avbuf->ref_buf);
+}
+

 #endif // AVCODEC_V4L2_BUFFERS_H
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index ff1ea8e57b..fcd5fdf359 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -27,11 +27,13 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <poll.h>
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/internal.h"
 #include "v4l2_buffers.h"
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
+#include "weak_link.h"

 struct v4l2_format_update {
     uint32_t v4l2_fmt;
@@ -41,26 +43,168 @@ struct v4l2_format_update {
     int update_avfmt;
 };

-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
+
+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
 {
-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
-        container_of(ctx, V4L2m2mContext, output) :
-        container_of(ctx, V4L2m2mContext, capture);
+    return (int64_t)n;
 }

-static inline AVCodecContext *logger(V4L2Context *ctx)
+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
 {
-    return ctx_to_m2mctx(ctx)->avctx;
+    return (unsigned int)pts;
 }

-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
+// FFmpeg requires us to propagate a number of vars from the coded pkt into
+// the decoded frame. The only thing that tracks like that in V4L2 stateful
+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
+// guarantees about PTS being unique or specified for every frame so replace
+// the supplied PTS with a simple incrementing number and keep a circular
+// buffer of all the things we want preserved (including the original PTS)
+// indexed by the tracking no.
+static int64_t
+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
 {
-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++x->track_no == 0)
+        x->track_no = 1;
+
+    track_pts = track_to_pts(avctx, x->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pending          = 1,
+        .pkt_size         = avpkt->size,
+        .pts              = avpkt->pts,
+        .dts              = avpkt->dts,
+        .reordered_opaque = avctx->reordered_opaque,
+        .pkt_pos          = avpkt->pos,
+        .pkt_duration     = avpkt->duration,
+        .track_pts        = track_pts
+    };
+    return track_pts;
+}
+
+static int64_t
+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++x->track_no == 0)
+        x->track_no = 1;
+
+    track_pts = track_to_pts(avctx, x->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pending          = 1,
+        .pkt_size         = 0,
+        .pts              = frame->pts,
+        .dts              = AV_NOPTS_VALUE,
+        .reordered_opaque = frame->reordered_opaque,
+        .pkt_pos          = frame->pkt_pos,
+        .pkt_duration     = frame->pkt_duration,
+        .track_pts        = track_pts
+    };
+    return track_pts;
 }

-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_frame_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             AVFrame *const frame)
 {
-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    V4L2m2mTrackEl *const t = x->track_els + n;
+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
+    {
+        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
+               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        frame->pts              = AV_NOPTS_VALUE;
+        frame->pkt_dts          = AV_NOPTS_VALUE;
+        frame->reordered_opaque = x->last_opaque;
+        frame->pkt_pos          = -1;
+        frame->pkt_duration     = 0;
+        frame->pkt_size         = -1;
+    }
+    else if (!t->discard)
+    {
+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
+        frame->pkt_dts          = t->dts;
+        frame->reordered_opaque = t->reordered_opaque;
+        frame->pkt_pos          = t->pkt_pos;
+        frame->pkt_duration     = t->pkt_duration;
+        frame->pkt_size         = t->pkt_size;
+
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (frame->pts != AV_NOPTS_VALUE)
+            x->last_pts = frame->pts;
+        t->pending = 0;
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        return -1;
+    }
+
+    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
+    return 0;
+}
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_pkt_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             AVPacket *const pkt)
+{
+    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    V4L2m2mTrackEl *const t = x->track_els + n;
+    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
+    {
+        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
+               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
+        pkt->pts                = AV_NOPTS_VALUE;
+    }
+    else if (!t->discard)
+    {
+        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
+
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (pkt->pts != AV_NOPTS_VALUE)
+            x->last_pts = pkt->pts;
+        t->pending = 0;
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
+        return -1;
+    }
+
+    // * Would like something much better than this...xlat(offset + out_count)?
+    pkt->dts = pkt->pts;
+    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           pkt->pts, t->track_pts, n);
+    return 0;
+}
+
+
+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
+{
+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+        container_of(ctx, V4L2m2mContext, output) :
+        container_of(ctx, V4L2m2mContext, capture);
+}
+
+static inline AVCodecContext *logger(const V4L2Context *ctx)
+{
+    return ctx_to_m2mctx(ctx)->avctx;
 }

 static AVRational v4l2_get_sar(V4L2Context *ctx)
@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
     return sar;
 }

-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
+static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
 {
-    struct v4l2_format *fmt1 = &ctx->format;
-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
-        :
-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
+    return ctx->bufrefs != NULL;
+}
+
+// Width/Height changed or we don't have an alloc in the first place?
+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+{
+    const struct v4l2_format *fmt1 = &ctx->format;
+    int ret = !ctx_buffers_alloced(ctx) ||
+        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+            :
+            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+            fmt1->fmt.pix.height != fmt2->fmt.pix.height);

     if (ret)
-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
+        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
             ctx->name,
-            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
-            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
+            ctx_buffers_alloced(ctx),
+            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
+            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));

     return ret;
 }
@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
     }
 }

-/**
- * handle resolution change event and end of stream event
- * returns 1 if reinit was successful, negative if it failed
- * returns 0 if reinit was not executed
- */
-static int v4l2_handle_event(V4L2Context *ctx)
+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-    struct v4l2_format cap_fmt = s->capture.format;
-    struct v4l2_format out_fmt = s->output.format;
-    struct v4l2_event evt = { 0 };
-    int full_reinit, reinit, ret;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    struct v4l2_selection selection = {
+        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
+        .target = V4L2_SEL_TGT_COMPOSE
+    };

-    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
-    if (ret < 0) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
-        return 0;
-    }
+    memset(r, 0, sizeof(*r));
+    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
+        return AVERROR(errno);

-    if (evt.type == V4L2_EVENT_EOS) {
-        ctx->done = 1;
-        return 0;
-    }
+    *r = selection.r;
+    return 0;
+}

-    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
-        return 0;
+static int do_source_change(V4L2m2mContext * const s)
+{
+    AVCodecContext *const avctx = s->avctx;

-    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
-    if (ret) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
-        return 0;
-    }
+    int ret;
+    int reinit;
+    struct v4l2_format cap_fmt = s->capture.format;
+
+    s->capture.done = 0;

     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
     if (ret) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
         return 0;
     }

-    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
-    if (full_reinit) {
-        s->output.height = v4l2_get_height(&out_fmt);
-        s->output.width = v4l2_get_width(&out_fmt);
-        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-    }
+    get_default_selection(&s->capture, &s->capture.selection);

-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
+    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
+        reinit = 1;
+
+    s->capture.format = cap_fmt;
     if (reinit) {
-        s->capture.height = v4l2_get_height(&cap_fmt);
-        s->capture.width = v4l2_get_width(&cap_fmt);
-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
+        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
     }

-    if (full_reinit || reinit)
-        s->reinit = 1;
-
-    if (full_reinit) {
-        ret = ff_v4l2_m2m_codec_full_reinit(s);
-        if (ret) {
-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
-            return AVERROR(EINVAL);
-        }
-        goto reinit_run;
+    // If we don't support selection (or it is bust) and we obviously have HD then kludge
+    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
+        (s->capture.height == 1088 && s->capture.width == 1920)) {
+        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
     }

+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
+           s->capture.width, s->capture.height,
+           s->capture.selection.width, s->capture.selection.height,
+           s->capture.selection.left, s->capture.selection.top, reinit);
+
     if (reinit) {
-        if (s->avctx)
-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+        if (avctx)
+            ret = ff_set_dimensions(s->avctx,
+                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
+                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
         if (ret < 0)
-            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
+            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");

         ret = ff_v4l2_m2m_codec_reinit(s);
         if (ret) {
-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
+            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
+            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
+                   s->capture.width, s->capture.height,
+                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
             return AVERROR(EINVAL);
         }
+
+        // Update pixel format - should only actually do something on initial change
+        s->capture.av_pix_fmt =
+            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
+        if (s->output_drm) {
+            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
+        }
+        else
+            avctx->pix_fmt = s->capture.av_pix_fmt;
+
         goto reinit_run;
     }

-    /* dummy event received */
-    return 0;
+    /* Buffers are OK so just stream off to ack */
+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
+
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret)
+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
+    s->draining = 0;

     /* reinit executed */
 reinit_run:
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
     return 1;
 }

@@ -280,171 +452,293 @@ static int v4l2_stop_encode(V4L2Context *ctx)
     return 0;
 }

-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
+// DQ a buffer
+// Amalgamates all the various ways there are of signalling EOS/Event to
+// generate a consistant EPIPE.
+//
+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
+//
+// Returns:
+//  0               Success
+//  AVERROR(EPIPE)  Nothing more to read
+//  AVERROR(ENOSPC) No buffers in Q to put result in
+//  *               AVERROR(..)
+
+ static int
+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
 {
-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-    struct v4l2_buffer buf = { 0 };
-    V4L2Buffer *avbuf;
-    struct pollfd pfd = {
-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
-        .fd = ctx_to_m2mctx(ctx)->fd,
+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
+    AVCodecContext * const avctx = m->avctx;
+    V4L2Buffer * avbuf;
+    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
+
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+
+    struct v4l2_buffer buf = {
+        .type = ctx->type,
+        .memory = V4L2_MEMORY_MMAP,
     };
-    int i, ret;

-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
-        for (i = 0; i < ctx->num_buffers; i++) {
-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-                break;
-        }
-        if (i == ctx->num_buffers)
-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
-                                                "userspace. Increase num_capture_buffers "
-                                                "to prevent device deadlock or dropped "
-                                                "packets/frames.\n");
-    }
-
-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
-        for (i = 0; i < ctx->num_buffers; i++) {
-            /* capture buffer initialization happens during decode hence
-             * detection happens at runtime
-             */
-            if (!ctx->buffers)
-                break;
-
-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-                goto start;
+    *ppavbuf = NULL;
+
+    if (ctx->flag_last)
+        return AVERROR(EPIPE);
+
+    if (is_mp) {
+        buf.length = VIDEO_MAX_PLANES;
+        buf.m.planes = planes;
+    }
+
+    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
+        const int err = errno;
+        av_assert0(AVERROR(err) < 0);
+        if (err != EINTR) {
+            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+                ctx->name, av_err2str(AVERROR(err)));
+
+            if (err == EPIPE)
+                ctx->flag_last = 1;
+
+            return AVERROR(err);
         }
-        ctx->done = 1;
-        return NULL;
     }
+    atomic_fetch_sub(&ctx->q_count, 1);
+
+    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
+    ff_v4l2_buffer_set_avail(avbuf);
+    avbuf->buf = buf;
+    if (is_mp) {
+        memcpy(avbuf->planes, planes, sizeof(planes));
+        avbuf->buf.m.planes = avbuf->planes;
+    }
+    // Done with any attached buffer
+    av_buffer_unref(&avbuf->ref_buf);

-start:
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        pfd.events =  POLLOUT | POLLWRNORM;
-    else {
-        /* no need to listen to requests for more input while draining */
-        if (ctx_to_m2mctx(ctx)->draining)
-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
+    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
+        // Zero length cap buffer return == EOS
+        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
+            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
+
+            // Must reQ so we don't leak
+            // May not matter if the next thing we do is release all the
+            // buffers but better to be tidy.
+            ff_v4l2_buffer_enqueue(avbuf);
+
+            ctx->flag_last = 1;
+            return AVERROR(EPIPE);
+        }
+
+#ifdef V4L2_BUF_FLAG_LAST
+        // If flag_last set then this contains data but is the last frame
+        // so remember that but return OK
+        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
+            ctx->flag_last = 1;
+#endif
     }

-    for (;;) {
-        ret = poll(&pfd, 1, timeout);
-        if (ret > 0)
-            break;
-        if (errno == EINTR)
+    *ppavbuf = avbuf;
+    return 0;
+}
+
+/**
+ * handle resolution change event and end of stream event
+ * Expects to be called after the stream has stopped
+ *
+ * returns 1 if reinit was successful, negative if it failed
+ * returns 0 if reinit was not executed
+ */
+static int
+get_event(V4L2m2mContext * const m)
+{
+    AVCodecContext * const avctx = m->avctx;
+    struct v4l2_event evt = { 0 };
+
+    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
+        const int rv = AVERROR(errno);
+        if (rv == AVERROR(EINTR))
             continue;
-        return NULL;
+        if (rv == AVERROR(EAGAIN)) {
+            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
+            return AVERROR_EOF;
+        }
+        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
+        return rv;
     }

-    /* 0. handle errors */
-    if (pfd.revents & POLLERR) {
-        /* if we are trying to get free buffers but none have been queued yet
-           no need to raise a warning */
-        if (timeout == 0) {
-            for (i = 0; i < ctx->num_buffers; i++) {
-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-            }
-        }
-        else
-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);

-        return NULL;
+    if (evt.type == V4L2_EVENT_EOS) {
+        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
+        return AVERROR_EOF;
     }

-    /* 1. handle resolution changes */
-    if (pfd.revents & POLLPRI) {
-        ret = v4l2_handle_event(ctx);
-        if (ret < 0) {
-            /* if re-init failed, abort */
-            ctx->done = 1;
-            return NULL;
-        }
-        if (ret) {
-            /* if re-init was successful drop the buffer (if there was one)
-             * since we had to reconfigure capture (unmap all buffers)
-             */
-            return NULL;
+    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
+        return do_source_change(m);
+
+    return 0;
+}
+
+static inline int
+dq_ok(const V4L2Context * const c)
+{
+    return c->streamon && atomic_load(&c->q_count) != 0;
+}
+
+// Get a buffer
+// If output then just gets the buffer in the expected way
+// If capture then runs the capture state m/c to deal with res change etc.
+// If return value == 0 then *ppavbuf != NULL
+
+static int
+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
+{
+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
+    AVCodecContext * const avctx = m->avctx;
+    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
+
+    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
+    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
+    const unsigned int poll_event = POLLPRI;
+
+    *ppavbuf = NULL;
+
+    for (;;) {
+        struct pollfd pfd = {
+            .fd = m->fd,
+            // If capture && stream not started then assume we are waiting for the initial event
+            .events = !is_cap ? poll_out :
+                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
+                    poll_event,
+        };
+        int ret;
+
+        if (ctx->done) {
+            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
+            return AVERROR_EOF;
         }
-    }

-    /* 2. dequeue the buffer */
-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
+        // If capture && timeout == -1 then also wait for rx buffer free
+        if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining)
+            pfd.events |= poll_out;

-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-            /* there is a capture buffer ready */
-            if (pfd.revents & (POLLIN | POLLRDNORM))
-                goto dequeue;
+        // If nothing Qed all we will get is POLLERR - avoid that
+        if ((pfd.events == poll_out && !dq_ok(&m->output)) ||
+            (pfd.events == poll_cap && !dq_ok(&m->capture)) ||
+            (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) {
+            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
+            return AVERROR(ENOSPC);
+        }

-            /* the driver is ready to accept more input; instead of waiting for the capture
-             * buffer to complete we return NULL so input can proceed (we are single threaded)
-             */
-            if (pfd.revents & (POLLOUT | POLLWRNORM))
-                return NULL;
+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
+        // If waiting for an event when we have seen a last_frame then we expect
+        //   it to be ready already so force a short timeout
+        ret = poll(&pfd, 1,
+                   ff_v4l2_ctx_eos(ctx) ? 10 :
+                   timeout == -1 ? 3000 : timeout);
+        if (ret < 0) {
+            ret = AVERROR(errno);  // Remember errno before logging etc.
+            av_assert0(ret < 0);
         }

-dequeue:
-        memset(&buf, 0, sizeof(buf));
-        buf.memory = V4L2_MEMORY_MMAP;
-        buf.type = ctx->type;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memset(planes, 0, sizeof(planes));
-            buf.length = VIDEO_MAX_PLANES;
-            buf.m.planes = planes;
+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
+               ctx->name, ret, timeout, pfd.events, pfd.revents);
+
+        if (ret < 0) {
+            if (ret == AVERROR(EINTR))
+                continue;
+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
+            return ret;
         }

-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
-        if (ret) {
-            if (errno != EAGAIN) {
-                ctx->done = 1;
-                if (errno != EPIPE)
-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-                        ctx->name, av_err2str(AVERROR(errno)));
+        if (ret == 0) {
+            if (timeout == -1)
+                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
+            if (ff_v4l2_ctx_eos(ctx)) {
+                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
+                ret = get_event(m);
+                if (ret < 0) {
+                    ctx->done = 1;
+                    return ret;
+                }
             }
-            return NULL;
+            return AVERROR(EAGAIN);
+        }
+
+        if ((pfd.revents & POLLERR) != 0) {
+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
+            return AVERROR_UNKNOWN;
         }

-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
-                            buf.m.planes[0].bytesused : buf.bytesused;
-            if (bytesused == 0) {
+        if ((pfd.revents & poll_event) != 0) {
+            ret = get_event(m);
+            if (ret < 0) {
                 ctx->done = 1;
-                return NULL;
+                return ret;
             }
-#ifdef V4L2_BUF_FLAG_LAST
-            if (buf.flags & V4L2_BUF_FLAG_LAST)
-                ctx->done = 1;
-#endif
+            continue;
+        }
+
+        if ((pfd.revents & poll_cap) != 0) {
+            ret = dq_buf(ctx, ppavbuf);
+            if (ret == AVERROR(EPIPE))
+                continue;
+            return ret;
         }

-        avbuf = &ctx->buffers[buf.index];
-        avbuf->status = V4L2BUF_AVAILABLE;
-        avbuf->buf = buf;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memcpy(avbuf->planes, planes, sizeof(planes));
-            avbuf->buf.m.planes = avbuf->planes;
+        if ((pfd.revents & poll_out) != 0) {
+            if (is_cap)
+                return AVERROR(EAGAIN);
+            return dq_buf(ctx, ppavbuf);
         }
-        return avbuf;
+
+        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
+        return AVERROR_UNKNOWN;
     }
+}

-    return NULL;
+// Clear out flags and timestamps that should should be set by the user
+// Returns the passed avbuf
+static V4L2Buffer *
+clean_v4l2_buffer(V4L2Buffer * const avbuf)
+{
+    struct v4l2_buffer *const buf = &avbuf->buf;
+
+    buf->flags = 0;
+    buf->field = V4L2_FIELD_ANY;
+    buf->timestamp = (struct timeval){0};
+    buf->timecode = (struct v4l2_timecode){0};
+    buf->sequence = 0;
+
+    return avbuf;
+}
+
+int
+ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1)
+{
+    V4L2Buffer * avbuf;
+    if (timeout1 != 0) {
+        int rv = get_qbuf(ctx, &avbuf, timeout1);
+        if (rv != 0)
+            return rv;
+    }
+    do {
+        get_qbuf(ctx, &avbuf, 0);
+    } while (avbuf);
+    return 0;
 }

 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
-    int timeout = 0; /* return when no more buffers to dequeue */
     int i;

     /* get back as many output buffers as possible */
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-          do {
-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
-    }
+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+        ff_v4l2_dq_all(ctx, 0);

     for (i = 0; i < ctx->num_buffers; i++) {
-        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
-            return &ctx->buffers[i];
+        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (avbuf->status == V4L2BUF_AVAILABLE)
+            return clean_v4l2_buffer(avbuf);
     }

     return NULL;
@@ -452,25 +746,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)

 static int v4l2_release_buffers(V4L2Context* ctx)
 {
-    struct v4l2_requestbuffers req = {
-        .memory = V4L2_MEMORY_MMAP,
-        .type = ctx->type,
-        .count = 0, /* 0 -> unmaps buffers from the driver */
-    };
-    int i, j;
+    int i;
+    int ret = 0;
+    const int fd = ctx_to_m2mctx(ctx)->fd;

-    for (i = 0; i < ctx->num_buffers; i++) {
-        V4L2Buffer *buffer = &ctx->buffers[i];
+    // Orphan any buffers in the wild
+    ff_weak_link_break(&ctx->wl_master);
+
+    if (ctx->bufrefs) {
+        for (i = 0; i < ctx->num_buffers; i++)
+            av_buffer_unref(ctx->bufrefs + i);
+    }
+
+    if (fd != -1) {
+        struct v4l2_requestbuffers req = {
+            .memory = V4L2_MEMORY_MMAP,
+            .type = ctx->type,
+            .count = 0, /* 0 -> unmap all buffers from the driver */
+        };
+
+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
+            if (errno == EINTR)
+                continue;
+
+            ret = AVERROR(errno);

-        for (j = 0; j < buffer->num_planes; j++) {
-            struct V4L2Plane_info *p = &buffer->plane_info[j];
-            if (p->mm_addr && p->length)
-                if (munmap(p->mm_addr, p->length) < 0)
-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
+                ctx->name, av_err2str(AVERROR(errno)));
+
+            if (ctx_to_m2mctx(ctx)->output_drm)
+                av_log(logger(ctx), AV_LOG_ERROR,
+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
+                    "for all buffers: \n"
+                    "  1. drmModeRmFB(..)\n"
+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
         }
     }
+    atomic_store(&ctx->q_count, 0);

-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
+    return ret;
 }

 static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
@@ -499,6 +813,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm

 static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
 {
+    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
+    V4L2m2mPriv *priv = s->avctx->priv_data;
     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
     struct v4l2_fmtdesc fdesc;
     int ret;
@@ -512,21 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
             return 0;
     }

-    for (;;) {
+    for (;; ++fdesc.index) {
         ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
         if (ret)
             return AVERROR(EINVAL);

+        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt))
+                continue;
+        }
+
         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
         ret = v4l2_try_raw_format(ctx, pixfmt);
-        if (ret){
-            fdesc.index++;
-            continue;
+        if (ret == 0) {
+            *p = pixfmt;
+            return 0;
         }
-
-        *p = pixfmt;
-
-        return 0;
     }

     return AVERROR(EINVAL);
@@ -569,30 +886,99 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
   *
   *****************************************************************************/

+
+static void flush_all_buffers_status(V4L2Context* const ctx)
+{
+    int i;
+
+    if (!ctx->bufrefs)
+        return;
+
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (buf->status == V4L2BUF_IN_DRIVER)
+            ff_v4l2_buffer_set_avail(buf);
+    }
+    atomic_store(&ctx->q_count, 0);
+}
+
+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
+{
+    int i;
+    int rv;
+
+    if (!ctx->bufrefs) {
+        rv = ff_v4l2_context_init(ctx);
+        if (rv) {
+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+            return rv;
+        }
+    }
+
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (buf->status == V4L2BUF_AVAILABLE) {
+            rv = ff_v4l2_buffer_enqueue(buf);
+            if (rv < 0)
+                return rv;
+        }
+    }
+    return 0;
+}
+
 int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
 {
     int type = ctx->type;
-    int ret;
+    int ret = 0;
+    AVCodecContext * const avctx = logger(ctx);

-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
-    if (ret < 0)
-        return AVERROR(errno);
+    // Avoid doing anything if there is nothing we can do
+    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
+        return 0;

-    ctx->streamon = (cmd == VIDIOC_STREAMON);
+    ff_mutex_lock(&ctx->lock);

-    return 0;
+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
+        stuff_all_buffers(avctx, ctx);
+
+    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
+        const int err = errno;
+        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
+        ret = AVERROR(err);
+    }
+    else
+    {
+        if (cmd == VIDIOC_STREAMOFF)
+            flush_all_buffers_status(ctx);
+        else
+            ctx->first_buf = 1;
+
+        ctx->streamon = (cmd == VIDIOC_STREAMON);
+        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
+    }
+
+    // Both stream off & on effectively clear flag_last
+    ctx->flag_last = 0;
+
+    ff_mutex_unlock(&ctx->lock);
+
+    return ret;
 }

 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
+    int64_t track_ts;
     V4L2Buffer* avbuf;
     int ret;

     if (!frame) {
         ret = v4l2_stop_encode(ctx);
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
         s->draining= 1;
         return 0;
     }
@@ -601,23 +987,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
     if (!avbuf)
         return AVERROR(EAGAIN);

-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
+    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
+
+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
     if (ret)
         return ret;

     return ff_v4l2_buffer_enqueue(avbuf);
 }

-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
+                                   const void * extdata, size_t extlen)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer* avbuf;
     int ret;
+    int64_t track_ts;

     if (!pkt->size) {
         ret = v4l2_stop_decode(ctx);
+        // Log but otherwise ignore stop failure
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
+            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
         s->draining = 1;
         return 0;
     }
@@ -626,8 +1018,13 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
     if (!avbuf)
         return AVERROR(EAGAIN);

-    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
-    if (ret)
+    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
+
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
+    if (ret == AVERROR(ENOMEM))
+        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
+               __func__, pkt->size, avbuf->planes[0].length);
+    else if (ret)
         return ret;

     return ff_v4l2_buffer_enqueue(avbuf);
@@ -635,42 +1032,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)

 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer *avbuf;
+    int rv;

-    /*
-     * timeout=-1 blocks until:
-     *  1. decoded frame available
-     *  2. an input buffer is ready to be dequeued
-     */
-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
-    if (!avbuf) {
-        if (ctx->done)
-            return AVERROR_EOF;
-
-        return AVERROR(EAGAIN);
-    }
+    do {
+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
+            return rv;
+        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
+            return rv;
+    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);

-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+   return 0;
 }

-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
 {
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer *avbuf;
+    int rv;

-    /*
-     * blocks until:
-     *  1. encoded packet available
-     *  2. an input buffer ready to be dequeued
-     */
-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
-    if (!avbuf) {
-        if (ctx->done)
-            return AVERROR_EOF;
+    do {
+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
+            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
+        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
+            return rv;
+    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);

-        return AVERROR(EAGAIN);
-    }
-
-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
+    return 0;
 }

 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
@@ -702,78 +1093,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)

 int ff_v4l2_context_set_format(V4L2Context* ctx)
 {
-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    int ret;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    if (ret != 0)
+        return ret;
+
+    // Check returned size against min size and if smaller have another go
+    // Only worry about plane[0] as this is meant to enforce limits for
+    // encoded streams where we might know a bit more about the shape
+    // than the driver
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
+        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
+            return 0;
+        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
+    }
+    else {
+        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
+            return 0;
+        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
+    }
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    return ret;
 }

 void ff_v4l2_context_release(V4L2Context* ctx)
 {
     int ret;

-    if (!ctx->buffers)
+    if (!ctx->bufrefs)
         return;

     ret = v4l2_release_buffers(ctx);
     if (ret)
         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);

-    av_freep(&ctx->buffers);
+    av_freep(&ctx->bufrefs);
+    av_buffer_unref(&ctx->frames_ref);
+
+    ff_mutex_destroy(&ctx->lock);
+    pthread_cond_destroy(&ctx->cond);
 }

-int ff_v4l2_context_init(V4L2Context* ctx)
+
+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_requestbuffers req;
-    int ret, i;
-
-    if (!v4l2_type_supported(ctx)) {
-        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
-        return AVERROR_PATCHWELCOME;
-    }
+    int ret;
+    int i;

-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
-    if (ret)
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
+    av_assert0(ctx->bufrefs == NULL);

     memset(&req, 0, sizeof(req));
-    req.count = ctx->num_buffers;
-    req.memory = V4L2_MEMORY_MMAP;
+    req.count = req_buffers;
+    req.memory = mem;
     req.type = ctx->type;
-    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
-    if (ret < 0) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
-        return AVERROR(errno);
+    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
+        if (errno != EINTR) {
+            ret = AVERROR(errno);
+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
+            return ret;
+        }
     }

     ctx->num_buffers = req.count;
-    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
-    if (!ctx->buffers) {
+    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
+    if (!ctx->bufrefs) {
         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
-        return AVERROR(ENOMEM);
+        goto fail_release;
     }

-    for (i = 0; i < req.count; i++) {
-        ctx->buffers[i].context = ctx;
-        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
-        if (ret < 0) {
+    ctx->wl_master = ff_weak_link_new(ctx);
+    if (!ctx->wl_master) {
+        ret = AVERROR(ENOMEM);
+        goto fail_release;
+    }
+
+    for (i = 0; i < ctx->num_buffers; i++) {
+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
+        if (ret) {
             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
-            goto error;
+            goto fail_release;
         }
     }

     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
         req.count,
-        v4l2_get_width(&ctx->format),
-        v4l2_get_height(&ctx->format),
+        ff_v4l2_get_format_width(&ctx->format),
+        ff_v4l2_get_format_height(&ctx->format),
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);

     return 0;

-error:
+fail_release:
     v4l2_release_buffers(ctx);
+    av_freep(&ctx->bufrefs);
+    return ret;
+}
+
+int ff_v4l2_context_init(V4L2Context* ctx)
+{
+    struct v4l2_queryctrl qctrl;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    int ret;
+
+    // It is not valid to reinit a context without a previous release
+    av_assert0(ctx->bufrefs == NULL);
+
+    if (!v4l2_type_supported(ctx)) {
+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+        return AVERROR_PATCHWELCOME;
+    }

-    av_freep(&ctx->buffers);
+    ff_mutex_init(&ctx->lock, NULL);
+    pthread_cond_init(&ctx->cond, NULL);
+    atomic_init(&ctx->q_count, 0);
+
+    if (s->output_drm) {
+        AVHWFramesContext *hwframes;
+
+        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
+        if (!ctx->frames_ref) {
+            ret = AVERROR(ENOMEM);
+            goto fail_unlock;
+        }
+
+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
+        hwframes->sw_format = ctx->av_pix_fmt;
+        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
+        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
+        ret = av_hwframe_ctx_init(ctx->frames_ref);
+        if (ret < 0)
+            goto fail_unref_hwframes;
+    }
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
+        goto fail_unref_hwframes;
+    }
+
+    memset(&qctrl, 0, sizeof(qctrl));
+    qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT;
+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) {
+        ret = AVERROR(errno);
+        if (ret != AVERROR(EINVAL)) {
+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret));
+            goto fail_unref_hwframes;
+        }
+        // Control unsupported - set default if wanted
+        if (ctx->num_buffers < 2)
+            ctx->num_buffers = 4;
+    }
+    else {
+        if (ctx->num_buffers < 2)
+            ctx->num_buffers = qctrl.minimum + 2;
+        ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum);
+    }
+
+    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
+    if (ret < 0)
+        goto fail_unref_hwframes;
+
+    return 0;

+fail_unref_hwframes:
+    av_buffer_unref(&ctx->frames_ref);
+fail_unlock:
+    ff_mutex_destroy(&ctx->lock);
     return ret;
 }
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 22a9532444..108fc05a6f 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -31,6 +31,7 @@
 #include "libavutil/pixfmt.h"
 #include "libavutil/frame.h"
 #include "libavutil/buffer.h"
+#include "libavutil/thread.h"
 #include "v4l2_buffers.h"

 typedef struct V4L2Context {
@@ -70,28 +71,57 @@ typedef struct V4L2Context {
      */
     int width, height;
     AVRational sample_aspect_ratio;
+    struct v4l2_rect selection;

     /**
-     * Indexed array of V4L2Buffers
+     * If the default size of buffer is less than this then try to
+     * set to this.
      */
-    V4L2Buffer *buffers;
+    uint32_t min_buf_size;
+
+    /**
+     * Indexed array of pointers to V4L2Buffers
+     */
+    AVBufferRef **bufrefs;

     /**
      * Readonly after init.
      */
     int num_buffers;

+    /**
+     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
+     */
+    enum v4l2_memory buf_mem;
+
     /**
      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
      */
     int streamon;

+    /* 1st buffer after stream on */
+    int first_buf;
+
     /**
      *  Either no more buffers available or an unrecoverable error was notified
      *  by the V4L2 kernel driver: once set the context has to be exited.
      */
     int done;

+    int flag_last;
+
+    /**
+     * If NZ then when Qing frame/pkt use this rather than the
+     * "real" PTS
+     */
+    uint64_t track_ts;
+
+    AVBufferRef *frames_ref;
+    atomic_int q_count;
+    struct ff_weak_link_master *wl_master;
+
+    AVMutex lock;
+    pthread_cond_t cond;
 } V4L2Context;

 /**
@@ -147,7 +177,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
  * @param[inout] pkt The AVPacket to dequeue to.
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout);

 /**
  * Dequeues a buffer from a V4L2Context to an AVFrame.
@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] ctx The V4L2Context to dequeue from.
  * @param[inout] f The AVFrame to dequeue to.
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
+ *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+ *                AVERROR(ENOSPC) if no buffer availible to put
+ *                the frame in
  */
 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);

@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
  * @param[in] pkt A pointer to an AVPacket.
  * @return 0 in case of success, a negative error otherwise.
  */
-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);

 /**
  * Enqueues a buffer to a V4L2Context from an AVFrame
@@ -183,4 +216,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
  */
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);

+/**
+ * Dequeue all buffers on this queue
+ *
+ * Used to recycle output buffers
+ *
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[in] timeout1 A timeout on dequeuing the 1st buffer,
+ *       all others have a timeout of zero
+ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return
+ *         of the first dequeue operation, 0 otherwise.
+ */
+int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1);
+
+/**
+ * Returns the number of buffers currently queued
+ *
+ * @param[in] ctx The V4L2Context to evaluate
+ */
+static inline int
+ff_v4l2_context_q_count(const V4L2Context* const ctx)
+{
+    return atomic_load(&ctx->q_count);
+}
+
 #endif // AVCODEC_V4L2_CONTEXT_H
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index cdfd579810..a919bdc030 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -35,6 +35,15 @@
 #include "v4l2_context.h"
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
+#include "v4l2_req_dmabufs.h"
+
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    x->last_pts = AV_NOPTS_VALUE;
+}
+

 static inline int v4l2_splane_video(struct v4l2_capability *cap)
 {
@@ -68,7 +77,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)

     s->capture.done = s->output.done = 0;
     s->capture.name = "capture";
+    s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     s->output.name = "output";
+    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     atomic_init(&s->refcount, 0);
     sem_init(&s->refsync, 0, 0);

@@ -85,12 +96,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
     if (v4l2_mplane_video(&cap)) {
         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        s->output.format.type = s->output.type;
         return 0;
     }

     if (v4l2_splane_video(&cap)) {
         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        s->output.format.type = s->output.type;
         return 0;
     }

@@ -215,13 +228,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");

     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
-     *    we must wait for all references to be released before being allowed
-     *    to queue new buffers.
      */
-    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
-    if (atomic_load(&s->refcount))
-        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
-
     ff_v4l2_context_release(&s->capture);

     /* 3. get the new capture format */
@@ -240,7 +247,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)

     /* 5. complete reinit */
     s->draining = 0;
-    s->reinit = 0;

     return 0;
 }
@@ -274,7 +280,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s)

     /* start again now that we know the stream dimensions */
     s->draining = 0;
-    s->reinit = 0;

     ret = ff_v4l2_context_get_format(&s->output, 0);
     if (ret) {
@@ -328,10 +333,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
     ff_v4l2_context_release(&s->capture);
     sem_destroy(&s->refsync);

-    close(s->fd);
+    if (s->fd != -1)
+        close(s->fd);
     av_frame_unref(s->frame);
     av_frame_free(&s->frame);
     av_packet_unref(&s->buf_pkt);
+    av_freep(&s->extdata_data);
+
+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");

     av_free(s);
 }
@@ -344,6 +353,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     if (!s)
         return 0;

+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
+
+    if (s->avctx && av_codec_is_decoder(s->avctx->codec))
+        av_packet_unref(&s->buf_pkt);
+
     if (s->fd >= 0) {
         ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
         if (ret)
@@ -356,7 +370,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)

     ff_v4l2_context_release(&s->output);

+    dmabufs_ctl_unref(&s->db_ctl);
+    close(s->fd);
+    s->fd = -1;
+
     s->self_ref = NULL;
+    // This is only called on avctx close so after this point we don't have that
+    // Crash sooner if we find we are using it (can still log with avctx = NULL)
+    s->avctx = NULL;
+    priv->context = NULL;
     av_buffer_unref(&priv->context_ref);

     return 0;
@@ -400,35 +422,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
     return v4l2_configure_contexts(s);
 }

-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
 {
-    *s = av_mallocz(sizeof(V4L2m2mContext));
-    if (!*s)
+    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
+
+    *pps = NULL;
+    if (!s)
         return AVERROR(ENOMEM);

-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
+    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
                                          &v4l2_m2m_destroy_context, NULL, 0);
     if (!priv->context_ref) {
-        av_freep(s);
+        av_free(s);
         return AVERROR(ENOMEM);
     }

     /* assign the context */
-    priv->context = *s;
-    (*s)->priv = priv;
+    priv->context = s;
+    s->priv = priv;

     /* populate it */
-    priv->context->capture.num_buffers = priv->num_capture_buffers;
-    priv->context->output.num_buffers  = priv->num_output_buffers;
-    priv->context->self_ref = priv->context_ref;
-    priv->context->fd = -1;
+    s->capture.num_buffers = priv->num_capture_buffers;
+    s->output.num_buffers  = priv->num_output_buffers;
+    s->self_ref = priv->context_ref;
+    s->fd = -1;
+    xlat_init(&s->xlat);

     priv->context->frame = av_frame_alloc();
     if (!priv->context->frame) {
         av_buffer_unref(&priv->context_ref);
-        *s = NULL; /* freed when unreferencing context_ref */
         return AVERROR(ENOMEM);
     }

+    *pps = s;
     return 0;
 }
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index b67b216331..ded1478a49 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -30,6 +30,7 @@
 #include <linux/videodev2.h>

 #include "libavcodec/avcodec.h"
+#include "libavutil/pixfmt.h"
 #include "v4l2_context.h"

 #define container_of(ptr, type, member) ({ \
@@ -38,7 +39,39 @@

 #define V4L_M2M_DEFAULT_OPTS \
     { "num_output_buffers", "Number of buffers in the output context",\
-        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
+
+#define FF_V4L2_M2M_TRACK_SIZE 128
+typedef struct V4L2m2mTrackEl {
+    int     discard;   // If we see this buffer its been flushed, so discard
+    int     pending;
+    int     pkt_size;
+    int64_t pts;
+    int64_t dts;
+    int64_t reordered_opaque;
+    int64_t pkt_pos;
+    int64_t pkt_duration;
+    int64_t track_pts;
+} V4L2m2mTrackEl;
+
+typedef struct pts_stats_s
+{
+    void * logctx;
+    const char * name;  // For debug
+    unsigned int last_count;
+    unsigned int last_interval;
+    int64_t last_pts;
+    int64_t guess;
+} pts_stats_t;
+
+typedef struct xlat_track_s {
+    unsigned int track_no;
+    int64_t last_pts;    // Last valid PTS decoded
+    int64_t last_opaque;
+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+} xlat_track_t;
+
+struct dmabufs_ctl;

 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
@@ -52,10 +85,10 @@ typedef struct V4L2m2mContext {
     AVCodecContext *avctx;
     sem_t refsync;
     atomic_uint refcount;
-    int reinit;

     /* null frame/packet received */
     int draining;
+    int running;
     AVPacket buf_pkt;

     /* Reference to a frame. Only used during encoding */
@@ -66,6 +99,35 @@ typedef struct V4L2m2mContext {

     /* reference back to V4L2m2mPriv */
     void *priv;
+
+    AVBufferRef *device_ref;
+
+    /* generate DRM frames */
+    int output_drm;
+
+    /* input frames are drmprime */
+    int input_drm;
+
+    /* Frame tracking */
+    xlat_track_t xlat;
+
+    pts_stats_t pts_stat;
+
+    /* req pkt */
+    int req_pkt;
+
+    /* Ext data sent */
+    int extdata_sent;
+    /* Ext data sent in packet - overrides ctx */
+    void * extdata_data;
+    size_t extdata_size;
+
+#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
+    /* Quirks */
+    unsigned int quirks;
+
+    struct dmabufs_ctl * db_ctl;
 } V4L2m2mContext;

 typedef struct V4L2m2mPriv {
@@ -76,6 +138,8 @@ typedef struct V4L2m2mPriv {

     int num_output_buffers;
     int num_capture_buffers;
+    const char * dmabuf_alloc;
+    enum AVPixelFormat pix_fmt;
 } V4L2m2mPriv;

 /**
@@ -129,4 +193,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
  */
 int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);

+
+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
+}
+
+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
+{
+    return ctx->flag_last;
+}
+
+
 #endif /* AVCODEC_V4L2_M2M_H */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index ab07c0a24a..b25779fd3e 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -21,8 +21,14 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include "config.h"
+
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/opt.h"
@@ -30,75 +36,274 @@
 #include "libavcodec/decode.h"
 #include "libavcodec/internal.h"

+#include "libavcodec/hwaccels.h"
+#include "libavcodec/internal.h"
+#include "libavcodec/hwconfig.h"
+
 #include "v4l2_context.h"
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"
+#include "v4l2_req_dmabufs.h"

-static int v4l2_try_start(AVCodecContext *avctx)
+#if CONFIG_H264_DECODER
+#include "h264_parse.h"
+#endif
+#if CONFIG_HEVC_DECODER
+#include "hevc_parse.h"
+#endif
+
+// Pick 64 for max last count - that is >1sec at 60fps
+#define STATS_LAST_COUNT_MAX 64
+#define STATS_INTERVAL_MAX (1 << 30)
+
+#ifndef FF_API_BUFFER_SIZE_T
+#define FF_API_BUFFER_SIZE_T 1
+#endif
+
+#define DUMP_FAILED_EXTRADATA 0
+
+#if DUMP_FAILED_EXTRADATA
+static inline char hex1(unsigned int x)
 {
-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    V4L2Context *const capture = &s->capture;
-    V4L2Context *const output = &s->output;
-    struct v4l2_selection selection = { 0 };
-    int ret;
+    x &= 0xf;
+    return x <= 9 ? '0' + x : 'a' + x - 10;
+}

-    /* 1. start the output process */
-    if (!output->streamon) {
-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
-            return ret;
-        }
+static inline char * hex2(char * s, unsigned int x)
+{
+    *s++ = hex1(x >> 4);
+    *s++ = hex1(x);
+    return s;
+}
+
+static inline char * hex4(char * s, unsigned int x)
+{
+    s = hex2(s, x >> 8);
+    s = hex2(s, x);
+    return s;
+}
+
+static inline char * dash2(char * s)
+{
+    *s++ = '-';
+    *s++ = '-';
+    return s;
+}
+
+static void
+data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len)
+{
+    size_t i;
+    s = hex4(s, offset);
+    m += offset;
+    for (i = 0; i != 8; ++i) {
+        *s++ = ' ';
+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+    }
+    *s++ = ' ';
+    *s++ = ':';
+    for (; i != 16; ++i) {
+        *s++ = ' ';
+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
     }
+    *s++ = 0;
+}

-    if (capture->streamon)
-        return 0;
+static void
+log_dump(void * logctx, int lvl, const void * const data, const size_t len)
+{
+    size_t i;
+    for (i = 0; i < len; i += 16) {
+        char buf[80];
+        data16(buf, i, data, len);
+        av_log(logctx, lvl, "%s\n", buf);
+    }
+}
+#endif

-    /* 2. get the capture format */
-    capture->format.type = capture->type;
-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
-    if (ret) {
-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
-        return ret;
+static int64_t pts_stats_guess(const pts_stats_t * const stats)
+{
+    if (stats->last_count <= 1)
+        return stats->last_pts;
+    if (stats->last_pts == AV_NOPTS_VALUE ||
+            stats->last_interval == 0 ||
+            stats->last_count >= STATS_LAST_COUNT_MAX)
+        return AV_NOPTS_VALUE;
+    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
+}
+
+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
+{
+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
+        if (stats->last_count < STATS_LAST_COUNT_MAX)
+            ++stats->last_count;
+        return;
     }

-    /* 2.1 update the AVCodecContext */
-    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-    capture->av_pix_fmt = avctx->pix_fmt;
+    if (stats->last_pts != AV_NOPTS_VALUE) {
+        const int64_t interval = pts - stats->last_pts;

-    /* 3. set the crop parameters */
-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    selection.r.height = avctx->coded_height;
-    selection.r.width = avctx->coded_width;
-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
-    if (!ret) {
-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-        if (ret) {
-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
-        } else {
-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
-            /* update the size of the resulting frame */
-            capture->height = selection.r.height;
-            capture->width  = selection.r.width;
+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
+            stats->last_count >= STATS_LAST_COUNT_MAX) {
+            if (stats->last_interval != 0)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
+                       __func__, stats->name, interval, stats->last_count);
+            stats->last_interval = 0;
+        }
+        else {
+            const int64_t frame_time = interval / (int64_t)stats->last_count;
+
+            if (frame_time != stats->last_interval)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
+            stats->last_interval = frame_time;
         }
     }

-    /* 4. init the capture context now that we have the capture format */
-    if (!capture->buffers) {
-        ret = ff_v4l2_context_init(capture);
-        if (ret) {
-            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-            return AVERROR(ENOMEM);
+    stats->last_pts = pts;
+    stats->last_count = 1;
+}
+
+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
+{
+    *stats = (pts_stats_t){
+        .logctx = logctx,
+        .name = name,
+        .last_count = 1,
+        .last_interval = 0,
+        .last_pts = AV_NOPTS_VALUE
+    };
+}
+
+// If abdata == NULL then this just counts space required
+// Unpacks avcC if detected
+static int
+h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata)
+{
+    const uint8_t * const xdend = extradata + extrasize;
+    const uint8_t * p = extradata;
+    uint8_t * d = abdata;
+    unsigned int n;
+    unsigned int len;
+    const unsigned int hdrlen = 4;
+    unsigned int need_pps = 1;
+
+    if (extrasize < 8)
+        return AVERROR(EINVAL);
+
+    if (p[0] == 0 && p[1] == 0) {
+        // Assume a couple of leading zeros are good enough to indicate NAL
+        if (abdata)
+            memcpy(d, p, extrasize);
+        return extrasize;
+    }
+
+    // avcC starts with a 1
+    if (p[0] != 1)
+        return AVERROR(EINVAL);
+
+    p += 5;
+    n = *p++ & 0x1f;
+
+doxps:
+    while (n--) {
+        if (xdend - p < 2)
+            return AVERROR(EINVAL);
+        len = (p[0] << 8) | p[1];
+        p += 2;
+        if (xdend - p < (ptrdiff_t)len)
+            return AVERROR(EINVAL);
+        if (abdata) {
+            d[0] = 0;
+            d[1] = 0;
+            d[2] = 0;
+            d[3] = 1;
+            memcpy(d + 4, p, len);
         }
+        d += len + hdrlen;
+        p += len;
+    }
+    if (need_pps) {
+        need_pps = 0;
+        if (p >= xdend)
+            return AVERROR(EINVAL);
+        n = *p++;
+        goto doxps;
     }

-    /* 5. start the capture process */
-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-    if (ret) {
-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
+    return d - abdata;
+}
+
+static int
+copy_extradata(AVCodecContext * const avctx,
+               const void * const src_data, const int src_len,
+               void ** const pdst_data, size_t * const pdst_len)
+{
+    int len;
+
+    *pdst_len = 0;
+    av_freep(pdst_data);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264)
+        len = h264_xd_copy(src_data, src_len, NULL);
+    else
+        len = src_len < 0 ? AVERROR(EINVAL) : src_len;
+
+    // Zero length is OK but we want to stop - -ve is error val
+    if (len <= 0)
+        return len;
+
+    if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
+        return AVERROR(ENOMEM);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264)
+        h264_xd_copy(src_data, src_len, *pdst_data);
+    else
+        memcpy(*pdst_data, src_data, len);
+    *pdst_len = len;
+
+    return 0;
+}
+
+
+
+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
+{
+    int ret;
+    struct v4l2_decoder_cmd cmd = {
+        .cmd = V4L2_DEC_CMD_START,
+        .flags = 0,
+    };
+
+    if (s->output.streamon)
+        return 0;
+
+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
+    if (ret != 0) {
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
         return ret;
     }

+    // STREAMON should do implicit START so this just for those that don't.
+    // It is optional so don't worry if it fails
+    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
+        ret = AVERROR(errno);
+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
+    }
+    else {
+        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+    }
+    return 0;
+}
+
+static int v4l2_try_start(AVCodecContext *avctx)
+{
+    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    int ret;
+
+    /* 1. start the output process */
+    if ((ret = check_output_streamon(avctx, s)) != 0)
+        return ret;
     return 0;
 }

@@ -133,58 +338,742 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
     return 0;
 }

-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+static void
+set_best_effort_pts(AVCodecContext *const avctx,
+             pts_stats_t * const ps,
+             AVFrame *const frame)
+{
+    pts_stats_add(ps, frame->pts);
+
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+    frame->pkt_pts = frame->pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    frame->best_effort_timestamp = pts_stats_guess(ps);
+    // If we can't guess from just PTS - try DTS
+    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
+        frame->best_effort_timestamp = frame->pkt_dts;
+
+    // We can't emulate what s/w does in a useful manner and using the
+    // "correct" answer seems to just confuse things.
+    frame->pkt_dts               = frame->pts;
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
+}
+
+static void
+xlat_flush(xlat_track_t * const x)
+{
+    unsigned int i;
+    // Do not reset track_no - this ensures that any frames left in the decoder
+    // that turn up later get discarded.
+
+    x->last_pts = AV_NOPTS_VALUE;
+    x->last_opaque = 0;
+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
+        x->track_els[i].pending = 0;
+        x->track_els[i].discard = 1;
+    }
+}
+
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    xlat_flush(x);
+}
+
+static int
+xlat_pending(const xlat_track_t * const x)
+{
+    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
+    int i;
+    const int64_t now = x->last_pts;
+
+    for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
+        const V4L2m2mTrackEl * const t = x->track_els + n;
+
+        // Discard only set on never-set or flushed entries
+        // So if we get here we've never successfully decoded a frame so allow
+        // more frames into the buffer before stalling
+        if (t->discard)
+            return i - 16;
+
+        // If we've got this frame out then everything before this point
+        // must have entered the decoder
+        if (!t->pending)
+            break;
+
+        // If we've never seen a pts all we can do is count frames
+        if (now == AV_NOPTS_VALUE)
+            continue;
+
+        if (t->dts != AV_NOPTS_VALUE && now >= t->dts)
+            break;
+    }
+
+    return i;
+}
+
+static inline int stream_started(const V4L2m2mContext * const s) {
+    return s->output.streamon;
+}
+
+#define NQ_OK        0
+#define NQ_Q_FULL    1
+#define NQ_SRC_EMPTY 2
+#define NQ_NONE      3
+#define NQ_DRAINING  4
+#define NQ_DEAD      5
+
+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
+
+// do_not_get      If true then no new packet will be got but status will
+//                  be set appropriately
+
+// AVERROR_EOF     Flushing an already flushed stream
+// -ve             Error (all errors except EOF are unexpected)
+// NQ_OK (0)       OK
+// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
+// NQ_SRC_EMPTY    Src empty (do not retry)
+// NQ_NONE         Enqueue not attempted
+// NQ_DRAINING     At EOS, dQ dest until EOS there too
+// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
+
+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
 {
-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    V4L2Context *const capture = &s->capture;
-    V4L2Context *const output = &s->output;
     int ret;

-    if (!s->buf_pkt.size) {
-        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
-        if (ret < 0 && ret != AVERROR_EOF)
+    // If we don't already have a coded packet - get a new one
+    // We will already have a coded pkt if the output Q was full last time we
+    // tried to Q it
+    if (!s->buf_pkt.size && !do_not_get) {
+        unsigned int i;
+
+        for (i = 0; i < 256; ++i) {
+            uint8_t * side_data;
+#if FF_API_BUFFER_SIZE_T
+            int side_size;
+#else
+            size_t side_size;
+#endif
+            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+            if (ret != 0)
+                break;
+
+            // New extradata is the only side-data we undertand
+            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+            if (side_data) {
+                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
+                if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0)
+                    av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret));
+                s->extdata_sent = 0;
+            }
+
+            if (s->buf_pkt.size != 0)
+                break;
+
+            if (s->buf_pkt.side_data_elems == 0) {
+                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
+                ret = AVERROR_EOF;
+                break;
+            }
+
+            // Retry a side-data only pkt
+        }
+        // If i >= 256 something has gone wrong
+        if (i >= 256) {
+            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
+            return AVERROR(EIO);
+        }
+
+        if (ret == AVERROR(EAGAIN)) {
+            if (!stream_started(s)) {
+                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
+                return NQ_DEAD;
+            }
+            return NQ_SRC_EMPTY;
+        }
+
+        if (ret == AVERROR_EOF) {
+            // EOF - enter drain mode
+            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
+                   ret, s->buf_pkt.size, stream_started(s), s->draining);
+            if (!stream_started(s)) {
+                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
+                s->draining = 1;
+                s->capture.done = 1;
+                return AVERROR_EOF;
+            }
+
+            if (!s->draining) {
+                // Calling enqueue with an empty pkt starts drain
+                av_assert0(s->buf_pkt.size == 0);
+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
+                if (ret) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
+                    return ret;
+                }
+            }
+            return NQ_DRAINING;
+        }
+
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
             return ret;
+        }
     }

-    if (s->draining)
-        goto dequeue;
+    if (s->draining) {
+        if (s->buf_pkt.size) {
+            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
+            av_packet_unref(&s->buf_pkt);
+        }
+        return NQ_DRAINING;
+    }
+
+    if (!s->buf_pkt.size)
+        return NQ_NONE;
+
+    if ((ret = check_output_streamon(avctx, s)) != 0)
+        return ret;

-    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
-    if (ret < 0 && ret != AVERROR(EAGAIN))
-        goto fail;
+    if (s->extdata_sent)
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
+    else
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);

-    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
-    if (ret != AVERROR(EAGAIN))
+    if (ret == AVERROR(EAGAIN)) {
+        // Out of input buffers - keep packet
+        ret = NQ_Q_FULL;
+    }
+    else {
+        // In all other cases we are done with this packet
         av_packet_unref(&s->buf_pkt);
+        s->extdata_sent = 1;

-    if (!s->draining) {
-        ret = v4l2_try_start(avctx);
         if (ret) {
-            /* cant recover */
-            if (ret != AVERROR(ENOMEM))
-                ret = 0;
-            goto fail;
+            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
+            return ret;
         }
     }

-dequeue:
-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
-fail:
-    av_packet_unref(&s->buf_pkt);
+    // Start if we haven't
+    {
+        const int ret2 = v4l2_try_start(avctx);
+        if (ret2) {
+            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
+            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
+        }
+    }
+
+    return ret;
+}
+
+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
+{
+    int rv = 0;
+
+    ff_mutex_lock(&ctx->lock);
+
+    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
+        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
+            rv = AVERROR(errno);
+            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
+            break;
+        }
+    }
+
+    ff_mutex_unlock(&ctx->lock);
+    return rv;
+}
+
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    int src_rv = NQ_OK;
+    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
+    unsigned int i = 0;
+
+    do {
+        const int pending = xlat_pending(&s->xlat);
+        const int prefer_dq = (pending > 4);
+        const int last_src_rv = src_rv;
+
+        av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
+
+        // Enqueue another pkt for decode if
+        // (a) We don't have a lot of stuff in the buffer already OR
+        // (b) ... we (think we) do but we've failed to get a frame already OR
+        // (c) We've dequeued a lot of frames without asking for input
+        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
+
+        // If we got a frame last time or we've already tried to get a frame and
+        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
+        // indicating that we want more input.
+        // This should mean that once decode starts we enter a stable state where
+        // we alternately ask for input and produce output
+        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
+            break;
+
+        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
+            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
+            break;
+        }
+
+        // Try to get a new frame if
+        // (a) we haven't already got one AND
+        // (b) enqueue returned a status indicating that decode should be attempted
+        if (dst_rv != 0 && TRY_DQ(src_rv)) {
+            // Pick a timeout depending on state
+            // The pending count isn't completely reliable so it is good enough
+            // hint that we want a frame but not good enough to require it in
+            // all cases; however if it has got > 31 that exceeds its margin of
+            // error so require a frame to prevent ridiculous levels of latency
+            const int t =
+                src_rv == NQ_Q_FULL ? -1 :
+                src_rv == NQ_DRAINING ? 300 :
+                prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0;
+
+            // Dequeue frame will unref any previous contents of frame
+            // if it returns success so we don't need an explicit unref
+            // when discarding
+            // This returns AVERROR(EAGAIN) on timeout or if
+            // there is room in the input Q and timeout == -1
+            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
+
+            // Failure due to no buffer in Q?
+            if (dst_rv == AVERROR(ENOSPC)) {
+                // Wait & retry
+                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
+                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
+                }
+            }
+
+            if (dst_rv == 0) {
+                set_best_effort_pts(avctx, &s->pts_stat, frame);
+                if (!s->running) {
+                    s->running = 1;
+                    av_log(avctx, AV_LOG_VERBOSE, "Decode running\n");
+                }
+            }
+
+            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
+                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
+                dst_rv = AVERROR_EOF;
+                s->capture.done = 1;
+            }
+            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
+                       s->draining, s->capture.done);
+            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
+                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
+                       s->draining, s->capture.done, dst_rv);
+        }
+
+        ++i;
+        if (i >= 256) {
+            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
+            src_rv = AVERROR(EIO);
+        }
+
+        // Continue trying to enqueue packets if either
+        // (a) we succeeded last time OR
+        // (b) we didn't ret a frame and we can retry the input
+    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
+
+    // Ensure that the frame contains nothing if we aren't returning a frame
+    // (might happen when discarding)
+    if (dst_rv)
+        av_frame_unref(frame);
+
+    // If we got a frame this time ask for a pkt next time
+    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
+
+#if 0
+    if (dst_rv == 0)
+    {
+        static int z = 0;
+        if (++z > 50) {
+            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
+            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+            return -1;
+        }
+    }
+#endif
+
+    return dst_rv == 0 ? 0 :
+        src_rv < 0 ? src_rv :
+        dst_rv < 0 ? dst_rv :
+            AVERROR(EAGAIN);
+}
+
+#if 0
+#include <time.h>
+static int64_t us_time(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
+
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    int ret;
+    const int64_t now = us_time();
+    int64_t done;
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    ret = v4l2_receive_frame2(avctx, frame);
+    done = us_time();
+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
     return ret;
 }
+#endif
+
+static uint32_t
+avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile)
+{
+    switch (codec_id) {
+        case AV_CODEC_ID_H264:
+            switch (avprofile) {
+                case FF_PROFILE_H264_BASELINE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
+                case FF_PROFILE_H264_CONSTRAINED_BASELINE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE;
+                case FF_PROFILE_H264_MAIN:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN;
+                case FF_PROFILE_H264_EXTENDED:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED;
+                case FF_PROFILE_H264_HIGH:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH;
+                case FF_PROFILE_H264_HIGH_10:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10;
+                case FF_PROFILE_H264_HIGH_10_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA;
+                case FF_PROFILE_H264_MULTIVIEW_HIGH:
+                case FF_PROFILE_H264_HIGH_422:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422;
+                case FF_PROFILE_H264_HIGH_422_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA;
+                case FF_PROFILE_H264_STEREO_HIGH:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH;
+                case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE;
+                case FF_PROFILE_H264_HIGH_444_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA;
+                case FF_PROFILE_H264_CAVLC_444:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA;
+                case FF_PROFILE_H264_HIGH_444:
+                default:
+                    break;
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE		= 12,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH		= 13,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA	= 14,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH		= 16,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH		= 17,
+            }
+            break;
+        case AV_CODEC_ID_MPEG2VIDEO:
+        case AV_CODEC_ID_MPEG4:
+        case AV_CODEC_ID_VC1:
+        case AV_CODEC_ID_VP8:
+        case AV_CODEC_ID_VP9:
+        case AV_CODEC_ID_AV1:
+            // Most profiles are a simple number that matches the V4L2 enum
+            return avprofile;
+        default:
+            break;
+    }
+    return ~(uint32_t)0;
+}
+
+// This check mirrors Chrome's profile check by testing to see if the profile
+// exists as a possible value for the V4L2 profile control
+static int
+check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
+{
+    struct v4l2_queryctrl query_ctrl;
+    struct v4l2_querymenu query_menu;
+    uint32_t profile_id;
+
+    // An unset profile is almost certainly zero or -99 - do not reject
+    if (avctx->profile <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile);
+        return 0;
+    }
+
+    memset(&query_ctrl, 0, sizeof(query_ctrl));
+    switch (avctx->codec_id) {
+        case AV_CODEC_ID_MPEG2VIDEO:
+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE;
+            break;
+        case AV_CODEC_ID_MPEG4:
+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE;
+            break;
+        case AV_CODEC_ID_H264:
+            profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
+            break;
+        case AV_CODEC_ID_VP8:
+            profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE;
+            break;
+        case AV_CODEC_ID_VP9:
+            profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE;
+            break;
+#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE
+        case AV_CODEC_ID_AV1:
+            profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE;
+            break;
+#endif
+        default:
+            av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id);
+            return 0;
+    }
+
+    query_ctrl = (struct v4l2_queryctrl){.id = profile_id};
+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id);
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id);
+
+        query_menu = (struct v4l2_querymenu){
+            .id = query_ctrl.id,
+            .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile),
+        };
+
+        if (query_menu.index > query_ctrl.maximum ||
+            query_menu.index < query_ctrl.minimum ||
+            ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) {
+            return AVERROR(ENOENT);
+        }
+    }
+
+    return 0;
+};
+
+static int
+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    unsigned int i;
+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
+    const uint32_t w = avctx->coded_width;
+    const uint32_t h = avctx->coded_height;
+
+    if (w == 0 || h == 0 || fcc == 0) {
+        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
+        return 0;
+    }
+    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
+        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
+        return 0;
+    }
+
+    for (i = 0;; ++i) {
+        struct v4l2_frmsizeenum fs = {
+            .index = i,
+            .pixel_format = fcc,
+        };
+
+        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
+            const int err = AVERROR(errno);
+            if (err == AVERROR(EINTR))
+                continue;
+            if (i == 0 && err == AVERROR(ENOTTY)) {
+                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
+                return 0;
+            }
+            if (err != AVERROR(EINVAL)) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
+                return err;
+            }
+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
+                   w, h, av_fourcc2str(fcc), i);
+            return err;
+        }
+
+        switch (fs.type) {
+            case V4L2_FRMSIZE_TYPE_DISCRETE:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
+                       fs.discrete.width,fs.discrete.height);
+                if (w == fs.discrete.width && h == fs.discrete.height)
+                    return 0;
+                break;
+            case V4L2_FRMSIZE_TYPE_STEPWISE:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
+                       fs.stepwise.min_width, fs.stepwise.min_height,
+                       fs.stepwise.max_width, fs.stepwise.max_height,
+                       fs.stepwise.step_width,fs.stepwise.step_height);
+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
+                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
+                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
+                    return 0;
+                break;
+            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
+                       fs.stepwise.min_width, fs.stepwise.min_height,
+                       fs.stepwise.max_width, fs.stepwise.max_height,
+                       fs.stepwise.step_width,fs.stepwise.step_height);
+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
+                    return 0;
+                break;
+            default:
+                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
+                return AVERROR(EINVAL);
+        }
+    }
+}
+
+static int
+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    struct v4l2_capability cap;
+
+    memset(&cap, 0, sizeof(cap));
+    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
+        int err = errno;
+        if (err == EINTR)
+            continue;
+        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
+        return AVERROR(err);
+    }
+
+    // Could be made table driven if we have a few more but right now there
+    // seems no point
+
+    // Meson (amlogic) always gives a resolution changed event after output
+    // streamon and userspace must (re)allocate capture buffers and streamon
+    // capture to clear the event even if the capture buffers were the right
+    // size in the first place.
+    if (strcmp(cap.driver, "meson-vdec") == 0)
+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
+
+    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
+    return 0;
+}
+
+// This heuristic is for H264 but use for everything
+static uint32_t max_coded_size(const AVCodecContext * const avctx)
+{
+    uint32_t wxh = avctx->coded_width * avctx->coded_height;
+    uint32_t size;
+
+    size = wxh * 3 / 2;
+    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
+    // unfortunately that doesn't yield an actually useful limit
+    // and it should be noted that frame 0 is special cased to allow
+    // a bigger number which really isn't helpful for us. So just pick
+    // frame_size / 2
+    size /= 2;
+    // Add 64k to allow for any overheads and/or encoder hopefulness
+    // with small WxH
+    return size + (1 << 16);
+}
+
+static void
+parse_extradata(AVCodecContext *avctx)
+{
+    if (!avctx->extradata || !avctx->extradata_size)
+        return;
+
+    switch (avctx->codec_id) {
+#if CONFIG_H264_DECODER
+        case AV_CODEC_ID_H264:
+        {
+            H264ParamSets ps = {{NULL}};
+            int is_avc = 0;
+            int nal_length_size = 0;
+            int ret;
+
+            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                           &ps, &is_avc, &nal_length_size,
+                                           avctx->err_recognition, avctx);
+            if (ret > 0) {
+                const SPS * sps = NULL;
+                unsigned int i;
+                for (i = 0; i != MAX_SPS_COUNT; ++i) {
+                    if (ps.sps_list[i]) {
+                        sps = (const SPS *)ps.sps_list[i]->data;
+                        break;
+                    }
+                }
+                if (sps) {
+                    avctx->profile = ff_h264_get_profile(sps);
+                    avctx->level = sps->level_idc;
+                }
+            }
+            ff_h264_ps_uninit(&ps);
+            break;
+        }
+#endif
+#if CONFIG_HEVC_DECODER
+        case AV_CODEC_ID_HEVC:
+        {
+            HEVCParamSets ps = {{NULL}};
+            HEVCSEI sei = {{{{0}}}};
+            int is_nalff = 0;
+            int nal_length_size = 0;
+            int ret;
+
+            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                           &ps, &sei, &is_nalff, &nal_length_size,
+                                           avctx->err_recognition, 0, avctx);
+            if (ret > 0) {
+                const HEVCSPS * sps = NULL;
+                unsigned int i;
+                for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) {
+                    if (ps.sps_list[i]) {
+                        sps = (const HEVCSPS *)ps.sps_list[i]->data;
+                        break;
+                    }
+                }
+                if (sps) {
+                    avctx->profile = sps->ptl.general_ptl.profile_idc;
+                    avctx->level   = sps->ptl.general_ptl.level_idc;
+                }
+            }
+            ff_hevc_ps_uninit(&ps);
+            ff_hevc_reset_sei(&sei);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
+}

 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
     V4L2m2mContext *s;
     V4L2m2mPriv *priv = avctx->priv_data;
+    int gf_pix_fmt;
     int ret;

+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        if (avctx->ticks_per_frame == 1) {
+            if(avctx->time_base.den < INT_MAX/2) {
+                avctx->time_base.den *= 2;
+            } else
+                avctx->time_base.num /= 2;
+        }
+        avctx->ticks_per_frame = 2;
+    }
+
+    parse_extradata(avctx);
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;

+    xlat_init(&s->xlat);
+    pts_stats_init(&s->pts_stat, avctx, "decoder");
+
     capture = &s->capture;
     output = &s->output;

@@ -192,14 +1081,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
      * the proper values will be retrieved from the kernel driver.
      */
-    output->height = capture->height = avctx->coded_height;
-    output->width = capture->width = avctx->coded_width;
+//    output->height = capture->height = avctx->coded_height;
+//    output->width = capture->width = avctx->coded_width;
+    output->height = capture->height = 0;
+    output->width = capture->width = 0;

     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
+    output->min_buf_size = max_coded_size(avctx);

     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
+    capture->min_buf_size = 0;
+
+    /* the client requests the codec to generate DRM frames:
+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
+     *       check the ff_v4l2_buffer_to_avframe conversion function.
+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
+     *       check the v4l2_get_drm_frame function.
+     */
+
+    avctx->sw_pix_fmt = avctx->pix_fmt;
+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
+           avctx->coded_width, avctx->coded_height,
+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+
+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+        s->output_drm = 1;
+    }
+    else {
+        capture->av_pix_fmt = gf_pix_fmt;
+        s->output_drm = 0;
+    }
+
+    s->db_ctl = NULL;
+    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
+        if (strcmp(priv->dmabuf_alloc, "cma") == 0)
+            s->db_ctl = dmabufs_ctl_new();
+        else {
+            av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc);
+            return AVERROR(EINVAL);
+        }
+        if (!s->db_ctl) {
+            av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
+    if (!s->device_ref) {
+        ret = AVERROR(ENOMEM);
+        return ret;
+    }
+
+    ret = av_hwdevice_ctx_init(s->device_ref);
+    if (ret < 0)
+        return ret;

     s->avctx = avctx;
     ret = ff_v4l2_m2m_codec_init(priv);
@@ -208,12 +1148,88 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }

-    return v4l2_prepare_decoder(s);
+    if (avctx->extradata &&
+        (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret));
+#if DUMP_FAILED_EXTRADATA
+        log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size);
+#endif
+        return ret;
+    }
+
+    if ((ret = v4l2_prepare_decoder(s)) < 0)
+        return ret;
+
+    if ((ret = get_quirks(avctx, s)) != 0)
+        return ret;
+
+    if ((ret = check_size(avctx, s)) != 0)
+        return ret;
+
+    if ((ret = check_profile(avctx, s)) != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
+        return ret;
+    }
+    return 0;
 }

 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
 {
-    return ff_v4l2_m2m_codec_end(avctx->priv_data);
+    int rv;
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
+    return rv;
+}
+
+static void v4l2_decode_flush(AVCodecContext *avctx)
+{
+    // An alternatve and more drastic form of flush is to simply do this:
+    //    v4l2_decode_close(avctx);
+    //    v4l2_decode_init(avctx);
+    // The downside is that this keeps a decoder open until all the frames
+    // associated with it have been returned.  This is a bit wasteful on
+    // possibly limited h/w resources and fails on a Pi for this reason unless
+    // more GPU mem is allocated than is the default.
+
+    V4L2m2mPriv * const priv = avctx->priv_data;
+    V4L2m2mContext * const s = priv->context;
+    V4L2Context * const output = &s->output;
+    V4L2Context * const capture = &s->capture;
+
+    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
+
+    // Reflushing everything is benign, quick and avoids having to worry about
+    // states like EOS processing so don't try to optimize out (having got it
+    // wrong once)
+
+    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
+
+    // Clear any buffered input packet
+    av_packet_unref(&s->buf_pkt);
+
+    // Clear a pending EOS
+    if (ff_v4l2_ctx_eos(capture)) {
+        // Arguably we could delay this but this is easy and doesn't require
+        // thought or extra vars
+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+    }
+
+    // V4L2 makes no guarantees about whether decoded frames are flushed or not
+    // so mark all frames we are tracking to be discarded if they appear
+    xlat_flush(&s->xlat);
+
+    // resend extradata
+    s->extdata_sent = 0;
+    // clear status vars
+    s->running = 0;
+    s->draining = 0;
+    output->done = 0;
+    capture->done = 0;
+
+    // Stream on will occur when we actually submit a new frame
+    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
 }

 #define OFFSET(x) offsetof(V4L2m2mPriv, x)
@@ -222,10 +1238,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
 static const AVOption options[] = {
     V4L_M2M_DEFAULT_OPTS,
     { "num_capture_buffers", "Number of buffers in the capture context",
-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
+    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
+    { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
     { NULL},
 };

+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
+    HW_CONFIG_INTERNAL(DRM_PRIME),
+    NULL
+};
+
 #define M2MDEC_CLASS(NAME) \
     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
         .class_name = #NAME "_v4l2m2m_decoder", \
@@ -246,9 +1269,15 @@ static const AVOption options[] = {
         .init           = v4l2_decode_init, \
         .receive_frame  = v4l2_receive_frame, \
         .close          = v4l2_decode_close, \
+        .flush          = v4l2_decode_flush, \
         .bsfs           = bsf_name, \
         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
         .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
+        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
+                                                         AV_PIX_FMT_NV12, \
+                                                         AV_PIX_FMT_YUV420P, \
+                                                         AV_PIX_FMT_NONE}, \
+        .hw_configs     = v4l2_m2m_hw_configs, \
         .wrapper_name   = "v4l2m2m", \
     }

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index f644b50133..6472b56030 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -24,6 +24,8 @@
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 #include <search.h>
+#include <drm_fourcc.h>
+
 #include "encode.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/internal.h"
@@ -38,6 +40,34 @@
 #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
 #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x

+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
+// in the future but until then...
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
+#endif
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+#ifndef V4L2_CID_CODEC_BASE
+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in videodev2.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
 static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
 {
     struct v4l2_streamparm parm = { 0 };
@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
 static int v4l2_check_b_frame_support(V4L2m2mContext *s)
 {
     if (s->avctx->max_b_frames)
-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);

-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
     if (s->avctx->max_b_frames == 0)
         return 0;

     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
-
     return AVERROR_PATCHWELCOME;
 }

@@ -271,17 +300,208 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
     return 0;
 }

+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
+{
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    const uint32_t drm_fmt = src->layers[0].format;
+    // Treat INVALID as LINEAR
+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
+    uint32_t pix_fmt = 0;
+    uint32_t w = 0;
+    uint32_t h = 0;
+    uint32_t bpl = src->layers[0].planes[0].pitch;
+
+    // We really don't expect multiple layers
+    // All formats that we currently cope with are single object
+
+    if (src->nb_layers != 1 || src->nb_objects != 1)
+        return AVERROR(EINVAL);
+
+    switch (drm_fmt) {
+        case DRM_FORMAT_YUV420:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 3)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_YUV420;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            break;
+
+        case DRM_FORMAT_NV12:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
+                w = bpl;
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        case DRM_FORMAT_P030:
+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
+                w = bpl / 2;  // Matching lie to how we construct this
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    if (!pix_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->plane_fmt[0].bytesperline = bpl;
+        pix->num_planes = 1;
+    }
+    else {
+        struct v4l2_pix_format *const pix = &format->fmt.pix;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->bytesperline = bpl;
+    }
+
+    return 0;
+}
+
+// Do we have similar enough formats to be usable?
+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
+{
+    if (a->type != b->type)
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
+        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
+        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
+        unsigned int i;
+        if (pa->pixelformat != pb->pixelformat ||
+            pa->num_planes != pb->num_planes)
+            return 0;
+        for (i = 0; i != pa->num_planes; ++i) {
+            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
+                return 0;
+        }
+    }
+    else {
+        const struct v4l2_pix_format *const pa = &a->fmt.pix;
+        const struct v4l2_pix_format *const pb = &b->fmt.pix;
+        if (pa->pixelformat != pb->pixelformat ||
+            pa->bytesperline != pb->bytesperline)
+            return 0;
+    }
+    return 1;
+}
+
+static inline int q_full(const V4L2Context *const output)
+{
+    return ff_v4l2_context_q_count(output) == output->num_buffers;
+}
+
 static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
+    int rv;
+    const int needs_slot = q_full(output);
+
+    av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
+
+    // Signal EOF if needed (doesn't need q slot)
+    if (!frame) {
+        av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__);
+        return ff_v4l2_context_enqueue_frame(output, frame);
+    }
+
+    if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) {
+        // We should be able to return AVERROR(EAGAIN) to indicate buffer
+        // exhaustion, but ffmpeg currently treats that as fatal.
+        av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv));
+        return rv;
+    }
+
+    if (s->input_drm && !output->streamon) {
+        struct v4l2_format req_format = {.type = output->format.type};
+
+        // Set format when we first get a buffer
+        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
+            return rv;
+        }
+
+        ff_v4l2_context_release(output);
+
+        output->format = req_format;
+
+        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
+            return rv;
+        }
+
+        if (!fmt_eq(&req_format, &output->format)) {
+            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
+            return AVERROR(EINVAL);
+        }
+
+        output->selection.top = frame->crop_top;
+        output->selection.left = frame->crop_left;
+        output->selection.width = av_frame_cropped_width(frame);
+        output->selection.height = av_frame_cropped_height(frame);
+
+        if ((rv = ff_v4l2_context_init(output)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
+            return rv;
+        }
+
+        {
+            struct v4l2_selection selection = {
+                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
+                .target = V4L2_SEL_TGT_CROP,
+                .r = output->selection
+            };
+            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
+                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
+                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
+                       av_err2str(AVERROR(errno)));
+            }
+            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
+                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
+        }
+    }

 #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
+    if (frame->pict_type == AV_PICTURE_TYPE_I)
         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
 #endif

-    return ff_v4l2_context_enqueue_frame(output, frame);
+    rv = ff_v4l2_context_enqueue_frame(output, frame);
+    if (rv) {
+        av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv));
+    }
+
+    return rv;
 }

 static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
@@ -292,6 +512,11 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;

+    av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__,
+           ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture));
+
+    ff_v4l2_dq_all(output, 0);
+
     if (s->draining)
         goto dequeue;

@@ -328,7 +553,115 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }

 dequeue:
-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
+    // Dequeue a frame
+    for (;;) {
+        int t = q_full(output) ? -1 : s->draining ? 300 : 0;
+        int rv2;
+
+        // If output is full wait for either a packet or output to become not full
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t);
+
+        // If output was full retry packet dequeue
+        t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300;
+        rv2 = ff_v4l2_dq_all(output, t);
+        if (t == 0 || rv2 != 0)
+            break;
+    }
+    if (ret)
+        return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
+
+    if (capture->first_buf == 1) {
+        uint8_t * data;
+        const int len = avpkt->size;
+
+        // 1st buffer after streamon should be SPS/PPS
+        capture->first_buf = 2;
+
+        // Clear both possible stores so there is no chance of confusion
+        av_freep(&s->extdata_data);
+        s->extdata_size = 0;
+        av_freep(&avctx->extradata);
+        avctx->extradata_size = 0;
+
+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
+            goto fail_no_mem;
+
+        memcpy(data, avpkt->data, len);
+        av_packet_unref(avpkt);
+
+        // We need to copy the header, but keep local if not global
+        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
+            avctx->extradata = data;
+            avctx->extradata_size = len;
+        }
+        else {
+            s->extdata_data = data;
+            s->extdata_size = len;
+        }
+
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0);
+        ff_v4l2_dq_all(output, 0);
+        if (ret)
+            return ret;
+    }
+
+    // First frame must be key so mark as such even if encoder forgot
+    if (capture->first_buf == 2) {
+        avpkt->flags |= AV_PKT_FLAG_KEY;
+
+        // Add any extradata to the 1st packet we emit as we cannot create it at init
+        if (avctx->extradata_size > 0 && avctx->extradata) {
+            void * const side = av_packet_new_side_data(avpkt,
+                                           AV_PKT_DATA_NEW_EXTRADATA,
+                                           avctx->extradata_size);
+            if (!side)
+                goto fail_no_mem;
+
+            memcpy(side, avctx->extradata, avctx->extradata_size);
+        }
+    }
+
+    // Add SPS/PPS to the start of every key frame if non-global headers
+    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
+        const size_t newlen = s->extdata_size + avpkt->size;
+        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
+
+        if (buf == NULL)
+            goto fail_no_mem;
+
+        memcpy(buf->data, s->extdata_data, s->extdata_size);
+        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
+
+        av_buffer_unref(&avpkt->buf);
+        avpkt->buf = buf;
+        avpkt->data = buf->data;
+        avpkt->size = newlen;
+    }
+    else if (ff_v4l2_context_q_count(capture) < 2) {
+        // Avoid running out of capture buffers
+        // In most cases the buffers will be returned quickly in which case
+        // we don't copy and can use the v4l2 buffers directly but sometimes
+        // ffmpeg seems to hold onto all of them for a long time (.mkv
+        // creation?) so avoid deadlock in those cases.
+        AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (buf == NULL)
+            goto fail_no_mem;
+
+        memcpy(buf->data, avpkt->data, avpkt->size);
+        av_buffer_unref(&avpkt->buf);  // Will recycle the V4L2 buffer
+
+        avpkt->buf = buf;
+        avpkt->data = buf->data;
+    }
+
+    capture->first_buf = 0;
+    return 0;
+
+fail_no_mem:
+    av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n");
+    ret = AVERROR(ENOMEM);
+    av_packet_unref(avpkt);
+    return ret;
 }

 static av_cold int v4l2_encode_init(AVCodecContext *avctx)
@@ -340,6 +673,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
     uint32_t v4l2_fmt_output;
     int ret;

+    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -347,13 +682,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
     capture = &s->capture;
     output  = &s->output;

+    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
+
     /* common settings output/capture */
     output->height = capture->height = avctx->height;
     output->width = capture->width = avctx->width;

     /* output context */
     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-    output->av_pix_fmt = avctx->pix_fmt;
+    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
+            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
+            AV_PIX_FMT_YUV420P;

     /* capture context */
     capture->av_codec_id = avctx->codec_id;
@@ -372,7 +711,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
         v4l2_fmt_output = output->format.fmt.pix.pixelformat;

     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
-    if (pix_fmt_output != avctx->pix_fmt) {
+    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
         return AVERROR(EINVAL);
@@ -390,9 +729,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx)
 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM

 #define V4L_M2M_CAPTURE_OPTS \
-    V4L_M2M_DEFAULT_OPTS,\
+    { "num_output_buffers", "Number of buffers in the output context",\
+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\
     { "num_capture_buffers", "Number of buffers in the capture context", \
-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS }
+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS }

 static const AVOption mpeg4_options[] = {
     V4L_M2M_CAPTURE_OPTS,
diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
new file mode 100644
index 0000000000..5b3fb958fa
--- /dev/null
+++ b/libavcodec/v4l2_req_decode_q.c
@@ -0,0 +1,84 @@
+#include <memory.h>
+#include <semaphore.h>
+#include <pthread.h>
+
+#include "v4l2_req_decode_q.h"
+
+int decode_q_in_q(const req_decode_ent * const d)
+{
+    return d->in_q;
+}
+
+void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
+{
+    pthread_mutex_lock(&q->q_lock);
+    if (!q->head) {
+        q->head = d;
+        q->tail = d;
+        d->prev = NULL;
+    }
+    else {
+        q->tail->next = d;
+        d->prev = q->tail;
+        q->tail = d;
+    }
+    d->next = NULL;
+    d->in_q = 1;
+    pthread_mutex_unlock(&q->q_lock);
+}
+
+// Remove entry from Q - if head wake-up anything that was waiting
+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
+{
+    int try_signal = 0;
+
+    if (!d->in_q)
+        return;
+
+    pthread_mutex_lock(&q->q_lock);
+    if (d->prev)
+        d->prev->next = d->next;
+    else {
+        try_signal = 1;  // Only need to signal if we were head
+        q->head = d->next;
+    }
+
+    if (d->next)
+        d->next->prev = d->prev;
+    else
+        q->tail = d->prev;
+
+    // Not strictly needed but makes debug easier
+    d->next = NULL;
+    d->prev = NULL;
+    d->in_q = 0;
+    pthread_mutex_unlock(&q->q_lock);
+
+    if (try_signal)
+        pthread_cond_broadcast(&q->q_cond);
+}
+
+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
+{
+    pthread_mutex_lock(&q->q_lock);
+
+    while (q->head != d)
+        pthread_cond_wait(&q->q_cond, &q->q_lock);
+
+    pthread_mutex_unlock(&q->q_lock);
+}
+
+void decode_q_uninit(req_decode_q * const q)
+{
+    pthread_mutex_destroy(&q->q_lock);
+    pthread_cond_destroy(&q->q_cond);
+}
+
+void decode_q_init(req_decode_q * const q)
+{
+    memset(q, 0, sizeof(*q));
+    pthread_mutex_init(&q->q_lock, NULL);
+    pthread_cond_init(&q->q_cond, NULL);
+}
+
+
diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h
new file mode 100644
index 0000000000..af7bbe1de4
--- /dev/null
+++ b/libavcodec/v4l2_req_decode_q.h
@@ -0,0 +1,25 @@
+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
+#define AVCODEC_V4L2_REQ_DECODE_Q_H
+
+typedef struct req_decode_ent {
+    struct req_decode_ent * next;
+    struct req_decode_ent * prev;
+    int in_q;
+} req_decode_ent;
+
+typedef struct req_decode_q {
+    pthread_mutex_t q_lock;
+    pthread_cond_t q_cond;
+    req_decode_ent * head;
+    req_decode_ent * tail;
+} req_decode_q;
+
+int decode_q_in_q(const req_decode_ent * const d);
+void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_uninit(req_decode_q * const q);
+void decode_q_init(req_decode_q * const q);
+
+#endif
+
diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
new file mode 100644
index 0000000000..cfa94d55c4
--- /dev/null
+++ b/libavcodec/v4l2_req_devscan.c
@@ -0,0 +1,449 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <libudev.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/sysmacros.h>
+
+#include <linux/media.h>
+#include <linux/videodev2.h>
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_utils.h"
+
+struct decdev {
+    enum v4l2_buf_type src_type;
+    uint32_t src_fmt_v4l2;
+    const char * vname;
+    const char * mname;
+};
+
+struct devscan {
+    struct decdev env;
+    unsigned int dev_size;
+    unsigned int dev_count;
+    struct decdev *devs;
+};
+
+static int video_src_pixfmt_supported(uint32_t fmt)
+{
+    return 1;
+}
+
+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
+                  unsigned int width, unsigned int height,
+                  unsigned int pixelformat)
+{
+    unsigned int sizeimage;
+
+    memset(format, 0, sizeof(*format));
+    format->type = type;
+
+    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
+        format->fmt.pix_mp.width = width;
+        format->fmt.pix_mp.height = height;
+        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
+        format->fmt.pix_mp.pixelformat = pixelformat;
+    } else {
+        format->fmt.pix.width = width;
+        format->fmt.pix.height = height;
+        format->fmt.pix.sizeimage = sizeimage;
+        format->fmt.pix.pixelformat = pixelformat;
+    }
+}
+
+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
+            unsigned int width, unsigned int height)
+{
+    struct v4l2_format format;
+
+    v4l2_setup_format(&format, type, width, height, pixelformat);
+
+    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
+}
+
+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
+{
+    struct v4l2_capability capability = { 0 };
+    int rc;
+
+    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
+    if (rc < 0)
+        return -errno;
+
+    if (capabilities != NULL) {
+        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
+            *capabilities = capability.device_caps;
+        else
+            *capabilities = capability.capabilities;
+    }
+
+    return 0;
+}
+
+static int devscan_add(struct devscan *const scan,
+                       enum v4l2_buf_type src_type,
+                       uint32_t src_fmt_v4l2,
+                       const char * vname,
+                       const char * mname)
+{
+    struct decdev *d;
+
+    if (scan->dev_size <= scan->dev_count) {
+        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
+        d = realloc(scan->devs, n * sizeof(*d));
+        if (!d)
+            return -ENOMEM;
+        scan->devs = d;
+        scan->dev_size = n;
+    }
+
+    d = scan->devs + scan->dev_count;
+    d->src_type = src_type;
+    d->src_fmt_v4l2 = src_fmt_v4l2;
+    d->vname = strdup(vname);
+    if (!d->vname)
+        return -ENOMEM;
+    d->mname = strdup(mname);
+    if (!d->mname) {
+        free((char *)d->vname);
+        return -ENOMEM;
+    }
+    ++scan->dev_count;
+    return 0;
+}
+
+void devscan_delete(struct devscan **const pScan)
+{
+    unsigned int i;
+    struct devscan * const scan = *pScan;
+
+    if (!scan)
+        return;
+    *pScan = NULL;
+
+    for (i = 0; i < scan->dev_count; ++i) {
+        free((char*)scan->devs[i].mname);
+        free((char*)scan->devs[i].vname);
+    }
+    free(scan->devs);
+    free(scan);
+}
+
+#define REQ_BUF_CAPS (\
+    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
+    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
+    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
+
+static void probe_formats(void * const dc,
+              struct devscan *const scan,
+              const int fd,
+              const unsigned int type_v4l2,
+              const char *const mpath,
+              const char *const vpath)
+{
+    unsigned int i;
+    for (i = 0;; ++i) {
+        struct v4l2_fmtdesc fmtdesc = {
+            .index = i,
+            .type = type_v4l2
+        };
+        struct v4l2_requestbuffers rbufs = {
+            .count = 0,
+            .type = type_v4l2,
+            .memory = V4L2_MEMORY_MMAP
+        };
+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
+            if (errno == EINTR)
+                continue;
+            if (errno != EINVAL)
+                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
+            return;
+        }
+        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
+            continue;
+
+        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
+            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
+            continue;
+        }
+
+        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
+            if (errno != EINTR) {
+                request_debug(dc, "%s: Reqbufs failed\n", vpath);
+                continue;
+            }
+        }
+
+        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
+            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
+            continue;
+        }
+
+        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
+                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
+        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
+    }
+}
+
+
+static int probe_video_device(void * const dc,
+                   struct udev_device *const device,
+                   struct devscan *const scan,
+                   const char *const mpath)
+{
+    int ret;
+    unsigned int capabilities = 0;
+    int video_fd = -1;
+
+    const char *path = udev_device_get_devnode(device);
+    if (!path) {
+        request_err(dc, "%s: get video device devnode failed\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    video_fd = open(path, O_RDWR, 0);
+    if (video_fd == -1) {
+        ret = -errno;
+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
+        goto fail;
+    }
+
+    ret = v4l2_query_capabilities(video_fd, &capabilities);
+    if (ret < 0) {
+        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
+
+    if (!(capabilities & V4L2_CAP_STREAMING)) {
+        request_debug(dc, "%s: missing required streaming capability\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
+        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Should check capture formats too... */
+    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
+
+    close(video_fd);
+    return 0;
+
+fail:
+    if (video_fd >= 0)
+        close(video_fd);
+    return ret;
+}
+
+static int probe_media_device(void * const dc,
+                   struct udev_device *const device,
+                   struct devscan *const scan)
+{
+    int ret;
+    int rv;
+    struct media_device_info device_info = { 0 };
+    struct media_v2_topology topology = { 0 };
+    struct media_v2_interface *interfaces = NULL;
+    struct udev *udev = udev_device_get_udev(device);
+    struct udev_device *video_device;
+    dev_t devnum;
+    int media_fd = -1;
+
+    const char *path = udev_device_get_devnode(device);
+    if (!path) {
+        request_err(dc, "%s: get media device devnode failed\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    media_fd = open(path, O_RDWR, 0);
+    if (media_fd < 0) {
+        ret = -errno;
+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    if (topology.num_interfaces <= 0) {
+        request_err(dc, "%s: media device has no interfaces\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
+    if (!interfaces) {
+        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    for (int i = 0; i < topology.num_interfaces; i++) {
+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
+            continue;
+
+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
+        if (!video_device) {
+            ret = -errno;
+            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
+            continue;
+        }
+
+        ret = probe_video_device(dc, video_device, scan, path);
+        udev_device_unref(video_device);
+
+        if (ret != 0)
+            goto fail;
+    }
+
+fail:
+    free(interfaces);
+    if (media_fd != -1)
+        close(media_fd);
+    return ret;
+}
+
+const char *decdev_media_path(const struct decdev *const dev)
+{
+    return !dev ? NULL : dev->mname;
+}
+
+const char *decdev_video_path(const struct decdev *const dev)
+{
+    return !dev ? NULL : dev->vname;
+}
+
+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
+{
+    return !dev ? 0 : dev->src_type;
+}
+
+uint32_t decdev_src_pixelformat(const struct decdev *const dev)
+{
+    return !dev ? 0 : dev->src_fmt_v4l2;
+}
+
+
+const struct decdev *devscan_find(struct devscan *const scan,
+                  const uint32_t src_fmt_v4l2)
+{
+    unsigned int i;
+
+    if (scan->env.mname && scan->env.vname)
+        return &scan->env;
+
+    if (!src_fmt_v4l2)
+        return scan->dev_count ? scan->devs + 0 : NULL;
+
+    for (i = 0; i != scan->dev_count; ++i) {
+        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
+            return scan->devs + i;
+    }
+    return NULL;
+}
+
+int devscan_build(void * const dc, struct devscan **pscan)
+{
+    int ret;
+    struct udev *udev;
+    struct udev_enumerate *enumerate;
+    struct udev_list_entry *devices;
+    struct udev_list_entry *entry;
+    struct udev_device *device;
+    struct devscan * scan;
+
+    *pscan = NULL;
+
+    scan = calloc(1, sizeof(*scan));
+    if (!scan) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
+    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
+    if (scan->env.mname && scan->env.vname) {
+        request_info(dc, "Media/video device env overrides found: %s,%s\n",
+                 scan->env.mname, scan->env.vname);
+        *pscan = scan;
+        return 0;
+    }
+
+    udev = udev_new();
+    if (!udev) {
+        request_err(dc, "%s: allocating udev context failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    enumerate = udev_enumerate_new(udev);
+    if (!enumerate) {
+        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    udev_enumerate_add_match_subsystem(enumerate, "media");
+    udev_enumerate_scan_devices(enumerate);
+
+    devices = udev_enumerate_get_list_entry(enumerate);
+    udev_list_entry_foreach(entry, devices) {
+        const char *path = udev_list_entry_get_name(entry);
+        if (!path)
+            continue;
+
+        device = udev_device_new_from_syspath(udev, path);
+        if (!device)
+            continue;
+
+        probe_media_device(dc, device, scan);
+        udev_device_unref(device);
+    }
+
+    udev_enumerate_unref(enumerate);
+
+    *pscan = scan;
+    return 0;
+
+fail:
+    udev_unref(udev);
+    devscan_delete(&scan);
+    return ret;
+}
+
diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
new file mode 100644
index 0000000000..956d9234f1
--- /dev/null
+++ b/libavcodec/v4l2_req_devscan.h
@@ -0,0 +1,23 @@
+#ifndef _DEVSCAN_H_
+#define _DEVSCAN_H_
+
+#include <stdint.h>
+
+struct devscan;
+struct decdev;
+enum v4l2_buf_type;
+
+/* These return pointers to data in the devscan structure and so are vaild
+ * for the lifetime of that
+ */
+const char *decdev_media_path(const struct decdev *const dev);
+const char *decdev_video_path(const struct decdev *const dev);
+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
+uint32_t decdev_src_pixelformat(const struct decdev *const dev);
+
+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
+
+int devscan_build(void * const dc, struct devscan **pscan);
+void devscan_delete(struct devscan **const pScan);
+
+#endif
diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
new file mode 100644
index 0000000000..acc0366e76
--- /dev/null
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -0,0 +1,369 @@
+#include <stdatomic.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-heap.h>
+
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_utils.h"
+
+#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
+#define DMABUF_NAME2  "/dev/dma_heap/reserved"
+
+#define TRACE_ALLOC 0
+
+struct dmabufs_ctl;
+struct dmabuf_h;
+
+struct dmabuf_fns {
+    int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size);
+    void (*buf_free)(struct dmabuf_h * dh);
+    int (*ctl_new)(struct dmabufs_ctl * dbsc);
+    void (*ctl_free)(struct dmabufs_ctl * dbsc);
+};
+
+struct dmabufs_ctl {
+    atomic_int ref_count;
+    int fd;
+    size_t page_size;
+    void * v;
+    const struct dmabuf_fns * fns;
+};
+
+struct dmabuf_h {
+    int fd;
+    size_t size;
+    size_t len;
+    void * mapptr;
+    void * v;
+    const struct dmabuf_fns * fns;
+};
+
+#if TRACE_ALLOC
+static unsigned int total_bufs = 0;
+static size_t total_size = 0;
+#endif
+
+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size)
+{
+    struct dmabuf_h *dh;
+
+    if (mapptr == MAP_FAILED)
+        return NULL;
+
+    dh = malloc(sizeof(*dh));
+    if (!dh)
+        return NULL;
+
+    *dh = (struct dmabuf_h) {
+        .fd = -1,
+        .size = size,
+        .mapptr = mapptr
+    };
+
+    return dh;
+}
+
+struct dmabuf_h * dmabuf_import(int fd, size_t size)
+{
+    struct dmabuf_h *dh;
+
+    fd = dup(fd);
+    if (fd < 0  || size == 0)
+        return NULL;
+
+    dh = malloc(sizeof(*dh));
+    if (!dh) {
+        close(fd);
+        return NULL;
+    }
+
+    *dh = (struct dmabuf_h) {
+        .fd = fd,
+        .size = size,
+        .mapptr = MAP_FAILED
+    };
+
+#if TRACE_ALLOC
+    ++total_bufs;
+    total_size += dh->size;
+    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    return dh;
+}
+
+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
+{
+    struct dmabuf_h * dh;
+    if (old != NULL) {
+        if (old->size >= size) {
+            return old;
+        }
+        dmabuf_free(old);
+    }
+
+    if (size == 0 ||
+        (dh = malloc(sizeof(*dh))) == NULL)
+        return NULL;
+
+    *dh = (struct dmabuf_h){
+        .fd = -1,
+        .mapptr = MAP_FAILED,
+        .fns = dbsc->fns
+    };
+
+    if (dh->fns->buf_alloc(dbsc, dh, size) != 0)
+        goto fail;
+
+
+#if TRACE_ALLOC
+    ++total_bufs;
+    total_size += dh->size;
+    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    return dh;
+
+fail:
+    free(dh);
+    return NULL;
+}
+
+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
+{
+    struct dma_buf_sync sync = {
+        .flags = flags
+    };
+    if (dh->fd == -1)
+        return 0;
+    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
+        const int err = errno;
+        if (errno == EINTR)
+            continue;
+        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
+        return -err;
+    }
+    return 0;
+}
+
+int dmabuf_write_start(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
+}
+
+int dmabuf_write_end(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
+}
+
+int dmabuf_read_start(struct dmabuf_h * const dh)
+{
+    if (!dmabuf_map(dh))
+        return -1;
+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
+}
+
+int dmabuf_read_end(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
+}
+
+
+void * dmabuf_map(struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return NULL;
+    if (dh->mapptr != MAP_FAILED)
+        return dh->mapptr;
+    dh->mapptr = mmap(NULL, dh->size,
+              PROT_READ | PROT_WRITE,
+              MAP_SHARED | MAP_POPULATE,
+              dh->fd, 0);
+    if (dh->mapptr == MAP_FAILED) {
+        request_log("%s: Map failed\n", __func__);
+        return NULL;
+    }
+    return dh->mapptr;
+}
+
+int dmabuf_fd(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return -1;
+    return dh->fd;
+}
+
+size_t dmabuf_size(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return 0;
+    return dh->size;
+}
+
+size_t dmabuf_len(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return 0;
+    return dh->len;
+}
+
+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
+{
+    dh->len = len;
+}
+
+void dmabuf_free(struct dmabuf_h * dh)
+{
+    if (!dh)
+        return;
+
+#if TRACE_ALLOC
+    --total_bufs;
+    total_size -= dh->size;
+    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    dh->fns->buf_free(dh);
+
+    if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
+        munmap(dh->mapptr, dh->size);
+    if (dh->fd != -1)
+        while (close(dh->fd) == -1 && errno == EINTR)
+            /* loop */;
+    free(dh);
+}
+
+static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns)
+{
+    struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc));
+
+    if (!dbsc)
+        return NULL;
+
+    dbsc->fd = -1;
+    dbsc->fns = fns;
+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    if (fns->ctl_new(dbsc) != 0)
+        goto fail;
+
+    return dbsc;
+
+fail:
+    free(dbsc);
+    return NULL;
+}
+
+static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
+{
+    request_debug(NULL, "Free dmabuf ctl\n");
+
+    dbsc->fns->ctl_free(dbsc);
+
+    free(dbsc);
+}
+
+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc)
+{
+    struct dmabufs_ctl * const dbsc = *pDbsc;
+
+    if (!dbsc)
+        return;
+    *pDbsc = NULL;
+
+    if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0)
+        return;
+
+    dmabufs_ctl_free(dbsc);
+}
+
+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
+{
+    atomic_fetch_add(&dbsc->ref_count, 1);
+    return dbsc;
+}
+
+//-----------------------------------------------------------------------------
+//
+// Alloc dmabuf via CMA
+
+static int ctl_cma_new(struct dmabufs_ctl * dbsc)
+{
+    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
+           errno == EINTR)
+        /* Loop */;
+
+    if (dbsc->fd == -1) {
+        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
+               errno == EINTR)
+            /* Loop */;
+        if (dbsc->fd == -1) {
+            request_log("Unable to open either %s or %s\n",
+                    DMABUF_NAME1, DMABUF_NAME2);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static void ctl_cma_free(struct dmabufs_ctl * dbsc)
+{
+    if (dbsc->fd != -1)
+        while (close(dbsc->fd) == -1 && errno == EINTR)
+            /* loop */;
+
+}
+
+static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
+{
+    struct dma_heap_allocation_data data = {
+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
+        .fd = 0,
+        .fd_flags = O_RDWR,
+        .heap_flags = 0
+    };
+
+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
+        int err = errno;
+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
+                (uint64_t)data.len,
+                dbsc->fd,
+                err,
+                strerror(err));
+        if (err == EINTR)
+            continue;
+        return -err;
+    }
+
+    dh->fd = data.fd;
+    dh->size = (size_t)data.len;
+    return 0;
+}
+
+static void buf_cma_free(struct dmabuf_h * dh)
+{
+    // Nothing needed
+}
+
+static const struct dmabuf_fns dmabuf_cma_fns = {
+    .buf_alloc  = buf_cma_alloc,
+    .buf_free   = buf_cma_free,
+    .ctl_new    = ctl_cma_new,
+    .ctl_free   = ctl_cma_free,
+};
+
+struct dmabufs_ctl * dmabufs_ctl_new(void)
+{
+    request_debug(NULL, "Dmabufs using CMA\n");;
+    return dmabufs_ctl_new2(&dmabuf_cma_fns);
+}
+
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
new file mode 100644
index 0000000000..381ba2708d
--- /dev/null
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -0,0 +1,44 @@
+#ifndef DMABUFS_H
+#define DMABUFS_H
+
+#include <stddef.h>
+
+struct dmabufs_ctl;
+struct dmabuf_h;
+
+struct dmabufs_ctl * dmabufs_ctl_new(void);
+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);
+
+// Need not preserve old contents
+// On NULL return old buffer is freed
+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
+
+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
+    return dmabuf_realloc(dbsc, NULL, size);
+}
+/* Create from existing fd - dups(fd) */
+struct dmabuf_h * dmabuf_import(int fd, size_t size);
+/* Import an MMAP - return NULL if mapptr = MAP_FAIL */
+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size);
+
+void * dmabuf_map(struct dmabuf_h * const dh);
+
+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
+
+int dmabuf_write_start(struct dmabuf_h * const dh);
+int dmabuf_write_end(struct dmabuf_h * const dh);
+int dmabuf_read_start(struct dmabuf_h * const dh);
+int dmabuf_read_end(struct dmabuf_h * const dh);
+
+int dmabuf_fd(const struct dmabuf_h * const dh);
+/* Allocated size */
+size_t dmabuf_size(const struct dmabuf_h * const dh);
+/* Bytes in use */
+size_t dmabuf_len(const struct dmabuf_h * const dh);
+/* Set bytes in use */
+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
+void dmabuf_free(struct dmabuf_h * dh);
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c
new file mode 100644
index 0000000000..169b532832
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v1.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 1
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c
new file mode 100644
index 0000000000..42af98e156
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v2.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 2
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c
new file mode 100644
index 0000000000..dcc8d95632
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v3.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 3
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
new file mode 100644
index 0000000000..c35579d8e0
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v4.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 4
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
new file mode 100644
index 0000000000..b98d8464ca
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -0,0 +1,1360 @@
+// File included by v4l2_req_hevc_v* - not compiled on its own
+
+#include "decode.h"
+#include "hevcdec.h"
+#include "hwconfig.h"
+
+#if HEVC_CTRLS_VERSION == 1
+#include "hevc-ctrls-v1.h"
+
+// Fixup renamed entries
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
+
+#elif HEVC_CTRLS_VERSION == 2
+#include "hevc-ctrls-v2.h"
+#elif HEVC_CTRLS_VERSION == 3
+#include "hevc-ctrls-v3.h"
+#elif HEVC_CTRLS_VERSION == 4
+#include <linux/v4l2-controls.h>
+#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
+#include "hevc-ctrls-v4.h"
+#endif
+#else
+#error Unknown HEVC_CTRLS_VERSION
+#endif
+
+#ifndef V4L2_CID_STATELESS_HEVC_SPS
+#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
+#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
+#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
+
+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
+#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
+#endif
+
+#include "v4l2_request_hevc.h"
+
+#include "libavutil/hwcontext_drm.h"
+
+#include <semaphore.h>
+#include <pthread.h>
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_utils.h"
+
+// Attached to buf[0] in frame
+// Pooled in hwcontext so generally create once - 1/frame
+typedef struct V4L2MediaReqDescriptor {
+    AVDRMFrameDescriptor drm;
+
+    // Media
+    uint64_t timestamp;
+    struct qent_dst * qe_dst;
+
+    // Decode only - should be NULL by the time we emit the frame
+    struct req_decode_ent decode_ent;
+
+    struct media_request *req;
+    struct qent_src *qe_src;
+
+#if HEVC_CTRLS_VERSION >= 2
+    struct v4l2_ctrl_hevc_decode_params dec;
+#endif
+
+    size_t num_slices;
+    size_t alloced_slices;
+    struct v4l2_ctrl_hevc_slice_params * slice_params;
+    struct slice_info * slices;
+
+    size_t num_offsets;
+    size_t alloced_offsets;
+    uint32_t *offsets;
+
+} V4L2MediaReqDescriptor;
+
+struct slice_info {
+    const uint8_t * ptr;
+    size_t len; // bytes
+    size_t n_offsets;
+};
+
+// Handy container for accumulating controls before setting
+struct req_controls {
+    int has_scaling;
+    struct timeval tv;
+    struct v4l2_ctrl_hevc_sps sps;
+    struct v4l2_ctrl_hevc_pps pps;
+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
+};
+
+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
+
+
+// Get an FFmpeg format from the v4l2 format
+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
+{
+    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
+            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
+    case V4L2_PIX_FMT_YUV420:
+        return AV_PIX_FMT_YUV420P;
+    case V4L2_PIX_FMT_NV12:
+        return AV_PIX_FMT_NV12;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        return AV_PIX_FMT_RPI4_8;
+    case V4L2_PIX_FMT_NV12_10_COL128:
+        return AV_PIX_FMT_RPI4_10;
+#endif
+    default:
+        break;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
+static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
+{
+    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
+    return rd->timestamp;
+}
+
+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
+{
+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
+    rd->timestamp = dpb_stamp;
+}
+
+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
+{
+    int32_t luma_weight_denom, chroma_weight_denom;
+    const SliceHeader *sh = &h->sh;
+
+    if (sh->slice_type == HEVC_SLICE_I ||
+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
+        return;
+
+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
+
+    if (h->ps.sps->chroma_format_idc)
+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
+
+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
+
+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
+    }
+
+    if (sh->slice_type != HEVC_SLICE_B)
+        return;
+
+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
+    }
+}
+
+#if HEVC_CTRLS_VERSION <= 2
+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
+{
+    const HEVCFrame *frame;
+    int i;
+
+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
+        frame = h->rps[ST_CURR_BEF].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
+    }
+
+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
+        frame = h->rps[ST_CURR_AFT].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
+    }
+
+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
+        frame = h->rps[LT_CURR].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
+    }
+
+    return 0;
+}
+#endif
+
+static unsigned int
+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
+                  const struct v4l2_hevc_dpb_entry * const entries,
+                  const unsigned int num_entries)
+{
+    uint64_t timestamp;
+
+    if (!frame)
+        return 0;
+
+    timestamp = frame_capture_dpb(frame->frame);
+
+    for (unsigned int i = 0; i < num_entries; i++) {
+        if (entries[i].timestamp == timestamp)
+            return i;
+    }
+
+    return 0;
+}
+
+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
+{
+    unsigned int z = 0;
+    while (idx--) {
+        if (*b++ == 0) {
+            ++z;
+            if (z >= 2 && *b == 3) {
+                ++b;
+                z = 0;
+            }
+        }
+        else {
+            z = 0;
+        }
+    }
+    return b;
+}
+
+static int slice_add(V4L2MediaReqDescriptor * const rd)
+{
+    if (rd->num_slices >= rd->alloced_slices) {
+        struct v4l2_ctrl_hevc_slice_params * p2;
+        struct slice_info * s2;
+        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
+
+        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
+        if (p2 == NULL)
+            return AVERROR(ENOMEM);
+        rd->slice_params = p2;
+
+        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
+        if (s2 == NULL)
+            return AVERROR(ENOMEM);
+        rd->slices = s2;
+
+        rd->alloced_slices = n2;
+    }
+    ++rd->num_slices;
+    return 0;
+}
+
+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
+{
+    if (rd->num_offsets + n > rd->alloced_offsets) {
+        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
+        void * p2;
+        while (rd->num_offsets + n > n2)
+            n2 *= 2;
+        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
+            return AVERROR(ENOMEM);
+        rd->offsets = p2;
+        rd->alloced_offsets = n2;
+    }
+    for (size_t i = 0; i != n; ++i)
+        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
+    return 0;
+}
+
+static unsigned int
+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
+{
+    unsigned int i;
+    unsigned int n = 0;
+    const HEVCFrame * const pic = h->ref;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
+        const HEVCFrame * const frame = &h->DPB[i];
+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
+            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
+
+            entry->timestamp = frame_capture_dpb(frame->frame);
+#if HEVC_CTRLS_VERSION <= 2
+            entry->rps = find_frame_rps_type(h, entry->timestamp);
+#else
+            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
+                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
+#endif
+            entry->field_pic = frame->frame->interlaced_frame;
+
+#if HEVC_CTRLS_VERSION <= 3
+            /* TODO: Interleaved: Get the POC for each field. */
+            entry->pic_order_cnt[0] = frame->poc;
+            entry->pic_order_cnt[1] = frame->poc;
+#else
+            entry->pic_order_cnt_val = frame->poc;
+#endif
+        }
+    }
+    return n;
+}
+
+static void fill_slice_params(const HEVCContext * const h,
+#if HEVC_CTRLS_VERSION >= 2
+                              const struct v4l2_ctrl_hevc_decode_params * const dec,
+#endif
+                              struct v4l2_ctrl_hevc_slice_params *slice_params,
+                              uint32_t bit_size, uint32_t bit_offset)
+{
+    const SliceHeader * const sh = &h->sh;
+#if HEVC_CTRLS_VERSION >= 2
+    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
+    const unsigned int dpb_n = dec->num_active_dpb_entries;
+#else
+    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
+    unsigned int dpb_n;
+#endif
+    unsigned int i;
+    RefPicList *rpl;
+
+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
+        .bit_size = bit_size,
+#if HEVC_CTRLS_VERSION <= 3
+        .data_bit_offset = bit_offset,
+#else
+        .data_byte_offset = bit_offset / 8 + 1,
+#endif
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .slice_segment_addr = sh->slice_segment_addr,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+        .nal_unit_type = h->nal_unit_type,
+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .slice_type = sh->slice_type,
+        .colour_plane_id = sh->colour_plane_id,
+        .slice_pic_order_cnt = h->ref->poc,
+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
+        .slice_qp_delta = sh->slice_qp_delta,
+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
+        .slice_act_y_qp_offset = 0,
+        .slice_act_cb_qp_offset = 0,
+        .slice_act_cr_qp_offset = 0,
+        .slice_beta_offset_div2 = sh->beta_offset / 2,
+        .slice_tc_offset_div2 = sh->tc_offset / 2,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+        .pic_struct = h->sei.picture_timing.picture_struct,
+
+#if HEVC_CTRLS_VERSION < 2
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
+#endif
+    };
+
+    if (sh->slice_sample_adaptive_offset_flag[0])
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
+
+    if (sh->slice_sample_adaptive_offset_flag[1])
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
+
+    if (sh->slice_temporal_mvp_enabled_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
+
+    if (sh->mvd_l1_zero_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
+
+    if (sh->cabac_init_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
+
+    if (sh->collocated_list == L0)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
+
+    if (sh->disable_deblocking_filter_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
+
+    if (sh->slice_loop_filter_across_slices_enabled_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
+
+    if (sh->dependent_slice_segment_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
+
+#if HEVC_CTRLS_VERSION < 2
+    dpb_n = fill_dpb_entries(h, dpb);
+    slice_params->num_active_dpb_entries = dpb_n;
+#endif
+
+    if (sh->slice_type != HEVC_SLICE_I) {
+        rpl = &h->ref->refPicList[0];
+        for (i = 0; i < rpl->nb_refs; i++)
+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
+    }
+
+    if (sh->slice_type == HEVC_SLICE_B) {
+        rpl = &h->ref->refPicList[1];
+        for (i = 0; i < rpl->nb_refs; i++)
+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
+    }
+
+    fill_pred_table(h, &slice_params->pred_weight_table);
+
+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
+#if HEVC_CTRLS_VERSION <= 3
+    if (slice_params->num_entry_point_offsets > 256) {
+        slice_params->num_entry_point_offsets = 256;
+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
+    }
+
+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
+#endif
+}
+
+#if HEVC_CTRLS_VERSION >= 2
+static void
+fill_decode_params(const HEVCContext * const h,
+                   struct v4l2_ctrl_hevc_decode_params * const dec)
+{
+    unsigned int i;
+
+    *dec = (struct v4l2_ctrl_hevc_decode_params){
+        .pic_order_cnt_val = h->poc,
+        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
+        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
+        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
+    };
+
+    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
+
+    // The docn does seem to ask that we fit our 32 bit signed POC into
+    // a U8 so... (To be fair 16 bits would be enough)
+    // Luckily we (Pi) don't use these fields
+    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
+        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
+    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
+        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
+    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
+        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
+
+    if (IS_IRAP(h))
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
+    if (IS_IDR(h))
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
+    if (h->sh.no_output_of_prior_pics_flag)
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
+
+}
+#endif
+
+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
+{
+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
+        .chroma_format_idc = sps->chroma_format_idc,
+        .pic_width_in_luma_samples = sps->width,
+        .pic_height_in_luma_samples = sps->height,
+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
+        .chroma_format_idc = sps->chroma_format_idc,
+        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
+    };
+
+    if (sps->separate_colour_plane_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
+
+    if (sps->scaling_list_enable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
+
+    if (sps->amp_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
+
+    if (sps->sao_enabled)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
+
+    if (sps->pcm_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
+
+    if (sps->pcm.loop_filter_disable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
+
+    if (sps->long_term_ref_pics_present_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
+
+    if (sps->sps_temporal_mvp_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
+
+    if (sps->sps_strong_intra_smoothing_enable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
+}
+
+static void fill_scaling_matrix(const ScalingList * const sl,
+                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
+{
+    unsigned int i;
+
+    for (i = 0; i < 6; i++) {
+        unsigned int j;
+
+        for (j = 0; j < 16; j++)
+            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
+        for (j = 0; j < 64; j++) {
+            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
+            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
+            if (i < 2)
+                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
+        }
+        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
+        if (i < 2)
+            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
+    }
+}
+
+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
+{
+    uint64_t flags = 0;
+
+    if (pps->dependent_slice_segments_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
+
+    if (pps->output_flag_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
+
+    if (pps->sign_data_hiding_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
+
+    if (pps->cabac_init_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
+
+    if (pps->constrained_intra_pred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
+
+    if (pps->transform_skip_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
+
+    if (pps->cu_qp_delta_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
+
+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
+
+    if (pps->weighted_pred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
+
+    if (pps->weighted_bipred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
+
+    if (pps->transquant_bypass_enable_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
+
+    if (pps->tiles_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
+
+    if (pps->entropy_coding_sync_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
+
+    if (pps->loop_filter_across_tiles_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
+
+    if (pps->seq_loop_filter_across_slices_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
+
+    if (pps->deblocking_filter_override_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
+
+    if (pps->disable_dbf)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
+
+    if (pps->lists_modification_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
+
+    if (pps->slice_header_extension_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
+
+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+    *ctrl = (struct v4l2_ctrl_hevc_pps) {
+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
+        .init_qp_minus26 = pps->pic_init_qp_minus26,
+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
+        .pps_cb_qp_offset = pps->cb_qp_offset,
+        .pps_cr_qp_offset = pps->cr_qp_offset,
+        .pps_beta_offset_div2 = pps->beta_offset / 2,
+        .pps_tc_offset_div2 = pps->tc_offset / 2,
+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
+        .flags = flags
+    };
+
+
+    if (pps->tiles_enabled_flag) {
+        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
+        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
+
+        for (int i = 0; i < pps->num_tile_columns; i++)
+            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
+
+        for (int i = 0; i < pps->num_tile_rows; i++)
+            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
+    }
+}
+
+// Called before finally returning the frame to the user
+// Set corrupt flag here as this is actually the frame structure that
+// is going to the user (in MT land each thread has its own pool)
+static int frame_post_process(void *logctx, AVFrame *frame)
+{
+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
+
+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
+    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
+    if (rd->qe_dst) {
+        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
+        if (stat != MEDIABUFS_STATUS_SUCCESS) {
+            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
+            frame->flags |= AV_FRAME_FLAG_CORRUPT;
+        }
+    }
+
+    return 0;
+}
+
+static inline struct timeval cvt_dpb_to_tv(uint64_t t)
+{
+    t /= 1000;
+    return (struct timeval){
+        .tv_usec = t % 1000000,
+        .tv_sec = t / 1000000
+    };
+}
+
+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
+{
+    return (uint64_t)t * 1000;
+}
+
+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
+                                         av_unused const uint8_t *buffer,
+                                         av_unused uint32_t size)
+{
+    const HEVCContext *h = avctx->priv_data;
+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
+    decode_q_add(&ctx->decode_q, &rd->decode_ent);
+
+    rd->num_slices = 0;
+    ctx->timestamp++;
+    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
+
+    {
+        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
+        fdd->post_process = frame_post_process;
+    }
+
+    // qe_dst needs to be bound to the data buffer and only returned when that is
+    if (!rd->qe_dst)
+    {
+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
+
+    return 0;
+}
+
+// Object fd & size will be zapped by this & need setting later
+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
+{
+    AVDRMLayerDescriptor *layer = &desc->layers[0];
+    unsigned int width;
+    unsigned int height;
+    unsigned int bpl;
+    uint32_t pixelformat;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        width       = format->fmt.pix_mp.width;
+        height      = format->fmt.pix_mp.height;
+        pixelformat = format->fmt.pix_mp.pixelformat;
+        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
+    }
+    else {
+        width       = format->fmt.pix.width;
+        height      = format->fmt.pix.height;
+        pixelformat = format->fmt.pix.pixelformat;
+        bpl         = format->fmt.pix.bytesperline;
+    }
+
+    switch (pixelformat) {
+    case V4L2_PIX_FMT_NV12:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
+        break;
+    case V4L2_PIX_FMT_NV12_10_COL128:
+        layer->format = DRM_FORMAT_P030;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
+        break;
+#endif
+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
+        break;
+#endif
+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
+    case V4L2_PIX_FMT_NV15:
+        layer->format = DRM_FORMAT_NV15;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#endif
+    case V4L2_PIX_FMT_NV16:
+        layer->format = DRM_FORMAT_NV16;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
+    case V4L2_PIX_FMT_NV20:
+        layer->format = DRM_FORMAT_NV20;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#endif
+    default:
+        return -1;
+    }
+
+    desc->nb_objects = 1;
+    desc->objects[0].fd = -1;
+    desc->objects[0].size = 0;
+
+    desc->nb_layers = 1;
+    layer->nb_planes = 2;
+
+    layer->planes[0].object_index = 0;
+    layer->planes[0].offset = 0;
+    layer->planes[0].pitch = bpl;
+#if CONFIG_SAND
+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = height * 128;
+        layer->planes[0].pitch = width;
+        layer->planes[1].pitch = width;
+    }
+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = height * 128;
+        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
+        layer->planes[1].pitch = width * 2;
+    }
+    else
+#endif
+    {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = layer->planes[0].pitch * height;
+        layer->planes[1].pitch = layer->planes[0].pitch;
+    }
+
+    return 0;
+}
+
+static int
+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
+    struct req_controls *const controls,
+#if HEVC_CTRLS_VERSION >= 2
+    struct v4l2_ctrl_hevc_decode_params * const dec,
+#endif
+    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
+    void * const offsets, const size_t offset_count)
+{
+    int rv;
+#if HEVC_CTRLS_VERSION >= 2
+    unsigned int n = 3;
+#else
+    unsigned int n = 2;
+#endif
+
+    struct v4l2_ext_control control[6] = {
+        {
+            .id = V4L2_CID_STATELESS_HEVC_SPS,
+            .ptr = &controls->sps,
+            .size = sizeof(controls->sps),
+        },
+        {
+            .id = V4L2_CID_STATELESS_HEVC_PPS,
+            .ptr = &controls->pps,
+            .size = sizeof(controls->pps),
+        },
+#if HEVC_CTRLS_VERSION >= 2
+        {
+            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
+            .ptr = dec,
+            .size = sizeof(*dec),
+        },
+#endif
+    };
+
+    if (slices)
+        control[n++] = (struct v4l2_ext_control) {
+            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
+            .ptr = slices,
+            .size = sizeof(*slices) * slice_count,
+        };
+
+    if (controls->has_scaling)
+        control[n++] = (struct v4l2_ext_control) {
+            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
+            .ptr = &controls->scaling_matrix,
+            .size = sizeof(controls->scaling_matrix),
+        };
+
+#if HEVC_CTRLS_VERSION >= 4
+    if (offsets)
+        control[n++] = (struct v4l2_ext_control) {
+            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
+            .ptr = offsets,
+            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
+        };
+#endif
+
+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
+
+    return rv;
+}
+
+// This only works because we started out from a single coded frame buffer
+// that will remain intact until after end_frame
+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
+    int bcount = get_bits_count(&h->HEVClc->gb);
+    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
+
+    const unsigned int n = rd->num_slices;
+    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
+
+    int rv;
+    struct slice_info * si;
+
+    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
+    // that contains the entire frame including the start code
+    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
+        buffer -= 3;
+        size += 3;
+        boff += 24;
+        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
+            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
+                   buffer[0], buffer[1], buffer[2]);
+        }
+    }
+
+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
+        if (rd->slices == NULL) {
+            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
+                return AVERROR(ENOMEM);
+            rd->slices->ptr = buffer;
+            rd->num_slices = 1;
+        }
+        rd->slices->len = buffer - rd->slices->ptr + size;
+        return 0;
+    }
+
+    if ((rv = slice_add(rd)) != 0)
+        return rv;
+
+    si = rd->slices + n;
+    si->ptr = buffer;
+    si->len = size;
+    si->n_offsets = rd->num_offsets;
+
+    if (n != block_start) {
+        struct slice_info *const si0 = rd->slices + block_start;
+        const size_t offset = (buffer - si0->ptr);
+        boff += offset * 8;
+        size += offset;
+        si0->len = si->len + offset;
+    }
+
+#if HEVC_CTRLS_VERSION >= 2
+    if (n == 0)
+        fill_decode_params(h, &rd->dec);
+    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
+#else
+    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
+#endif
+    if (ctx->max_offsets != 0 &&
+        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
+        return rv;
+
+    return 0;
+}
+
+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    if (h->ref != NULL) {
+        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
+        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+        media_request_abort(&rd->req);
+        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
+
+        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    }
+}
+
+static int send_slice(AVCodecContext * const avctx,
+                      V4L2MediaReqDescriptor * const rd,
+                      struct req_controls *const controls,
+                      const unsigned int i, const unsigned int j)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+    const int is_last = (j == rd->num_slices);
+    struct slice_info *const si = rd->slices + i;
+    struct media_request * req = NULL;
+    struct qent_src * src = NULL;
+    MediaBufsStatus stat;
+    void * offsets = rd->offsets + rd->slices[i].n_offsets;
+    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
+
+    if ((req = media_request_get(ctx->mpool)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
+        return AVERROR(ENOMEM);
+    }
+
+    if (set_req_ctls(ctx, req,
+                     controls,
+#if HEVC_CTRLS_VERSION >= 2
+                     &rd->dec,
+#endif
+                     rd->slice_params + i, j - i,
+                     offsets, n_offsets)) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
+        goto fail1;
+    }
+
+    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
+        goto fail1;
+    }
+
+    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
+        goto fail2;
+    }
+
+    if (qent_src_params_set(src, &controls->tv)) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
+        goto fail2;
+    }
+
+    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
+                                   i == 0 ? rd->qe_dst : NULL,
+                                   is_last);
+
+    if (stat != MEDIABUFS_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
+        return AVERROR_UNKNOWN;
+    }
+    return 0;
+
+fail2:
+    mediabufs_src_qent_abort(ctx->mbufs, &src);
+fail1:
+    media_request_abort(&req);
+    return AVERROR_UNKNOWN;
+}
+
+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    struct req_controls rc;
+    unsigned int i;
+    int rv;
+
+    // It is possible, though maybe a bug, to get an end_frame without
+    // a previous start_frame.  If we do then give up.
+    if (!decode_q_in_q(&rd->decode_ent)) {
+        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
+        return AVERROR_INVALIDDATA;
+    }
+
+    {
+        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
+                                    &h->ps.pps->scaling_list :
+                                h->ps.sps->scaling_list_enable_flag ?
+                                    &h->ps.sps->scaling_list : NULL;
+
+
+        memset(&rc, 0, sizeof(rc));
+        rc.tv = cvt_dpb_to_tv(rd->timestamp);
+        fill_sps(&rc.sps, h->ps.sps);
+        fill_pps(&rc.pps, h->ps.pps);
+        if (sl) {
+            rc.has_scaling = 1;
+            fill_scaling_matrix(sl, &rc.scaling_matrix);
+        }
+    }
+
+    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
+
+    // qe_dst needs to be bound to the data buffer and only returned when that is
+    // Alloc almost certainly wants to be serialised if there is any chance of blocking
+    // so we get the next frame to be free in the thread that needs it for decode first.
+    //
+    // In our current world this probably isn't a concern but put it here anyway
+    if (!rd->qe_dst)
+    {
+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
+            rv = AVERROR(ENOMEM);
+            goto fail;
+        }
+    }
+
+    // Send as slices
+    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
+        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
+        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
+            goto fail;
+    }
+
+    // Set the drm_prime desriptor
+    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
+    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
+    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
+
+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    return 0;
+
+fail:
+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    return rv;
+}
+
+static inline int
+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
+{
+    return v >= c->minimum && v <= c->maximum;
+}
+
+// Initial check & init
+static int
+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
+{
+    const HEVCContext *h = avctx->priv_data;
+    const HEVCSPS * const sps = h->ps.sps;
+    struct v4l2_ctrl_hevc_sps ctrl_sps;
+    unsigned int i;
+
+    // Check for var slice array
+    struct v4l2_query_ext_ctrl qc[] = {
+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_SPS },
+        { .id = V4L2_CID_STATELESS_HEVC_PPS },
+        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
+#if HEVC_CTRLS_VERSION >= 2
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
+#endif
+    };
+    // Order & size must match!
+    static const size_t ctrl_sizes[] = {
+        sizeof(struct v4l2_ctrl_hevc_slice_params),
+        sizeof(int32_t),
+        sizeof(struct v4l2_ctrl_hevc_sps),
+        sizeof(struct v4l2_ctrl_hevc_pps),
+        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
+#if HEVC_CTRLS_VERSION >= 2
+        sizeof(struct v4l2_ctrl_hevc_decode_params),
+#endif
+    };
+    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
+
+#if HEVC_CTRLS_VERSION == 2
+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
+        return AVERROR(EINVAL);
+#elif HEVC_CTRLS_VERSION == 3
+    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
+        return AVERROR(EINVAL);
+#endif
+
+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
+    i = 0;
+#if HEVC_CTRLS_VERSION >= 4
+    // Skip slice check if no slice mode
+    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+        i = 1;
+#else
+    // Fail frame mode silently for anything prior to V4
+    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+        return AVERROR(EINVAL);
+#endif
+    for (; i != noof_ctrls; ++i) {
+        if (qc[i].type == 0) {
+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
+            return AVERROR(EINVAL);
+        }
+        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    fill_sps(&ctrl_sps, sps);
+
+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+// Final init
+static int
+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
+{
+    int ret;
+
+    struct v4l2_query_ext_ctrl querys[] = {
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
+#if HEVC_CTRLS_VERSION >= 4
+        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
+#endif
+    };
+
+    struct v4l2_ext_control ctrls[] = {
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
+    };
+
+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
+
+    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
+                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
+        1 : querys[2].dims[0];
+    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
+
+#if HEVC_CTRLS_VERSION >= 4
+    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
+        0 : querys[3].dims[0];
+    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
+#else
+    ctx->max_offsets = 0;
+#endif
+
+    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
+        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
+        ctx->decode_mode = querys[0].default_value;
+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
+    else {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
+        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
+        ctx->start_code = querys[1].default_value;
+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+    else {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    // If we are in slice mode & START_CODE_NONE supported then pick that
+    // as it doesn't require the slightly dodgy look backwards in our raw buffer
+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+
+    ctrls[0].value = ctx->decode_mode;
+    ctrls[1].value = ctx->start_code;
+
+    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
+    return !ret ? 0 : AVERROR(-ret);
+}
+
+static void v4l2_req_frame_free(void *opaque, uint8_t *data)
+{
+    AVCodecContext *avctx = opaque;
+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
+
+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
+
+    qent_dst_unref(&rd->qe_dst);
+
+    // We don't expect req or qe_src to be set
+    if (rd->req || rd->qe_src)
+        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
+
+    av_freep(&rd->slices);
+    av_freep(&rd->slice_params);
+    av_freep(&rd->offsets);
+
+    av_free(rd);
+}
+
+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
+{
+    AVCodecContext *avctx = opaque;
+//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+//    V4L2MediaReqDescriptor *req;
+    AVBufferRef *ref;
+    uint8_t *data;
+//    int ret;
+
+    data = av_mallocz(size);
+    if (!data)
+        return NULL;
+
+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
+    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
+    if (!ref) {
+        av_freep(&data);
+        return NULL;
+    }
+    return ref;
+}
+
+#if 0
+static void v4l2_req_pool_free(void *opaque)
+{
+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
+}
+
+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
+{
+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
+
+    av_buffer_pool_uninit(&hwfc->pool);
+}
+#endif
+
+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
+{
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
+    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
+
+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
+    hwfc->sw_format = pixel_format_from_format(vfmt);
+    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
+        hwfc->width = vfmt->fmt.pix_mp.width;
+        hwfc->height = vfmt->fmt.pix_mp.height;
+    } else {
+        hwfc->width = vfmt->fmt.pix.width;
+        hwfc->height = vfmt->fmt.pix.height;
+    }
+#if 0
+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
+    if (!hwfc->pool)
+        return AVERROR(ENOMEM);
+
+    hwfc->free = v4l2_req_hwframe_ctx_free;
+
+    hwfc->initial_pool_size = 1;
+
+    switch (avctx->codec_id) {
+    case AV_CODEC_ID_VP9:
+        hwfc->initial_pool_size += 8;
+        break;
+    case AV_CODEC_ID_VP8:
+        hwfc->initial_pool_size += 3;
+        break;
+    default:
+        hwfc->initial_pool_size += 2;
+    }
+#endif
+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
+
+    return 0;
+}
+
+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    int rv;
+
+    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
+    if (!frame->buf[0])
+        return AVERROR(ENOMEM);
+
+    frame->data[0] = frame->buf[0]->data;
+
+    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
+
+    if ((rv = ff_attach_decode_data(frame)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
+        av_frame_unref(frame);
+        return rv;
+    }
+
+    return 0;
+}
+
+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
+    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
+    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
+    .probe = probe,
+    .set_controls = set_controls,
+
+    .start_frame    = v4l2_request_hevc_start_frame,
+    .decode_slice   = v4l2_request_hevc_decode_slice,
+    .end_frame      = v4l2_request_hevc_end_frame,
+    .abort_frame    = v4l2_request_hevc_abort_frame,
+    .frame_params   = frame_params,
+    .alloc_frame    = alloc_frame,
+};
+
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
new file mode 100644
index 0000000000..1a9944774a
--- /dev/null
+++ b/libavcodec/v4l2_req_media.c
@@ -0,0 +1,1802 @@
+/*
+ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/media.h>
+#include <linux/mman.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/videodev2.h>
+
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_utils.h"
+#include "weak_link.h"
+
+
+/* floor(log2(x)) */
+static unsigned int log2_size(size_t x)
+{
+    unsigned int n = 0;
+
+    if (x & ~0xffff) {
+        n += 16;
+        x >>= 16;
+    }
+    if (x & ~0xff) {
+        n += 8;
+        x >>= 8;
+    }
+    if (x & ~0xf) {
+        n += 4;
+        x >>= 4;
+    }
+    if (x & ~3) {
+        n += 2;
+        x >>= 2;
+    }
+    return (x & ~1) ? n + 1 : n;
+}
+
+static size_t round_up_size(const size_t x)
+{
+    /* Admit no size < 256 */
+    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
+
+    return x >= (3 << n) ? 4 << n : (3 << n);
+}
+
+struct media_request;
+
+struct media_pool {
+    int fd;
+    sem_t sem;
+    pthread_mutex_t lock;
+    struct media_request * free_reqs;
+    struct pollqueue * pq;
+};
+
+struct media_request {
+    struct media_request * next;
+    struct media_pool * mp;
+    int fd;
+    struct polltask * pt;
+};
+
+static inline enum v4l2_memory
+mediabufs_memory_to_v4l2(const enum mediabufs_memory m)
+{
+    return (enum v4l2_memory)m;
+}
+
+const char *
+mediabufs_memory_name(const enum mediabufs_memory m)
+{
+    switch (m) {
+    case MEDIABUFS_MEMORY_UNSET:
+        return "Unset";
+    case MEDIABUFS_MEMORY_MMAP:
+        return "MMap";
+    case MEDIABUFS_MEMORY_USERPTR:
+        return "UserPtr";
+    case MEDIABUFS_MEMORY_OVERLAY:
+        return "Overlay";
+    case MEDIABUFS_MEMORY_DMABUF:
+        return "DMABuf";
+    default:
+        break;
+    }
+    return "Unknown";
+}
+
+
+static inline int do_trywait(sem_t *const sem)
+{
+    while (sem_trywait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static inline int do_wait(sem_t *const sem)
+{
+    while (sem_wait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static int request_buffers(int video_fd, unsigned int type,
+                           enum mediabufs_memory memory, unsigned int buffers_count)
+{
+    struct v4l2_requestbuffers buffers;
+    int rc;
+
+    memset(&buffers, 0, sizeof(buffers));
+    buffers.type = type;
+    buffers.memory = mediabufs_memory_to_v4l2(memory);
+    buffers.count = buffers_count;
+
+    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
+    if (rc < 0) {
+        rc = -errno;
+        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
+        return rc;
+    }
+
+    return 0;
+}
+
+
+static int set_stream(int video_fd, unsigned int type, bool enable)
+{
+    enum v4l2_buf_type buf_type = type;
+    int rc;
+
+    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
+           &buf_type);
+    if (rc < 0) {
+        rc = -errno;
+        request_log("Unable to %sable stream: %s\n",
+                enable ? "en" : "dis", strerror(-rc));
+        return rc;
+    }
+
+    return 0;
+}
+
+
+
+struct media_request * media_request_get(struct media_pool * const mp)
+{
+    struct media_request *req = NULL;
+
+    /* Timeout handled by poll code */
+    if (do_wait(&mp->sem))
+        return NULL;
+
+    pthread_mutex_lock(&mp->lock);
+    req = mp->free_reqs;
+    if (req) {
+        mp->free_reqs = req->next;
+        req->next = NULL;
+    }
+    pthread_mutex_unlock(&mp->lock);
+    return req;
+}
+
+int media_request_fd(const struct media_request * const req)
+{
+    return req->fd;
+}
+
+int media_request_start(struct media_request * const req)
+{
+    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
+    {
+        const int err = errno;
+        if (err == EINTR)
+            continue;
+        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
+        return -err;
+    }
+
+    pollqueue_add_task(req->pt, 2000);
+    return 0;
+}
+
+static void media_request_done(void *v, short revents)
+{
+    struct media_request *const req = v;
+    struct media_pool *const mp = req->mp;
+
+    /* ** Not sure what to do about timeout */
+
+    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
+        request_log("Unable to reinit media request: %s\n",
+                strerror(errno));
+
+    pthread_mutex_lock(&mp->lock);
+    req->next = mp->free_reqs;
+    mp->free_reqs = req;
+    pthread_mutex_unlock(&mp->lock);
+    sem_post(&mp->sem);
+}
+
+int media_request_abort(struct media_request ** const preq)
+{
+    struct media_request * const req = *preq;
+
+    if (req == NULL)
+        return 0;
+    *preq = NULL;
+
+    media_request_done(req, 0);
+    return 0;
+}
+
+static void delete_req_chain(struct media_request * const chain)
+{
+    struct media_request * next = chain;
+    while (next) {
+        struct media_request * const req = next;
+        next = req->next;
+        if (req->pt)
+            polltask_delete(&req->pt);
+        if (req->fd != -1)
+            close(req->fd);
+        free(req);
+    }
+}
+
+struct media_pool * media_pool_new(const char * const media_path,
+                   struct pollqueue * const pq,
+                   const unsigned int n)
+{
+    struct media_pool * const mp = calloc(1, sizeof(*mp));
+    unsigned int i;
+
+    if (!mp)
+        goto fail0;
+
+    mp->pq = pq;
+    pthread_mutex_init(&mp->lock, NULL);
+    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
+    if (mp->fd == -1) {
+        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
+        goto fail1;
+    }
+
+    for (i = 0; i != n; ++i) {
+        struct media_request * req = malloc(sizeof(*req));
+        if (!req)
+            goto fail4;
+
+        *req = (struct media_request){
+            .next = mp->free_reqs,
+            .mp = mp,
+            .fd = -1
+        };
+        mp->free_reqs = req;
+
+        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
+            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
+            goto fail4;
+        }
+
+        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
+        if (!req->pt)
+            goto fail4;
+    }
+
+    sem_init(&mp->sem, 0, n);
+
+    return mp;
+
+fail4:
+    delete_req_chain(mp->free_reqs);
+    close(mp->fd);
+    pthread_mutex_destroy(&mp->lock);
+fail1:
+    free(mp);
+fail0:
+    return NULL;
+}
+
+void media_pool_delete(struct media_pool ** pMp)
+{
+    struct media_pool * const mp = *pMp;
+
+    if (!mp)
+        return;
+    *pMp = NULL;
+
+    delete_req_chain(mp->free_reqs);
+    close(mp->fd);
+    sem_destroy(&mp->sem);
+    pthread_mutex_destroy(&mp->lock);
+    free(mp);
+}
+
+
+#define INDEX_UNSET (~(uint32_t)0)
+
+enum qent_status {
+    QENT_NEW = 0,       // Initial state - shouldn't last
+    QENT_FREE,          // On free chain
+    QENT_PENDING,       // User has ent
+    QENT_WAITING,       // On inuse
+    QENT_DONE,          // Frame rx
+    QENT_ERROR,         // Error
+    QENT_IMPORT
+};
+
+struct qent_base {
+    atomic_int ref_count;
+    struct qent_base *next;
+    struct qent_base *prev;
+    enum qent_status status;
+    enum mediabufs_memory memtype;
+    uint32_t index;
+    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
+    struct timeval timestamp;
+};
+
+struct qent_src {
+    struct qent_base base;
+    int fixed_size;
+};
+
+struct qent_dst {
+    struct qent_base base;
+    bool waiting;
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+    struct ff_weak_link_client * mbc_wl;
+};
+
+struct qe_list_head {
+    struct qent_base *head;
+    struct qent_base *tail;
+};
+
+struct buf_pool {
+    enum mediabufs_memory memtype;
+    pthread_mutex_t lock;
+    sem_t free_sem;
+    struct qe_list_head free;
+    struct qe_list_head inuse;
+};
+
+
+static inline struct qent_dst *base_to_dst(struct qent_base *be)
+{
+    return (struct qent_dst *)be;
+}
+
+static inline struct qent_src *base_to_src(struct qent_base *be)
+{
+    return (struct qent_src *)be;
+}
+
+
+#define QENT_BASE_INITIALIZER(mtype) {\
+    .ref_count = ATOMIC_VAR_INIT(0),\
+    .status = QENT_NEW,\
+    .memtype = (mtype),\
+    .index  = INDEX_UNSET\
+}
+
+static void qe_base_uninit(struct qent_base *const be)
+{
+    unsigned int i;
+    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
+        dmabuf_free(be->dh[i]);
+        be->dh[i] = NULL;
+    }
+}
+
+static void qe_src_free(struct qent_src *const be_src)
+{
+    if (!be_src)
+        return;
+    qe_base_uninit(&be_src->base);
+    free(be_src);
+}
+
+static struct qent_src * qe_src_new(enum mediabufs_memory mtype)
+{
+    struct qent_src *const be_src = malloc(sizeof(*be_src));
+    if (!be_src)
+        return NULL;
+    *be_src = (struct qent_src){
+        .base = QENT_BASE_INITIALIZER(mtype)
+    };
+    return be_src;
+}
+
+static void qe_dst_free(struct qent_dst *const be_dst)
+{
+    if (!be_dst)
+        return;
+
+    ff_weak_link_unref(&be_dst->mbc_wl);
+    pthread_cond_destroy(&be_dst->cond);
+    pthread_mutex_destroy(&be_dst->lock);
+    qe_base_uninit(&be_dst->base);
+    free(be_dst);
+}
+
+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype)
+{
+    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
+    if (!be_dst)
+        return NULL;
+    *be_dst = (struct qent_dst){
+        .base = QENT_BASE_INITIALIZER(memtype),
+        .lock = PTHREAD_MUTEX_INITIALIZER,
+        .cond = PTHREAD_COND_INITIALIZER,
+        .mbc_wl = ff_weak_link_ref(wl)
+    };
+    return be_dst;
+}
+
+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
+{
+    if (ql->tail)
+        ql->tail->next = be;
+    else
+        ql->head = be;
+    be->prev = ql->tail;
+    be->next = NULL;
+    ql->tail = be;
+}
+
+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
+{
+    if (!be)
+        return NULL;
+
+    if (be->next)
+        be->next->prev = be->prev;
+    else
+        ql->tail = be->prev;
+    if (be->prev)
+        be->prev->next = be->next;
+    else
+        ql->head = be->next;
+    be->next = NULL;
+    be->prev = NULL;
+    return be;
+}
+
+
+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
+{
+    ql_add_tail(&bp->free, be);
+}
+
+static struct qent_base * bq_get_free(struct buf_pool *const bp)
+{
+    return ql_extract(&bp->free, bp->free.head);
+}
+
+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
+{
+    return ql_extract(&bp->inuse, be);
+}
+
+static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
+{
+    return ql_extract(&bp->inuse, bp->inuse.head);
+}
+
+static void bq_free_all_free_src(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_free(bp)) != NULL)
+        qe_src_free(base_to_src(be));
+}
+
+static void bq_free_all_inuse_src(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_inuse(bp)) != NULL)
+        qe_src_free(base_to_src(be));
+}
+
+static void bq_free_all_free_dst(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_free(bp)) != NULL)
+        qe_dst_free(base_to_dst(be));
+}
+
+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
+{
+    unsigned int i;
+
+    pthread_mutex_lock(&bp->lock);
+    /* Clear out state vars */
+    be->timestamp.tv_sec = 0;
+    be->timestamp.tv_usec = 0;
+    be->status = QENT_FREE;
+    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
+        dmabuf_len_set(be->dh[i], 0);
+    bq_put_free(bp, be);
+    pthread_mutex_unlock(&bp->lock);
+    sem_post(&bp->free_sem);
+}
+
+static bool queue_is_inuse(const struct buf_pool *const bp)
+{
+    return bp->inuse.tail != NULL;
+}
+
+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
+{
+    if (!be)
+        return;
+    pthread_mutex_lock(&bp->lock);
+    ql_add_tail(&bp->inuse, be);
+    be->status = QENT_WAITING;
+    pthread_mutex_unlock(&bp->lock);
+}
+
+static struct qent_base *queue_get_free(struct buf_pool *const bp)
+{
+    struct qent_base *buf;
+
+    if (do_wait(&bp->free_sem))
+        return NULL;
+    pthread_mutex_lock(&bp->lock);
+    buf = bq_get_free(bp);
+    pthread_mutex_unlock(&bp->lock);
+    return buf;
+}
+
+static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
+{
+    struct qent_base *buf;
+
+    if (do_trywait(&bp->free_sem))
+        return NULL;
+    pthread_mutex_lock(&bp->lock);
+    buf = bq_get_free(bp);
+    pthread_mutex_unlock(&bp->lock);
+    return buf;
+}
+
+static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index)
+{
+    struct qent_base *be;
+
+    pthread_mutex_lock(&bp->lock);
+    /* Expect 1st in Q, but allow anywhere */
+    for (be = bp->inuse.head; be; be = be->next) {
+        if (be->index == index) {
+            bq_extract_inuse(bp, be);
+            break;
+        }
+    }
+    pthread_mutex_unlock(&bp->lock);
+
+    return be;
+}
+
+static void queue_delete(struct buf_pool *const bp)
+{
+    sem_destroy(&bp->free_sem);
+    pthread_mutex_destroy(&bp->lock);
+    free(bp);
+}
+
+static struct buf_pool* queue_new(const int vfd)
+{
+    struct buf_pool *bp = calloc(1, sizeof(*bp));
+    if (!bp)
+        return NULL;
+    pthread_mutex_init(&bp->lock, NULL);
+    sem_init(&bp->free_sem, 0, 0);
+    return bp;
+}
+
+
+struct mediabufs_ctl {
+    atomic_int ref_count;  /* 0 is single ref for easier atomics */
+    void * dc;
+    int vfd;
+    bool stream_on;
+    bool polling;
+    bool dst_fixed;             // Dst Q is fixed size
+    pthread_mutex_t lock;
+    struct buf_pool * src;
+    struct buf_pool * dst;
+    struct polltask * pt;
+    struct pollqueue * pq;
+    struct ff_weak_link_master * this_wlm;
+
+    enum mediabufs_memory src_memtype;
+    enum mediabufs_memory dst_memtype;
+    struct v4l2_format src_fmt;
+    struct v4l2_format dst_fmt;
+    struct v4l2_capability capability;
+};
+
+static int qe_v4l2_queue(struct qent_base *const be,
+               const int vfd, struct media_request *const mreq,
+               const struct v4l2_format *const fmt,
+               const bool is_dst, const bool hold_flag)
+{
+    struct v4l2_buffer buffer = {
+        .type = fmt->type,
+        .memory = mediabufs_memory_to_v4l2(be->memtype),
+        .index = be->index
+    };
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        unsigned int i;
+        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+            if (is_dst)
+                dmabuf_len_set(be->dh[i], 0);
+
+            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
+            planes[i].length = dmabuf_size(be->dh[i]);
+            planes[i].bytesused = dmabuf_len(be->dh[i]);
+            if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
+                planes[i].m.fd = dmabuf_fd(be->dh[i]);
+            else
+                planes[i].m.mem_offset = 0;
+        }
+        buffer.m.planes = planes;
+        buffer.length = i;
+    }
+    else {
+        if (is_dst)
+            dmabuf_len_set(be->dh[0], 0);
+
+        buffer.bytesused = dmabuf_len(be->dh[0]);
+        buffer.length = dmabuf_size(be->dh[0]);
+        if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
+            buffer.m.fd = dmabuf_fd(be->dh[0]);
+        else
+            buffer.m.offset = 0;
+    }
+
+    if (!is_dst && mreq) {
+        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
+        buffer.request_fd = media_request_fd(mreq);
+        if (hold_flag)
+            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
+    }
+
+    if (is_dst)
+        be->timestamp = (struct timeval){0,0};
+
+    buffer.timestamp = be->timestamp;
+
+    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
+        const int err = errno;
+        if (err != EINTR) {
+            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
+            return -err;
+        }
+    }
+    return 0;
+}
+
+static struct qent_base * qe_dequeue(struct buf_pool *const bp,
+                     const int vfd,
+                     const struct v4l2_format * const f)
+{
+    struct qent_base *be;
+    int rc;
+    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+    struct v4l2_buffer buffer = {
+        .type =  f->type,
+        .memory = mediabufs_memory_to_v4l2(bp->memtype)
+    };
+    if (mp) {
+        buffer.length = f->fmt.pix_mp.num_planes;
+        buffer.m.planes = planes;
+    }
+
+    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
+           errno == EINTR)
+        /* Loop */;
+    if (rc) {
+        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
+        return NULL;
+    }
+
+    be = queue_find_extract_index(bp, buffer.index);
+    if (!be) {
+        request_log("Failed to find index %d in Q\n", buffer.index);
+        return NULL;
+    }
+
+    if (mp) {
+        unsigned int i;
+        for (i = 0; i != buffer.length; ++i)
+            dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0);
+    }
+    else
+        dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0);
+
+    be->timestamp = buffer.timestamp;
+    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
+    return be;
+}
+
+static void qe_dst_done(struct qent_dst * dst_be)
+{
+    pthread_mutex_lock(&dst_be->lock);
+    dst_be->waiting = false;
+    pthread_cond_broadcast(&dst_be->cond);
+    pthread_mutex_unlock(&dst_be->lock);
+
+    qent_dst_unref(&dst_be);
+}
+
+static bool qe_dst_waiting(struct qent_dst *const dst_be)
+{
+    bool waiting;
+    pthread_mutex_lock(&dst_be->lock);
+    waiting = dst_be->waiting;
+    dst_be->waiting = true;
+    pthread_mutex_unlock(&dst_be->lock);
+    return waiting;
+}
+
+
+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
+{
+    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
+}
+
+static void mediabufs_poll_cb(void * v, short revents)
+{
+    struct mediabufs_ctl *mbc = v;
+    struct qent_src *src_be = NULL;
+    struct qent_dst *dst_be = NULL;
+
+    if (!revents)
+        request_err(mbc->dc, "%s: Timeout\n", __func__);
+
+    pthread_mutex_lock(&mbc->lock);
+    mbc->polling = false;
+
+    if ((revents & POLLOUT) != 0)
+        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
+    if ((revents & POLLIN) != 0)
+        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
+
+    /* Reschedule */
+    if (mediabufs_wants_poll(mbc)) {
+        mbc->polling = true;
+        pollqueue_add_task(mbc->pt, 2000);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+
+    if (src_be)
+        queue_put_free(mbc->src, &src_be->base);
+    if (dst_be)
+        qe_dst_done(dst_be);
+}
+
+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
+{
+    struct qent_base *const be = &be_src->base;
+
+    be->timestamp = *timestamp;
+    return 0;
+}
+
+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
+{
+    return be_dst->base.timestamp;
+}
+
+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
+        size_t newsize = round_up_size(len);
+        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
+        if (!dbsc) {
+            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
+            return -ENOMEM;
+        }
+        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
+            request_log("%s: Realloc %zd failed\n", __func__, newsize);
+            return -ENOMEM;
+        }
+    }
+    return 0;
+}
+
+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    struct qent_base *const be = &be_src->base;
+    return qent_base_realloc(be, len, dbsc);
+}
+
+
+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    void * dst;
+    struct qent_base *const be = &be_src->base;
+    int rv;
+
+    // Realloc doesn't copy so don't alloc if offset != 0
+    if ((rv = qent_base_realloc(be, offset + len,
+                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
+        return rv;
+
+    dmabuf_write_start(be->dh[0]);
+    dst = dmabuf_map(be->dh[0]);
+    if (!dst)
+        return -1;
+    memcpy((char*)dst + offset, src, len);
+    dmabuf_len_set(be->dh[0], len);
+    dmabuf_write_end(be->dh[0]);
+    return 0;
+}
+
+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
+{
+    const struct qent_base *const be = &be_dst->base;
+
+    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
+}
+
+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
+{
+    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
+}
+
+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
+                struct media_request **const pmreq,
+                struct qent_src **const psrc_be,
+                struct qent_dst *const dst_be,
+                const bool is_final)
+{
+    struct media_request * mreq = *pmreq;
+    struct qent_src *const src_be = *psrc_be;
+
+    // Req & src are always both "consumed"
+    *pmreq = NULL;
+    *psrc_be = NULL;
+
+    pthread_mutex_lock(&mbc->lock);
+
+    if (!src_be)
+        goto fail1;
+
+    if (dst_be) {
+        if (qe_dst_waiting(dst_be)) {
+            request_info(mbc->dc, "Request buffer already waiting on start\n");
+            goto fail1;
+        }
+        dst_be->base.timestamp = (struct timeval){0,0};
+        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
+            goto fail1;
+
+        qent_dst_ref(dst_be);
+        queue_put_inuse(mbc->dst, &dst_be->base);
+    }
+
+    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
+        goto fail1;
+    queue_put_inuse(mbc->src, &src_be->base);
+
+    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
+        mbc->polling = true;
+        pollqueue_add_task(mbc->pt, 2000);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+
+    if (media_request_start(mreq))
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail1:
+    media_request_abort(&mreq);
+    if (src_be)
+        queue_put_free(mbc->src, &src_be->base);
+
+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
+    if (dst_be) {
+        dst_be->base.status = QENT_ERROR;
+        qe_dst_done(dst_be);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+    return MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+
+static int qe_alloc_from_fmt(struct qent_base *const be,
+                   struct dmabufs_ctl *const dbsc,
+                   const struct v4l2_format *const fmt)
+{
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        unsigned int i;
+        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
+            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
+                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
+            /* On failure tidy up and die */
+            if (!be->dh[i]) {
+                while (i--) {
+                    dmabuf_free(be->dh[i]);
+                    be->dh[i] = NULL;
+                }
+                return -1;
+            }
+        }
+    }
+    else {
+//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
+        size_t size = fmt->fmt.pix.sizeimage;
+        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
+        if (!be->dh[0])
+            return -1;
+    }
+    return 0;
+}
+
+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
+            const enum v4l2_buf_type buftype,
+            uint32_t pixfmt,
+            const unsigned int width, const unsigned int height,
+                               const size_t bufsize)
+{
+    *fmt = (struct v4l2_format){.type = buftype};
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
+        fmt->fmt.pix_mp.width = width;
+        fmt->fmt.pix_mp.height = height;
+        fmt->fmt.pix_mp.pixelformat = pixfmt;
+        if (bufsize) {
+            fmt->fmt.pix_mp.num_planes = 1;
+            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
+        }
+    }
+    else {
+        fmt->fmt.pix.width = width;
+        fmt->fmt.pix.height = height;
+        fmt->fmt.pix.pixelformat = pixfmt;
+        fmt->fmt.pix.sizeimage = bufsize;
+    }
+
+    while (ioctl(fd, VIDIOC_S_FMT, fmt))
+        if (errno != EINTR)
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    // Treat anything where we don't get at least what we asked for as a fail
+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
+        if (fmt->fmt.pix_mp.width < width ||
+            fmt->fmt.pix_mp.height < height ||
+            fmt->fmt.pix_mp.pixelformat != pixfmt) {
+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+    }
+    else {
+        if (fmt->fmt.pix.width < width ||
+            fmt->fmt.pix.height < height ||
+            fmt->fmt.pix.pixelformat != pixfmt) {
+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
+                   const int fd,
+                   const unsigned int type_v4l2,
+                   const uint32_t flags_must,
+                   const uint32_t flags_not,
+                   const unsigned int width,
+                   const unsigned int height,
+                   mediabufs_dst_fmt_accept_fn *const accept_fn,
+                   void *const accept_v)
+{
+    unsigned int i;
+
+    for (i = 0;; ++i) {
+        struct v4l2_fmtdesc fmtdesc = {
+            .index = i,
+            .type = type_v4l2
+        };
+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
+            if (errno != EINTR)
+                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+        if ((fmtdesc.flags & flags_must) != flags_must ||
+            (fmtdesc.flags & flags_not))
+            continue;
+        if (!accept_fn(accept_v, &fmtdesc))
+            continue;
+
+        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
+                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
+            return MEDIABUFS_STATUS_SUCCESS;
+    }
+    return 0;
+}
+
+
+/* Wait for qent done */
+
+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    enum qent_status estat;
+
+    pthread_mutex_lock(&be_dst->lock);
+    while (be_dst->waiting &&
+           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
+        /* Loop */;
+    estat = be->status;
+    pthread_mutex_unlock(&be_dst->lock);
+
+    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
+        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
+            MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
+{
+    struct qent_base *const be = &be_dst->base;
+    return dmabuf_map(be->dh[buf_no]);
+}
+
+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    unsigned int i;
+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+        if (dmabuf_read_start(be->dh[i])) {
+            while (i--)
+                dmabuf_read_end(be->dh[i]);
+            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+        }
+    }
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    unsigned int i;
+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
+
+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+        if (dmabuf_read_end(be->dh[i]))
+            status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+    return status;
+}
+
+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
+{
+    if (be_dst)
+        atomic_fetch_add(&be_dst->base.ref_count, 1);
+    return be_dst;
+}
+
+void qent_dst_unref(struct qent_dst ** const pbe_dst)
+{
+    struct qent_dst * const be_dst = *pbe_dst;
+    struct mediabufs_ctl * mbc;
+    if (!be_dst)
+        return;
+    *pbe_dst = NULL;
+
+    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
+        return;
+
+    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
+        queue_put_free(mbc->dst, &be_dst->base);
+        ff_weak_link_unlock(be_dst->mbc_wl);
+    }
+    else {
+        qe_dst_free(be_dst);
+    }
+}
+
+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
+                unsigned int plane,
+                int fd, size_t size)
+{
+    struct qent_base *const be = &be_dst->base;
+    struct dmabuf_h * dh;
+
+    if (be->status != QENT_IMPORT || be->dh[plane])
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    dh = dmabuf_import(fd, size);
+    if (!dh)
+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+
+    be->dh[plane] = dh;
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+// Returns noof buffers created, -ve for error
+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
+{
+    unsigned int i;
+
+    struct v4l2_create_buffers cbuf = {
+        .count = n,
+        .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype),
+        .format = mbc->dst_fmt,
+    };
+
+    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
+        const int err = -errno;
+        if (err != EINTR) {
+            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
+            return -err;
+        }
+    }
+
+    if (cbuf.count != n)
+        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
+
+    for (i = 0; i != cbuf.count; ++i)
+        qes[i]->base.index = cbuf.index + i;
+
+    return cbuf.count;
+}
+
+static MediaBufsStatus
+qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt,
+                   const unsigned int n, const bool x_dmabuf)
+{
+    struct v4l2_buffer buf = {
+        .index = n,
+        .type = fmt->type,
+    };
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    int ret;
+
+    if (be->dh[0])
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        memset(planes, 0, sizeof(planes));
+        buf.m.planes = planes;
+        buf.length = VIDEO_MAX_PLANES;
+    }
+
+    if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) {
+        request_err(mbc->dc, "VIDIOC_QUERYBUF failed");
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type))
+    {
+        unsigned int i;
+        for (i = 0; i != buf.length; ++i) {
+            if (x_dmabuf) {
+                struct v4l2_exportbuffer xbuf = {
+                    .type = buf.type,
+                    .index = buf.index,
+                    .plane = i,
+                    .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
+                };
+                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                    be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
+            }
+            else {
+                be->dh[i] = dmabuf_import_mmap(
+                    mmap(NULL, planes[i].length,
+                        PROT_READ | PROT_WRITE,
+                        MAP_SHARED | MAP_POPULATE,
+                        mbc->vfd, planes[i].m.mem_offset),
+                    planes[i].length);
+            }
+            /* On failure tidy up and die */
+            if (!be->dh[i]) {
+                while (i--) {
+                    dmabuf_free(be->dh[i]);
+                    be->dh[i] = NULL;
+                }
+                return MEDIABUFS_ERROR_OPERATION_FAILED;
+            }
+        }
+    }
+    else
+    {
+        if (x_dmabuf) {
+            struct v4l2_exportbuffer xbuf = {
+                .type = buf.type,
+                .index = buf.index,
+                .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
+            };
+            if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                be->dh[0] = dmabuf_import(xbuf.fd, buf.length);
+        }
+        else {
+            be->dh[0] = dmabuf_import_mmap(
+                mmap(NULL, buf.length,
+                    PROT_READ | PROT_WRITE,
+                    MAP_SHARED | MAP_POPULATE,
+                    mbc->vfd, buf.m.offset),
+                buf.length);
+        }
+        /* On failure tidy up and die */
+        if (!be->dh[0]) {
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+        }
+    }
+
+    return 0;
+}
+
+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
+{
+    struct qent_dst * be_dst;
+
+    if (mbc == NULL) {
+        be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF);
+        if (be_dst)
+            be_dst->base.status = QENT_IMPORT;
+        return be_dst;
+    }
+
+    if (mbc->dst_fixed) {
+        be_dst = base_to_dst(queue_get_free(mbc->dst));
+        if (!be_dst)
+            return NULL;
+    }
+    else {
+        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
+        if (!be_dst) {
+            be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype);
+            if (!be_dst)
+                return NULL;
+
+            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
+                qe_dst_free(be_dst);
+                return NULL;
+            }
+        }
+    }
+
+    if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) {
+        if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) {
+            request_err(mbc->dc, "Failed to export as dmabuf\n");
+            queue_put_free(mbc->dst, &be_dst->base);
+            return NULL;
+        }
+    }
+    else {
+        if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
+            /* Given  how create buf works we can't uncreate it on alloc failure
+             * all we can do is put it on the free Q
+            */
+            queue_put_free(mbc->dst, &be_dst->base);
+            return NULL;
+        }
+    }
+
+    be_dst->base.status = QENT_PENDING;
+    atomic_store(&be_dst->base.ref_count, 0);
+    return be_dst;
+}
+
+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
+{
+    return &mbc->dst_fmt;
+}
+
+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
+               const unsigned int width,
+               const unsigned int height,
+               mediabufs_dst_fmt_accept_fn *const accept_fn,
+               void *const accept_v)
+{
+    MediaBufsStatus status;
+    unsigned int i;
+    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
+    static const struct {
+        unsigned int flags_must;
+        unsigned int flags_not;
+    } trys[] = {
+        {0, V4L2_FMT_FLAG_EMULATED},
+        {V4L2_FMT_FLAG_EMULATED, 0},
+    };
+    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
+        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
+                                buf_type,
+                                trys[i].flags_must,
+                                trys[i].flags_not,
+                                width, height, accept_fn, accept_v);
+        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
+            return status;
+    }
+
+    if (status != MEDIABUFS_STATUS_SUCCESS)
+        return status;
+
+    /* Try to create a buffer - don't alloc */
+    return status;
+}
+
+// ** This is a mess if we get partial alloc but without any way to remove
+//    individual V4L2 Q members we are somewhat stuffed
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype)
+{
+    unsigned int i;
+    int a = 0;
+    unsigned int qc;
+    struct qent_dst * qes[32];
+
+    if (n > 32)
+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+
+    mbc->dst->memtype = memtype;
+
+    // Create qents first as it is hard to get rid of the V4L2 buffers on error
+    for (qc = 0; qc != n; ++qc)
+    {
+        if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL)
+            goto fail;
+    }
+
+    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
+        goto fail;
+
+    for (i = 0; i != a; ++i)
+        queue_put_free(mbc->dst, &qes[i]->base);
+
+    if (a != n)
+        goto fail;
+
+    mbc->dst_fixed = fixed;
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail:
+    for (i = (a < 0 ? 0 : a); i != qc; ++i)
+        qe_dst_free(qes[i]);
+
+    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+}
+
+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
+{
+    struct qent_base * buf = queue_get_free(mbc->src);
+    buf->status = QENT_PENDING;
+    return base_to_src(buf);
+}
+
+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
+{
+    struct qent_src *const qe_src = *pqe_src;
+    if (!qe_src)
+        return;
+    *pqe_src = NULL;
+    queue_put_free(mbc->src, &qe_src->base);
+}
+
+static MediaBufsStatus
+chk_memory_type(struct mediabufs_ctl *const mbc,
+    const struct v4l2_format * const f,
+    const enum mediabufs_memory m)
+{
+    struct v4l2_create_buffers cbuf = {
+        .count = 0,
+        .memory = V4L2_MEMORY_MMAP,
+        .format = *f
+    };
+
+    if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0)
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    switch (m) {
+    case MEDIABUFS_MEMORY_DMABUF:
+        // 0 = Unknown but assume not in that case
+        if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0)
+            return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
+        break;
+    case MEDIABUFS_MEMORY_MMAP:
+        break;
+    default:
+        return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus
+mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
+{
+    return chk_memory_type(mbc, &mbc->src_fmt, memtype);
+}
+
+MediaBufsStatus
+mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
+{
+    return chk_memory_type(mbc, &mbc->dst_fmt, memtype);
+}
+
+/* src format must have been set up before this */
+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
+                  struct dmabufs_ctl * const dbsc,
+                  unsigned int n, const enum mediabufs_memory memtype)
+{
+    unsigned int i;
+    struct v4l2_requestbuffers req = {
+        .count = n,
+        .type = mbc->src_fmt.type,
+        .memory = mediabufs_memory_to_v4l2(memtype)
+    };
+
+    bq_free_all_free_src(mbc->src);
+
+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
+        if (errno != EINTR) {
+            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+        }
+    }
+
+    if (n > req.count) {
+        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
+        n = req.count;
+    }
+
+    for (i = 0; i != n; ++i) {
+        struct qent_src *const be_src = qe_src_new(memtype);
+        if (!be_src) {
+            request_err(mbc->dc, "Failed to create src be %d\n", i);
+            goto fail;
+        }
+        switch (memtype) {
+        case MEDIABUFS_MEMORY_MMAP:
+            if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) {
+                qe_src_free(be_src);
+                goto fail;
+            }
+            be_src->fixed_size = 1;
+            break;
+        case MEDIABUFS_MEMORY_DMABUF:
+            if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
+                qe_src_free(be_src);
+                goto fail;
+            }
+            be_src->fixed_size = !mediabufs_src_resizable(mbc);
+            break;
+        default:
+            request_err(mbc->dc, "Unexpected memorty type\n");
+            goto fail;
+        }
+        be_src->base.index = i;
+
+        queue_put_free(mbc->src, &be_src->base);
+    }
+
+    mbc->src->memtype = memtype;
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail:
+    bq_free_all_free_src(mbc->src);
+    req.count = 0;
+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
+           errno == EINTR)
+        /* Loop */;
+
+    return MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+
+
+/*
+ * Set stuff order:
+ *  Set src fmt
+ *  Set parameters (sps) on vfd
+ *  Negotiate dst format (dst_fmt_set)
+ *  Create src buffers
+ *  Alloc a dst buffer or Create dst slots
+*/
+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
+{
+    if (mbc->stream_on)
+        return MEDIABUFS_STATUS_SUCCESS;
+
+    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
+        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
+        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
+        set_stream(mbc->vfd, mbc->src_fmt.type, false);
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    mbc->stream_on = true;
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
+{
+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
+
+    if (!mbc->stream_on)
+        return MEDIABUFS_STATUS_SUCCESS;
+
+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
+        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
+        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    mbc->stream_on = false;
+    return status;
+}
+
+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
+{
+    struct v4l2_ext_controls controls = {
+        .controls = control_array,
+        .count = n
+    };
+
+    if (mreq) {
+        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
+        controls.request_fd = media_request_fd(mreq);
+    }
+
+    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
+    {
+        const int err = errno;
+        if (err != EINTR) {
+            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
+            return -err;
+        }
+    }
+
+    return 0;
+}
+
+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
+                struct media_request * const mreq,
+                unsigned int id, void *data,
+                unsigned int size)
+{
+    struct v4l2_ext_control control = {
+        .id = id,
+        .ptr = data,
+        .size = size
+    };
+
+    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
+    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
+                                      enum v4l2_buf_type buf_type,
+                   const uint32_t pixfmt,
+                   const uint32_t width, const uint32_t height,
+                                      const size_t bufsize)
+{
+    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
+    if (rv != MEDIABUFS_STATUS_SUCCESS)
+        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
+
+    return rv;
+}
+
+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
+{
+    int rv = 0;
+    while (n--) {
+        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
+            const int err = errno;
+            if (err != EINTR) {
+                // Often used for probing - errors are to be expected
+                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
+                ctrls->type = 0; // 0 is invalid
+                rv = -err;
+                break;
+            }
+        }
+        ++ctrls;
+    }
+    return rv;
+}
+
+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
+{
+#if 1
+    return 0;
+#else
+    // Single planar OUTPUT can only take exact size buffers
+    // Multiplanar will take larger than negotiated
+    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
+#endif
+}
+
+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
+{
+    if (!mbc)
+        return;
+
+    // Break the weak link first
+    ff_weak_link_break(&mbc->this_wlm);
+
+    polltask_delete(&mbc->pt);
+
+    mediabufs_stream_off(mbc);
+
+    // Empty v4l2 buffer stash
+    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
+    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
+
+    bq_free_all_free_src(mbc->src);
+    bq_free_all_inuse_src(mbc->src);
+    bq_free_all_free_dst(mbc->dst);
+
+    {
+        struct qent_dst *dst_be;
+        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
+            dst_be->base.timestamp = (struct timeval){0};
+            dst_be->base.status = QENT_ERROR;
+            qe_dst_done(dst_be);
+        }
+    }
+
+    queue_delete(mbc->dst);
+    queue_delete(mbc->src);
+    close(mbc->vfd);
+    pthread_mutex_destroy(&mbc->lock);
+
+    free(mbc);
+}
+
+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
+{
+    atomic_fetch_add(&mbc->ref_count, 1);
+    return mbc;
+}
+
+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
+{
+    struct mediabufs_ctl *const mbc = *pmbc;
+    int n;
+
+    if (!mbc)
+        return;
+    *pmbc = NULL;
+    n = atomic_fetch_sub(&mbc->ref_count, 1);
+    if (n)
+        return;
+    mediabufs_ctl_delete(mbc);
+}
+
+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
+{
+    return mbc->capability.version;
+}
+
+static int set_capabilities(struct mediabufs_ctl *const mbc)
+{
+    uint32_t caps;
+
+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
+        int err = errno;
+        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
+        return -err;
+    }
+
+    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
+            mbc->capability.device_caps :
+            mbc->capability.capabilities;
+
+    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+    }
+    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    }
+    else {
+        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+/* One of these per context */
+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
+{
+    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
+
+    if (!mbc)
+        return NULL;
+
+    mbc->dc = dc;
+    // Default mono planar
+    mbc->pq = pq;
+    pthread_mutex_init(&mbc->lock, NULL);
+
+    /* Pick a default  - could we scan for this? */
+    if (vpath == NULL)
+        vpath = "/dev/media0";
+
+    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
+    {
+        const int err = errno;
+        if (err != EINTR) {
+            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
+            goto fail0;
+        }
+    }
+
+    if (set_capabilities(mbc)) {
+        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
+        goto fail1;
+    }
+
+    mbc->src = queue_new(mbc->vfd);
+    if (!mbc->src)
+        goto fail1;
+    mbc->dst = queue_new(mbc->vfd);
+    if (!mbc->dst)
+        goto fail2;
+    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
+    if (!mbc->pt)
+        goto fail3;
+    mbc->this_wlm = ff_weak_link_new(mbc);
+    if (!mbc->this_wlm)
+        goto fail4;
+
+    /* Cannot add polltask now - polling with nothing pending
+     * generates infinite error polls
+    */
+    return mbc;
+
+fail4:
+    polltask_delete(&mbc->pt);
+fail3:
+    queue_delete(mbc->dst);
+fail2:
+    queue_delete(mbc->src);
+fail1:
+    close(mbc->vfd);
+fail0:
+    free(mbc);
+    request_info(dc, "%s: FAILED\n", __func__);
+    return NULL;
+}
+
+
+
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
new file mode 100644
index 0000000000..890947b2e2
--- /dev/null
+++ b/libavcodec/v4l2_req_media.h
@@ -0,0 +1,171 @@
+/*
+e.h
+*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _MEDIA_H_
+#define _MEDIA_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct v4l2_format;
+struct v4l2_fmtdesc;
+struct v4l2_query_ext_ctrl;
+
+struct pollqueue;
+struct media_request;
+struct media_pool;
+
+typedef enum media_buf_status {
+    MEDIABUFS_STATUS_SUCCESS = 0,
+    MEDIABUFS_ERROR_OPERATION_FAILED,
+    MEDIABUFS_ERROR_DECODING_ERROR,
+    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
+    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
+    MEDIABUFS_ERROR_ALLOCATION_FAILED,
+    MEDIABUFS_ERROR_UNSUPPORTED_MEMORY,
+} MediaBufsStatus;
+
+struct media_pool * media_pool_new(const char * const media_path,
+                   struct pollqueue * const pq,
+                   const unsigned int n);
+void media_pool_delete(struct media_pool ** pmp);
+
+// Obtain a media request
+// Will block if none availible - has a 2sec timeout
+struct media_request * media_request_get(struct media_pool * const mp);
+int media_request_fd(const struct media_request * const req);
+
+// Start this request
+// Request structure is returned to pool once done
+int media_request_start(struct media_request * const req);
+
+// Return an *unstarted* media_request to the pool
+// May later be upgraded to allow for aborting a started req
+int media_request_abort(struct media_request ** const preq);
+
+
+struct mediabufs_ctl;
+struct qent_src;
+struct qent_dst;
+struct dmabuf_h;
+struct dmabufs_ctl;
+
+// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties
+enum mediabufs_memory {
+   MEDIABUFS_MEMORY_UNSET            = 0,
+   MEDIABUFS_MEMORY_MMAP             = 1,
+   MEDIABUFS_MEMORY_USERPTR          = 2,
+   MEDIABUFS_MEMORY_OVERLAY          = 3,
+   MEDIABUFS_MEMORY_DMABUF           = 4,
+};
+
+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
+
+// prealloc
+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
+// dbsc may be NULL if realloc not required
+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
+MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
+void qent_dst_delete(struct qent_dst *const be);
+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
+void qent_dst_unref(struct qent_dst ** const pbe_dst);
+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
+
+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
+/* Import an fd unattached to any mediabuf */
+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
+                unsigned int plane,
+                int fd, size_t size);
+
+const char * mediabufs_memory_name(const enum mediabufs_memory m);
+
+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
+                struct media_request **const pmreq,
+                struct qent_src **const psrc_be,
+                struct qent_dst *const dst_be,
+                const bool is_final);
+// Get / alloc a dst buffer & associate with a slot
+// If the dst pool is empty then behaviour depends on the fixed flag passed to
+// dst_slots_create.  Default is !fixed = unlimited alloc
+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
+                           struct dmabufs_ctl *const dbsc);
+// Create dst slots without alloc
+// If fixed true then qent_alloc will only get slots from this pool and will
+// block until a qent has been unrefed
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype);
+
+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
+
+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
+
+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
+               const unsigned int width,
+               const unsigned int height,
+               mediabufs_dst_fmt_accept_fn *const accept_fn,
+               void *const accept_v);
+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
+
+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
+                                struct v4l2_ext_control control_array[], unsigned int n);
+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
+                struct media_request * const mreq,
+                unsigned int id, void *data,
+                unsigned int size);
+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
+
+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
+
+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
+                                      enum v4l2_buf_type buf_type,
+                                      const uint32_t pixfmt,
+                                      const uint32_t width, const uint32_t height,
+                                      const size_t bufsize);
+
+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
+                  struct dmabufs_ctl * const dbsc,
+                  unsigned int n,
+                  const enum mediabufs_memory memtype);
+
+// Want to have appropriate formats set first
+MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
+MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
+
+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
+
+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
+                     const char *vpath, struct pollqueue *const pq);
+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
+
+
+#endif
diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c
new file mode 100644
index 0000000000..cc8a5d4001
--- /dev/null
+++ b/libavcodec/v4l2_req_pollqueue.c
@@ -0,0 +1,361 @@
+#include <errno.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_utils.h"
+
+
+struct pollqueue;
+
+enum polltask_state {
+    POLLTASK_UNQUEUED = 0,
+    POLLTASK_QUEUED,
+    POLLTASK_RUNNING,
+    POLLTASK_Q_KILL,
+    POLLTASK_RUN_KILL,
+};
+
+struct polltask {
+    struct polltask *next;
+    struct polltask *prev;
+    struct pollqueue *q;
+    enum polltask_state state;
+
+    int fd;
+    short events;
+
+    void (*fn)(void *v, short revents);
+    void * v;
+
+    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
+    sem_t kill_sem;
+};
+
+struct pollqueue {
+    atomic_int ref_count;
+    pthread_mutex_t lock;
+
+    struct polltask *head;
+    struct polltask *tail;
+
+    bool kill;
+    bool no_prod;
+    int prod_fd;
+    struct polltask *prod_pt;
+    pthread_t worker;
+};
+
+struct polltask *polltask_new(struct pollqueue *const pq,
+                              const int fd, const short events,
+                  void (*const fn)(void *v, short revents),
+                  void *const v)
+{
+    struct polltask *pt;
+
+    if (!events)
+        return NULL;
+
+    pt = malloc(sizeof(*pt));
+    if (!pt)
+        return NULL;
+
+    *pt = (struct polltask){
+        .next = NULL,
+        .prev = NULL,
+        .q = pollqueue_ref(pq),
+        .fd = fd,
+        .events = events,
+        .fn = fn,
+        .v = v
+    };
+
+    sem_init(&pt->kill_sem, 0, 0);
+
+    return pt;
+}
+
+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
+{
+    if (pt->prev)
+        pt->prev->next = pt->next;
+    else
+        pq->head = pt->next;
+    if (pt->next)
+        pt->next->prev = pt->prev;
+    else
+        pq->tail = pt->prev;
+    pt->next = NULL;
+    pt->prev = NULL;
+}
+
+static void polltask_free(struct polltask * const pt)
+{
+    sem_destroy(&pt->kill_sem);
+    free(pt);
+}
+
+static int pollqueue_prod(const struct pollqueue *const pq)
+{
+    static const uint64_t one = 1;
+    return write(pq->prod_fd, &one, sizeof(one));
+}
+
+void polltask_delete(struct polltask **const ppt)
+{
+    struct polltask *const pt = *ppt;
+    struct pollqueue * pq;
+    enum polltask_state state;
+    bool prodme;
+
+    if (!pt)
+        return;
+
+    pq = pt->q;
+    pthread_mutex_lock(&pq->lock);
+    state = pt->state;
+    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
+    prodme = !pq->no_prod;
+    pthread_mutex_unlock(&pq->lock);
+
+    if (state != POLLTASK_UNQUEUED) {
+        if (prodme)
+            pollqueue_prod(pq);
+        while (sem_wait(&pt->kill_sem) && errno == EINTR)
+            /* loop */;
+    }
+
+    // Leave zapping the ref until we have DQed the PT as might well be
+    // legitimately used in it
+    *ppt = NULL;
+    polltask_free(pt);
+    pollqueue_unref(&pq);
+}
+
+static uint64_t pollqueue_now(int timeout)
+{
+    struct timespec now;
+    uint64_t now_ms;
+
+    if (clock_gettime(CLOCK_MONOTONIC, &now))
+        return 0;
+    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
+    return now_ms ? now_ms : (uint64_t)1;
+}
+
+void pollqueue_add_task(struct polltask *const pt, const int timeout)
+{
+    bool prodme = false;
+    struct pollqueue * const pq = pt->q;
+
+    pthread_mutex_lock(&pq->lock);
+    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
+        if (pq->tail)
+            pq->tail->next = pt;
+        else
+            pq->head = pt;
+        pt->prev = pq->tail;
+        pt->next = NULL;
+        pt->state = POLLTASK_QUEUED;
+        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
+        pq->tail = pt;
+        prodme = !pq->no_prod;
+    }
+    pthread_mutex_unlock(&pq->lock);
+    if (prodme)
+        pollqueue_prod(pq);
+}
+
+static void *poll_thread(void *v)
+{
+    struct pollqueue *const pq = v;
+    struct pollfd *a = NULL;
+    size_t asize = 0;
+
+    pthread_mutex_lock(&pq->lock);
+    do {
+        unsigned int i;
+        unsigned int n = 0;
+        struct polltask *pt;
+        struct polltask *pt_next;
+        uint64_t now = pollqueue_now(0);
+        int timeout = -1;
+        int rv;
+
+        for (pt = pq->head; pt; pt = pt_next) {
+            int64_t t;
+
+            pt_next = pt->next;
+
+            if (pt->state == POLLTASK_Q_KILL) {
+                pollqueue_rem_task(pq, pt);
+                sem_post(&pt->kill_sem);
+                continue;
+            }
+
+            if (n >= asize) {
+                asize = asize ? asize * 2 : 4;
+                a = realloc(a, asize * sizeof(*a));
+                if (!a) {
+                    request_log("Failed to realloc poll array to %zd\n", asize);
+                    goto fail_locked;
+                }
+            }
+
+            a[n++] = (struct pollfd){
+                .fd = pt->fd,
+                .events = pt->events
+            };
+
+            t = (int64_t)(pt->timeout - now);
+            if (pt->timeout && t < INT_MAX &&
+                (timeout < 0 || (int)t < timeout))
+                timeout = (t < 0) ? 0 : (int)t;
+        }
+        pthread_mutex_unlock(&pq->lock);
+
+        if ((rv = poll(a, n, timeout)) == -1) {
+            if (errno != EINTR) {
+                request_log("Poll error: %s\n", strerror(errno));
+                goto fail_unlocked;
+            }
+        }
+
+        pthread_mutex_lock(&pq->lock);
+        now = pollqueue_now(0);
+
+        /* Prodding in this loop is pointless and might lead to
+         * infinite looping
+        */
+        pq->no_prod = true;
+        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
+            pt_next = pt->next;
+
+            /* Pending? */
+            if (a[i].revents ||
+                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
+                pollqueue_rem_task(pq, pt);
+                if (pt->state == POLLTASK_QUEUED)
+                    pt->state = POLLTASK_RUNNING;
+                if (pt->state == POLLTASK_Q_KILL)
+                    pt->state = POLLTASK_RUN_KILL;
+                pthread_mutex_unlock(&pq->lock);
+
+                /* This can add new entries to the Q but as
+                 * those are added to the tail our existing
+                 * chain remains intact
+                */
+                pt->fn(pt->v, a[i].revents);
+
+                pthread_mutex_lock(&pq->lock);
+                if (pt->state == POLLTASK_RUNNING)
+                    pt->state = POLLTASK_UNQUEUED;
+                if (pt->state == POLLTASK_RUN_KILL)
+                    sem_post(&pt->kill_sem);
+            }
+        }
+        pq->no_prod = false;
+
+    } while (!pq->kill);
+
+fail_locked:
+    pthread_mutex_unlock(&pq->lock);
+fail_unlocked:
+    free(a);
+    return NULL;
+}
+
+static void prod_fn(void *v, short revents)
+{
+    struct pollqueue *const pq = v;
+    char buf[8];
+    if (revents)
+        read(pq->prod_fd, buf, 8);
+    if (!pq->kill)
+        pollqueue_add_task(pq->prod_pt, -1);
+}
+
+struct pollqueue * pollqueue_new(void)
+{
+    struct pollqueue *pq = malloc(sizeof(*pq));
+    if (!pq)
+        return NULL;
+    *pq = (struct pollqueue){
+        .ref_count = ATOMIC_VAR_INIT(0),
+        .lock = PTHREAD_MUTEX_INITIALIZER,
+        .head = NULL,
+        .tail = NULL,
+        .kill = false,
+        .prod_fd = -1
+    };
+
+    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
+    if (pq->prod_fd == 1)
+        goto fail1;
+    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
+    if (!pq->prod_pt)
+        goto fail2;
+    pollqueue_add_task(pq->prod_pt, -1);
+    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
+        goto fail3;
+    // Reset ref count which will have been inced by the add_task
+    atomic_store(&pq->ref_count, 0);
+    return pq;
+
+fail3:
+    polltask_free(pq->prod_pt);
+fail2:
+    close(pq->prod_fd);
+fail1:
+    free(pq);
+    return NULL;
+}
+
+static void pollqueue_free(struct pollqueue *const pq)
+{
+    void *rv;
+
+    pthread_mutex_lock(&pq->lock);
+    pq->kill = true;
+    pollqueue_prod(pq);
+    pthread_mutex_unlock(&pq->lock);
+
+    pthread_join(pq->worker, &rv);
+    polltask_free(pq->prod_pt);
+    pthread_mutex_destroy(&pq->lock);
+    close(pq->prod_fd);
+    free(pq);
+}
+
+struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
+{
+    atomic_fetch_add(&pq->ref_count, 1);
+    return pq;
+}
+
+void pollqueue_unref(struct pollqueue **const ppq)
+{
+    struct pollqueue * const pq = *ppq;
+
+    if (!pq)
+        return;
+    *ppq = NULL;
+
+    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
+        return;
+
+    pollqueue_free(pq);
+}
+
+
+
diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h
new file mode 100644
index 0000000000..e1182cb2fc
--- /dev/null
+++ b/libavcodec/v4l2_req_pollqueue.h
@@ -0,0 +1,18 @@
+#ifndef POLLQUEUE_H_
+#define POLLQUEUE_H_
+
+struct polltask;
+struct pollqueue;
+
+struct polltask *polltask_new(struct pollqueue *const pq,
+			      const int fd, const short events,
+			      void (*const fn)(void *v, short revents),
+			      void *const v);
+void polltask_delete(struct polltask **const ppt);
+
+void pollqueue_add_task(struct polltask *const pt, const int timeout);
+struct pollqueue * pollqueue_new(void);
+void pollqueue_unref(struct pollqueue **const ppq);
+struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
+
+#endif /* POLLQUEUE_H_ */
diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h
new file mode 100644
index 0000000000..a31cc1f4ec
--- /dev/null
+++ b/libavcodec/v4l2_req_utils.h
@@ -0,0 +1,27 @@
+#ifndef AVCODEC_V4L2_REQ_UTILS_H
+#define AVCODEC_V4L2_REQ_UTILS_H
+
+#include <stdint.h>
+#include "libavutil/log.h"
+
+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
+
+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
+
+static inline char safechar(char c) {
+    return c > 0x20 && c < 0x7f ? c : '.';
+}
+
+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
+    tbuf[0] = safechar((fcc >>  0) & 0xff);
+    tbuf[1] = safechar((fcc >>  8) & 0xff);
+    tbuf[2] = safechar((fcc >> 16) & 0xff);
+    tbuf[3] = safechar((fcc >> 24) & 0xff);
+    tbuf[4] = '\0';
+    return tbuf;
+}
+
+#endif
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
new file mode 100644
index 0000000000..fbec16a93e
--- /dev/null
+++ b/libavcodec/v4l2_request_hevc.c
@@ -0,0 +1,347 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "decode.h"
+#include "hevcdec.h"
+#include "hwconfig.h"
+
+#include "v4l2_request_hevc.h"
+
+#include "libavutil/hwcontext_drm.h"
+#include "libavutil/pixdesc.h"
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_utils.h"
+
+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
+{
+    const size_t wxh = w * h;
+    size_t bits_alloc;
+
+    /* Annex A gives a min compression of 2 @ lvl 3.1
+     * (wxh <= 983040) and min 4 thereafter but avoid
+     * the odity of 983041 having a lower limit than
+     * 983040.
+     * Multiply by 3/2 for 4:2:0
+     */
+    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
+        wxh < 983040 * 2 ? 983040 * 3 / 4 :
+        wxh * 3 / 8;
+    /* Allow for bit depth */
+    bits_alloc += (bits_alloc * bits_minus8) / 8;
+    /* Add a few bytes (16k) for overhead */
+    bits_alloc += 0x4000;
+    return bits_alloc;
+}
+
+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
+                                     av_unused const uint8_t *buffer,
+                                     av_unused uint32_t size)
+{
+    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->start_frame(avctx, buffer, size);
+}
+
+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->decode_slice(avctx, buffer, size);
+}
+
+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
+{
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->end_frame(avctx);
+}
+
+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    ctx->fns->abort_frame(avctx);
+}
+
+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->frame_params(avctx, hw_frames_ctx);
+}
+
+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->alloc_frame(avctx, frame);
+}
+
+
+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
+
+    mediabufs_ctl_unref(&ctx->mbufs);
+    media_pool_delete(&ctx->mpool);
+    pollqueue_unref(&ctx->pq);
+    dmabufs_ctl_unref(&ctx->dbufs);
+    devscan_delete(&ctx->devscan);
+
+    decode_q_uninit(&ctx->decode_q);
+
+//    if (avctx->hw_frames_ctx) {
+//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+//        av_buffer_pool_flush(hwfc->pool);
+//    }
+    return 0;
+}
+
+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
+{
+    AVCodecContext *const avctx = v;
+    const HEVCContext *const h = avctx->priv_data;
+
+    if (h->ps.sps->bit_depth == 8) {
+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
+            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
+            return 1;
+        }
+    }
+    else if (h->ps.sps->bit_depth == 10) {
+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int v4l2_request_hevc_init(AVCodecContext *avctx)
+{
+    const HEVCContext *h = avctx->priv_data;
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    const HEVCSPS * const sps = h->ps.sps;
+    int ret;
+    const struct decdev * decdev;
+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
+    size_t src_size;
+    enum mediabufs_memory src_memtype;
+    enum mediabufs_memory dst_memtype;
+
+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    // Give up immediately if this is something that we have no code to deal with
+    if (h->ps.sps->chroma_format_idc != 1) {
+        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
+        return AVERROR_PATCHWELCOME;
+    }
+    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
+        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
+        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
+        return (AVERROR(-ret));
+    }
+    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
+
+    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
+    {
+        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
+        ret = AVERROR(ENODEV);
+        goto fail0;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
+           decdev_media_path(decdev), decdev_video_path(decdev));
+
+    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
+        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_MMAP;
+        dst_memtype = MEDIABUFS_MEMORY_MMAP;
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_DMABUF;
+        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
+    }
+
+    if ((ctx->pq = pollqueue_new()) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
+        goto fail1;
+    }
+
+    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
+        goto fail2;
+    }
+
+    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
+        goto fail3;
+    }
+
+    // Ask for an initial bitbuf size of max size / 4
+    // We will realloc if we need more
+    // Must use sps->h/w as avctx contains cropped size
+retry_src_memtype:
+    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
+    if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs))
+        src_size /= 4;
+    // Kludge for conformance tests which break Annex A limits
+    else if (src_size < 0x40000)
+        src_size = 0x40000;
+
+    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
+                              sps->width, sps->height, src_size)) {
+        char tbuf1[5];
+        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
+        goto fail4;
+    }
+
+    if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) {
+        if (src_memtype == MEDIABUFS_MEMORY_DMABUF) {
+            src_memtype = MEDIABUFS_MEMORY_MMAP;
+            goto retry_src_memtype;
+        }
+        av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n");
+        goto fail4;
+    }
+
+    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
+    }
+#if CONFIG_V4L2_REQ_HEVC_VX
+    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
+    }
+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
+    }
+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
+    }
+#endif
+    else {
+        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
+        ret = AVERROR(EINVAL);
+        goto fail4;
+    }
+
+    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
+        char tbuf1[5];
+        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
+        goto fail4;
+    }
+
+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
+        goto fail4;
+    }
+
+    {
+        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
+            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
+        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
+               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
+               avctx->thread_count, avctx->extra_hw_frames);
+
+        if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) {
+            if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n");
+                goto fail4;
+            }
+            av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n");
+            dst_memtype = MEDIABUFS_MEMORY_MMAP;
+        }
+
+        // extra_hw_frames is -1 if unset
+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
+            goto fail4;
+        }
+    }
+
+    if (mediabufs_stream_on(ctx->mbufs)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
+        goto fail4;
+    }
+
+    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
+        goto fail4;
+    }
+
+    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
+        goto fail5;
+    }
+
+    decode_q_init(&ctx->decode_q);
+
+    // Set our s/w format
+    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
+
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n",
+           ctx->fns->name,
+           decdev_media_path(decdev), decdev_video_path(decdev),
+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype),
+           av_get_pix_fmt_name(avctx->sw_pix_fmt));
+
+    return 0;
+
+fail5:
+    av_buffer_unref(&avctx->hw_frames_ctx);
+fail4:
+    mediabufs_ctl_unref(&ctx->mbufs);
+fail3:
+    media_pool_delete(&ctx->mpool);
+fail2:
+    pollqueue_unref(&ctx->pq);
+fail1:
+    dmabufs_ctl_unref(&ctx->dbufs);
+fail0:
+    devscan_delete(&ctx->devscan);
+    return ret;
+}
+
+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
+    .name           = "hevc_v4l2request",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
+    .alloc_frame    = v4l2_req_hevc_alloc_frame,
+    .start_frame    = v4l2_req_hevc_start_frame,
+    .decode_slice   = v4l2_req_hevc_decode_slice,
+    .end_frame      = v4l2_req_hevc_end_frame,
+    .abort_frame    = v4l2_req_hevc_abort_frame,
+    .init           = v4l2_request_hevc_init,
+    .uninit         = v4l2_request_hevc_uninit,
+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
+    .frame_params   = v4l2_req_hevc_frame_params,
+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
+};
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
new file mode 100644
index 0000000000..99c90064ea
--- /dev/null
+++ b/libavcodec/v4l2_request_hevc.h
@@ -0,0 +1,102 @@
+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
+#define AVCODEC_V4L2_REQUEST_HEVC_H
+
+#include <stdint.h>
+#include <drm_fourcc.h>
+#include "v4l2_req_decode_q.h"
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
+// in the future but until then...
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
+#endif
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+#include <linux/videodev2.h>
+#ifndef V4L2_CID_CODEC_BASE
+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in drm_fourcc.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
+#endif
+
+#define VCAT(name, version) name##_v##version
+#define V2(n,v) VCAT(n, v)
+#define V(n) V2(n, HEVC_CTRLS_VERSION)
+
+#define S2(x) #x
+#define STR(x) S2(x)
+
+// 1 per decoder
+struct v4l2_req_decode_fns;
+
+typedef struct V4L2RequestContextHEVC {
+//    V4L2RequestContext base;
+    const struct v4l2_req_decode_fns * fns;
+
+    unsigned int timestamp;  // ?? maybe uint64_t
+
+    int decode_mode;
+    int start_code;
+    unsigned int max_slices;    // 0 => not wanted (frame mode)
+    unsigned int max_offsets;   // 0 => not wanted
+
+    req_decode_q decode_q;
+
+    struct devscan *devscan;
+    struct dmabufs_ctl *dbufs;
+    struct pollqueue *pq;
+    struct media_pool * mpool;
+    struct mediabufs_ctl *mbufs;
+} V4L2RequestContextHEVC;
+
+typedef struct v4l2_req_decode_fns {
+    int src_pix_fmt_v4l2;
+    const char * name;
+
+    // Init setup
+    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
+    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
+
+    // Passthrough of hwaccel fns
+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+    int (*end_frame)(AVCodecContext *avctx);
+    void (*abort_frame)(AVCodecContext *avctx);
+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
+    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
+} v4l2_req_decode_fns;
+
+
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
+
+#endif
diff --git a/libavcodec/vaapi_av1.c b/libavcodec/vaapi_av1.c
index 5985493b8d..16b7e35747 100644
--- a/libavcodec/vaapi_av1.c
+++ b/libavcodec/vaapi_av1.c
@@ -21,28 +21,8 @@
 #include "libavutil/pixdesc.h"
 #include "hwconfig.h"
 #include "vaapi_decode.h"
-#include "internal.h"
 #include "av1dec.h"

-typedef struct VAAPIAV1FrameRef {
-    ThreadFrame frame;
-    int valid;
-} VAAPIAV1FrameRef;
-
-typedef struct VAAPIAV1DecContext {
-    VAAPIDecodeContext base;
-
-    /**
-     * For film grain case, VAAPI generate 2 output for each frame,
-     * current_frame will not apply film grain, and will be used for
-     * references for next frames. Maintain the reference list without
-     * applying film grain here. And current_display_picture will be
-     * used to apply film grain and push to downstream.
-    */
-    VAAPIAV1FrameRef ref_tab[AV1_NUM_REF_FRAMES];
-    ThreadFrame tmp_frame;
-} VAAPIAV1DecContext;
-
 static VASurfaceID vaapi_av1_surface_id(AV1Frame *vf)
 {
     if (vf)
@@ -69,48 +49,6 @@ static int8_t vaapi_av1_get_bit_depth_idx(AVCodecContext *avctx)
     return bit_depth == 8 ? 0 : bit_depth == 10 ? 1 : 2;
 }

-static int vaapi_av1_decode_init(AVCodecContext *avctx)
-{
-    VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data;
-
-    ctx->tmp_frame.f = av_frame_alloc();
-    if (!ctx->tmp_frame.f) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Failed to allocate frame.\n");
-        return AVERROR(ENOMEM);
-    }
-
-    for (int i = 0; i < FF_ARRAY_ELEMS(ctx->ref_tab); i++) {
-        ctx->ref_tab[i].frame.f = av_frame_alloc();
-        if (!ctx->ref_tab[i].frame.f) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Failed to allocate reference table frame %d.\n", i);
-            return AVERROR(ENOMEM);
-        }
-        ctx->ref_tab[i].valid = 0;
-    }
-
-    return ff_vaapi_decode_init(avctx);
-}
-
-static int vaapi_av1_decode_uninit(AVCodecContext *avctx)
-{
-    VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data;
-
-    if (ctx->tmp_frame.f->buf[0])
-        ff_thread_release_buffer(avctx, &ctx->tmp_frame);
-    av_frame_free(&ctx->tmp_frame.f);
-
-    for (int i = 0; i < FF_ARRAY_ELEMS(ctx->ref_tab); i++) {
-        if (ctx->ref_tab[i].frame.f->buf[0])
-            ff_thread_release_buffer(avctx, &ctx->ref_tab[i].frame);
-        av_frame_free(&ctx->ref_tab[i].frame.f);
-    }
-
-    return ff_vaapi_decode_uninit(avctx);
-}
-
-
 static int vaapi_av1_start_frame(AVCodecContext *avctx,
                                  av_unused const uint8_t *buffer,
                                  av_unused uint32_t size)
@@ -120,62 +58,40 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx,
     const AV1RawFrameHeader *frame_header = s->raw_frame_header;
     const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain;
     VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private;
-    VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data;
     VADecPictureParameterBufferAV1 pic_param;
     int8_t bit_depth_idx;
     int err = 0;
     int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain;
     uint8_t remap_lr_type[4] = {AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ};
-    uint8_t segmentation_feature_signed[AV1_SEG_LVL_MAX] = {1, 1, 1, 1, 1, 0, 0, 0};
-    uint8_t segmentation_feature_max[AV1_SEG_LVL_MAX] = {255, AV1_MAX_LOOP_FILTER,
-        AV1_MAX_LOOP_FILTER, AV1_MAX_LOOP_FILTER, AV1_MAX_LOOP_FILTER, 7 , 0 , 0 };
+
+    pic->output_surface = vaapi_av1_surface_id(&s->cur_frame);

     bit_depth_idx = vaapi_av1_get_bit_depth_idx(avctx);
     if (bit_depth_idx < 0)
         goto fail;

-    if (apply_grain) {
-        if (ctx->tmp_frame.f->buf[0])
-            ff_thread_release_buffer(avctx, &ctx->tmp_frame);
-        err = ff_thread_get_buffer(avctx, &ctx->tmp_frame, AV_GET_BUFFER_FLAG_REF);
-        if (err < 0)
-            goto fail;
-        pic->output_surface = ff_vaapi_get_surface_id(ctx->tmp_frame.f);
-    } else {
-        pic->output_surface = vaapi_av1_surface_id(&s->cur_frame);
-    }
-
     memset(&pic_param, 0, sizeof(VADecPictureParameterBufferAV1));
     pic_param = (VADecPictureParameterBufferAV1) {
-        .profile                    = seq->seq_profile,
-        .order_hint_bits_minus_1    = seq->order_hint_bits_minus_1,
-        .bit_depth_idx              = bit_depth_idx,
-        .matrix_coefficients        = seq->color_config.matrix_coefficients,
-        .current_frame              = pic->output_surface,
-        .current_display_picture    = vaapi_av1_surface_id(&s->cur_frame),
-        .frame_width_minus1         = frame_header->frame_width_minus_1,
-        .frame_height_minus1        = frame_header->frame_height_minus_1,
-        .primary_ref_frame          = frame_header->primary_ref_frame,
-        .order_hint                 = frame_header->order_hint,
-        .tile_cols                  = frame_header->tile_cols,
-        .tile_rows                  = frame_header->tile_rows,
-        .context_update_tile_id     = frame_header->context_update_tile_id,
-        .superres_scale_denominator = frame_header->use_superres ?
-                                        frame_header->coded_denom + AV1_SUPERRES_DENOM_MIN :
-                                        AV1_SUPERRES_NUM,
-        .interp_filter              = frame_header->interpolation_filter,
-        .filter_level[0]            = frame_header->loop_filter_level[0],
-        .filter_level[1]            = frame_header->loop_filter_level[1],
-        .filter_level_u             = frame_header->loop_filter_level[2],
-        .filter_level_v             = frame_header->loop_filter_level[3],
-        .base_qindex                = frame_header->base_q_idx,
-        .y_dc_delta_q               = frame_header->delta_q_y_dc,
-        .u_dc_delta_q               = frame_header->delta_q_u_dc,
-        .u_ac_delta_q               = frame_header->delta_q_u_ac,
-        .v_dc_delta_q               = frame_header->delta_q_v_dc,
-        .v_ac_delta_q               = frame_header->delta_q_v_ac,
-        .cdef_damping_minus_3       = frame_header->cdef_damping_minus_3,
-        .cdef_bits                  = frame_header->cdef_bits,
+        .profile                 = seq->seq_profile,
+        .order_hint_bits_minus_1 = seq->order_hint_bits_minus_1,
+        .bit_depth_idx           = bit_depth_idx,
+        .current_frame           = pic->output_surface,
+        .current_display_picture = pic->output_surface,
+        .frame_width_minus1      = frame_header->frame_width_minus_1,
+        .frame_height_minus1     = frame_header->frame_height_minus_1,
+        .primary_ref_frame       = frame_header->primary_ref_frame,
+        .order_hint              = frame_header->order_hint,
+        .tile_cols               = frame_header->tile_cols,
+        .tile_rows               = frame_header->tile_rows,
+        .context_update_tile_id  = frame_header->context_update_tile_id,
+        .interp_filter           = frame_header->interpolation_filter,
+        .filter_level[0]         = frame_header->loop_filter_level[0],
+        .filter_level[1]         = frame_header->loop_filter_level[1],
+        .filter_level_u          = frame_header->loop_filter_level[2],
+        .filter_level_v          = frame_header->loop_filter_level[3],
+        .base_qindex             = frame_header->base_q_idx,
+        .cdef_damping_minus_3    = frame_header->cdef_damping_minus_3,
+        .cdef_bits               = frame_header->cdef_bits,
         .seq_info_fields.fields = {
             .still_picture              = seq->still_picture,
             .use_128x128_superblock     = seq->use_128x128_superblock,
@@ -246,15 +162,12 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx,
             .mode_ref_delta_update  = frame_header->loop_filter_delta_update,
         },
         .mode_control_fields.bits = {
-            .delta_q_present_flag  = frame_header->delta_q_present,
-            .log2_delta_q_res      = frame_header->delta_q_res,
-            .delta_lf_present_flag = frame_header->delta_lf_present,
-            .log2_delta_lf_res     = frame_header->delta_lf_res,
-            .delta_lf_multi        = frame_header->delta_lf_multi,
-            .tx_mode               = frame_header->tx_mode,
-            .reference_select      = frame_header->reference_select,
-            .reduced_tx_set_used   = frame_header->reduced_tx_set,
-            .skip_mode_present     = frame_header->skip_mode_present,
+            .delta_q_present_flag = frame_header->delta_q_present,
+            .log2_delta_q_res     = frame_header->delta_q_res,
+            .tx_mode              = frame_header->tx_mode,
+            .reference_select     = frame_header->reference_select,
+            .reduced_tx_set_used  = frame_header->reduced_tx_set,
+            .skip_mode_present    = frame_header->skip_mode_present,
         },
         .loop_restoration_fields.bits = {
             .yframe_restoration_type  = remap_lr_type[frame_header->lr_type[0]],
@@ -265,9 +178,6 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx,
         },
         .qmatrix_fields.bits = {
             .using_qmatrix = frame_header->using_qmatrix,
-            .qm_y          = frame_header->qm_y,
-            .qm_u          = frame_header->qm_u,
-            .qm_v          = frame_header->qm_v,
         }
     };

@@ -275,9 +185,7 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx,
         if (pic_param.pic_info_fields.bits.frame_type == AV1_FRAME_KEY)
             pic_param.ref_frame_map[i] = VA_INVALID_ID;
         else
-            pic_param.ref_frame_map[i] = ctx->ref_tab[i].valid ?
-                                         ff_vaapi_get_surface_id(ctx->ref_tab[i].frame.f) :
-                                         vaapi_av1_surface_id(&s->ref[i]);
+            pic_param.ref_frame_map[i] = vaapi_av1_surface_id(&s->ref[i]);
     }
     for (int i = 0; i < AV1_REFS_PER_FRAME; i++) {
         pic_param.ref_frame_idx[i] = frame_header->ref_frame_idx[i];
@@ -305,22 +213,10 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx,
             frame_header->height_in_sbs_minus_1[i];
     }
     for (int i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF; i++) {
-        pic_param.wm[i - 1].invalid = s->cur_frame.gm_invalid[i];
-        pic_param.wm[i - 1].wmtype  = s->cur_frame.gm_type[i];
+        pic_param.wm[i - 1].wmtype = s->cur_frame.gm_type[i];
         for (int j = 0; j < 6; j++)
             pic_param.wm[i - 1].wmmat[j] = s->cur_frame.gm_params[i][j];
     }
-    for (int i = 0; i < AV1_MAX_SEGMENTS; i++) {
-        for (int j = 0; j < AV1_SEG_LVL_MAX; j++) {
-            pic_param.seg_info.feature_mask[i] |= (frame_header->feature_enabled[i][j] << j);
-            if (segmentation_feature_signed[j])
-                pic_param.seg_info.feature_data[i][j] = av_clip(frame_header->feature_value[i][j],
-                    -segmentation_feature_max[j], segmentation_feature_max[j]);
-            else
-                pic_param.seg_info.feature_data[i][j] = av_clip(frame_header->feature_value[i][j],
-                    0, segmentation_feature_max[j]);
-        }
-    }
     if (apply_grain) {
         for (int i = 0; i < film_grain->num_y_points; i++) {
             pic_param.film_grain_info.point_y_value[i] =
@@ -367,34 +263,8 @@ fail:
 static int vaapi_av1_end_frame(AVCodecContext *avctx)
 {
     const AV1DecContext *s = avctx->priv_data;
-    const AV1RawFrameHeader *header = s->raw_frame_header;
-    const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain;
     VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private;
-    VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data;
-
-    int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain;
-    int ret;
-    ret = ff_vaapi_decode_issue(avctx, pic);
-    if (ret < 0)
-        return ret;
-
-    for (int i = 0; i < AV1_NUM_REF_FRAMES; i++) {
-        if (header->refresh_frame_flags & (1 << i)) {
-            if (ctx->ref_tab[i].frame.f->buf[0])
-                ff_thread_release_buffer(avctx, &ctx->ref_tab[i].frame);
-
-            if (apply_grain) {
-                ret = ff_thread_ref_frame(&ctx->ref_tab[i].frame, &ctx->tmp_frame);
-                if (ret < 0)
-                    return ret;
-                ctx->ref_tab[i].valid = 1;
-            } else {
-                ctx->ref_tab[i].valid = 0;
-            }
-        }
-    }
-
-    return 0;
+    return ff_vaapi_decode_issue(avctx, pic);
 }

 static int vaapi_av1_decode_slice(AVCodecContext *avctx,
@@ -441,9 +311,9 @@ const AVHWAccel ff_av1_vaapi_hwaccel = {
     .end_frame            = vaapi_av1_end_frame,
     .decode_slice         = vaapi_av1_decode_slice,
     .frame_priv_data_size = sizeof(VAAPIDecodePicture),
-    .init                 = vaapi_av1_decode_init,
-    .uninit               = vaapi_av1_decode_uninit,
+    .init                 = ff_vaapi_decode_init,
+    .uninit               = ff_vaapi_decode_uninit,
     .frame_params         = ff_vaapi_common_frame_params,
-    .priv_data_size       = sizeof(VAAPIAV1DecContext),
+    .priv_data_size       = sizeof(VAAPIDecodeContext),
     .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
 };
diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
index 032e8531f2..57a0eb4e6e 100644
--- a/libavcodec/vaapi_decode.c
+++ b/libavcodec/vaapi_decode.c
@@ -577,10 +577,10 @@ static int vaapi_decode_make_config(AVCodecContext *avctx,
         switch (avctx->codec_id) {
         case AV_CODEC_ID_H264:
         case AV_CODEC_ID_HEVC:
-        case AV_CODEC_ID_AV1:
             frames->initial_pool_size += 16;
             break;
         case AV_CODEC_ID_VP9:
+        case AV_CODEC_ID_AV1:
             frames->initial_pool_size += 8;
             break;
         case AV_CODEC_ID_VP8:
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index b1fa3307cc..607858435f 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -2366,11 +2366,6 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
     VAStatus vas;
     int err;

-    ctx->va_config  = VA_INVALID_ID;
-    ctx->va_context = VA_INVALID_ID;
-
-    /* If you add something that can fail above this av_frame_alloc(),
-     * modify ff_vaapi_encode_close() accordingly. */
     ctx->frame = av_frame_alloc();
     if (!ctx->frame) {
         return AVERROR(ENOMEM);
@@ -2382,6 +2377,9 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }

+    ctx->va_config  = VA_INVALID_ID;
+    ctx->va_context = VA_INVALID_ID;
+
     ctx->input_frames_ref = av_buffer_ref(avctx->hw_frames_ctx);
     if (!ctx->input_frames_ref) {
         err = AVERROR(ENOMEM);
@@ -2533,11 +2531,6 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
     VAAPIEncodeContext *ctx = avctx->priv_data;
     VAAPIEncodePicture *pic, *next;

-    /* We check ctx->frame to know whether ff_vaapi_encode_init()
-     * has been called and va_config/va_context initialized. */
-    if (!ctx->frame)
-        return 0;
-
     for (pic = ctx->pic_start; pic; pic = next) {
         next = pic->next;
         vaapi_encode_free(avctx, pic);
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index d4ceb60791..fb7f839c5e 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             size = next - start - 4;
             if (size <= 0)
                 continue;
-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
+            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
@@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 case VC1_CODE_FRAME:
                     if (avctx->hwaccel)
                         buf_start = start;
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
@@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
@@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 }
                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
                     break;
@@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
@@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     ret = AVERROR(ENOMEM);
                     goto err;
                 }
-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
@@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
         } else {
-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
         }
         init_get_bits(&s->gb, buf2, buf_size2*8);
     } else
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index c25a6f3adf..10182786b3 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -32,6 +32,7 @@
 #include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
+#include "vc1_common.h"

 /* Apply overlap transform to horizontal edge */
 static void vc1_v_overlap_c(uint8_t *src, int stride)
@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */

     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
+    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;

     if (ARCH_AARCH64)
         ff_vc1dsp_init_aarch64(dsp);
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index 75db62b1b4..e192b431be 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
      * one or more further zero bytes and a one byte.
      */
     int (*startcode_find_candidate)(const uint8_t *buf, int size);
+
+    /* Copy a buffer, removing startcode emulation escape bytes as we go */
+    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
 } VC1DSPContext;

 void ff_vc1dsp_init(VC1DSPContext* c);
diff --git a/libavcodec/videodsp_template.c b/libavcodec/videodsp_template.c
index 8743d725c6..55123a5844 100644
--- a/libavcodec/videodsp_template.c
+++ b/libavcodec/videodsp_template.c
@@ -60,7 +60,7 @@ void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
     av_assert2(start_x < end_x && block_w);

     w    = end_x - start_x;
-    src += start_y * src_linesize + start_x * (ptrdiff_t)sizeof(pixel);
+    src += start_y * src_linesize + start_x * sizeof(pixel);
     buf += start_x * sizeof(pixel);

     // top
@@ -83,7 +83,7 @@ void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
         buf += buf_linesize;
     }

-    buf -= block_h * buf_linesize + start_x * (ptrdiff_t)sizeof(pixel);
+    buf -= block_h * buf_linesize + start_x * sizeof(pixel);
     while (block_h--) {
         pixel *bufp = (pixel *) buf;

diff --git a/libavcodec/videotoolbox.c b/libavcodec/videotoolbox.c
index 2357401412..49e726a75f 100644
--- a/libavcodec/videotoolbox.c
+++ b/libavcodec/videotoolbox.c
@@ -608,7 +608,8 @@ static void videotoolbox_decoder_callback(void *opaque,
                                           CMTime pts,
                                           CMTime duration)
 {
-    VTContext *vtctx = opaque;
+    AVCodecContext *avctx = opaque;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;

     if (vtctx->frame) {
         CVPixelBufferRelease(vtctx->frame);
@@ -616,8 +617,7 @@ static void videotoolbox_decoder_callback(void *opaque,
     }

     if (!image_buffer) {
-        av_log(vtctx->logctx,  AV_LOG_DEBUG,
-               "vt decoder cb: output image buffer is null: %i\n", status);
+        av_log(avctx, AV_LOG_DEBUG, "vt decoder cb: output image buffer is null\n");
         return;
     }

@@ -828,7 +828,7 @@ static int videotoolbox_start(AVCodecContext *avctx)
                                                      videotoolbox->cv_pix_fmt_type);

     decoder_cb.decompressionOutputCallback = videotoolbox_decoder_callback;
-    decoder_cb.decompressionOutputRefCon   = avctx->internal->hwaccel_priv_data;
+    decoder_cb.decompressionOutputRefCon   = avctx;

     status = VTDecompressionSessionCreate(NULL,                      // allocator
                                           videotoolbox->cm_fmt_desc, // videoFormatDescription
@@ -1040,8 +1040,6 @@ static int videotoolbox_common_init(AVCodecContext *avctx)
     AVHWFramesContext *hw_frames;
     int err;

-    vtctx->logctx = avctx;
-
     // Old API - do nothing.
     if (avctx->hwaccel_context)
         return 0;
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 90c889182c..57c6eb1ff9 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -2683,27 +2683,15 @@ static int vp3_decode_frame(AVCodecContext *avctx,
     if ((ret = ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         goto error;

-    if (!s->edge_emu_buffer) {
+    if (!s->edge_emu_buffer)
         s->edge_emu_buffer = av_malloc(9 * FFABS(s->current_frame.f->linesize[0]));
-        if (!s->edge_emu_buffer) {
-            ret = AVERROR(ENOMEM);
-            goto error;
-        }
-    }

     if (s->keyframe) {
         if (!s->theora) {
             skip_bits(&gb, 4); /* width code */
             skip_bits(&gb, 4); /* height code */
             if (s->version) {
-                int version = get_bits(&gb, 5);
-#if !CONFIG_VP4_DECODER
-                if (version >= 2) {
-                    av_log(avctx, AV_LOG_ERROR, "This build does not support decoding VP4.\n");
-                    return AVERROR_DECODER_NOT_FOUND;
-                }
-#endif
-                s->version = version;
+                s->version = get_bits(&gb, 5);
                 if (avctx->frame_number == 0)
                     av_log(s->avctx, AV_LOG_DEBUG,
                            "VP version: %d\n", s->version);
diff --git a/libavcodec/vqavideo.c b/libavcodec/vqavideo.c
index d0e1927444..f45390cfe5 100644
--- a/libavcodec/vqavideo.c
+++ b/libavcodec/vqavideo.c
@@ -588,14 +588,13 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         if (s->partial_countdown <= 0) {
             bytestream2_init(&s->gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
             /* decompress codebook */
-            res = decode_format80(s, s->next_codebook_buffer_index,
-                                  s->codebook, s->codebook_size, 0);
+            if ((res = decode_format80(s, s->next_codebook_buffer_index,
+                                       s->codebook, s->codebook_size, 0)) < 0)
+                return res;

             /* reset accounting */
             s->next_codebook_buffer_index = 0;
             s->partial_countdown = s->partial_count;
-            if (res < 0)
-                return res;
         }
     }

diff --git a/libavcodec/vt_internal.h b/libavcodec/vt_internal.h
index 08d9c77090..fb64735b8c 100644
--- a/libavcodec/vt_internal.h
+++ b/libavcodec/vt_internal.h
@@ -42,8 +42,6 @@ typedef struct VTContext {
     // Current H264 parameters (used to trigger decoder restart on SPS changes).
     uint8_t                     sps[3];
     bool                        reconfig_needed;
-
-    void *logctx;
 } VTContext;

 int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame);
diff --git a/libavcodec/wavpack.c b/libavcodec/wavpack.c
index 4b865087bb..2d49172eaf 100644
--- a/libavcodec/wavpack.c
+++ b/libavcodec/wavpack.c
@@ -128,7 +128,7 @@ static av_always_inline unsigned get_tail(GetBitContext *gb, int k)
     e   = (1 << (p + 1)) - k - 1;
     res = get_bitsz(gb, p);
     if (res >= e)
-        res = res * 2U - e + get_bits1(gb);
+        res = (res << 1) - e + get_bits1(gb);
     return res;
 }

@@ -498,8 +498,6 @@ static int wv_unpack_dsd_high(WavpackFrameContext *s, uint8_t *dst_left, uint8_t
                 sp[0].fltr0 = 0;
             }

-            if (DSD_BYTE_READY(high, low) && !bytestream2_get_bytes_left(&s->gbyte))
-                return AVERROR_INVALIDDATA;
             while (DSD_BYTE_READY(high, low) && bytestream2_get_bytes_left(&s->gbyte)) {
                 value = (value << 8) | bytestream2_get_byte(&s->gbyte);
                 high = (high << 8) | 0xff;
@@ -535,8 +533,6 @@ static int wv_unpack_dsd_high(WavpackFrameContext *s, uint8_t *dst_left, uint8_t
                 sp[1].fltr0 = 0;
             }

-            if (DSD_BYTE_READY(high, low) && !bytestream2_get_bytes_left(&s->gbyte))
-                return AVERROR_INVALIDDATA;
             while (DSD_BYTE_READY(high, low) && bytestream2_get_bytes_left(&s->gbyte)) {
                 value = (value << 8) | bytestream2_get_byte(&s->gbyte);
                 high = (high << 8) | 0xff;
diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
new file mode 100644
index 0000000000..f234a985b9
--- /dev/null
+++ b/libavcodec/weak_link.c
@@ -0,0 +1,102 @@
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include "weak_link.h"
+
+struct ff_weak_link_master {
+    atomic_int ref_count;    /* 0 is single ref for easier atomics */
+    pthread_rwlock_t lock;
+    void * ptr;
+};
+
+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
+{
+    return (struct ff_weak_link_master *)c;
+}
+
+struct ff_weak_link_master * ff_weak_link_new(void * p)
+{
+    struct ff_weak_link_master * w = malloc(sizeof(*w));
+    if (!w)
+        return NULL;
+    w->ptr = p;
+    if (pthread_rwlock_init(&w->lock, NULL)) {
+        free(w);
+        return NULL;
+    }
+    return w;
+}
+
+static void weak_link_do_unref(struct ff_weak_link_master * const w)
+{
+    int n = atomic_fetch_sub(&w->ref_count, 1);
+    if (n)
+        return;
+
+    pthread_rwlock_destroy(&w->lock);
+    free(w);
+}
+
+// Unref & break link
+void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
+{
+    struct ff_weak_link_master * const w = *ppLink;
+    if (!w)
+        return;
+
+    *ppLink = NULL;
+    pthread_rwlock_wrlock(&w->lock);
+    w->ptr = NULL;
+    pthread_rwlock_unlock(&w->lock);
+
+    weak_link_do_unref(w);
+}
+
+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
+{
+    if (!w)
+        return NULL;
+    atomic_fetch_add(&w->ref_count, 1);
+    return (struct ff_weak_link_client*)w;
+}
+
+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
+{
+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
+    if (!w)
+        return;
+
+    *ppLink = NULL;
+    weak_link_do_unref(w);
+}
+
+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
+{
+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
+
+    if (!w)
+        return NULL;
+
+    if (pthread_rwlock_rdlock(&w->lock))
+        goto broken;
+
+    if (w->ptr)
+        return w->ptr;
+
+    pthread_rwlock_unlock(&w->lock);
+
+broken:
+    *ppLink = NULL;
+    weak_link_do_unref(w);
+    return NULL;
+}
+
+// Ignores a NULL c (so can be on the return path of both broken & live links)
+void ff_weak_link_unlock(struct ff_weak_link_client * c)
+{
+    struct ff_weak_link_master * const w = weak_link_x(c);
+    if (w)
+        pthread_rwlock_unlock(&w->lock);
+}
+
+
diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h
new file mode 100644
index 0000000000..415b6a27a0
--- /dev/null
+++ b/libavcodec/weak_link.h
@@ -0,0 +1,23 @@
+struct ff_weak_link_master;
+struct ff_weak_link_client;
+
+struct ff_weak_link_master * ff_weak_link_new(void * p);
+void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
+
+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
+
+// Returns NULL if link broken - in this case it will also zap
+//   *ppLink and unref the weak_link.
+// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
+//
+// The above does mean that there is a race if this is called simultainiously
+// by two threads using the same weak_link_client (so don't do that)
+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
+void ff_weak_link_unlock(struct ff_weak_link_client * c);
+
+
+
+
+
+
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index f5408a1789..8710414936 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -980,7 +980,6 @@ AVCodec ff_wmav1_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
 #endif
 #if CONFIG_WMAV2_DECODER
@@ -997,6 +996,5 @@ AVCodec ff_wmav2_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
 #endif
diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index a28a0c387b..6a7e23d016 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -436,7 +436,6 @@ AVCodec ff_wmav1_encoder = {
     .close          = ff_wma_end,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
 #endif
 #if CONFIG_WMAV2_ENCODER
@@ -451,6 +450,5 @@ AVCodec ff_wmav2_encoder = {
     .close          = ff_wma_end,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
 #endif
diff --git a/libavcodec/wnv1.c b/libavcodec/wnv1.c
index fd9721f4ca..dcf417763c 100644
--- a/libavcodec/wnv1.c
+++ b/libavcodec/wnv1.c
@@ -126,9 +126,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
 {
     static AVOnce init_static_once = AV_ONCE_INIT;

-    if (avctx->width <= 1)
-        return AVERROR_INVALIDDATA;
-
     avctx->pix_fmt = AV_PIX_FMT_YUV422P;

     ff_thread_once(&init_static_once, wnv1_init_static);
diff --git a/libavcodec/xpmdec.c b/libavcodec/xpmdec.c
index 6db95285ce..993873c595 100644
--- a/libavcodec/xpmdec.c
+++ b/libavcodec/xpmdec.c
@@ -355,9 +355,6 @@ static int xpm_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }

-    if (size > SIZE_MAX / 4)
-        return AVERROR(ENOMEM);
-
     size *= 4;

     ptr += mod_strcspn(ptr, ",") + 1;
diff --git a/libavcodec/zmbvenc.c b/libavcodec/zmbvenc.c
index d050cc2ef0..319381dd48 100644
--- a/libavcodec/zmbvenc.c
+++ b/libavcodec/zmbvenc.c
@@ -73,7 +73,6 @@ typedef struct ZmbvEncContext {
     int keyint, curfrm;
     int bypp;
     enum ZmbvFormat fmt;
-    int zlib_init_ok;
     z_stream zstream;

     int score_tab[ZMBV_BLOCK * ZMBV_BLOCK * 4 + 1];
@@ -311,9 +310,8 @@ static av_cold int encode_end(AVCodecContext *avctx)
     av_freep(&c->comp_buf);
     av_freep(&c->work_buf);

+    deflateEnd(&c->zstream);
     av_freep(&c->prev_buf);
-    if (c->zlib_init_ok)
-        deflateEnd(&c->zstream);

     return 0;
 }
@@ -383,6 +381,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }

+    // Needed if zlib unused or init aborted before deflateInit
+    memset(&c->zstream, 0, sizeof(z_stream));
     c->comp_size = avctx->width * c->bypp * avctx->height + 1024 +
         ((avctx->width + ZMBV_BLOCK - 1) / ZMBV_BLOCK) * ((avctx->height + ZMBV_BLOCK - 1) / ZMBV_BLOCK) * 2 + 4;
     if (!(c->work_buf = av_malloc(c->comp_size))) {
@@ -424,7 +424,6 @@ static av_cold int encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Inflate init error: %d\n", zret);
         return -1;
     }
-    c->zlib_init_ok = 1;

     return 0;
 }
@@ -446,5 +445,4 @@ AVCodec ff_zmbv_encoder = {
 #endif //ZMBV_ENABLE_24BPP
                                                      AV_PIX_FMT_BGR0,
                                                      AV_PIX_FMT_NONE },
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index 0dfe47a1f4..ec7c7b4147 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -47,6 +47,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
 OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
 OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
 OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
+OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
+OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
 OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
 OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o

diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
index 92b27a1d14..19d2a9de55 100644
--- a/libavdevice/alldevices.c
+++ b/libavdevice/alldevices.c
@@ -53,6 +53,9 @@ extern AVOutputFormat ff_sndio_muxer;
 extern AVInputFormat  ff_v4l2_demuxer;
 extern AVOutputFormat ff_v4l2_muxer;
 extern AVInputFormat  ff_vfwcap_demuxer;
+extern AVOutputFormat ff_vout_drm_muxer;
+extern AVOutputFormat ff_vout_egl_muxer;
+extern AVOutputFormat ff_vout_rpi_muxer;
 extern AVInputFormat  ff_xcbgrab_demuxer;
 extern AVOutputFormat ff_xv_muxer;

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
new file mode 100644
index 0000000000..c7b90e6dd8
--- /dev/null
+++ b/libavdevice/drm_vout.c
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+// *** This module is a work in progress and its utility is strictly
+//     limited to testing.
+
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavformat/internal.h"
+#include "avdevice.h"
+
+#include "pthread.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+#include <drm_fourcc.h>
+
+#define TRACE_ALL 0
+
+#define DRM_MODULE "vc4"
+
+#define ERRSTR strerror(errno)
+
+struct drm_setup {
+   int conId;
+   uint32_t crtcId;
+   int crtcIdx;
+   uint32_t planeId;
+   unsigned int out_fourcc;
+   struct {
+       int x, y, width, height;
+   } compose;
+};
+
+typedef struct drm_aux_s {
+    unsigned int fb_handle;
+    uint32_t bo_handles[AV_DRM_MAX_PLANES];
+    AVFrame * frame;
+} drm_aux_t;
+
+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
+// we get initial flicker probably due to dodgy drm timing
+#define AUX_SIZE 3
+typedef struct drm_display_env_s
+{
+    AVClass *class;
+
+    int drm_fd;
+    uint32_t con_id;
+    struct drm_setup setup;
+    enum AVPixelFormat avfmt;
+
+    int show_all;
+    const char * drm_module;
+
+    unsigned int ano;
+    drm_aux_t aux[AUX_SIZE];
+
+    pthread_t q_thread;
+    sem_t q_sem_in;
+    sem_t q_sem_out;
+    int q_terminate;
+    AVFrame * q_next;
+
+} drm_display_env_t;
+
+
+static int drm_vout_write_trailer(AVFormatContext *s)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+
+    return 0;
+}
+
+static int drm_vout_write_header(AVFormatContext *s)
+{
+    const AVCodecParameters * const par = s->streams[0]->codecpar;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+    if (   s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int find_plane(struct AVFormatContext * const avctx,
+                      const int drmfd, const int crtcidx, const uint32_t format,
+                      uint32_t * const pplane_id)
+{
+   drmModePlaneResPtr planes;
+   drmModePlanePtr plane;
+   drmModeObjectPropertiesPtr props = NULL;
+   drmModePropertyPtr prop = NULL;
+   unsigned int i;
+   unsigned int j;
+   int ret = -1;
+
+   planes = drmModeGetPlaneResources(drmfd);
+   if (!planes)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
+       return -1;
+   }
+
+   for (i = 0; i < planes->count_planes; ++i) {
+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
+      if (!planes)
+      {
+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
+          break;
+      }
+
+      if (!(plane->possible_crtcs & (1 << crtcidx))) {
+         drmModeFreePlane(plane);
+         continue;
+      }
+
+      for (j = 0; j < plane->count_formats; ++j) {
+         if (plane->formats[j] == format)
+            break;
+      }
+
+      if (j == plane->count_formats) {
+         drmModeFreePlane(plane);
+         continue;
+      }
+
+      *pplane_id = plane->plane_id;
+      drmModeFreePlane(plane);
+      break;
+   }
+
+   if (i == planes->count_planes) {
+       ret = -1;
+       goto fail;
+   }
+
+    props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE);
+    if (!props)
+        goto fail;
+    for (i = 0; i != props->count_props; ++i) {
+        if (prop)
+            drmModeFreeProperty(prop);
+        prop = drmModeGetProperty(drmfd, props->props[i]);
+        if (!prop)
+            goto fail;
+        if (strcmp("zpos", prop->name) == 0) {
+            if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0)
+                av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]);
+            else
+                av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n");
+            break;
+        }
+    }
+
+    ret = 0;
+fail:
+    if (props)
+        drmModeFreeObjectProperties(props);
+    if (prop)
+        drmModeFreeProperty(prop);
+    drmModeFreePlaneResources(planes);
+    return ret;
+}
+
+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
+{
+    if (da->fb_handle != 0) {
+        drmModeRmFB(de->drm_fd, da->fb_handle);
+        da->fb_handle = 0;
+    }
+
+    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
+        if (da->bo_handles[i]) {
+            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
+            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+            da->bo_handles[i] = 0;
+        }
+    }
+    av_frame_free(&da->frame);
+}
+
+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
+{
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
+    drm_aux_t * da = de->aux + de->ano;
+    const uint32_t format = desc->layers[0].format;
+    int ret = 0;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
+#endif
+
+    if (de->setup.out_fourcc != format) {
+        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
+            av_frame_free(&frame);
+            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
+            return -1;
+        }
+        de->setup.out_fourcc = format;
+    }
+
+    {
+        drmVBlank vbl = {
+            .request = {
+                .type = DRM_VBLANK_RELATIVE,
+                .sequence = 0
+            }
+        };
+
+        while (drmWaitVBlank(de->drm_fd, &vbl)) {
+            if (errno != EINTR) {
+//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
+                break;
+            }
+        }
+    }
+
+    da_uninit(de, da);
+
+    {
+        uint32_t pitches[4] = {0};
+        uint32_t offsets[4] = {0};
+        uint64_t modifiers[4] = {0};
+        uint32_t bo_handles[4] = {0};
+        int has_mods = 0;
+        int i, j, n;
+
+        da->frame = frame;
+
+        for (i = 0; i < desc->nb_objects; ++i) {
+            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
+                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
+                return -1;
+            }
+            if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR &&
+                desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID)
+                has_mods = 1;
+        }
+
+        n = 0;
+        for (i = 0; i < desc->nb_layers; ++i) {
+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                pitches[n] = p->pitch;
+                offsets[n] = p->offset;
+                modifiers[n] = obj->format_modifier;
+                bo_handles[n] = da->bo_handles[p->object_index];
+                ++n;
+            }
+        }
+
+#if 1 && TRACE_ALL
+        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
+               av_frame_cropped_width(frame),
+               av_frame_cropped_height(frame),
+               desc->layers[0].format,
+               bo_handles[0],
+               bo_handles[1],
+               bo_handles[2],
+               bo_handles[3],
+               pitches[0],
+               pitches[1],
+               pitches[2],
+               pitches[3],
+               offsets[0],
+               offsets[1],
+               offsets[2],
+               offsets[3],
+               (long long)modifiers[0],
+               (long long)modifiers[1],
+               (long long)modifiers[2],
+               (long long)modifiers[3]
+               );
+#endif
+
+        if (drmModeAddFB2WithModifiers(de->drm_fd,
+                                       av_frame_cropped_width(frame),
+                                       av_frame_cropped_height(frame),
+                                       desc->layers[0].format, bo_handles,
+                                       pitches, offsets,
+                                       has_mods ? modifiers : NULL,
+                                       &da->fb_handle,
+                                       has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) {
+            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
+            return -1;
+        }
+    }
+
+    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
+                              da->fb_handle, 0,
+                de->setup.compose.x, de->setup.compose.y,
+                de->setup.compose.width,
+                de->setup.compose.height,
+                0, 0,
+                av_frame_cropped_width(frame) << 16,
+                av_frame_cropped_height(frame) << 16);
+
+    if (ret != 0) {
+        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
+    }
+
+    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
+
+    return ret;
+}
+
+static int do_sem_wait(sem_t * const sem, const int nowait)
+{
+    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static void * display_thread(void * v)
+{
+    AVFormatContext * const s = v;
+    drm_display_env_t * const de = s->priv_data;
+    int i;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+#endif
+
+    sem_post(&de->q_sem_out);
+
+    for (;;) {
+        AVFrame * frame;
+
+        do_sem_wait(&de->q_sem_in, 0);
+
+        if (de->q_terminate)
+            break;
+
+        frame = de->q_next;
+        de->q_next = NULL;
+        sem_post(&de->q_sem_out);
+
+        do_display(s, de, frame);
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+#endif
+
+    for (i = 0; i != AUX_SIZE; ++i)
+        da_uninit(de, de->aux + i);
+
+    av_frame_free(&de->q_next);
+
+    return NULL;
+}
+
+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
+    AVFrame * frame;
+    drm_display_env_t * const de = s->priv_data;
+    int ret;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+
+    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
+        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
+        return 0;
+    }
+
+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
+        frame = av_frame_alloc();
+        av_frame_ref(frame, src_frame);
+    }
+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
+        frame = av_frame_alloc();
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        if (av_hwframe_map(frame, src_frame, 0) != 0)
+        {
+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
+            av_frame_free(&frame);
+            return AVERROR(EINVAL);
+        }
+    }
+    else {
+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
+        return AVERROR(EINVAL);
+    }
+
+    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
+    if (ret) {
+        av_frame_free(&frame);
+    }
+    else {
+        de->q_next = frame;
+        sem_post(&de->q_sem_in);
+    }
+
+    return 0;
+}
+
+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                          unsigned flags)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+#endif
+
+    /* drm_vout_write_header() should have accepted only supported formats */
+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
+        return 0;
+
+    return 0;
+}
+
+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
+#endif
+    switch(type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
+{
+   int ret = -1;
+   int i;
+   drmModeRes *res = drmModeGetResources(drmfd);
+   drmModeConnector *c;
+
+   if(!res)
+   {
+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
+      return -1;
+   }
+
+   if (res->count_crtcs <= 0)
+   {
+      printf( "drm: no crts\n");
+      goto fail_res;
+   }
+
+   if (!s->conId) {
+      fprintf(stderr,
+         "No connector ID specified.  Choosing default from list:\n");
+
+      for (i = 0; i < res->count_connectors; i++) {
+         drmModeConnector *con =
+            drmModeGetConnector(drmfd, res->connectors[i]);
+         drmModeEncoder *enc = NULL;
+         drmModeCrtc *crtc = NULL;
+
+         if (con->encoder_id) {
+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
+            if (enc->crtc_id) {
+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
+            }
+         }
+
+         if (!s->conId && crtc) {
+            s->conId = con->connector_id;
+            s->crtcId = crtc->crtc_id;
+         }
+
+         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
+                con->connector_id,
+                crtc ? crtc->crtc_id : 0,
+                con->connector_type,
+                crtc ? crtc->width : 0,
+                crtc ? crtc->height : 0,
+                (s->conId == (int)con->connector_id ?
+            " (chosen)" : ""));
+      }
+
+      if (!s->conId) {
+         av_log(avctx, AV_LOG_ERROR,
+            "No suitable enabled connector found.\n");
+         return -1;;
+      }
+   }
+
+   s->crtcIdx = -1;
+
+   for (i = 0; i < res->count_crtcs; ++i) {
+      if (s->crtcId == res->crtcs[i]) {
+         s->crtcIdx = i;
+         break;
+      }
+   }
+
+   if (s->crtcIdx == -1)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
+       goto fail_res;
+   }
+
+   if (res->count_connectors <= 0)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
+       goto fail_res;
+   }
+
+   c = drmModeGetConnector(drmfd, s->conId);
+   if (!c)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
+       goto fail_res;
+   }
+
+   if (!c->count_modes)
+   {
+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
+       goto fail_conn;
+   }
+
+   {
+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
+      s->compose.x = crtc->x;
+      s->compose.y = crtc->y;
+      s->compose.width = crtc->width;
+      s->compose.height = crtc->height;
+      drmModeFreeCrtc(crtc);
+   }
+
+   if (pConId)
+      *pConId = c->connector_id;
+   ret = 0;
+
+fail_conn:
+   drmModeFreeConnector(c);
+
+fail_res:
+   drmModeFreeResources(res);
+
+   return ret;
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int drm_vout_init(struct AVFormatContext * s)
+{
+    drm_display_env_t * const de = s->priv_data;
+    int rv;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->drm_fd = -1;
+    de->con_id = 0;
+    de->setup = (struct drm_setup){0};
+    de->q_terminate = 0;
+
+    if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0)
+    {
+        rv = AVERROR(errno);
+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv));
+        return rv;
+    }
+
+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
+    {
+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
+        rv = AVERROR(EINVAL);
+        goto fail_close;
+    }
+
+    sem_init(&de->q_sem_in, 0, 0);
+    sem_init(&de->q_sem_out, 0, 0);
+    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
+        rv = AVERROR(errno);
+        av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv));
+        goto fail_close;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+
+    return 0;
+
+fail_close:
+    close(de->drm_fd);
+    de->drm_fd = -1;
+    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
+
+    return rv;
+}
+
+static void drm_vout_deinit(struct AVFormatContext * s)
+{
+    drm_display_env_t * const de = s->priv_data;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->q_terminate = 1;
+    sem_post(&de->q_sem_in);
+    pthread_join(de->q_thread, NULL);
+    sem_destroy(&de->q_sem_in);
+    sem_destroy(&de->q_sem_out);
+
+    for (unsigned int i = 0; i != AUX_SIZE; ++i)
+        da_uninit(de, de->aux + i);
+
+    av_frame_free(&de->q_next);
+
+    if (de->drm_fd >= 0) {
+        close(de->drm_fd);
+        de->drm_fd = -1;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+}
+
+
+#define OFFSET(x) offsetof(drm_display_env_t, x)
+static const AVOption options[] = {
+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+};
+
+static const AVClass drm_vout_class = {
+    .class_name = "drm vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+AVOutputFormat ff_vout_drm_muxer = {
+    .name           = "vout_drm",
+    .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
+    .priv_data_size = sizeof(drm_display_env_t),
+    .audio_codec    = AV_CODEC_ID_NONE,
+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+    .write_header   = drm_vout_write_header,
+    .write_packet   = drm_vout_write_packet,
+    .write_uncoded_frame = drm_vout_write_frame,
+    .write_trailer  = drm_vout_write_trailer,
+    .control_message = drm_vout_control_message,
+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+    .priv_class     = &drm_vout_class,
+    .init           = drm_vout_init,
+    .deinit         = drm_vout_deinit,
+};
+
diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
new file mode 100644
index 0000000000..cc6e310551
--- /dev/null
+++ b/libavdevice/egl_vout.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+// *** This module is a work in progress and its utility is strictly
+//     limited to testing.
+//     Amongst other issues it doesn't wait for the pic to be displayed before
+//     returning the buffer so flikering does occur.
+
+#include <epoxy/gl.h>
+#include <epoxy/egl.h>
+
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavformat/internal.h"
+#include "avdevice.h"
+
+#include "pthread.h"
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <unistd.h>
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+
+#include "libavutil/rpi_sand_fns.h"
+
+#define TRACE_ALL 0
+
+struct egl_setup {
+    int conId;
+
+    Display *dpy;
+    EGLDisplay egl_dpy;
+    EGLContext ctx;
+    EGLSurface surf;
+    Window win;
+
+    uint32_t crtcId;
+    int crtcIdx;
+    uint32_t planeId;
+    struct {
+        int x, y, width, height;
+    } compose;
+};
+
+typedef struct egl_aux_s {
+    int fd;
+    GLuint texture;
+
+} egl_aux_t;
+
+typedef struct egl_display_env_s {
+    AVClass *class;
+
+    struct egl_setup setup;
+    enum AVPixelFormat avfmt;
+
+    int show_all;
+    int window_width, window_height;
+    int window_x, window_y;
+    int fullscreen;
+
+    egl_aux_t aux[32];
+
+    pthread_t q_thread;
+    pthread_mutex_t q_lock;
+    sem_t display_start_sem;
+    sem_t q_sem;
+    int q_terminate;
+    AVFrame *q_this;
+    AVFrame *q_next;
+
+} egl_display_env_t;
+
+
+/**
+ * Remove window border/decorations.
+ */
+static void
+no_border(Display *dpy, Window w)
+{
+    static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
+    static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
+
+    typedef struct {
+        unsigned long       flags;
+        unsigned long       functions;
+        unsigned long       decorations;
+        long                inputMode;
+        unsigned long       status;
+    } PropMotifWmHints;
+
+    PropMotifWmHints motif_hints;
+    Atom prop, proptype;
+    unsigned long flags = 0;
+
+    /* setup the property */
+    motif_hints.flags = MWM_HINTS_DECORATIONS;
+    motif_hints.decorations = flags;
+
+    /* get the atom for the property */
+    prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True);
+    if (!prop) {
+        /* something went wrong! */
+        return;
+    }
+
+    /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
+    proptype = prop;
+
+    XChangeProperty(dpy, w,                         /* display, window */
+                    prop, proptype,                 /* property, type */
+                    32,                             /* format: 32-bit datums */
+                    PropModeReplace,                /* mode */
+                    (unsigned char *)&motif_hints, /* data */
+                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
+                   );
+}
+
+
+/*
+ * Create an RGB, double-buffered window.
+ * Return the window and context handles.
+ */
+static int
+make_window(struct AVFormatContext *const s,
+            egl_display_env_t *const de,
+            Display *dpy, EGLDisplay egl_dpy, const char *name,
+            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
+{
+    int scrnum = DefaultScreen(dpy);
+    XSetWindowAttributes attr;
+    unsigned long mask;
+    Window root = RootWindow(dpy, scrnum);
+    Window win;
+    EGLContext ctx;
+    const int fullscreen = de->fullscreen;
+    EGLConfig config;
+    int x = de->window_x;
+    int y = de->window_y;
+    int width = de->window_width ? de->window_width : 1280;
+    int height = de->window_height ? de->window_height : 720;
+
+
+    if (fullscreen) {
+        int scrnum = DefaultScreen(dpy);
+
+        x = 0; y = 0;
+        width = DisplayWidth(dpy, scrnum);
+        height = DisplayHeight(dpy, scrnum);
+    }
+
+    {
+        EGLint num_configs;
+        static const EGLint attribs[] = {
+            EGL_RED_SIZE, 1,
+            EGL_GREEN_SIZE, 1,
+            EGL_BLUE_SIZE, 1,
+            EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
+            EGL_NONE
+        };
+
+        if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
+            av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
+            return -1;
+        }
+    }
+
+    {
+        EGLint vid;
+        if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
+            return -1;
+        }
+
+        {
+            XVisualInfo visTemplate = {
+                .visualid = vid,
+            };
+            int num_visuals;
+            XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
+                                                  &visTemplate, &num_visuals);
+
+            /* window attributes */
+            attr.background_pixel = 0;
+            attr.border_pixel = 0;
+            attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone);
+            attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
+            /* XXX this is a bad way to get a borderless window! */
+            mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
+
+            win = XCreateWindow(dpy, root, x, y, width, height,
+                                0, visinfo->depth, InputOutput,
+                                visinfo->visual, mask, &attr);
+            XFree(visinfo);
+        }
+    }
+
+    if (fullscreen)
+        no_border(dpy, win);
+
+    /* set hints and properties */
+    {
+        XSizeHints sizehints;
+        sizehints.x = x;
+        sizehints.y = y;
+        sizehints.width  = width;
+        sizehints.height = height;
+        sizehints.flags = USSize | USPosition;
+        XSetNormalHints(dpy, win, &sizehints);
+        XSetStandardProperties(dpy, win, name, name,
+                               None, (char **)NULL, 0, &sizehints);
+    }
+
+    eglBindAPI(EGL_OPENGL_ES_API);
+
+    {
+        static const EGLint ctx_attribs[] = {
+            EGL_CONTEXT_CLIENT_VERSION, 2,
+            EGL_NONE
+        };
+        ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs);
+        if (!ctx) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+            return -1;
+        }
+    }
+
+
+    XMapWindow(dpy, win);
+
+    {
+        EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
+        if (!surf) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
+            return -1;
+        }
+
+        if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+            return -1;
+        }
+
+        *winRet = win;
+        *ctxRet = ctx;
+        *surfRet = surf;
+    }
+
+    return 0;
+}
+
+static GLint
+compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source)
+{
+    GLuint s = glCreateShader(target);
+
+    if (s == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
+        return 0;
+    }
+
+    glShaderSource(s, 1, (const GLchar **)&source, NULL);
+    glCompileShader(s);
+
+    {
+        GLint ok;
+        glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+
+        if (!ok) {
+            GLchar *info;
+            GLint size;
+
+            glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
+            info = malloc(size);
+
+            glGetShaderInfoLog(s, size, NULL, info);
+            av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
+
+            return 0;
+        }
+    }
+
+    return s;
+}
+
+static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs)
+{
+    GLuint prog = glCreateProgram();
+
+    if (prog == 0) {
+        av_log(s, AV_LOG_ERROR, "Failed to create program\n");
+        return 0;
+    }
+
+    glAttachShader(prog, vs);
+    glAttachShader(prog, fs);
+    glLinkProgram(prog);
+
+    {
+        GLint ok;
+        glGetProgramiv(prog, GL_LINK_STATUS, &ok);
+        if (!ok) {
+            /* Some drivers return a size of 1 for an empty log.  This is the size
+             * of a log that contains only a terminating NUL character.
+             */
+            GLint size;
+            GLchar *info = NULL;
+            glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
+            if (size > 1) {
+                info = malloc(size);
+                glGetProgramInfoLog(prog, size, NULL, info);
+            }
+
+            av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
+                   (info != NULL) ? info : "<empty log>");
+            return 0;
+        }
+    }
+
+    return prog;
+}
+
+static int
+gl_setup(struct AVFormatContext *const s)
+{
+    const char *vs =
+        "attribute vec4 pos;\n"
+        "varying vec2 texcoord;\n"
+        "\n"
+        "void main() {\n"
+        "  gl_Position = pos;\n"
+        "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
+        "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
+        "}\n";
+    const char *fs =
+        "#extension GL_OES_EGL_image_external : enable\n"
+        "precision mediump float;\n"
+        "uniform samplerExternalOES s;\n"
+        "varying vec2 texcoord;\n"
+        "void main() {\n"
+        "  gl_FragColor = texture2D(s, texcoord);\n"
+        "}\n";
+
+    GLuint vs_s;
+    GLuint fs_s;
+    GLuint prog;
+
+    if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
+        !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
+        !(prog = link_program(s, vs_s, fs_s)))
+        return -1;
+
+    glUseProgram(prog);
+
+    {
+        static const float verts[] = {
+            -1, -1,
+            1, -1,
+            1,  1,
+            -1,  1,
+        };
+        glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
+    }
+
+    glEnableVertexAttribArray(0);
+    return 0;
+}
+
+static int egl_vout_write_trailer(AVFormatContext *s)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+
+    return 0;
+}
+
+static int egl_vout_write_header(AVFormatContext *s)
+{
+    const AVCodecParameters *const par = s->streams[0]->codecpar;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+    if (s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+
+static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame)
+{
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0];
+    egl_aux_t *da = NULL;
+    unsigned int i;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
+#endif
+
+    for (i = 0; i != 32; ++i) {
+        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
+            da = de->aux + i;
+            break;
+        }
+    }
+
+    if (da == NULL) {
+        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    if (da->texture == 0) {
+        EGLint attribs[50];
+        EGLint *a = attribs;
+        int i, j;
+        static const EGLint anames[] = {
+            EGL_DMA_BUF_PLANE0_FD_EXT,
+            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE0_PITCH_EXT,
+            EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE1_FD_EXT,
+            EGL_DMA_BUF_PLANE1_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE1_PITCH_EXT,
+            EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE2_FD_EXT,
+            EGL_DMA_BUF_PLANE2_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE2_PITCH_EXT,
+            EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
+        };
+        const EGLint *b = anames;
+
+        *a++ = EGL_WIDTH;
+        *a++ = av_frame_cropped_width(frame);
+        *a++ = EGL_HEIGHT;
+        *a++ = av_frame_cropped_height(frame);
+        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
+        *a++ = desc->layers[0].format;
+
+        for (i = 0; i < desc->nb_layers; ++i) {
+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
+                const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index;
+                *a++ = *b++;
+                *a++ = obj->fd;
+                *a++ = *b++;
+                *a++ = p->offset;
+                *a++ = *b++;
+                *a++ = p->pitch;
+                if (obj->format_modifier == 0) {
+                    b += 2;
+                }
+                else {
+                    *a++ = *b++;
+                    *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
+                    *a++ = *b++;
+                    *a++ = (EGLint)(obj->format_modifier >> 32);
+                }
+            }
+        }
+
+        *a = EGL_NONE;
+
+#if TRACE_ALL
+        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
+            av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
+        }
+#endif
+        {
+            const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
+                                                     EGL_NO_CONTEXT,
+                                                     EGL_LINUX_DMA_BUF_EXT,
+                                                     NULL, attribs);
+            if (!image) {
+                av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
+                return -1;
+            }
+
+            glGenTextures(1, &da->texture);
+            glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+            glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
+
+            eglDestroyImageKHR(de->setup.egl_dpy, image);
+        }
+
+        da->fd = desc->objects[0].fd;
+    }
+
+    glClearColor(0.5, 0.5, 0.5, 0.5);
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
+
+    glDeleteTextures(1, &da->texture);
+    da->texture = 0;
+    da->fd = -1;
+
+    return 0;
+}
+
+static void* display_thread(void *v)
+{
+    AVFormatContext *const s = v;
+    egl_display_env_t *const de = s->priv_data;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
+#endif
+    {
+        EGLint egl_major, egl_minor;
+
+        de->setup.dpy = XOpenDisplay(NULL);
+        if (!de->setup.dpy) {
+            av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
+            goto fail;
+        }
+
+        de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
+        if (!de->setup.egl_dpy) {
+            av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
+            goto fail;
+        }
+
+        if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
+            goto fail;
+        }
+
+        av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
+
+        if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
+            av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
+            goto fail;
+        }
+    }
+
+    if (!de->window_width || !de->window_height) {
+        de->window_width = 1280;
+        de->window_height = 720;
+    }
+    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
+                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
+        av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
+        goto fail;
+    }
+
+    if (gl_setup(s)) {
+        av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
+        goto fail;
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
+#endif
+    sem_post(&de->display_start_sem);
+
+    for (;;) {
+        AVFrame *frame;
+
+        while (sem_wait(&de->q_sem) != 0) {
+            av_assert0(errno == EINTR);
+        }
+
+        if (de->q_terminate)
+            break;
+
+        pthread_mutex_lock(&de->q_lock);
+        frame = de->q_next;
+        de->q_next = NULL;
+        pthread_mutex_unlock(&de->q_lock);
+
+        do_display(s, de, frame);
+
+        av_frame_free(&de->q_this);
+        de->q_this = frame;
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
+#endif
+
+    return NULL;
+
+fail:
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
+#endif
+    de->q_terminate = 1;
+    sem_post(&de->display_start_sem);
+
+    return NULL;
+}
+
+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    const AVFrame *const src_frame = (AVFrame *)pkt->data;
+    AVFrame *frame;
+    egl_display_env_t *const de = s->priv_data;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+
+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
+        frame = av_frame_alloc();
+        av_frame_ref(frame, src_frame);
+    }
+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
+        frame = av_frame_alloc();
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        if (av_hwframe_map(frame, src_frame, 0) != 0) {
+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
+            av_frame_free(&frame);
+            return AVERROR(EINVAL);
+        }
+    }
+    else {
+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
+        return AVERROR(EINVAL);
+    }
+
+    // Really hacky sync
+    while (de->show_all && de->q_next) {
+        usleep(3000);
+    }
+
+    pthread_mutex_lock(&de->q_lock);
+    {
+        AVFrame *const t = de->q_next;
+        de->q_next = frame;
+        frame = t;
+    }
+    pthread_mutex_unlock(&de->q_lock);
+
+    if (frame == NULL)
+        sem_post(&de->q_sem);
+    else
+        av_frame_free(&frame);
+
+    return 0;
+}
+
+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                                unsigned flags)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+#endif
+
+    /* egl_vout_write_header() should have accepted only supported formats */
+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
+        return 0;
+
+    return 0;
+}
+
+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
+#endif
+    switch (type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int egl_vout_init(struct AVFormatContext *s)
+{
+    egl_display_env_t *const de = s->priv_data;
+    unsigned int i;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->setup = (struct egl_setup) { 0 };
+
+    for (i = 0; i != 32; ++i) {
+        de->aux[i].fd = -1;
+    }
+
+    de->q_terminate = 0;
+    pthread_mutex_init(&de->q_lock, NULL);
+    sem_init(&de->q_sem, 0, 0);
+    sem_init(&de->display_start_sem, 0, 0);
+    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
+
+    sem_wait(&de->display_start_sem);
+    if (de->q_terminate) {
+        av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
+        return -1;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+
+    return 0;
+}
+
+static void egl_vout_deinit(struct AVFormatContext *s)
+{
+    egl_display_env_t *const de = s->priv_data;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->q_terminate = 1;
+    sem_post(&de->q_sem);
+    pthread_join(de->q_thread, NULL);
+    sem_destroy(&de->q_sem);
+    pthread_mutex_destroy(&de->q_lock);
+
+    av_frame_free(&de->q_next);
+    av_frame_free(&de->q_this);
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+}
+
+#define OFFSET(x) offsetof(egl_display_env_t, x)
+static const AVOption options[] = {
+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+
+};
+
+static const AVClass egl_vout_class = {
+    .class_name = "egl vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+AVOutputFormat ff_vout_egl_muxer = {
+    .name           = "vout_egl",
+    .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
+    .priv_data_size = sizeof(egl_display_env_t),
+    .audio_codec    = AV_CODEC_ID_NONE,
+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+    .write_header   = egl_vout_write_header,
+    .write_packet   = egl_vout_write_packet,
+    .write_uncoded_frame = egl_vout_write_frame,
+    .write_trailer  = egl_vout_write_trailer,
+    .control_message = egl_vout_control_message,
+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+    .priv_class     = &egl_vout_class,
+    .init           = egl_vout_init,
+    .deinit         = egl_vout_deinit,
+};
+
diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c
new file mode 100644
index 0000000000..84723a34ad
--- /dev/null
+++ b/libavdevice/rpi_vout.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2013 Jeff Moguillansky
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * XVideo output device
+ *
+ * TODO:
+ * - add support to more formats
+ */
+
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/imgutils.h"
+#include "libavformat/internal.h"
+#include "avdevice.h"
+
+#include <stdatomic.h>
+#include <unistd.h>
+
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include <bcm_host.h>
+#include <interface/mmal/mmal.h>
+#include <interface/mmal/mmal_parameters_camera.h>
+#include <interface/mmal/mmal_buffer.h>
+#include <interface/mmal/mmal_port.h>
+#include <interface/mmal/util/mmal_util.h>
+#include <interface/mmal/util/mmal_default_components.h>
+#include <interface/mmal/util/mmal_connection.h>
+#include <interface/mmal/util/mmal_util_params.h>
+#pragma GCC diagnostic pop
+#include "libavutil/rpi_sand_fns.h"
+#include "libavcodec/rpi_zc.h"
+
+#define TRACE_ALL 0
+
+#define DISPLAY_PORT_DEPTH 4
+
+typedef struct rpi_display_env_s
+{
+    AVClass *class;
+
+    MMAL_COMPONENT_T* display;
+    MMAL_COMPONENT_T* isp;
+    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
+    MMAL_CONNECTION_T * conn;
+
+    MMAL_POOL_T *rpi_pool;
+    volatile int rpi_display_count;
+
+    MMAL_FOURCC_T req_fmt;
+    MMAL_VIDEO_FORMAT_T req_vfmt;
+
+    AVZcEnvPtr zc;
+
+    int window_width, window_height;
+    int window_x, window_y;
+    int layer, fullscreen;
+    int show_all;
+} rpi_display_env_t;
+
+
+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+    mmal_buffer_header_release(buffer);
+}
+
+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+    mmal_buffer_header_release(buffer);
+}
+
+
+static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
+{
+    switch (fmt) {
+    case AV_PIX_FMT_SAND128:
+    case AV_PIX_FMT_RPI4_8:
+        return MMAL_ENCODING_YUVUV128;
+    case AV_PIX_FMT_RPI4_10:
+        return MMAL_ENCODING_YUV10_COL;
+    case AV_PIX_FMT_SAND64_10:
+        return MMAL_ENCODING_YUVUV64_10;
+    case AV_PIX_FMT_SAND64_16:
+        return MMAL_ENCODING_YUVUV64_16;
+    case AV_PIX_FMT_YUV420P:
+        return MMAL_ENCODING_I420;
+
+    default:
+        break;
+    }
+    return 0;
+}
+
+
+static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
+                                       const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
+{
+    MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
+    const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
+    if (av_rpi_is_sand_format(geo->format)) {
+        // Sand formats are a bit "special"
+        // stride1 implicit in format
+        // width = stride2
+        vfmt->width = geo->stripe_is_yc ?
+            geo->height_y + geo->height_c : geo->height_y;
+//        es->height = geo->video_height;  //*** When we get the FLAG this will change
+        vfmt->height = geo->height_y;
+        es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
+    }
+    else {
+        vfmt->width = geo->stride_y / geo->bytes_per_pel;
+        vfmt->height = geo->height_y;
+        es_fmt->flags = 0;
+    }
+
+    es_fmt->type = MMAL_ES_TYPE_VIDEO;
+    es_fmt->encoding = mmfmt_from_avfmt(geo->format);
+    es_fmt->encoding_variant = 0;
+    es_fmt->bitrate = 0;
+
+    vfmt->crop.x = frame->crop_left;
+    vfmt->crop.y = frame->crop_top;
+    vfmt->crop.width = av_frame_cropped_width(frame);
+    vfmt->crop.height = av_frame_cropped_height(frame);
+
+    vfmt->frame_rate.den = 0;  // Don't think I know it here
+    vfmt->frame_rate.num = 0;
+
+    vfmt->par.den = frame->sample_aspect_ratio.den;
+    vfmt->par.num = frame->sample_aspect_ratio.num;
+
+    vfmt->color_space = 0;  // Unknown currently
+}
+
+static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
+{
+    rpi_display_env_t * const de = userdata;
+    if (buf->user_data != NULL) {
+        av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
+        buf->user_data = NULL;
+    }
+    atomic_fetch_add(&de->rpi_display_count, -1);
+    return MMAL_FALSE;
+}
+
+static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
+{
+    return avfmt == AV_PIX_FMT_SAND64_10;
+}
+
+static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
+{
+    if (de->isp != NULL)
+    {
+        if (de->isp->input[0]->is_enabled)
+            mmal_port_disable(de->isp->input[0]);
+        if (de->isp->control->is_enabled)
+            mmal_port_disable(de->isp->control);
+    }
+    if (de->conn != NULL) {
+        mmal_connection_destroy(de->conn);
+        de->conn = NULL;
+    }
+    if (de->isp != NULL) {
+        mmal_component_destroy(de->isp);
+        de->isp = NULL;
+    }
+}
+
+static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
+{
+    MMAL_BUFFER_HEADER_T* buf = NULL;
+    AVRpiZcRefPtr fr_buf = NULL;
+
+    if (de == NULL)
+        return;
+
+    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+        return;
+    }
+
+    if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
+        return;
+    }
+
+    buf = mmal_queue_get(de->rpi_pool->queue);
+    if (!buf) {
+        // Running too fast so drop the frame (unexpected)
+        goto fail;
+    }
+
+    buf->cmd = 0;
+    buf->offset = 0;
+    buf->flags = 0;
+    mmal_buffer_header_reset(buf);
+
+    atomic_fetch_add(&de->rpi_display_count, 1);  // Deced on release
+    mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
+
+    buf->user_data = fr_buf;
+    buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
+    buf->offset = av_rpi_zc_offset(fr_buf);
+    buf->length = av_rpi_zc_length(fr_buf);
+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+
+    while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+        usleep(5000);
+    }
+
+    {
+        MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
+        MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
+		MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
+
+        video_format_from_zc_frame(&new_es, fr, fr_buf);
+        if (de->req_fmt != new_es.encoding ||
+            de->req_vfmt.width       != new_vfmt->width ||
+            de->req_vfmt.height      != new_vfmt->height ||
+            de->req_vfmt.crop.x      != new_vfmt->crop.x ||
+            de->req_vfmt.crop.y      != new_vfmt->crop.y ||
+            de->req_vfmt.crop.width  != new_vfmt->crop.width ||
+            de->req_vfmt.crop.height != new_vfmt->crop.height) {
+            // Something has changed
+
+            // If we have an ISP tear it down
+            isp_remove(s, de);
+            de->port_in = de->display->input[0];
+
+            // If we still need an ISP create it now
+            if (avfmt_needs_isp(fr->format))
+            {
+                if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
+                {
+                    av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
+                    goto fail;
+                }
+                de->port_in = de->isp->input[0];
+            }
+
+            mmal_format_copy(de->port_in->format, &new_es);
+
+            if (mmal_port_format_commit(de->port_in)) {
+                av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
+                goto fail;
+            }
+
+            // If we have an ISP then we must want to use it
+            if (de->isp != NULL) {
+                MMAL_PORT_T * const port_out = de->isp->output[0];
+                MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
+                MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
+
+                port_out->format->type = MMAL_ES_TYPE_VIDEO;
+                port_out->format->encoding  = MMAL_ENCODING_YUVUV128;
+                port_out->format->encoding_variant = 0;
+                port_out->format->bitrate = 0;
+                port_out->format->flags = 0;
+                port_out->format->extradata = NULL;
+                port_out->format->extradata_size = 0;
+
+                vfmt_out->width       = (vfmt_in->crop.width + 31) & ~31;
+                vfmt_out->height      = (vfmt_in->crop.height + 15) & ~15;
+                vfmt_out->crop.x      = 0;
+                vfmt_out->crop.y      = 0;
+                vfmt_out->crop.width  = vfmt_in->crop.width;
+                vfmt_out->crop.height = vfmt_in->crop.height;
+                vfmt_out->frame_rate  = vfmt_in->frame_rate;
+                vfmt_out->par         = vfmt_in->par;
+                vfmt_out->color_space = vfmt_in->color_space;
+
+                if (mmal_port_format_commit(port_out)) {
+                    av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
+                    goto fail;
+                }
+
+                if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
+                    av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
+                    goto fail;
+                }
+                if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
+                    av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
+                    goto fail;
+                }
+                mmal_port_enable(de->isp->control,display_cb_control);
+                mmal_component_enable(de->isp);
+            }
+
+            // Number of slots in my port Q
+            de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
+            // Size to keep it happy - isn't used for anything other than error checking
+            de->port_in->buffer_size = buf->alloc_size;
+            if (!de->port_in->is_enabled)
+            {
+                mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
+                if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
+                    av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
+                    goto fail;
+                }
+            }
+
+            de->req_fmt  = new_es.encoding;
+            de->req_vfmt = *new_vfmt;
+        }
+    }
+
+    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
+    {
+        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
+        goto fail;
+    }
+    return;
+
+fail:
+    // If we have a buf then fr_buf is held by that
+    if (buf != NULL)
+        mmal_buffer_header_release(buf);
+    else if (fr_buf != NULL)
+        av_rpi_zc_unref(fr_buf);
+}
+
+
+static int xv_write_trailer(AVFormatContext *s)
+{
+    rpi_display_env_t * const de = s->priv_data;
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+    if (de->port_in != NULL && de->port_in->is_enabled) {
+        mmal_port_disable(de->port_in);
+    }
+
+    // The above disable should kick out all buffers - check that
+    if (atomic_load(&de->rpi_display_count) != 0) {
+        av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
+    }
+
+    isp_remove(s, de);
+    if (de->rpi_pool != NULL) {
+        mmal_pool_destroy(de->rpi_pool);
+        de->rpi_pool = NULL;
+    }
+    if (de->display != NULL) {
+        mmal_component_destroy(de->display);
+        de->display = NULL;
+    }
+
+    return 0;
+}
+
+static int xv_write_header(AVFormatContext *s)
+{
+    rpi_display_env_t * const de = s->priv_data;
+    const AVCodecParameters * const par = s->streams[0]->codecpar;
+    const unsigned int w = de->window_width ? de->window_width : par->width;
+    const unsigned int h = de->window_height ? de->window_height : par->height;
+    const unsigned int x = de->window_x;
+    const unsigned int y = de->window_y;
+    const int layer = de->layer ? de->layer : 2;
+    const MMAL_BOOL_T fullscreen = de->fullscreen;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
+#endif
+    if (   s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    {
+        MMAL_DISPLAYREGION_T region =
+        {
+            .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+            .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
+                MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
+            .layer = layer,
+            .fullscreen = fullscreen,
+            .dest_rect = {x, y, w, h},
+            .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
+        };
+
+        bcm_host_init();  // Needs to be done by someone...
+
+        if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
+        {
+            av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
+            goto fail;
+        }
+        de->port_in = de->display->input[0];
+
+        mmal_port_parameter_set(de->display->input[0], &region.hdr);
+
+        if (mmal_component_enable(de->display) != MMAL_SUCCESS)
+        {
+            av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
+            goto fail;
+        }
+        if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
+        {
+            av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
+            goto fail;
+        }
+
+        if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
+        {
+            av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
+            goto fail;
+        }
+    }
+
+    return 0;
+
+fail:
+    xv_write_trailer(s);
+    return AVERROR_UNKNOWN;
+}
+
+static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    AVFrame * const frame = (AVFrame *)pkt->data;
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+    display_frame(s, s->priv_data, frame);
+    return 0;
+}
+
+static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                          unsigned flags)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+#endif
+
+    /* xv_write_header() should have accepted only supported formats */
+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
+        return 0;
+//    return write_picture(s, (*frame)->data, (*frame)->linesize);
+
+    display_frame(s, s->priv_data, *ppframe);
+    return 0;
+}
+
+static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
+#endif
+    switch(type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int rpi_vout_init(struct AVFormatContext * s)
+{
+    rpi_display_env_t * const de = s->priv_data;
+
+    // Get a ZC context in case we need one - has little overhead if unused
+    if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
+        return 1;
+
+    return 0;
+}
+
+static void rpi_vout_deinit(struct AVFormatContext * s)
+{
+    rpi_display_env_t * const de = s->priv_data;
+
+    av_rpi_zc_int_env_freep(&de->zc);
+}
+
+
+#define OFFSET(x) offsetof(rpi_display_env_t, x)
+static const AVOption options[] = {
+    { "show_all",     "show all frames",        OFFSET(show_all),     AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "display_layer","set display layer",      OFFSET(layer),        AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+
+};
+
+static const AVClass xv_class = {
+    .class_name = "rpi vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+AVOutputFormat ff_vout_rpi_muxer = {
+    .name           = "vout_rpi",
+    .long_name      = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
+    .priv_data_size = sizeof(rpi_display_env_t),
+    .audio_codec    = AV_CODEC_ID_NONE,
+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+    .write_header   = xv_write_header,
+    .write_packet   = xv_write_packet,
+    .write_uncoded_frame = xv_write_frame,
+    .write_trailer  = xv_write_trailer,
+    .control_message = xv_control_message,
+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+    .priv_class     = &xv_class,
+    .init           = rpi_vout_init,
+    .deinit         = rpi_vout_deinit,
+};
diff --git a/libavdevice/xv.c b/libavdevice/xv.c
index 33507291d2..50dc4e0d04 100644
--- a/libavdevice/xv.c
+++ b/libavdevice/xv.c
@@ -296,7 +296,7 @@ static int write_picture(AVFormatContext *s, uint8_t *input_data[4],
 {
     XVContext *xv = s->priv_data;
     XvImage *img = xv->yuv_image;
-    uint8_t *data[4] = {
+    uint8_t *data[3] = {
         img->data + img->offsets[0],
         img->data + img->offsets[1],
         img->data + img->offsets[2]
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index b2c254ea67..144fbda652 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -233,6 +233,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)                += vf_neighbor.o
 OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
 OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_deinterlace_qsv.o
 OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
 OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
 OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
 OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
@@ -459,6 +460,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)       += vf_transpose_opencl.o opencl.o o
 OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
 OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
 OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
 OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
 OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
                                                 opencl/unsharp.o
diff --git a/libavfilter/aeval.c b/libavfilter/aeval.c
index 7636063bcf..d5437431ab 100644
--- a/libavfilter/aeval.c
+++ b/libavfilter/aeval.c
@@ -124,10 +124,11 @@ static int parse_channel_expressions(AVFilterContext *ctx,
     }

 #define ADD_EXPRESSION(expr_) do {                                      \
-        ret = av_dynarray_add_nofree(&eval->expr,                       \
-                                     &eval->nb_channels, NULL);         \
-        if (ret < 0)                                                    \
+        if (!av_dynarray2_add((void **)&eval->expr, &eval->nb_channels, \
+                              sizeof(*eval->expr), NULL)) {             \
+            ret = AVERROR(ENOMEM);                                      \
             goto end;                                                   \
+        }                                                               \
         eval->expr[eval->nb_channels-1] = NULL;                         \
         ret = av_expr_parse(&eval->expr[eval->nb_channels - 1], expr_,  \
                             var_names, func1_names, func1,              \
diff --git a/libavfilter/af_surround.c b/libavfilter/af_surround.c
index c0b8b002c2..d18b3146e7 100644
--- a/libavfilter/af_surround.c
+++ b/libavfilter/af_surround.c
@@ -203,13 +203,13 @@ static int config_input(AVFilterLink *inlink)
     s->rdft = av_calloc(inlink->channels, sizeof(*s->rdft));
     if (!s->rdft)
         return AVERROR(ENOMEM);
-    s->nb_in_channels = inlink->channels;

     for (ch = 0; ch < inlink->channels; ch++) {
         s->rdft[ch]  = av_rdft_init(ff_log2(s->buf_size), DFT_R2C);
         if (!s->rdft[ch])
             return AVERROR(ENOMEM);
     }
+    s->nb_in_channels = inlink->channels;
     s->input_levels = av_malloc_array(s->nb_in_channels, sizeof(*s->input_levels));
     if (!s->input_levels)
         return AVERROR(ENOMEM);
@@ -266,13 +266,13 @@ static int config_output(AVFilterLink *outlink)
     s->irdft = av_calloc(outlink->channels, sizeof(*s->irdft));
     if (!s->irdft)
         return AVERROR(ENOMEM);
-    s->nb_out_channels = outlink->channels;

     for (ch = 0; ch < outlink->channels; ch++) {
         s->irdft[ch] = av_rdft_init(ff_log2(s->buf_size), IDFT_C2R);
         if (!s->irdft[ch])
             return AVERROR(ENOMEM);
     }
+    s->nb_out_channels = outlink->channels;
     s->output_levels = av_malloc_array(s->nb_out_channels, sizeof(*s->output_levels));
     if (!s->output_levels)
         return AVERROR(ENOMEM);
diff --git a/libavfilter/af_vibrato.c b/libavfilter/af_vibrato.c
index 64d6068b39..5db1f0f6c9 100644
--- a/libavfilter/af_vibrato.c
+++ b/libavfilter/af_vibrato.c
@@ -157,11 +157,11 @@ static int config_input(AVFilterLink *inlink)
     int c;
     AVFilterContext *ctx = inlink->dst;
     VibratoContext *s = ctx->priv;
+    s->channels = inlink->channels;

     s->buf = av_calloc(inlink->channels, sizeof(*s->buf));
     if (!s->buf)
         return AVERROR(ENOMEM);
-    s->channels = inlink->channels;
     s->buf_size = lrint(inlink->sample_rate * 0.005 + 0.5);
     for (c = 0; c < s->channels; c++) {
         s->buf[c] = av_malloc_array(s->buf_size, sizeof(*s->buf[c]));
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 0872c6e0f2..1dd05e4d75 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -218,6 +218,7 @@ extern AVFilter ff_vf_dedot;
 extern AVFilter ff_vf_deflate;
 extern AVFilter ff_vf_deflicker;
 extern AVFilter ff_vf_deinterlace_qsv;
+extern AVFilter ff_vf_deinterlace_v4l2m2m;
 extern AVFilter ff_vf_deinterlace_vaapi;
 extern AVFilter ff_vf_dejudder;
 extern AVFilter ff_vf_delogo;
@@ -377,6 +378,7 @@ extern AVFilter ff_vf_scale;
 extern AVFilter ff_vf_scale_cuda;
 extern AVFilter ff_vf_scale_npp;
 extern AVFilter ff_vf_scale_qsv;
+extern AVFilter ff_vf_scale_v4l2m2m;
 extern AVFilter ff_vf_scale_vaapi;
 extern AVFilter ff_vf_scale_vulkan;
 extern AVFilter ff_vf_scale2ref;
@@ -438,6 +440,7 @@ extern AVFilter ff_vf_transpose_opencl;
 extern AVFilter ff_vf_transpose_vaapi;
 extern AVFilter ff_vf_trim;
 extern AVFilter ff_vf_unpremultiply;
+extern AVFilter ff_vf_unsand;
 extern AVFilter ff_vf_unsharp;
 extern AVFilter ff_vf_unsharp_opencl;
 extern AVFilter ff_vf_untile;
diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c
index 6373ae761d..3e543a3ab6 100644
--- a/libavfilter/asrc_flite.c
+++ b/libavfilter/asrc_flite.c
@@ -196,12 +196,10 @@ static av_cold void uninit(AVFilterContext *ctx)
 {
     FliteContext *flite = ctx->priv;

-    if (flite->voice_entry) {
-        if (!--flite->voice_entry->usage_count) {
-            flite->voice_entry->unregister_fn(flite->voice);
-            flite->voice_entry->voice = NULL;
-        }
-    }
+    if (!--flite->voice_entry->usage_count)
+        flite->voice_entry->unregister_fn(flite->voice);
+    flite->voice = NULL;
+    flite->voice_entry = NULL;
     delete_wave(flite->wave);
     flite->wave = NULL;
 }
diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index 6a344282eb..22ecad5f77 100644
--- a/libavfilter/avfilter.c
+++ b/libavfilter/avfilter.c
@@ -925,8 +925,6 @@ int avfilter_init_dict(AVFilterContext *ctx, AVDictionary **options)
         ret = ctx->filter->init(ctx);
     else if (ctx->filter->init_dict)
         ret = ctx->filter->init_dict(ctx, options);
-    if (ret < 0)
-        return ret;

     if (ctx->enable_str) {
         ret = set_enable_expr(ctx, ctx->enable_str);
@@ -934,7 +932,7 @@ int avfilter_init_dict(AVFilterContext *ctx, AVDictionary **options)
             return ret;
     }

-    return 0;
+    return ret;
 }

 int avfilter_init_str(AVFilterContext *filter, const char *args)
diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
index f6b572b3de..44fe8b679c 100644
--- a/libavfilter/avfiltergraph.c
+++ b/libavfilter/avfiltergraph.c
@@ -32,6 +32,9 @@
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#if CONFIG_UNSAND_FILTER
+#include "libavutil/rpi_sand_fns.h"
+#endif

 #define FF_INTERNAL_FIELDS 1
 #include "framequeue.h"
@@ -422,6 +425,19 @@ static int formats_declared(AVFilterContext *f)
     return 1;
 }

+#if CONFIG_UNSAND_FILTER
+static int has_sand_format(const AVFilterFormats * const ff)
+{
+    int i;
+    for (i = 0; i != ff->nb_formats; ++i) {
+        if (av_rpi_is_sand_format(ff->formats[i])) {
+            return 1;
+        }
+    }
+    return 0;
+}
+#endif
+
 /**
  * Perform one round of query_formats() and merging formats lists on the
  * filter graph.
@@ -462,6 +478,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
         for (j = 0; j < filter->nb_inputs; j++) {
             AVFilterLink *link = filter->inputs[j];
             int convert_needed = 0;
+            unsigned int extra_convert_tried = 0;

             if (!link)
                 continue;
@@ -504,11 +521,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
                            link->outcfg.formats, link->type)
 #undef MERGE_DISPATCH

-            if (convert_needed) {
+            while (convert_needed) {
                 AVFilterContext *convert;
                 const AVFilter *filter;
                 AVFilterLink *inlink, *outlink;
                 char inst_name[30];
+                int can_retry = 0;
+
+                convert_needed = 0;

                 if (graph->disable_auto_convert) {
                     av_log(log_ctx, AV_LOG_ERROR,
@@ -521,19 +541,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
                 /* couldn't merge format lists. auto-insert conversion filter */
                 switch (link->type) {
                 case AVMEDIA_TYPE_VIDEO:
-                    if (!(filter = avfilter_get_by_name("scale"))) {
-                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
-                               "not present, cannot convert pixel formats.\n");
-                        return AVERROR(EINVAL);
-                    }
-
-                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
-                             scaler_count++);
+#if CONFIG_UNSAND_FILTER
+                    // Only try each extra conversion once
+                    // The unsand output pad should never trigger has_sand_format
+                    // but it is better to be safe
+                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->incfg.formats)) {
+                        if (!(filter = avfilter_get_by_name("unsand"))) {
+                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
+                                   "not present, cannot convert pixel formats.\n");
+                            return AVERROR(EINVAL);
+                        }
+
+                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
+                                 scaler_count++);
+
+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
+                                                                inst_name, "", NULL,
+                                                                graph)) < 0)
+                            return ret;

-                    if ((ret = avfilter_graph_create_filter(&convert, filter,
-                                                            inst_name, graph->scale_sws_opts, NULL,
-                                                            graph)) < 0)
-                        return ret;
+                        extra_convert_tried |= 1;
+                        can_retry = 1;
+                    }
+                    else
+#endif
+                    {
+                        if (!(filter = avfilter_get_by_name("scale"))) {
+                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
+                                   "not present, cannot convert pixel formats.\n");
+                            return AVERROR(EINVAL);
+                        }
+
+                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
+                                 scaler_count++);
+
+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
+                                                                inst_name, graph->scale_sws_opts, NULL,
+                                                                graph)) < 0)
+                            return ret;
+                    }
                     break;
                 case AVMEDIA_TYPE_AUDIO:
                     if (!(filter = avfilter_get_by_name("aresample"))) {
@@ -589,6 +635,13 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
                                                 outlink->outcfg.samplerates) ||
                      CHECKED_MERGE(channel_layouts, outlink->incfg.channel_layouts,
                                                     outlink->outcfg.channel_layouts))) {
+                    // Try adding an unsand filter & see if that helps
+                    if (ret < 0 && can_retry) {
+                        link = outlink;
+                        convert_needed = 1;
+                        continue;
+                    }
+
                     if (ret < 0)
                         return ret;
                     av_log(log_ctx, AV_LOG_ERROR,
diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c
index 15d897cff6..c134759bbf 100644
--- a/libavfilter/buffersink.c
+++ b/libavfilter/buffersink.c
@@ -58,6 +58,11 @@ typedef struct BufferSinkContext {
     int sample_rates_size;

     AVFrame *peeked_frame;
+
+    union {
+        av_buffersink_alloc_video_frame * video;
+    } alloc_cb;
+    void * alloc_v;
 } BufferSinkContext;

 #define NB_ITEMS(list) (list ## _size / sizeof(*list))
@@ -148,6 +153,22 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx,
     return get_frame_internal(ctx, frame, 0, nb_samples);
 }

+static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h)
+{
+    AVFilterContext * const ctx = link->dst;
+    BufferSinkContext * const bs = ctx->priv;
+    return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) :
+        ff_default_get_video_buffer(link, w, h);
+}
+
+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v)
+{
+    BufferSinkContext * const bs = ctx->priv;
+    bs->alloc_cb.video = cb;
+    bs->alloc_v = v;
+    return 0;
+}
+
 #if FF_API_BUFFERSINK_ALLOC
 AVBufferSinkParams *av_buffersink_params_alloc(void)
 {
@@ -331,6 +352,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = {
     {
         .name = "default",
         .type = AVMEDIA_TYPE_VIDEO,
+        .get_video_buffer = alloc_video_buffer,
     },
     { NULL }
 };
diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h
index 69ed0f29a8..a3aa6fcb3c 100644
--- a/libavfilter/buffersink.h
+++ b/libavfilter/buffersink.h
@@ -198,6 +198,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
  */
 int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);

+typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h);
+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v);
+
 /**
  * @}
  */
diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
index da1cf9941e..c588ed23cb 100644
--- a/libavfilter/buffersrc.c
+++ b/libavfilter/buffersrc.c
@@ -188,7 +188,7 @@ int attribute_align_arg av_buffersrc_add_frame_flags(AVFilterContext *ctx, AVFra

         switch (ctx->outputs[0]->type) {
         case AVMEDIA_TYPE_VIDEO:
-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
                                      frame->format, frame->pts);
             break;
         case AVMEDIA_TYPE_AUDIO:
diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
new file mode 100644
index 0000000000..d4c11cfc51
--- /dev/null
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -0,0 +1,2115 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * deinterlace video filter - V4L2 M2M
+ */
+
+#include <drm_fourcc.h>
+
+#include <linux/videodev2.h>
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "config.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavutil/internal.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/time.h"
+
+#define FF_INTERNAL_FIELDS 1
+#include "framequeue.h"
+#include "filters.h"
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "scale_eval.h"
+#include "video.h"
+
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in drm_fourcc.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
+typedef struct V4L2Queue V4L2Queue;
+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
+
+typedef enum filter_type_v4l2_e
+{
+    FILTER_V4L2_DEINTERLACE = 1,
+    FILTER_V4L2_SCALE,
+} filter_type_v4l2_t;
+
+typedef struct V4L2Buffer {
+    int enqueued;
+    int reenqueue;
+    struct v4l2_buffer buffer;
+    AVFrame frame;
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    int num_planes;
+    AVDRMFrameDescriptor drm_frame;
+    V4L2Queue *q;
+} V4L2Buffer;
+
+typedef struct V4L2Queue {
+    struct v4l2_format format;
+    struct v4l2_selection sel;
+    int eos;
+    int num_buffers;
+    V4L2Buffer *buffers;
+    const char * name;
+    DeintV4L2M2MContextShared *ctx;
+} V4L2Queue;
+
+typedef struct pts_stats_s
+{
+    void * logctx;
+    const char * name;  // For debug
+    unsigned int last_count;
+    unsigned int last_interval;
+    int64_t last_pts;
+} pts_stats_t;
+
+#define PTS_TRACK_SIZE 32
+typedef struct pts_track_el_s
+{
+    uint32_t n;
+    unsigned int interval;
+    AVFrame * props;
+} pts_track_el_t;
+
+typedef struct pts_track_s
+{
+    uint32_t n;
+    uint32_t last_n;
+    int got_2;
+    void * logctx;
+    pts_stats_t stats;
+    pts_track_el_t a[PTS_TRACK_SIZE];
+} pts_track_t;
+
+typedef enum drain_state_e
+{
+    DRAIN_NONE = 0,     // Not draining
+    DRAIN_TIMEOUT,      // Drain until normal timeout setup yields no frame
+    DRAIN_LAST,         // Drain with long timeout last_frame in received on output expected
+    DRAIN_EOS,          // Drain with long timeout EOS expected
+    DRAIN_DONE          // Drained
+} drain_state_t;
+
+typedef struct DeintV4L2M2MContextShared {
+    void * logctx;  // For logging - will be NULL when done
+    filter_type_v4l2_t filter_type;
+
+    int fd;
+    int done;   // fd closed - awating all refs dropped
+    int width;
+    int height;
+
+    int drain;          // EOS received (inlink status)
+    drain_state_t drain_state;
+    int64_t drain_pts;  // PTS associated with inline status
+
+    unsigned int frames_rx;
+    unsigned int frames_tx;
+
+    // from options
+    int output_width;
+    int output_height;
+    enum AVPixelFormat output_format;
+
+    int has_enc_stop;
+    // We expect to get exactly the same number of frames out as we put in
+    // We can drain by matching input to output
+    int one_to_one;
+
+    int orig_width;
+    int orig_height;
+    atomic_uint refcount;
+
+    AVBufferRef *hw_frames_ctx;
+
+    unsigned int field_order;
+
+    pts_track_t track;
+
+    V4L2Queue output;
+    V4L2Queue capture;
+} DeintV4L2M2MContextShared;
+
+typedef struct DeintV4L2M2MContext {
+    const AVClass *class;
+
+    DeintV4L2M2MContextShared *shared;
+
+    char * w_expr;
+    char * h_expr;
+    char * output_format_string;;
+
+    int force_original_aspect_ratio;
+    int force_divisible_by;
+
+    char *colour_primaries_string;
+    char *colour_transfer_string;
+    char *colour_matrix_string;
+    int   colour_range;
+    char *chroma_location_string;
+
+    enum AVColorPrimaries colour_primaries;
+    enum AVColorTransferCharacteristic colour_transfer;
+    enum AVColorSpace colour_matrix;
+    enum AVChromaLocation chroma_location;
+} DeintV4L2M2MContext;
+
+
+static inline int drain_frame_expected(const drain_state_t d)
+{
+    return d == DRAIN_EOS || d == DRAIN_LAST;
+}
+
+// These just list the ones we know we can cope with
+static uint32_t
+fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
+{
+    switch (avfmt) {
+    case AV_PIX_FMT_YUV420P:
+        return V4L2_PIX_FMT_YUV420;
+    case AV_PIX_FMT_NV12:
+        return V4L2_PIX_FMT_NV12;
+#if CONFIG_SAND
+    case AV_PIX_FMT_RPI4_8:
+    case AV_PIX_FMT_SAND128:
+        return V4L2_PIX_FMT_NV12_COL128;
+#endif
+    default:
+        break;
+    }
+    return 0;
+}
+
+static enum AVPixelFormat
+fmt_v4l2_to_av(const uint32_t pixfmt)
+{
+    switch (pixfmt) {
+    case V4L2_PIX_FMT_YUV420:
+        return AV_PIX_FMT_YUV420P;
+    case V4L2_PIX_FMT_NV12:
+        return AV_PIX_FMT_NV12;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        return AV_PIX_FMT_RPI4_8;
+#endif
+    default:
+        break;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
+{
+    return stats->last_interval;
+}
+
+// Pick 64 for max last count - that is >1sec at 60fps
+#define STATS_LAST_COUNT_MAX 64
+#define STATS_INTERVAL_MAX (1 << 30)
+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
+{
+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
+        if (stats->last_count < STATS_LAST_COUNT_MAX)
+            ++stats->last_count;
+        return;
+    }
+
+    if (stats->last_pts != AV_NOPTS_VALUE) {
+        const int64_t interval = pts - stats->last_pts;
+
+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
+            stats->last_count >= STATS_LAST_COUNT_MAX) {
+            if (stats->last_interval != 0)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
+                       __func__, stats->name, interval, stats->last_count);
+            stats->last_interval = 0;
+        }
+        else {
+            const int64_t frame_time = interval / (int64_t)stats->last_count;
+
+            if (frame_time != stats->last_interval)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
+            stats->last_interval = frame_time;
+        }
+    }
+
+    stats->last_pts = pts;
+    stats->last_count = 1;
+}
+
+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
+{
+    *stats = (pts_stats_t){
+        .logctx = logctx,
+        .name = name,
+        .last_count = 1,
+        .last_interval = 0,
+        .last_pts = AV_NOPTS_VALUE
+    };
+}
+
+static inline uint32_t pts_track_next_n(pts_track_t * const trk)
+{
+    if (++trk->n == 0)
+        trk->n = 1;
+    return trk->n;
+}
+
+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
+{
+    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
+    pts_track_el_t * t;
+
+    // As a first guess assume that n==0 means last frame
+    if (n == 0) {
+        n = trk->last_n;
+        if (n == 0)
+            goto fail;
+    }
+
+    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
+
+    if (t->n != n) {
+        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
+        goto fail;
+    }
+
+    // 1st frame is simple - just believe it
+    if (n != trk->last_n) {
+        trk->last_n = n;
+        trk->got_2 = 0;
+        return av_frame_copy_props(dst, t->props);
+    }
+
+    // Only believe in a single interpolated frame
+    if (trk->got_2)
+        goto fail;
+    trk->got_2 = 1;
+
+    av_frame_copy_props(dst, t->props);
+
+
+    // If we can't guess - don't
+    if (t->interval == 0) {
+        dst->best_effort_timestamp = AV_NOPTS_VALUE;
+        dst->pts = AV_NOPTS_VALUE;
+        dst->pkt_dts = AV_NOPTS_VALUE;
+    }
+    else {
+        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
+            dst->best_effort_timestamp += t->interval / 2;
+        if (dst->pts != AV_NOPTS_VALUE)
+            dst->pts += t->interval / 2;
+        if (dst->pkt_dts != AV_NOPTS_VALUE)
+            dst->pkt_dts += t->interval / 2;
+    }
+
+    return 0;
+
+fail:
+    trk->last_n = 0;
+    trk->got_2 = 0;
+    dst->pts = AV_NOPTS_VALUE;
+    dst->pkt_dts = AV_NOPTS_VALUE;
+    return 0;
+}
+
+// We are only ever expecting in-order frames so nothing more clever is required
+static unsigned int
+pts_track_count(const pts_track_t * const trk)
+{
+    return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1);
+}
+
+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
+{
+    const uint32_t n = pts_track_next_n(trk);
+    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
+
+    pts_stats_add(&trk->stats, src->pts);
+
+    t->n = n;
+    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
+    av_frame_unref(t->props);
+    av_frame_copy_props(t->props, src);
+
+    // We now know what the previous interval was, rather than having to guess,
+    // so set it.  There is a better than decent chance that this is before
+    // we use it.
+    if (t->interval != 0) {
+        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
+        prev_t->interval = t->interval;
+    }
+
+    // In case deinterlace interpolates frames use every other usec
+    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
+}
+
+static void pts_track_uninit(pts_track_t * const trk)
+{
+    unsigned int i;
+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
+        trk->a[i].n = 0;
+        av_frame_free(&trk->a[i].props);
+    }
+}
+
+static int pts_track_init(pts_track_t * const trk, void *logctx)
+{
+    unsigned int i;
+    trk->n = 1;
+    pts_stats_init(&trk->stats, logctx, "track");
+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
+        trk->a[i].n = 0;
+        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
+            pts_track_uninit(trk);
+            return AVERROR(ENOMEM);
+        }
+    }
+    return 0;
+}
+
+static inline uint32_t
+fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline;
+}
+
+static inline uint32_t
+fmt_height(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+static inline uint32_t
+fmt_width(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline uint32_t
+fmt_pixelformat(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
+}
+
+static inline uint32_t
+buf_bytesused0(const struct v4l2_buffer * const buf)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused;
+}
+
+static void
+init_format(V4L2Queue * const q, const uint32_t format_type)
+{
+    memset(&q->format, 0, sizeof(q->format));
+    memset(&q->sel,    0, sizeof(q->sel));
+    q->format.type = format_type;
+    q->sel.type    = format_type;
+}
+
+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
+{
+    struct v4l2_capability cap;
+    int ret;
+
+    memset(&cap, 0, sizeof(cap));
+    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
+    if (ret < 0)
+        return ret;
+
+    if (ctx->filter_type == FILTER_V4L2_SCALE &&
+        strcmp("bcm2835-codec-isp", cap.card) != 0)
+    {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
+    }
+    else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE);
+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT);
+    }
+    else {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+// Just use for probe - doesn't modify q format
+static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt)
+{
+    struct v4l2_format fmt         = {.type = queue->format.type};
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    int ret, field;
+    // Pick YUV to test with if not otherwise specified
+    uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt);
+    enum AVPixelFormat r_avfmt;
+
+
+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
+
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type))
+        field = V4L2_FIELD_INTERLACED_TB;
+    else
+        field = V4L2_FIELD_NONE;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        fmt.fmt.pix_mp.pixelformat = pixelformat;
+        fmt.fmt.pix_mp.field = field;
+        fmt.fmt.pix_mp.width = width;
+        fmt.fmt.pix_mp.height = height;
+    } else {
+        fmt.fmt.pix.pixelformat = pixelformat;
+        fmt.fmt.pix.field = field;
+        fmt.fmt.pix.width = width;
+        fmt.fmt.pix.height = height;
+    }
+
+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
+         fmt.fmt.pix_mp.pixelformat,
+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
+
+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt);
+    if (ret)
+        return AVERROR(EINVAL);
+
+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
+         fmt.fmt.pix_mp.pixelformat,
+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
+
+    r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt));
+    if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
+        return AVERROR(EINVAL);
+    }
+    if (r_avfmt == AV_PIX_FMT_NONE) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
+        return AVERROR(EINVAL);
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        if (fmt.fmt.pix_mp.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
+
+            return AVERROR(EINVAL);
+        }
+    } else {
+        if (fmt.fmt.pix.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
+
+            return AVERROR(EINVAL);
+        }
+    }
+
+    return 0;
+}
+
+static int
+do_s_fmt(V4L2Queue * const q)
+{
+    DeintV4L2M2MContextShared * const ctx = q->ctx;
+    const uint32_t pixelformat = fmt_pixelformat(&q->format);
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret));
+        return ret;
+    }
+
+    if (pixelformat != fmt_pixelformat(&q->format)) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format)));
+        return AVERROR(EINVAL);
+    }
+
+    q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
+    q->sel.flags  = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE;
+
+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret));
+    }
+
+    return 0;
+}
+
+static void
+set_fmt_color(struct v4l2_format *const fmt,
+               const enum AVColorPrimaries avcp,
+               const enum AVColorSpace avcs,
+               const enum AVColorTransferCharacteristic avxc)
+{
+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
+
+    switch (avcp) {
+    case AVCOL_PRI_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        ycbcr = V4L2_YCBCR_ENC_709;
+        break;
+    case AVCOL_PRI_BT470M:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        ycbcr = V4L2_YCBCR_ENC_601;
+        break;
+    case AVCOL_PRI_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_PRI_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_PRI_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_PRI_BT2020:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    case AVCOL_PRI_SMPTE428:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+    case AVCOL_PRI_EBU3213:
+    case AVCOL_PRI_RESERVED:
+    case AVCOL_PRI_FILM:
+    case AVCOL_PRI_UNSPECIFIED:
+    default:
+        break;
+    }
+
+    switch (avcs) {
+    case AVCOL_SPC_RGB:
+        cs = V4L2_COLORSPACE_SRGB;
+        break;
+    case AVCOL_SPC_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        break;
+    case AVCOL_SPC_FCC:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        break;
+    case AVCOL_SPC_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_SPC_BT2020_CL:
+        cs = V4L2_COLORSPACE_BT2020;
+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    default:
+        break;
+    }
+
+    switch (xfer) {
+    case AVCOL_TRC_BT709:
+        xfer = V4L2_XFER_FUNC_709;
+        break;
+    case AVCOL_TRC_IEC61966_2_1:
+        xfer = V4L2_XFER_FUNC_SRGB;
+        break;
+    case AVCOL_TRC_SMPTE240M:
+        xfer = V4L2_XFER_FUNC_SMPTE240M;
+        break;
+    case AVCOL_TRC_SMPTE2084:
+        xfer = V4L2_XFER_FUNC_SMPTE2084;
+        break;
+    default:
+        break;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.colorspace = cs;
+        fmt->fmt.pix_mp.ycbcr_enc = ycbcr;
+        fmt->fmt.pix_mp.xfer_func = xfer;
+    } else {
+        fmt->fmt.pix.colorspace = cs;
+        fmt->fmt.pix.ycbcr_enc = ycbcr;
+        fmt->fmt.pix.xfer_func = xfer;
+    }
+}
+
+static void
+set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr)
+{
+    const enum v4l2_quantization q =
+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
+            V4L2_QUANTIZATION_DEFAULT;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.quantization = q;
+    } else {
+        fmt->fmt.pix.quantization = q;
+    }
+}
+
+static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    switch(ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
+    case V4L2_YCBCR_ENC_XV601:
+    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
+    default:
+        break;
+    }
+
+    switch(cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
+    default:
+        break;
+    }
+
+    return AVCOL_PRI_UNSPECIFIED;
+}
+
+static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    switch(cs) {
+    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
+    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020:
+        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+            return AVCOL_SPC_BT2020_CL;
+        else
+             return AVCOL_SPC_BT2020_NCL;
+    default:
+        break;
+    }
+
+    return AVCOL_SPC_UNSPECIFIED;
+}
+
+static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_xfer_func xfer;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.xfer_func:
+        fmt->fmt.pix.xfer_func;
+
+    switch (xfer) {
+    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
+    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
+    default:
+        break;
+    }
+
+    switch (cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
+    default:
+        break;
+    }
+
+    switch (ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
+    default:
+        break;
+    }
+
+    return AVCOL_TRC_UNSPECIFIED;
+}
+
+static enum AVColorRange get_color_range(const struct v4l2_format *const fmt)
+{
+    enum v4l2_quantization qt;
+
+    qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.quantization :
+        fmt->fmt.pix.quantization;
+
+    switch (qt) {
+    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
+    default:
+        break;
+    }
+
+     return AVCOL_RANGE_UNSPECIFIED;
+}
+
+static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
+{
+    struct v4l2_format *const format = &q->format;
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    const uint32_t drm_fmt = src->layers[0].format;
+    // Treat INVALID as LINEAR
+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
+    uint32_t pix_fmt = 0;
+    uint32_t w = 0;
+    uint32_t h = 0;
+    uint32_t bpl = src->layers[0].planes[0].pitch;
+
+    // We really don't expect multiple layers
+    // All formats that we currently cope with are single object
+
+    if (src->nb_layers != 1 || src->nb_objects != 1)
+        return AVERROR(EINVAL);
+
+    switch (drm_fmt) {
+        case DRM_FORMAT_YUV420:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 3)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_YUV420;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            break;
+
+        case DRM_FORMAT_NV12:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+#if CONFIG_SAND
+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
+                w = bpl;
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+#endif
+            break;
+
+        case DRM_FORMAT_P030:
+#if CONFIG_SAND
+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
+                w = bpl / 2;  // Matching lie to how we construct this
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+#endif
+            break;
+
+        default:
+            break;
+    }
+
+    if (!pix_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->plane_fmt[0].bytesperline = bpl;
+        pix->num_planes = 1;
+    }
+    else {
+        struct v4l2_pix_format *const pix = &format->fmt.pix;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->bytesperline = bpl;
+    }
+
+    set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc);
+    set_fmt_color_range(format, frame->color_range);
+
+    q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right);
+    q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom);
+    q->sel.r.left = frame->crop_left;
+    q->sel.r.top = frame->crop_top;
+
+    return 0;
+}
+
+
+static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height)
+{
+    struct v4l2_format * const fmt   = &queue->format;
+    struct v4l2_selection *const sel = &queue->sel;
+
+    memset(&fmt->fmt, 0, sizeof(fmt->fmt));
+
+    // Align w/h to 16 here in case there are alignment requirements at the next
+    // stage of the filter chain (also RPi deinterlace setup is bust and this
+    // fixes it)
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = pixelformat;
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = FFALIGN(width, 16);
+        fmt->fmt.pix_mp.height = FFALIGN(height, 16);
+    } else {
+        fmt->fmt.pix.pixelformat = pixelformat;
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = FFALIGN(width, 16);
+        fmt->fmt.pix.height = FFALIGN(height, 16);
+    }
+
+    set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer);
+    set_fmt_color_range(fmt, priv->colour_range);
+
+    sel->r.width = width;
+    sel->r.height = height;
+    sel->r.left = 0;
+    sel->r.top = 0;
+
+    return do_s_fmt(queue);
+}
+
+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
+{
+    int ret;
+
+    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
+    if (ctx->fd < 0)
+        return AVERROR(errno);
+
+    ret = deint_v4l2m2m_prepare_context(ctx);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n");
+        goto fail;
+    }
+
+    ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n");
+        goto fail;
+    }
+
+    ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n");
+        goto fail;
+    }
+
+    return 0;
+
+fail:
+    close(ctx->fd);
+    ctx->fd = -1;
+
+    return ret;
+}
+
+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
+{
+    int ret = AVERROR(EINVAL);
+    struct dirent *entry;
+    char node[PATH_MAX];
+    DIR *dirp;
+
+    dirp = opendir("/dev");
+    if (!dirp)
+        return AVERROR(errno);
+
+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
+
+        if (strncmp(entry->d_name, "video", 5))
+            continue;
+
+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
+        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
+        ret = deint_v4l2m2m_probe_device(ctx, node);
+        if (!ret)
+            break;
+    }
+
+    closedir(dirp);
+
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
+        ctx->fd = -1;
+
+        return ret;
+    }
+
+    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
+
+    return 0;
+}
+
+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
+{
+    int ret;
+
+    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    buf->enqueued = 1;
+
+    return 0;
+}
+
+static void
+drm_frame_init(AVDRMFrameDescriptor * const d)
+{
+    unsigned int i;
+    for (i = 0; i != AV_DRM_MAX_PLANES; ++i) {
+        d->objects[i].fd = -1;
+    }
+}
+
+static void
+drm_frame_uninit(AVDRMFrameDescriptor * const d)
+{
+    unsigned int i;
+    for (i = 0; i != d->nb_objects; ++i) {
+        if (d->objects[i].fd != -1) {
+            close(d->objects[i].fd);
+            d->objects[i].fd = -1;
+        }
+    }
+}
+
+static void
+avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n)
+{
+    unsigned int i;
+    V4L2Buffer* const avbufs = *ppavbufs;
+
+    if (avbufs == NULL)
+        return;
+    *ppavbufs = NULL;
+
+    for (i = 0; i != n; ++i) {
+        V4L2Buffer* const avbuf = avbufs + i;
+        drm_frame_uninit(&avbuf->drm_frame);
+    }
+
+    av_free(avbufs);
+}
+
+static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
+{
+    struct v4l2_exportbuffer expbuf;
+    int i, ret;
+    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
+
+    AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
+    const struct v4l2_format *const fmt = &q->format;
+    const uint32_t height = fmt_height(fmt);
+    ptrdiff_t bpl0;
+
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_layers = 1;
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = fmt_bpl(fmt, i);
+    }
+    bpl0 = layer->planes[0].pitch;
+
+    switch (fmt_pixelformat(fmt)) {
+#if CONFIG_SAND
+        case V4L2_PIX_FMT_NV12_COL128:
+            mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
+            layer->format = V4L2_PIX_FMT_NV12;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 2;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = height * 128;
+            layer->planes[0].pitch = fmt_width(fmt);
+            layer->planes[1].pitch = layer->planes[0].pitch;
+            break;
+#endif
+
+        case DRM_FORMAT_NV12:
+            layer->format = V4L2_PIX_FMT_NV12;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 2;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = bpl0 * height;
+            layer->planes[1].pitch = bpl0;
+            break;
+
+        case V4L2_PIX_FMT_YUV420:
+            layer->format = DRM_FORMAT_YUV420;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 3;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = bpl0 * height;
+            layer->planes[1].pitch = bpl0 / 2;
+            layer->planes[2].object_index = 0;
+            layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4);
+            layer->planes[2].pitch = bpl0 / 2;
+            break;
+
+        default:
+            drm_desc->nb_layers = 0;
+            return AVERROR(EINVAL);
+    }
+
+    drm_desc->nb_objects = 0;
+    for (i = 0; i < avbuf->num_planes; i++) {
+        memset(&expbuf, 0, sizeof(expbuf));
+
+        expbuf.index = avbuf->buffer.index;
+        expbuf.type = avbuf->buffer.type;
+        expbuf.plane = i;
+
+        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
+        if (ret < 0)
+            return AVERROR(errno);
+
+        drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ?
+            avbuf->buffer.m.planes[i].length : avbuf->buffer.length;
+        drm_desc->objects[i].fd = expbuf.fd;
+        drm_desc->objects[i].format_modifier = mod;
+        drm_desc->nb_objects = i + 1;
+    }
+
+    return 0;
+}
+
+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
+{
+    struct v4l2_format *fmt = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    struct v4l2_requestbuffers req;
+    int ret, i, multiplanar;
+    uint32_t memory;
+
+    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
+        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+
+    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
+
+    memset(&req, 0, sizeof(req));
+    req.count = queue->num_buffers;
+    req.memory = memory;
+    req.type = fmt->type;
+
+    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
+    if (ret < 0) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
+
+        return AVERROR(errno);
+    }
+
+    queue->num_buffers = req.count;
+    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
+    if (!queue->buffers) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
+
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < queue->num_buffers; i++) {
+        V4L2Buffer * const buf = &queue->buffers[i];
+
+        buf->enqueued = 0;
+        buf->q = queue;
+
+        buf->buffer.type = fmt->type;
+        buf->buffer.memory = memory;
+        buf->buffer.index = i;
+
+        if (multiplanar) {
+            buf->buffer.length = VIDEO_MAX_PLANES;
+            buf->buffer.m.planes = buf->planes;
+        }
+
+        drm_frame_init(&buf->drm_frame);
+    }
+
+    for (i = 0; i < queue->num_buffers; i++) {
+        V4L2Buffer * const buf = &queue->buffers[i];
+
+        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
+        if (ret < 0) {
+            ret = AVERROR(errno);
+
+            goto fail;
+        }
+
+        buf->num_planes = multiplanar ? buf->buffer.length : 1;
+
+        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
+            ret = deint_v4l2m2m_enqueue_buffer(buf);
+            if (ret)
+                goto fail;
+
+            ret = v4l2_buffer_export_drm(queue, buf);
+            if (ret)
+                goto fail;
+        }
+    }
+
+    return 0;
+
+fail:
+    avbufs_delete(&queue->buffers, queue->num_buffers);
+    queue->num_buffers = 0;
+    return ret;
+}
+
+static int deint_v4l2m2m_streamon(V4L2Queue *queue)
+{
+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
+    int type = queue->format.type;
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
+    if (ret < 0)
+        return AVERROR(errno);
+
+    return 0;
+}
+
+static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
+{
+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
+    int type = queue->format.type;
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
+    if (ret < 0)
+        return AVERROR(errno);
+
+    return 0;
+}
+
+// timeout in ms
+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
+{
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    struct v4l2_buffer buf = { 0 };
+    V4L2Buffer* avbuf = NULL;
+    struct pollfd pfd;
+    short events;
+    int ret;
+
+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
+        events =  POLLOUT | POLLWRNORM;
+    else
+        events = POLLIN | POLLRDNORM;
+
+    pfd.events = events;
+    pfd.fd = ctx->fd;
+
+    for (;;) {
+        ret = poll(&pfd, 1, timeout);
+        if (ret > 0)
+            break;
+        if (errno == EINTR)
+            continue;
+        return NULL;
+    }
+
+    if (pfd.revents & POLLERR)
+        return NULL;
+
+    if (pfd.revents & events) {
+        memset(&buf, 0, sizeof(buf));
+        buf.memory = V4L2_MEMORY_MMAP;
+        buf.type = queue->format.type;
+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
+            memset(planes, 0, sizeof(planes));
+            buf.length = VIDEO_MAX_PLANES;
+            buf.m.planes = planes;
+        }
+
+        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
+        if (ret) {
+            if (errno != EAGAIN)
+                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
+                       av_err2str(AVERROR(errno)));
+            return NULL;
+        }
+
+        avbuf = &queue->buffers[buf.index];
+        avbuf->enqueued = 0;
+        avbuf->buffer = buf;
+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
+            memcpy(avbuf->planes, planes, sizeof(planes));
+            avbuf->buffer.m.planes = avbuf->planes;
+        }
+        return avbuf;
+    }
+
+    return NULL;
+}
+
+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
+{
+    int i;
+    V4L2Buffer *buf = NULL;
+
+    for (i = 0; i < queue->num_buffers; i++)
+        if (!queue->buffers[i].enqueued) {
+            buf = &queue->buffers[i];
+            break;
+        }
+    return buf;
+}
+
+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
+{
+    int i;
+    V4L2Buffer *buf = NULL;
+
+    if (!queue || !queue->buffers)
+        return;
+    for (i = 0; i < queue->num_buffers; i++) {
+        buf = &queue->buffers[i];
+        if (queue->buffers[i].enqueued)
+            av_frame_unref(&buf->frame);
+    }
+}
+
+static void recycle_q(V4L2Queue * const queue)
+{
+    V4L2Buffer* avbuf;
+    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
+        av_frame_unref(&avbuf->frame);
+    }
+}
+
+static int count_enqueued(V4L2Queue *queue)
+{
+    int i;
+    int n = 0;
+
+    if (queue->buffers == NULL)
+        return 0;
+
+    for (i = 0; i < queue->num_buffers; i++)
+        if (queue->buffers[i].enqueued)
+            ++n;
+    return n;
+}
+
+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
+{
+    DeintV4L2M2MContextShared *const ctx = queue->ctx;
+    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
+    V4L2Buffer *buf;
+    int i;
+
+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
+        recycle_q(queue);
+
+    buf = deint_v4l2m2m_find_free_buf(queue);
+    if (!buf) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
+        return AVERROR(EAGAIN);
+    }
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
+        for (i = 0; i < drm_desc->nb_objects; i++)
+            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
+    else
+        buf->buffer.m.fd = drm_desc->objects[0].fd;
+
+    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
+        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
+            V4L2_FIELD_INTERLACED_BT;
+
+    if (ctx->field_order != buf->buffer.field) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
+        ctx->field_order = buf->buffer.field;
+    }
+
+    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
+
+    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
+
+    av_frame_move_ref(&buf->frame, frame);
+
+    return deint_v4l2m2m_enqueue_buffer(buf);
+}
+
+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
+{
+    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
+        V4L2Queue *capture = &ctx->capture;
+        V4L2Queue *output  = &ctx->output;
+
+        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
+
+        if (ctx->fd >= 0) {
+            deint_v4l2m2m_streamoff(capture);
+            deint_v4l2m2m_streamoff(output);
+        }
+
+        avbufs_delete(&capture->buffers, capture->num_buffers);
+
+        deint_v4l2m2m_unref_queued(output);
+
+        av_buffer_unref(&ctx->hw_frames_ctx);
+
+        if (capture->buffers)
+            av_free(capture->buffers);
+
+        if (output->buffers)
+            av_free(output->buffers);
+
+        if (ctx->fd >= 0) {
+            close(ctx->fd);
+            ctx->fd = -1;
+        }
+
+        av_free(ctx);
+    }
+}
+
+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+{
+    V4L2Buffer *buf                = opaque;
+    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
+
+    if (!ctx->done)
+        deint_v4l2m2m_enqueue_buffer(buf);
+
+    deint_v4l2m2m_destroy_context(ctx);
+}
+
+// timeout in ms
+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
+{
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    V4L2Buffer* avbuf;
+    enum AVColorPrimaries color_primaries;
+    enum AVColorSpace colorspace;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorRange color_range;
+
+    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    if (queue->eos) {
+        av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__);
+        return AVERROR_EOF;
+    }
+
+    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
+    if (!avbuf) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
+        return AVERROR(EAGAIN);
+    }
+
+    if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) {
+        if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0)
+            queue->eos = 1;
+        if (buf_bytesused0(&avbuf->buffer) == 0)
+            return queue->eos ? AVERROR_EOF : AVERROR(EINVAL);
+    }
+
+    // Fill in PTS and anciliary info from src frame
+    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
+
+    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
+                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
+                            avbuf, AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[0]) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
+        return AVERROR(ENOMEM);
+    }
+
+    atomic_fetch_add(&ctx->refcount, 1);
+
+    frame->data[0] = (uint8_t *)&avbuf->drm_frame;
+    frame->format = AV_PIX_FMT_DRM_PRIME;
+    if (ctx->hw_frames_ctx)
+        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
+    frame->height = ctx->output_height;
+    frame->width = ctx->output_width;
+
+    color_primaries = get_color_primaries(&ctx->capture.format);
+    colorspace      = get_color_space(&ctx->capture.format);
+    color_trc       = get_color_trc(&ctx->capture.format);
+    color_range     = get_color_range(&ctx->capture.format);
+
+    // If the color parameters are unspecified by V4L2 then leave alone as they
+    // will have been copied from src
+    if (color_primaries != AVCOL_PRI_UNSPECIFIED)
+        frame->color_primaries = color_primaries;
+    if (colorspace != AVCOL_SPC_UNSPECIFIED)
+        frame->colorspace = colorspace;
+    if (color_trc != AVCOL_TRC_UNSPECIFIED)
+        frame->color_trc = color_trc;
+    if (color_range != AVCOL_RANGE_UNSPECIFIED)
+        frame->color_range = color_range;
+
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) {
+        // Not interlaced now
+        frame->interlaced_frame = 0;   // *** Fill in from dst buffer?
+        frame->top_field_first = 0;
+        // Pkt duration halved
+        frame->pkt_duration /= 2;
+    }
+
+    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
+    }
+
+    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
+    return 0;
+}
+
+static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
+{
+    AVFilterLink *inlink           = outlink->src->inputs[0];
+    AVFilterContext *avctx         = outlink->src;
+    DeintV4L2M2MContext *priv      = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+    int ret;
+
+    ctx->height = avctx->inputs[0]->h;
+    ctx->width = avctx->inputs[0]->w;
+
+    if (ctx->filter_type == FILTER_V4L2_SCALE) {
+        if ((ret = ff_scale_eval_dimensions(priv,
+                                            priv->w_expr, priv->h_expr,
+                                            inlink, outlink,
+                                            &ctx->output_width, &ctx->output_height)) < 0)
+            return ret;
+
+        ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height,
+                                   priv->force_original_aspect_ratio, priv->force_divisible_by);
+    }
+    else {
+        ctx->output_width  = ctx->width;
+        ctx->output_height = ctx->height;
+    }
+
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__,
+           ctx->width, ctx->height, ctx->output_width, ctx->output_height,
+           inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den);
+
+    outlink->time_base           = inlink->time_base;
+    outlink->w                   = ctx->output_width;
+    outlink->h                   = ctx->output_height;
+    outlink->format              = inlink->format;
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0)
+        outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den};
+
+    if (inlink->sample_aspect_ratio.num)
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+    else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+    ret = deint_v4l2m2m_find_device(ctx);
+    if (ret)
+        return ret;
+
+    if (inlink->hw_frames_ctx) {
+        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
+        if (!ctx->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
+{
+    static const enum AVPixelFormat pixel_formats[] = {
+        AV_PIX_FMT_DRM_PRIME,
+//        AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_NONE,
+    };
+
+    return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
+}
+
+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
+{
+    const uint64_t mod = drm_desc->objects[0].format_modifier;
+    const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID);
+
+    // Only currently support single object things
+    if (drm_desc->nb_objects != 1)
+        return 0;
+
+    switch (drm_desc->layers[0].format) {
+    case DRM_FORMAT_YUV420:
+        return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
+    case DRM_FORMAT_NV12:
+        return is_linear ? V4L2_PIX_FMT_NV12 :
+#if CONFIG_SAND
+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 :
+#endif
+            0;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext *avctx         = link->dst;
+    DeintV4L2M2MContext *priv      = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+    V4L2Queue *capture             = &ctx->capture;
+    V4L2Queue *output              = &ctx->output;
+    int ret;
+
+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n",
+           __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
+    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
+           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
+
+    if (ctx->field_order == V4L2_FIELD_ANY) {
+        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        uint32_t pixelformat = desc_pixelformat(drm_desc);
+
+        if (pixelformat == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
+                   av_fourcc2str(drm_desc->layers[0].format),
+                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
+            return AVERROR(EINVAL);
+        }
+
+        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
+        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
+
+        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
+           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
+
+        if ((ret = set_src_fmt(output, in)) != 0) {
+            av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n",
+                   av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier);
+            return ret;
+        }
+
+        ret = do_s_fmt(output);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n");
+            return ret;
+        }
+
+        if (ctx->output_format != AV_PIX_FMT_NONE)
+           pixelformat = fmt_av_to_v4l2(ctx->output_format);
+        ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n");
+            return ret;
+        }
+
+        ret = deint_v4l2m2m_allocate_buffers(capture);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n");
+            return ret;
+        }
+
+        ret = deint_v4l2m2m_streamon(capture);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret));
+            return ret;
+        }
+
+        ret = deint_v4l2m2m_allocate_buffers(output);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n");
+            return ret;
+        }
+
+        ret = deint_v4l2m2m_streamon(output);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret));
+            return ret;
+        }
+
+        if (in->top_field_first)
+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
+        else
+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
+
+        {
+            struct v4l2_encoder_cmd ecmd = {
+                .cmd = V4L2_ENC_CMD_STOP
+            };
+            ctx->has_enc_stop = 0;
+            if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) {
+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n");
+                ctx->has_enc_stop = 1;
+            }
+            else {
+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno)));
+            }
+
+        }
+    }
+
+    ret = deint_v4l2m2m_enqueue_frame(output, in);
+
+    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
+    return ret;
+}
+
+static int
+ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s,
+           AVFilterLink * const inlink)
+{
+    int instatus;
+    int64_t inpts;
+
+    if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0)
+        return 0;
+
+    s->drain      = instatus;
+    s->drain_pts  = inpts;
+    s->drain_state = DRAIN_TIMEOUT;
+
+    if (s->field_order == V4L2_FIELD_ANY) {  // Not yet started
+        s->drain_state = DRAIN_DONE;
+    }
+    else if (s->one_to_one) {
+        s->drain_state = DRAIN_LAST;
+    }
+    else if (s->has_enc_stop) {
+        struct v4l2_encoder_cmd ecmd = {
+            .cmd = V4L2_ENC_CMD_STOP
+        };
+        if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) {
+            av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n");
+            s->drain_state = DRAIN_EOS;
+        }
+        else {
+            av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno)));
+        }
+    }
+    return 1;
+}
+
+static int deint_v4l2m2m_activate(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext * const priv = avctx->priv;
+    DeintV4L2M2MContextShared *const s = priv->shared;
+    AVFilterLink * const outlink = avctx->outputs[0];
+    AVFilterLink * const inlink = avctx->inputs[0];
+    int n = 0;
+    int cn = 99;
+    int did_something = 0;
+
+    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
+
+    ack_inlink(avctx, s, inlink);
+
+    if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
+    {
+        AVFrame * frame = av_frame_alloc();
+        int rv;
+
+        recycle_q(&s->output);
+        n = count_enqueued(&s->output);
+
+        if (frame == NULL) {
+            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
+            return AVERROR(ENOMEM);
+        }
+
+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame,
+                                         drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0);
+        if (rv != 0) {
+            av_frame_free(&frame);
+            if (rv == AVERROR_EOF) {
+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__);
+                s->drain_state = DRAIN_DONE;
+            }
+            else if (rv == AVERROR(EAGAIN)) {
+                if (s->drain_state != DRAIN_NONE) {
+                    av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__);
+                    s->drain_state = DRAIN_DONE;
+                }
+            }
+            else {
+                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
+                return rv;
+            }
+        }
+        else {
+            frame->interlaced_frame = 0;
+            // frame is always consumed by filter_frame - even on error despite
+            // a somewhat confusing comment in the header
+            rv = ff_filter_frame(outlink, frame);
+            ++s->frames_tx;
+
+            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
+            did_something = 1;
+
+            if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) {
+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__);
+                s->drain_state = DRAIN_DONE;
+            }
+        }
+
+        cn = count_enqueued(&s->capture);
+    }
+
+    if (s->drain_state == DRAIN_DONE) {
+        ff_outlink_set_status(outlink, s->drain, s->drain_pts);
+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain));
+        return 0;
+    }
+
+    recycle_q(&s->output);
+    n = count_enqueued(&s->output);
+
+    while (n < 6 && !s->drain) {
+        AVFrame * frame;
+        int rv;
+
+        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
+            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
+            return rv;
+        }
+
+        if (frame == NULL) {
+            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+            if (!ack_inlink(avctx, s, inlink)) {
+                ff_inlink_request_frame(inlink);
+                av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+            }
+            break;
+        }
+        ++s->frames_rx;
+
+        rv = deint_v4l2m2m_filter_frame(inlink, frame);
+        av_frame_free(&frame);
+
+        if (rv != 0)
+            return rv;
+
+        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
+        did_something = 1;
+        ++n;
+    }
+
+    if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) {
+        ff_filter_set_ready(avctx, 1);
+        did_something = 1;
+        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
+    }
+
+    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
+    return did_something ? 0 : FFERROR_NOT_READY;
+}
+
+static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type)
+{
+    DeintV4L2M2MContext * const priv = avctx->priv;
+    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
+
+    if (!ctx) {
+        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
+        return AVERROR(ENOMEM);
+    }
+    priv->shared = ctx;
+    ctx->logctx = priv;
+    ctx->filter_type = filter_type;
+    ctx->fd = -1;
+    ctx->output.ctx = ctx;
+    ctx->output.num_buffers = 8;
+    ctx->output.name = "OUTPUT";
+    ctx->capture.ctx = ctx;
+    ctx->capture.num_buffers = 12;
+    ctx->capture.name = "CAPTURE";
+    ctx->done = 0;
+    ctx->field_order = V4L2_FIELD_ANY;
+
+    pts_track_init(&ctx->track, priv);
+
+    atomic_init(&ctx->refcount, 1);
+
+    if (priv->output_format_string) {
+        ctx->output_format = av_get_pix_fmt(priv->output_format_string);
+        if (ctx->output_format == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string);
+            return AVERROR(EINVAL);
+        }
+        if (fmt_av_to_v4l2(ctx->output_format) == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format));
+            return AVERROR(EINVAL);
+        }
+    } else {
+        // Use the input format once that is configured.
+        ctx->output_format = AV_PIX_FMT_NONE;
+    }
+
+#define STRING_OPTION(var_name, func_name, default_value) do { \
+        if (priv->var_name ## _string) { \
+            int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \
+            if (var < 0) { \
+                av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \
+                return AVERROR(EINVAL); \
+            } \
+            priv->var_name = var; \
+        } else { \
+            priv->var_name = default_value; \
+        } \
+    } while (0)
+
+    STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
+    STRING_OPTION(colour_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
+    STRING_OPTION(colour_matrix,    color_space,     AVCOL_SPC_UNSPECIFIED);
+    STRING_OPTION(chroma_location,  chroma_location, AVCHROMA_LOC_UNSPECIFIED);
+
+    return 0;
+}
+
+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+{
+    return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE);
+}
+
+static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
+{
+    int rv;
+    DeintV4L2M2MContext * priv;
+    DeintV4L2M2MContextShared * ctx;
+
+    if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0)
+        return rv;
+
+    priv = avctx->priv;
+    ctx = priv->shared;
+
+    ctx->one_to_one = 1;
+    return 0;
+}
+
+static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext *priv = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+
+    av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n",
+           ctx->frames_rx, ctx->frames_tx);
+    ctx->done = 1;
+    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
+    pts_track_uninit(&ctx->track);
+    deint_v4l2m2m_destroy_context(ctx);
+}
+
+static const AVOption deinterlace_v4l2m2m_options[] = {
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
+
+#define OFFSET(x) offsetof(DeintV4L2M2MContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption scale_v4l2m2m_options[] = {
+    { "w", "Output video width",
+      OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
+    { "h", "Output video height",
+      OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
+    { "format", "Output video format (software format of hardware frames)",
+      OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS },
+      // These colour properties match the ones of the same name in vf_scale.
+      { "out_color_matrix", "Output colour matrix coefficient set",
+      OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
+    { "out_range", "Output colour range",
+      OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED },
+      AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" },
+        { "full",    "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+        { "limited", "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "jpeg",    "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+        { "mpeg",    "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "tv",      "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "pc",      "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+    // These colour properties match the ones in the VAAPI scaler
+    { "out_color_primaries", "Output colour primaries",
+      OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "out_color_transfer", "Output colour transfer characteristics",
+      OFFSET(colour_transfer_string),  AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "out_chroma_location", "Output chroma sample location",
+      OFFSET(chroma_location_string),  AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+    { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS },
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(scale_v4l2m2m);
+
+static const AVFilterPad deint_v4l2m2m_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+static const AVFilterPad deint_v4l2m2m_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = deint_v4l2m2m_config_props,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_deinterlace_v4l2m2m = {
+    .name           = "deinterlace_v4l2m2m",
+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
+    .priv_size      = sizeof(DeintV4L2M2MContext),
+    .init           = &deint_v4l2m2m_init,
+    .uninit         = &deint_v4l2m2m_uninit,
+    .query_formats  = &deint_v4l2m2m_query_formats,
+    .inputs         = deint_v4l2m2m_inputs,
+    .outputs        = deint_v4l2m2m_outputs,
+    .priv_class     = &deinterlace_v4l2m2m_class,
+    .activate       = deint_v4l2m2m_activate,
+};
+
+AVFilter ff_vf_scale_v4l2m2m = {
+    .name           = "scale_v4l2m2m",
+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"),
+    .priv_size      = sizeof(DeintV4L2M2MContext),
+    .init           = &scale_v4l2m2m_init,
+    .uninit         = &deint_v4l2m2m_uninit,
+    .query_formats  = &deint_v4l2m2m_query_formats,
+    .inputs         = deint_v4l2m2m_inputs,
+    .outputs        = deint_v4l2m2m_outputs,
+    .priv_class     = &scale_v4l2m2m_class,
+    .activate       = deint_v4l2m2m_activate,
+};
+
diff --git a/libavfilter/vf_frei0r.c b/libavfilter/vf_frei0r.c
index ed0ba9f866..2ec4707d97 100644
--- a/libavfilter/vf_frei0r.c
+++ b/libavfilter/vf_frei0r.c
@@ -353,20 +353,14 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 {
     Frei0rContext *s = inlink->dst->priv;
     AVFilterLink *outlink = inlink->dst->outputs[0];
-    AVFrame *out = ff_default_get_video_buffer2(outlink, outlink->w, outlink->h, 16);
-    if (!out)
-        goto fail;
+    AVFrame *out;

-    av_frame_copy_props(out, in);
-
-    if (in->linesize[0] != out->linesize[0]) {
-        AVFrame *in2 = ff_default_get_video_buffer2(outlink, outlink->w, outlink->h, 16);
-        if (!in2)
-            goto fail;
-        av_frame_copy(in2, in);
+    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!out) {
         av_frame_free(&in);
-        in = in2;
+        return AVERROR(ENOMEM);
     }
+    av_frame_copy_props(out, in);

     s->update(s->instance, in->pts * av_q2d(inlink->time_base) * 1000,
                    (const uint32_t *)in->data[0],
@@ -375,10 +369,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     av_frame_free(&in);

     return ff_filter_frame(outlink, out);
-fail:
-    av_frame_free(&in);
-    av_frame_free(&out);
-    return AVERROR(ENOMEM);
 }

 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
@@ -477,7 +467,7 @@ static int source_config_props(AVFilterLink *outlink)
 static int source_request_frame(AVFilterLink *outlink)
 {
     Frei0rContext *s = outlink->src->priv;
-    AVFrame *frame = ff_default_get_video_buffer2(outlink, outlink->w, outlink->h, 16);
+    AVFrame *frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);

     if (!frame)
         return AVERROR(ENOMEM);
diff --git a/libavfilter/vf_idet.c b/libavfilter/vf_idet.c
index cc08722b06..02ae2edcb9 100644
--- a/libavfilter/vf_idet.c
+++ b/libavfilter/vf_idet.c
@@ -336,19 +336,20 @@ static int request_frame(AVFilterLink *link)
 static av_cold void uninit(AVFilterContext *ctx)
 {
     IDETContext *idet = ctx->priv;
+    int level = strncmp(ctx->name, "auto-inserted", 13) ? AV_LOG_INFO : AV_LOG_DEBUG;

-    av_log(ctx, AV_LOG_INFO, "Repeated Fields: Neither:%6"PRId64" Top:%6"PRId64" Bottom:%6"PRId64"\n",
+    av_log(ctx, level, "Repeated Fields: Neither:%6"PRId64" Top:%6"PRId64" Bottom:%6"PRId64"\n",
            idet->total_repeats[REPEAT_NONE],
            idet->total_repeats[REPEAT_TOP],
            idet->total_repeats[REPEAT_BOTTOM]
         );
-    av_log(ctx, AV_LOG_INFO, "Single frame detection: TFF:%6"PRId64" BFF:%6"PRId64" Progressive:%6"PRId64" Undetermined:%6"PRId64"\n",
+    av_log(ctx, level, "Single frame detection: TFF:%6"PRId64" BFF:%6"PRId64" Progressive:%6"PRId64" Undetermined:%6"PRId64"\n",
            idet->total_prestat[TFF],
            idet->total_prestat[BFF],
            idet->total_prestat[PROGRESSIVE],
            idet->total_prestat[UNDETERMINED]
         );
-    av_log(ctx, AV_LOG_INFO, "Multi frame detection: TFF:%6"PRId64" BFF:%6"PRId64" Progressive:%6"PRId64" Undetermined:%6"PRId64"\n",
+    av_log(ctx, level, "Multi frame detection: TFF:%6"PRId64" BFF:%6"PRId64" Progressive:%6"PRId64" Undetermined:%6"PRId64"\n",
            idet->total_poststat[TFF],
            idet->total_poststat[BFF],
            idet->total_poststat[PROGRESSIVE],
diff --git a/libavfilter/vf_scale.c b/libavfilter/vf_scale.c
index 788e4bab5a..3ca6ba2368 100644
--- a/libavfilter/vf_scale.c
+++ b/libavfilter/vf_scale.c
@@ -493,19 +493,19 @@ static int config_props(AVFilterLink *outlink)
     if ((ret = scale_eval_dimensions(ctx)) < 0)
         goto fail;

-    outlink->w = scale->w;
-    outlink->h = scale->h;
-
-    ff_scale_adjust_dimensions(inlink, &outlink->w, &outlink->h,
+    ff_scale_adjust_dimensions(inlink, &scale->w, &scale->h,
                                scale->force_original_aspect_ratio,
                                scale->force_divisible_by);

-    if (outlink->w > INT_MAX ||
-        outlink->h > INT_MAX ||
-        (outlink->h * inlink->w) > INT_MAX ||
-        (outlink->w * inlink->h) > INT_MAX)
+    if (scale->w > INT_MAX ||
+        scale->h > INT_MAX ||
+        (scale->h * inlink->w) > INT_MAX ||
+        (scale->w * inlink->h) > INT_MAX)
         av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n");

+    outlink->w = scale->w;
+    outlink->h = scale->h;
+
     /* TODO: make algorithm configurable */

     scale->input_is_pal = desc->flags & AV_PIX_FMT_FLAG_PAL;
@@ -684,9 +684,9 @@ static int scale_frame(AVFilterLink *link, AVFrame *in, AVFrame **frame_out)
             goto scale;

         if (scale->eval_mode == EVAL_MODE_INIT) {
-            snprintf(buf, sizeof(buf) - 1, "%d", scale->w);
+            snprintf(buf, sizeof(buf)-1, "%d", outlink->w);
             av_opt_set(scale, "w", buf, 0);
-            snprintf(buf, sizeof(buf) - 1, "%d", scale->h);
+            snprintf(buf, sizeof(buf)-1, "%d", outlink->h);
             av_opt_set(scale, "h", buf, 0);

             ret = scale_parse_expr(ctx, NULL, &scale->w_pexpr, "width", scale->w_expr);
diff --git a/libavfilter/vf_showinfo.c b/libavfilter/vf_showinfo.c
index 0b67cd7205..6208892005 100644
--- a/libavfilter/vf_showinfo.c
+++ b/libavfilter/vf_showinfo.c
@@ -454,15 +454,12 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
             av_log(ctx, AV_LOG_INFO, " %08"PRIX32, plane_checksum[plane]);
         av_log(ctx, AV_LOG_INFO, "] mean:[");
         for (plane = 0; plane < 4 && frame->data[plane] && frame->linesize[plane]; plane++)
-            av_log(ctx, AV_LOG_INFO, "%s%"PRId64,
-                   plane ? " ":"",
-                   (sum[plane] + pixelcount[plane]/2) / pixelcount[plane]);
-        av_log(ctx, AV_LOG_INFO, "] stdev:[");
+            av_log(ctx, AV_LOG_INFO, "%"PRId64" ", (sum[plane] + pixelcount[plane]/2) / pixelcount[plane]);
+        av_log(ctx, AV_LOG_INFO, "\b] stdev:[");
         for (plane = 0; plane < 4 && frame->data[plane] && frame->linesize[plane]; plane++)
-            av_log(ctx, AV_LOG_INFO, "%s%3.1f",
-                   plane ? " ":"",
+            av_log(ctx, AV_LOG_INFO, "%3.1f ",
                    sqrt((sum2[plane] - sum[plane]*(double)sum[plane]/pixelcount[plane])/pixelcount[plane]));
-        av_log(ctx, AV_LOG_INFO, "]");
+        av_log(ctx, AV_LOG_INFO, "\b]");
     }
     av_log(ctx, AV_LOG_INFO, "\n");

diff --git a/libavfilter/vf_signature.c b/libavfilter/vf_signature.c
index 1205168f8f..32a6405e14 100644
--- a/libavfilter/vf_signature.c
+++ b/libavfilter/vf_signature.c
@@ -224,7 +224,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
     dw1 = inlink->w / 32;
     if (inlink->w % 32)
         dw2 = dw1 + 1;
-    denom = (sc->divide) ? dh1 * (int64_t)dh2 * dw1 * dw2 : 1;
+    denom = (sc->divide) ? dh1 * dh2 * dw1 * dw2 : 1;

     for (i = 0; i < 32; i++) {
         rowcount = 0;
@@ -250,7 +250,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
         }
     }

-    denom = (sc->divide) ? 1 : dh1 * (int64_t)dh2 * dw1 * dw2;
+    denom = (sc->divide) ? 1 : dh1 * dh2 * dw1 * dw2;

     for (i = 0; i < ELEMENT_COUNT; i++) {
         const ElemCat* elemcat = elements[i];
diff --git a/libavfilter/vf_subtitles.c b/libavfilter/vf_subtitles.c
index b57dd80b13..de74afa2b7 100644
--- a/libavfilter/vf_subtitles.c
+++ b/libavfilter/vf_subtitles.c
@@ -145,16 +145,9 @@ static int config_input(AVFilterLink *inlink)
     ff_draw_init(&ass->draw, inlink->format, ass->alpha ? FF_DRAW_PROCESS_ALPHA : 0);

     ass_set_frame_size  (ass->renderer, inlink->w, inlink->h);
-    if (ass->original_w && ass->original_h) {
+    if (ass->original_w && ass->original_h)
         ass_set_aspect_ratio(ass->renderer, (double)inlink->w / inlink->h,
                              (double)ass->original_w / ass->original_h);
-#if LIBASS_VERSION > 0x01010000
-        ass_set_storage_size(ass->renderer, ass->original_w, ass->original_h);
-    } else {
-        ass_set_storage_size(ass->renderer, inlink->w, inlink->h);
-#endif
-    }
-
     if (ass->shaping != -1)
         ass_set_shaper(ass->renderer, ass->shaping);

diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
new file mode 100644
index 0000000000..61c03a385c
--- /dev/null
+++ b/libavfilter/vf_unsand.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2007 Bobby Bingham
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * format and noformat video filters
+ */
+
+#include <string.h>
+
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+typedef struct UnsandContext {
+    const AVClass *class;
+} UnsandContext;
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+
+    return 0;
+}
+
+
+static int filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterLink * const outlink = link->dst->outputs[0];
+    AVFrame *out = NULL;
+    int rv = 0;
+
+    if (outlink->format == in->format) {
+        // If nothing to do then do nothing
+        out = in;
+    }
+    else
+    {
+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
+        {
+            rv = AVERROR(ENOMEM);
+            goto fail;
+        }
+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
+        {
+            rv = -1;
+            goto fail;
+        }
+
+        av_frame_free(&in);
+    }
+
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&out);
+    av_frame_free(&in);
+    return rv;
+}
+
+#if 0
+static void dump_fmts(const AVFilterFormats * fmts)
+{
+    int i;
+    if (fmts== NULL) {
+        printf("NULL\n");
+        return;
+    }
+    for (i = 0; i < fmts->nb_formats; ++i) {
+        printf(" %d", fmts->formats[i]);
+    }
+    printf("\n");
+}
+#endif
+
+static int query_formats(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+    int ret;
+
+    // If we aren't connected at both ends then just do nothing
+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
+        return 0;
+
+    // Our output formats depend on our input formats and we can't/don't
+    // want to convert between bit depths so we need to wait for the source
+    // to have an opinion before we do
+    if (ctx->inputs[0]->incfg.formats == NULL)
+        return AVERROR(EAGAIN);
+
+    // Accept anything
+    if (ctx->inputs[0]->outcfg.formats == NULL &&
+        (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0)
+        return ret;
+
+    // Filter out sand formats
+
+    // Generate a container if we don't already have one
+    if (ctx->outputs[0]->incfg.formats == NULL)
+    {
+        // Somewhat rubbish way of ensuring we have a good structure
+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
+
+        if (formats == NULL)
+            return AVERROR(ENOMEM);
+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
+            return ret;
+    }
+
+    // Replace old format list with new filtered list derived from what our
+    // input says it can do
+    {
+        const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats;
+        AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats;
+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
+        int i;
+        int n = 0;
+        int seen_420p = 0;
+        int seen_420p10 = 0;
+
+        for (i = 0; i < src_ff->nb_formats; ++i) {
+            const enum AVPixelFormat f = src_ff->formats[i];
+
+            switch (f){
+                case AV_PIX_FMT_YUV420P:
+                case AV_PIX_FMT_SAND128:
+                case AV_PIX_FMT_RPI4_8:
+                    if (!seen_420p) {
+                        seen_420p = 1;
+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
+                    }
+                    break;
+                case AV_PIX_FMT_SAND64_10:
+                case AV_PIX_FMT_YUV420P10:
+                case AV_PIX_FMT_RPI4_10:
+                    if (!seen_420p10) {
+                        seen_420p10 = 1;
+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
+                    }
+                    break;
+                default:
+                    dst_fmts[n++] = f;
+                    break;
+            }
+        }
+
+        av_freep(&dst_ff->formats);
+        dst_ff->formats = dst_fmts;
+        dst_ff->nb_formats = n;
+    }
+
+//    printf("Unsand: %s calc: ", __func__);
+//    dump_fmts(ctx->outputs[0]->incfg.formats);
+
+    return 0;
+}
+
+
+#define OFFSET(x) offsetof(UnsandContext, x)
+static const AVOption unsand_options[] = {
+    { NULL }
+};
+
+
+AVFILTER_DEFINE_CLASS(unsand);
+
+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
+    {
+        .name             = "default",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_unsand = {
+    .name          = "unsand",
+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
+
+    .init          = init,
+    .uninit        = uninit,
+
+    .query_formats = query_formats,
+
+    .priv_size     = sizeof(UnsandContext),
+    .priv_class    = &unsand_class,
+
+    .inputs        = avfilter_vf_unsand_inputs,
+    .outputs       = avfilter_vf_unsand_outputs,
+};
+
diff --git a/libavfilter/vf_untile.c b/libavfilter/vf_untile.c
index df805141e0..9a2eb24901 100644
--- a/libavfilter/vf_untile.c
+++ b/libavfilter/vf_untile.c
@@ -139,8 +139,8 @@ static int activate(AVFilterContext *ctx)
         if (!(s->desc->flags & AV_PIX_FMT_FLAG_PAL || s->desc->flags & FF_PSEUDOPAL)) {
             for (i = 1; i < 3; i ++) {
                 if (out->data[i]) {
-                    out->data[i] += (y >> s->desc->log2_chroma_h) * out->linesize[i];
-                    out->data[i] += (x >> s->desc->log2_chroma_w) * s->max_step[i];
+                    out->data[i] += (y >> s->desc->log2_chroma_w) * out->linesize[i];
+                    out->data[i] += (x >> s->desc->log2_chroma_h) * s->max_step[i];
                 }
             }
         }
diff --git a/libavfilter/vf_w3fdif.c b/libavfilter/vf_w3fdif.c
index d380fdd4de..1a64b2b953 100644
--- a/libavfilter/vf_w3fdif.c
+++ b/libavfilter/vf_w3fdif.c
@@ -283,7 +283,7 @@ static int config_input(AVFilterLink *inlink)
     AVFilterContext *ctx = inlink->dst;
     W3FDIFContext *s = ctx->priv;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
-    int ret, i, depth, nb_threads;
+    int ret, i, depth;

     if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
         return ret;
@@ -297,11 +297,10 @@ static int config_input(AVFilterLink *inlink)
     }

     s->nb_planes = av_pix_fmt_count_planes(inlink->format);
-    nb_threads = ff_filter_get_nb_threads(ctx);
-    s->work_line = av_calloc(nb_threads, sizeof(*s->work_line));
+    s->nb_threads = ff_filter_get_nb_threads(ctx);
+    s->work_line = av_calloc(s->nb_threads, sizeof(*s->work_line));
     if (!s->work_line)
         return AVERROR(ENOMEM);
-    s->nb_threads = nb_threads;

     for (i = 0; i < s->nb_threads; i++) {
         s->work_line[i] = av_calloc(FFALIGN(s->linesize[0], 32), sizeof(*s->work_line[0]));
diff --git a/libavfilter/video.c b/libavfilter/video.c
index b049804419..7a8e587798 100644
--- a/libavfilter/video.c
+++ b/libavfilter/video.c
@@ -41,7 +41,7 @@ AVFrame *ff_null_get_video_buffer(AVFilterLink *link, int w, int h)
     return ff_get_video_buffer(link->dst->outputs[0], w, h);
 }

-AVFrame *ff_default_get_video_buffer2(AVFilterLink *link, int w, int h, int align)
+AVFrame *ff_default_get_video_buffer(AVFilterLink *link, int w, int h)
 {
     AVFrame *frame = NULL;
     int pool_width = 0;
@@ -96,11 +96,6 @@ AVFrame *ff_default_get_video_buffer2(AVFilterLink *link, int w, int h, int alig
     return frame;
 }

-AVFrame *ff_default_get_video_buffer(AVFilterLink *link, int w, int h)
-{
-    return ff_default_get_video_buffer2(link, w, h, av_cpu_max_align());
-}
-
 AVFrame *ff_get_video_buffer(AVFilterLink *link, int w, int h)
 {
     AVFrame *ret = NULL;
diff --git a/libavfilter/video.h b/libavfilter/video.h
index f9174a4a0b..56c58d6766 100644
--- a/libavfilter/video.h
+++ b/libavfilter/video.h
@@ -24,7 +24,6 @@
 #include "avfilter.h"

 AVFrame *ff_default_get_video_buffer(AVFilterLink *link, int w, int h);
-AVFrame *ff_default_get_video_buffer2(AVFilterLink *link, int w, int h, int align);
 AVFrame *ff_null_get_video_buffer(AVFilterLink *link, int w, int h);

 /**
diff --git a/libavfilter/vsrc_mandelbrot.c b/libavfilter/vsrc_mandelbrot.c
index ed31a23c31..761c915103 100644
--- a/libavfilter/vsrc_mandelbrot.c
+++ b/libavfilter/vsrc_mandelbrot.c
@@ -134,9 +134,6 @@ static av_cold int init(AVFilterContext *ctx)
     s-> next_cache= av_malloc_array(s->cache_allocated, sizeof(*s-> next_cache));
     s-> zyklus    = av_malloc_array(s->maxiter + 16, sizeof(*s->zyklus));

-    if (!s->point_cache || !s->next_cache || !s->zyklus)
-        return AVERROR(ENOMEM);
-
     return 0;
 }

diff --git a/libavformat/4xm.c b/libavformat/4xm.c
index cfee8a02f4..30f1b05324 100644
--- a/libavformat/4xm.c
+++ b/libavformat/4xm.c
@@ -137,8 +137,7 @@ static int parse_strk(AVFormatContext *s,
         return AVERROR_INVALIDDATA;

     track = AV_RL32(buf + 8);
-    if ((unsigned)track >= UINT_MAX / sizeof(AudioTrack) - 1 ||
-        track >= s->max_streams) {
+    if ((unsigned)track >= UINT_MAX / sizeof(AudioTrack) - 1) {
         av_log(s, AV_LOG_ERROR, "current_track too large\n");
         return AVERROR_INVALIDDATA;
     }
@@ -149,9 +148,6 @@ static int parse_strk(AVFormatContext *s,
         memset(&fourxm->tracks[fourxm->track_count], 0,
                sizeof(AudioTrack) * (track + 1 - fourxm->track_count));
         fourxm->track_count = track + 1;
-    } else {
-        if (fourxm->tracks[track].bits)
-            return AVERROR_INVALIDDATA;
     }
     fourxm->tracks[track].adpcm       = AV_RL32(buf + 12);
     fourxm->tracks[track].channels    = AV_RL32(buf + 36);
diff --git a/libavformat/aadec.c b/libavformat/aadec.c
index 90796c9599..2575e98153 100644
--- a/libavformat/aadec.c
+++ b/libavformat/aadec.c
@@ -130,8 +130,8 @@ static int aa_read_header(AVFormatContext *s)
                 AV_WB32(&header_key[idx * 4], header_key_part[idx]); // convert each part to BE!
             }
             av_log(s, AV_LOG_DEBUG, "Processed HeaderKey is ");
-            for (int j = 0; j < 16; j++)
-                av_log(s, AV_LOG_DEBUG, "%02x", header_key[j]);
+            for (i = 0; i < 16; i++)
+                av_log(s, AV_LOG_DEBUG, "%02x", header_key[i]);
             av_log(s, AV_LOG_DEBUG, "\n");
         } else {
             av_dict_set(&s->metadata, key, val, 0);
diff --git a/libavformat/aaxdec.c b/libavformat/aaxdec.c
index ad893efadd..e69e5615ee 100644
--- a/libavformat/aaxdec.c
+++ b/libavformat/aaxdec.c
@@ -262,8 +262,6 @@ static int aax_read_header(AVFormatContext *s)

                 start = avio_rb32(pb);
                 size  = avio_rb32(pb);
-                if (!size)
-                    return AVERROR_INVALIDDATA;
                 a->segments[r].start = start + a->data_offset;
                 a->segments[r].end   = a->segments[r].start + size;
             } else {
diff --git a/libavformat/act.c b/libavformat/act.c
index f6edfb44ab..26425ca1bb 100644
--- a/libavformat/act.c
+++ b/libavformat/act.c
@@ -66,7 +66,6 @@ static int read_header(AVFormatContext *s)
     AVIOContext *pb = s->pb;
     int size;
     AVStream* st;
-    int ret;

     int min,sec,msec;

@@ -76,9 +75,7 @@ static int read_header(AVFormatContext *s)

     avio_skip(pb, 16);
     size=avio_rl32(pb);
-    ret = ff_get_wav_header(s, pb, st->codecpar, size, 0);
-    if (ret < 0)
-        return ret;
+    ff_get_wav_header(s, pb, st->codecpar, size, 0);

     /*
       8000Hz (Fine-rec) file format has 10 bytes long
diff --git a/libavformat/aiffdec.c b/libavformat/aiffdec.c
index f14044d61c..8b85fea809 100644
--- a/libavformat/aiffdec.c
+++ b/libavformat/aiffdec.c
@@ -53,9 +53,9 @@ static enum AVCodecID aiff_codec_get_id(int bps)
 }

 /* returns the size of the found tag */
-static int64_t get_tag(AVIOContext *pb, uint32_t * tag)
+static int get_tag(AVIOContext *pb, uint32_t * tag)
 {
-    int64_t size;
+    int size;

     if (avio_feof(pb))
         return AVERROR(EIO);
@@ -63,16 +63,16 @@ static int64_t get_tag(AVIOContext *pb, uint32_t * tag)
     *tag = avio_rl32(pb);
     size = avio_rb32(pb);

+    if (size < 0)
+        size = 0x7fffffff;
+
     return size;
 }

 /* Metadata string read */
-static void get_meta(AVFormatContext *s, const char *key, int64_t size)
+static void get_meta(AVFormatContext *s, const char *key, int size)
 {
-    uint8_t *str = NULL;
-
-    if (size < SIZE_MAX)
-        str = av_malloc(size+1);
+    uint8_t *str = av_malloc(size+1);

     if (str) {
         int res = avio_read(s->pb, str, size);
@@ -89,7 +89,7 @@ static void get_meta(AVFormatContext *s, const char *key, int64_t size)
 }

 /* Returns the number of sound data frames or negative on error */
-static int get_aiff_header(AVFormatContext *s, int64_t size,
+static int get_aiff_header(AVFormatContext *s, int size,
                                     unsigned version)
 {
     AVIOContext *pb        = s->pb;
@@ -100,6 +100,9 @@ static int get_aiff_header(AVFormatContext *s, int64_t size,
     int sample_rate;
     unsigned int num_frames;

+    if (size == INT_MAX)
+        return AVERROR_INVALIDDATA;
+
     if (size & 1)
         size++;
     par->codec_type = AVMEDIA_TYPE_AUDIO;
@@ -117,9 +120,6 @@ static int get_aiff_header(AVFormatContext *s, int64_t size,
         sample_rate = val << exp;
     else
         sample_rate = (val + (1ULL<<(-exp-1))) >> -exp;
-    if (sample_rate <= 0)
-        return AVERROR_INVALIDDATA;
-
     par->sample_rate = sample_rate;
     if (size < 18)
         return AVERROR_INVALIDDATA;
@@ -182,10 +182,8 @@ static int get_aiff_header(AVFormatContext *s, int64_t size,
         par->block_align = (av_get_bits_per_sample(par->codec_id) * par->channels) >> 3;

     if (aiff->block_duration) {
-        par->bit_rate = av_rescale(par->sample_rate, par->block_align * 8LL,
-                                   aiff->block_duration);
-        if (par->bit_rate < 0)
-            par->bit_rate = 0;
+        par->bit_rate = (int64_t)par->sample_rate * (par->block_align << 3) /
+                        aiff->block_duration;
     }

     /* Chunk is over */
@@ -210,8 +208,7 @@ static int aiff_probe(const AVProbeData *p)
 /* aiff input */
 static int aiff_read_header(AVFormatContext *s)
 {
-    int ret;
-    int64_t filesize, size;
+    int ret, size, filesize;
     int64_t offset = 0, position;
     uint32_t tag;
     unsigned version = AIFF_C_VERSION1;
@@ -222,7 +219,7 @@ static int aiff_read_header(AVFormatContext *s)

     /* check FORM header */
     filesize = get_tag(pb, &tag);
-    if (filesize < 4 || tag != MKTAG('F', 'O', 'R', 'M'))
+    if (filesize < 0 || tag != MKTAG('F', 'O', 'R', 'M'))
         return AVERROR_INVALIDDATA;

     /* AIFF data type */
@@ -249,7 +246,10 @@ static int aiff_read_header(AVFormatContext *s)
         if (size < 0)
             return size;

-        filesize -= size + 8;
+        if (size >= 0x7fffffff - 8)
+            filesize = 0;
+        else
+            filesize -= size + 8;

         switch (tag) {
         case MKTAG('C', 'O', 'M', 'M'):     /* Common chunk */
@@ -365,12 +365,10 @@ got_sound:
     if (!st->codecpar->block_align && st->codecpar->codec_id == AV_CODEC_ID_QCELP) {
         av_log(s, AV_LOG_WARNING, "qcelp without wave chunk, assuming full rate\n");
         st->codecpar->block_align = 35;
-    } else if (st->codecpar->block_align <= 0) {
+    } else if (!st->codecpar->block_align) {
         av_log(s, AV_LOG_ERROR, "could not find COMM tag or invalid block_align value\n");
         return -1;
     }
-    if (aiff->block_duration < 0)
-        return AVERROR_INVALIDDATA;

     /* Now positioned, get the sound data start and end */
     avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate);
@@ -425,7 +423,7 @@ static int aiff_read_packet(AVFormatContext *s,
         pkt->flags &= ~AV_PKT_FLAG_CORRUPT;
     /* Only one stream in an AIFF file */
     pkt->stream_index = 0;
-    pkt->duration     = (res / st->codecpar->block_align) * (int64_t) aiff->block_duration;
+    pkt->duration     = (res / st->codecpar->block_align) * aiff->block_duration;
     return 0;
 }

diff --git a/libavformat/ape.c b/libavformat/ape.c
index 7ced92cf76..2698c770ee 100644
--- a/libavformat/ape.c
+++ b/libavformat/ape.c
@@ -42,8 +42,8 @@

 typedef struct APEFrame {
     int64_t pos;
-    int64_t size;
     int nblocks;
+    int size;
     int skip;
     int64_t pts;
 } APEFrame;
@@ -130,7 +130,7 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx)

     av_log(s, AV_LOG_DEBUG, "\nFrames\n\n");
     for (i = 0; i < ape_ctx->totalframes; i++)
-        av_log(s, AV_LOG_DEBUG, "%8d   %8"PRId64" %8"PRId64" (%d samples)\n", i,
+        av_log(s, AV_LOG_DEBUG, "%8d   %8"PRId64" %8d (%d samples)\n", i,
                ape_ctx->frames[i].pos, ape_ctx->frames[i].size,
                ape_ctx->frames[i].nblocks);

@@ -148,8 +148,7 @@ static int ape_read_header(AVFormatContext * s)
     AVStream *st;
     uint32_t tag;
     int i, ret;
-    int total_blocks;
-    int64_t final_size = 0;
+    int total_blocks, final_size = 0;
     int64_t pts, file_size;

     /* Skip any leading junk such as id3v2 tags */
@@ -301,8 +300,6 @@ static int ape_read_header(AVFormatContext * s)
             ape->frames[i].pos  -= ape->frames[i].skip;
             ape->frames[i].size += ape->frames[i].skip;
         }
-        if (ape->frames[i].size > INT_MAX - 3)
-            return AVERROR_INVALIDDATA;
         ape->frames[i].size = (ape->frames[i].size + 3) & ~3;
     }
     if (ape->fileversion < 3810) {
@@ -400,7 +397,7 @@ static int ape_read_packet(AVFormatContext * s, AVPacket * pkt)

     if (ape->frames[ape->currentframe].size <= 0 ||
         ape->frames[ape->currentframe].size > INT_MAX - extra_size) {
-        av_log(s, AV_LOG_ERROR, "invalid packet size: %8"PRId64"\n",
+        av_log(s, AV_LOG_ERROR, "invalid packet size: %d\n",
                ape->frames[ape->currentframe].size);
         ape->currentframe++;
         return AVERROR(EIO);
diff --git a/libavformat/aqtitledec.c b/libavformat/aqtitledec.c
index 960a5d8ef5..81630d73b0 100644
--- a/libavformat/aqtitledec.c
+++ b/libavformat/aqtitledec.c
@@ -74,8 +74,7 @@ static int aqt_read_header(AVFormatContext *s)
             new_event = 1;
             pos = avio_tell(s->pb);
             if (sub) {
-                if (frame >= sub->pts && (uint64_t)frame - sub->pts < INT64_MAX)
-                    sub->duration = frame - sub->pts;
+                sub->duration = frame - sub->pts;
                 sub = NULL;
             }
         } else if (*line) {
diff --git a/libavformat/argo_asf.c b/libavformat/argo_asf.c
index 06d62442b3..8e2bf21c71 100644
--- a/libavformat/argo_asf.c
+++ b/libavformat/argo_asf.c
@@ -422,7 +422,7 @@ static int argo_asf_write_trailer(AVFormatContext *s)
     ArgoASFMuxContext *ctx = s->priv_data;
     int64_t ret;

-    if ((ret = avio_seek(s->pb, ASF_FILE_HEADER_SIZE, SEEK_SET)) < 0)
+    if ((ret = avio_seek(s->pb, ASF_FILE_HEADER_SIZE, SEEK_SET) < 0))
         return ret;

     avio_wl32(s->pb, (uint32_t)ctx->nb_blocks);
diff --git a/libavformat/asfdec_f.c b/libavformat/asfdec_f.c
index add0d33540..c0265af20d 100644
--- a/libavformat/asfdec_f.c
+++ b/libavformat/asfdec_f.c
@@ -104,7 +104,7 @@ typedef struct ASFContext {
     int ts_is_pts;
     int packet_multi_size;
     int packet_time_delta;
-    int64_t packet_time_start;
+    int packet_time_start;
     int64_t packet_pos;

     int stream_index;
@@ -1321,12 +1321,10 @@ static int asf_parse_packet(AVFormatContext *s, AVIOContext *pb, AVPacket *pkt)
             if ((ret = av_new_packet(&asf_st->pkt, asf_st->packet_obj_size)) < 0)
                 return ret;
             asf_st->seq              = asf->packet_seq;
-            if (asf->packet_frag_timestamp != AV_NOPTS_VALUE) {
-                if (asf->ts_is_pts) {
-                    asf_st->pkt.pts          = asf->packet_frag_timestamp - asf->hdr.preroll;
-                } else
-                    asf_st->pkt.dts          = asf->packet_frag_timestamp - asf->hdr.preroll;
-            }
+            if (asf->ts_is_pts) {
+                asf_st->pkt.pts          = asf->packet_frag_timestamp - asf->hdr.preroll;
+            } else
+                asf_st->pkt.dts          = asf->packet_frag_timestamp - asf->hdr.preroll;
             asf_st->pkt.stream_index = asf->stream_index;
             asf_st->pkt.pos          = asf_st->packet_pos = asf->packet_pos;
             asf_st->pkt_clean        = 0;
diff --git a/libavformat/asfdec_o.c b/libavformat/asfdec_o.c
index 3a9e590a5b..f98ffc76fa 100644
--- a/libavformat/asfdec_o.c
+++ b/libavformat/asfdec_o.c
@@ -113,7 +113,6 @@ typedef struct ASFContext {
     int64_t data_offset;
     int64_t first_packet_offset; // packet offset
     int64_t unknown_offset;   // for top level header objects or subobjects without specified behavior
-    int in_asf_read_unknown;

     // ASF file must not contain more than 128 streams according to the specification
     ASFStream *asf_st[ASF_MAX_STREAMS];
@@ -178,7 +177,7 @@ static int asf_read_unknown(AVFormatContext *s, const GUIDParseTable *g)
     uint64_t size   = avio_rl64(pb);
     int ret;

-    if (size > INT64_MAX || asf->in_asf_read_unknown > 5)
+    if (size > INT64_MAX)
         return AVERROR_INVALIDDATA;

     if (asf->is_header)
@@ -187,11 +186,8 @@ static int asf_read_unknown(AVFormatContext *s, const GUIDParseTable *g)
     if (!g->is_subobject) {
         if (!(ret = strcmp(g->name, "Header Extension")))
             avio_skip(pb, 22); // skip reserved fields and Data Size
-        asf->in_asf_read_unknown ++;
-        ret = detect_unknown_subobject(s, asf->unknown_offset,
-                                            asf->unknown_size);
-        asf->in_asf_read_unknown --;
-        if (ret < 0)
+        if ((ret = detect_unknown_subobject(s, asf->unknown_offset,
+                                            asf->unknown_size)) < 0)
             return ret;
     } else {
         if (size < 24) {
@@ -1354,8 +1350,6 @@ static int asf_read_packet_header(AVFormatContext *s)
     unsigned char error_flags, len_flags, pay_flags;

     asf->packet_offset = avio_tell(pb);
-    if (asf->packet_offset > INT64_MAX/2)
-        asf->packet_offset = 0;
     error_flags = avio_r8(pb); // read Error Correction Flags
     if (error_flags & ASF_PACKET_FLAG_ERROR_CORRECTION_PRESENT) {
         if (!(error_flags & ASF_ERROR_CORRECTION_LENGTH_TYPE)) {
diff --git a/libavformat/avidec.c b/libavformat/avidec.c
index 75b05ab5d5..542161e360 100644
--- a/libavformat/avidec.c
+++ b/libavformat/avidec.c
@@ -79,8 +79,6 @@ typedef struct AVIContext {
     int stream_index;
     DVDemuxContext *dv_demux;
     int odml_depth;
-    int64_t odml_read;
-    int64_t odml_max_pos;
     int use_odml;
 #define MAX_ODML_DEPTH 1000
     int64_t dts_max;
@@ -200,7 +198,7 @@ static int read_odml_index(AVFormatContext *s, int64_t frame_num)
     st  = s->streams[stream_id];
     ast = st->priv_data;

-    if (index_sub_type || entries_in_use < 0)
+    if (index_sub_type)
         return AVERROR_INVALIDDATA;

     avio_rl32(pb);
@@ -221,18 +219,11 @@ static int read_odml_index(AVFormatContext *s, int64_t frame_num)
     }

     for (i = 0; i < entries_in_use; i++) {
-        avi->odml_max_pos = FFMAX(avi->odml_max_pos, avio_tell(pb));
-
-        // If we read more than there are bytes then we must have been reading something twice
-        if (avi->odml_read > avi->odml_max_pos)
-            return AVERROR_INVALIDDATA;
-
         if (index_type) {
             int64_t pos = avio_rl32(pb) + base - 8;
             int len     = avio_rl32(pb);
             int key     = len >= 0;
             len &= 0x7FFFFFFF;
-            avi->odml_read += 8;

             av_log(s, AV_LOG_TRACE, "pos:%"PRId64", len:%X\n", pos, len);

@@ -250,9 +241,6 @@ static int read_odml_index(AVFormatContext *s, int64_t frame_num)
         } else {
             int64_t offset, pos;
             int duration;
-            int ret;
-            avi->odml_read += 16;
-
             offset = avio_rl64(pb);
             avio_rl32(pb);       /* size */
             duration = avio_rl32(pb);
@@ -270,7 +258,7 @@ static int read_odml_index(AVFormatContext *s, int64_t frame_num)
             if (avio_seek(pb, offset + 8, SEEK_SET) < 0)
                 return -1;
             avi->odml_depth++;
-            ret = read_odml_index(s, frame_num);
+            read_odml_index(s, frame_num);
             avi->odml_depth--;
             frame_num += duration;

@@ -278,8 +266,7 @@ static int read_odml_index(AVFormatContext *s, int64_t frame_num)
                 av_log(s, AV_LOG_ERROR, "Failed to restore position after reading index\n");
                 return -1;
             }
-            if (ret < 0)
-                return ret;
+
         }
     }
     avi->index_loaded = 2;
@@ -869,8 +856,6 @@ static int avi_read_header(AVFormatContext *s)
                             memcpy(st->codecpar->extradata + st->codecpar->extradata_size - 9,
                                    "BottomUp", 9);
                     }
-                    if (st->codecpar->height == INT_MIN)
-                        return AVERROR_INVALIDDATA;
                     st->codecpar->height = FFABS(st->codecpar->height);

 //                    avio_skip(pb, size - 5 * 4);
diff --git a/libavformat/aviobuf.c b/libavformat/aviobuf.c
index 1fb30644ff..518cb11129 100644
--- a/libavformat/aviobuf.c
+++ b/libavformat/aviobuf.c
@@ -1005,9 +1005,6 @@ int ffio_ensure_seekback(AVIOContext *s, int64_t buf_size)
     if (buf_size <= s->buf_end - s->buf_ptr)
         return 0;

-    if (buf_size > INT_MAX - max_buffer_size)
-        return AVERROR(EINVAL);
-
     buf_size += max_buffer_size - 1;

     if (buf_size + s->buf_ptr - s->buffer <= s->buffer_size || s->seekable || !s->read_packet)
diff --git a/libavformat/bfi.c b/libavformat/bfi.c
index 35b6816aad..f9e0bb2e30 100644
--- a/libavformat/bfi.c
+++ b/libavformat/bfi.c
@@ -140,12 +140,12 @@ static int bfi_read_packet(AVFormatContext * s, AVPacket * pkt)
         audio_offset    = avio_rl32(pb);
         avio_rl32(pb);
         video_offset    = avio_rl32(pb);
-        if (audio_offset < 0 || video_offset < audio_offset || chunk_size < video_offset) {
+        audio_size      = video_offset - audio_offset;
+        bfi->video_size = chunk_size - video_offset;
+        if (audio_size < 0 || bfi->video_size < 0) {
             av_log(s, AV_LOG_ERROR, "Invalid audio/video offsets or chunk size\n");
             return AVERROR_INVALIDDATA;
         }
-        audio_size      = video_offset - audio_offset;
-        bfi->video_size = chunk_size - video_offset;

         //Tossing an audio packet at the audio decoder.
         ret = av_get_packet(pb, pkt, audio_size);
diff --git a/libavformat/cafdec.c b/libavformat/cafdec.c
index 1842c3c0ae..7f09a27977 100644
--- a/libavformat/cafdec.c
+++ b/libavformat/cafdec.c
@@ -241,8 +241,6 @@ static void read_info_chunk(AVFormatContext *s, int64_t size)
         char value[1024];
         avio_get_str(pb, INT_MAX, key, sizeof(key));
         avio_get_str(pb, INT_MAX, value, sizeof(value));
-        if (!*key)
-            continue;
         av_dict_set(&s->metadata, key, value, 0);
     }
 }
@@ -342,7 +340,7 @@ static int read_header(AVFormatContext *s)

 found_data:
     if (caf->bytes_per_packet > 0 && caf->frames_per_packet > 0) {
-        if (caf->data_size > 0 && caf->data_size / caf->bytes_per_packet < INT64_MAX / caf->frames_per_packet)
+        if (caf->data_size > 0)
             st->nb_frames = (caf->data_size / caf->bytes_per_packet) * caf->frames_per_packet;
     } else if (st->nb_index_entries && st->duration > 0) {
         if (st->codecpar->sample_rate && caf->data_size / st->duration > INT64_MAX / st->codecpar->sample_rate / 8) {
diff --git a/libavformat/cafenc.c b/libavformat/cafenc.c
index c5e47f20a6..7e44797a52 100644
--- a/libavformat/cafenc.c
+++ b/libavformat/cafenc.c
@@ -28,6 +28,7 @@

 typedef struct {
     int64_t data;
+    uint8_t *pkt_sizes;
     int size_buffer_size;
     int size_entries_used;
     int packets;
@@ -208,29 +209,30 @@ static int caf_write_header(AVFormatContext *s)
 static int caf_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
     CAFContext *caf = s->priv_data;
-    AVStream *const st = s->streams[0];

-    if (!st->codecpar->block_align) {
-        uint8_t *pkt_sizes;
-        int i, alloc_size = caf->size_entries_used + 5U;
-        if (alloc_size < 0)
-            return AVERROR(ERANGE);
-
-        pkt_sizes = av_fast_realloc(st->priv_data,
-                                    &caf->size_buffer_size,
-                                    alloc_size);
-        if (!pkt_sizes)
+    avio_write(s->pb, pkt->data, pkt->size);
+    if (!s->streams[0]->codecpar->block_align) {
+        void *pkt_sizes = caf->pkt_sizes;
+        int i, alloc_size = caf->size_entries_used + 5;
+        if (alloc_size < 0) {
+            caf->pkt_sizes = NULL;
+        } else {
+            caf->pkt_sizes = av_fast_realloc(caf->pkt_sizes,
+                                             &caf->size_buffer_size,
+                                             alloc_size);
+        }
+        if (!caf->pkt_sizes) {
+            av_free(pkt_sizes);
             return AVERROR(ENOMEM);
-        st->priv_data = pkt_sizes;
+        }
         for (i = 4; i > 0; i--) {
             unsigned top = pkt->size >> i * 7;
             if (top)
-                pkt_sizes[caf->size_entries_used++] = 128 | top;
+                caf->pkt_sizes[caf->size_entries_used++] = 128 | top;
         }
-        pkt_sizes[caf->size_entries_used++] = pkt->size & 127;
+        caf->pkt_sizes[caf->size_entries_used++] = pkt->size & 127;
         caf->packets++;
     }
-    avio_write(s->pb, pkt->data, pkt->size);
     return 0;
 }

@@ -238,8 +240,7 @@ static int caf_write_trailer(AVFormatContext *s)
 {
     CAFContext *caf = s->priv_data;
     AVIOContext *pb = s->pb;
-    AVStream *st = s->streams[0];
-    AVCodecParameters *par = st->codecpar;
+    AVCodecParameters *par = s->streams[0]->codecpar;

     if (pb->seekable & AVIO_SEEKABLE_NORMAL) {
         int64_t file_size = avio_tell(pb);
@@ -249,14 +250,16 @@ static int caf_write_trailer(AVFormatContext *s)
         avio_seek(pb, file_size, SEEK_SET);
         if (!par->block_align) {
             ffio_wfourcc(pb, "pakt");
-            avio_wb64(pb, caf->size_entries_used + 24U);
+            avio_wb64(pb, caf->size_entries_used + 24);
             avio_wb64(pb, caf->packets); ///< mNumberPackets
             avio_wb64(pb, caf->packets * samples_per_packet(par->codec_id, par->channels, par->block_align)); ///< mNumberValidFrames
             avio_wb32(pb, 0); ///< mPrimingFrames
             avio_wb32(pb, 0); ///< mRemainderFrames
-            avio_write(pb, st->priv_data, caf->size_entries_used);
+            avio_write(pb, caf->pkt_sizes, caf->size_entries_used);
+            caf->size_buffer_size = 0;
         }
     }
+    av_freep(&caf->pkt_sizes);
     return 0;
 }

diff --git a/libavformat/dxa.c b/libavformat/dxa.c
index 2a5487710f..cd9c489851 100644
--- a/libavformat/dxa.c
+++ b/libavformat/dxa.c
@@ -118,12 +118,9 @@ static int dxa_read_header(AVFormatContext *s)
             if(tag == MKTAG('d', 'a', 't', 'a')) break;
             avio_skip(pb, fsize);
         }
-        c->bpc = (fsize + (int64_t)c->frames - 1) / c->frames;
-        if(ast->codecpar->block_align) {
-            if (c->bpc > INT_MAX - ast->codecpar->block_align + 1)
-                return AVERROR_INVALIDDATA;
+        c->bpc = (fsize + c->frames - 1) / c->frames;
+        if(ast->codecpar->block_align)
             c->bpc = ((c->bpc + ast->codecpar->block_align - 1) / ast->codecpar->block_align) * ast->codecpar->block_align;
-        }
         c->bytes_left = fsize;
         c->wavpos = avio_tell(pb);
         avio_seek(pb, c->vidpos, SEEK_SET);
diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c
index 4a1c01a714..79c810f963 100644
--- a/libavformat/flvdec.c
+++ b/libavformat/flvdec.c
@@ -64,7 +64,7 @@ typedef struct FLVContext {
     uint8_t resync_buffer[2*RESYNC_BUFFER_SIZE];

     int broken_sizes;
-    int64_t sum_flv_tag_size;
+    int sum_flv_tag_size;

     int last_keyframe_stream_index;
     int keyframe_count;
@@ -459,10 +459,6 @@ static int parse_keyframes_index(AVFormatContext *s, AVIOContext *ioc, int64_t m
             d = av_int2double(avio_rb64(ioc));
             if (isnan(d) || d < INT64_MIN || d > INT64_MAX)
                 goto invalid;
-            if (current_array == &times && (d <= INT64_MIN / 1000 || d >= INT64_MAX / 1000))
-                goto invalid;
-            if (avio_feof(ioc))
-                goto invalid;
             current_array[0][i] = d;
         }
         if (times && filepositions) {
@@ -1033,7 +1029,7 @@ retry:
     type = (avio_r8(s->pb) & 0x1F);
     orig_size =
     size = avio_rb24(s->pb);
-    flv->sum_flv_tag_size += size + 11LL;
+    flv->sum_flv_tag_size += size + 11;
     dts  = avio_rb24(s->pb);
     dts |= (unsigned)avio_r8(s->pb) << 24;
     av_log(s, AV_LOG_TRACE, "type:%d, size:%d, last:%d, dts:%"PRId64" pos:%"PRId64"\n", type, size, last, dts, avio_tell(s->pb));
@@ -1335,7 +1331,7 @@ leave:
             !avio_feof(s->pb) &&
             (last != orig_size || !last) && last != flv->sum_flv_tag_size &&
             !flv->broken_sizes) {
-            av_log(s, AV_LOG_ERROR, "Packet mismatch %d %d %"PRId64"\n", last, orig_size + 11, flv->sum_flv_tag_size);
+            av_log(s, AV_LOG_ERROR, "Packet mismatch %d %d %d\n", last, orig_size + 11, flv->sum_flv_tag_size);
             avio_seek(s->pb, pos + 1, SEEK_SET);
             ret = resync(s);
             av_packet_unref(pkt);
diff --git a/libavformat/genh.c b/libavformat/genh.c
index 0b55a8884a..698104a9d6 100644
--- a/libavformat/genh.c
+++ b/libavformat/genh.c
@@ -67,9 +67,6 @@ static int genh_read_header(AVFormatContext *s)
         return AVERROR_INVALIDDATA;
     st->codecpar->block_align = align * st->codecpar->channels;
     st->codecpar->sample_rate = avio_rl32(s->pb);
-    if (st->codecpar->sample_rate < 0)
-        return AVERROR_INVALIDDATA;
-
     avio_skip(s->pb, 4);
     st->duration = avio_rl32(s->pb);

diff --git a/libavformat/hls.c b/libavformat/hls.c
index e17cb23897..597bea7f25 100644
--- a/libavformat/hls.c
+++ b/libavformat/hls.c
@@ -236,7 +236,6 @@ static void free_init_section_list(struct playlist *pls)
 {
     int i;
     for (i = 0; i < pls->n_init_sections; i++) {
-        av_freep(&pls->init_sections[i]->key);
         av_freep(&pls->init_sections[i]->url);
         av_freep(&pls->init_sections[i]);
     }
@@ -811,26 +810,20 @@ static int parse_playlist(HLSContext *c, const char *url,
                                &info);
             new_rendition(c, &info, url);
         } else if (av_strstart(line, "#EXT-X-TARGETDURATION:", &ptr)) {
-            int64_t t;
             ret = ensure_playlist(c, &pls, url);
             if (ret < 0)
                 goto fail;
-            t = strtoll(ptr, NULL, 10);
-            if (t < 0 || t >= INT64_MAX / AV_TIME_BASE) {
-                ret = AVERROR_INVALIDDATA;
-                goto fail;
-            }
-            pls->target_duration = t * AV_TIME_BASE;
+            pls->target_duration = strtoll(ptr, NULL, 10) * AV_TIME_BASE;
         } else if (av_strstart(line, "#EXT-X-MEDIA-SEQUENCE:", &ptr)) {
             uint64_t seq_no;
             ret = ensure_playlist(c, &pls, url);
             if (ret < 0)
                 goto fail;
             seq_no = strtoull(ptr, NULL, 10);
-            if (seq_no > INT64_MAX/2) {
+            if (seq_no > INT64_MAX) {
                 av_log(c->ctx, AV_LOG_DEBUG, "MEDIA-SEQUENCE higher than "
-                        "INT64_MAX/2, mask out the highest bit\n");
-                seq_no &= INT64_MAX/2;
+                        "INT64_MAX, mask out the highest bit\n");
+                seq_no &= INT64_MAX;
             }
             pls->start_seq_no = seq_no;
         } else if (av_strstart(line, "#EXT-X-PLAYLIST-TYPE:", &ptr)) {
@@ -910,7 +903,7 @@ static int parse_playlist(HLSContext *c, const char *url,
                 if (has_iv) {
                     memcpy(seg->iv, iv, sizeof(iv));
                 } else {
-                    uint64_t seq = pls->start_seq_no + (uint64_t)pls->n_segments;
+                    int64_t seq = pls->start_seq_no + pls->n_segments;
                     memset(seg->iv, 0, sizeof(seg->iv));
                     AV_WB64(seg->iv + 8, seq);
                 }
diff --git a/libavformat/icodec.c b/libavformat/icodec.c
index b321ad6007..93179bb41e 100644
--- a/libavformat/icodec.c
+++ b/libavformat/icodec.c
@@ -203,9 +203,6 @@ static int read_packet(AVFormatContext *s, AVPacket *pkt)
             AV_WL32(buf + 32, image->nb_pal);
         }

-        if (image->nb_pal > INT_MAX / 4 - 14 - 40)
-            return AVERROR_INVALIDDATA;
-
         AV_WL32(buf - 4, 14 + 40 + image->nb_pal * 4);
         AV_WL32(buf + 8, AV_RL32(buf + 8) / 2);
     }
diff --git a/libavformat/id3v2.c b/libavformat/id3v2.c
index a40f858477..1377cef4b8 100644
--- a/libavformat/id3v2.c
+++ b/libavformat/id3v2.c
@@ -376,10 +376,10 @@ static void read_uslt(AVFormatContext *s, AVIOContext *pb, int taglen,
     lang[3] = '\0';
     taglen -= 3;

-    if (decode_str(s, pb, encoding, &descriptor, &taglen) < 0 || taglen < 0)
+    if (decode_str(s, pb, encoding, &descriptor, &taglen) < 0)
         goto error;

-    if (decode_str(s, pb, encoding, &text, &taglen) < 0 || taglen < 0)
+    if (decode_str(s, pb, encoding, &text, &taglen) < 0)
         goto error;

     // FFmpeg does not support hierarchical metadata, so concatenate the keys.
diff --git a/libavformat/iff.c b/libavformat/iff.c
index 06785c748b..c15302d3c5 100644
--- a/libavformat/iff.c
+++ b/libavformat/iff.c
@@ -385,7 +385,7 @@ static int read_dst_frame(AVFormatContext *s, AVPacket *pkt)
                 avio_skip(pb, 1);
             pkt->flags |= AV_PKT_FLAG_KEY;
             pkt->stream_index = 0;
-            pkt->duration = s->streams[0]->codecpar->sample_rate / 75;
+            pkt->duration = 588LL * s->streams[0]->codecpar->sample_rate / 44100;
             pkt->pos = chunk_pos;

             chunk_pos = avio_tell(pb);
@@ -398,8 +398,7 @@ static int read_dst_frame(AVFormatContext *s, AVPacket *pkt)
         case ID_FRTE:
             if (data_size < 4)
                 return AVERROR_INVALIDDATA;
-            s->streams[0]->duration = avio_rb32(pb) * (uint64_t)s->streams[0]->codecpar->sample_rate / 75;
-
+            s->streams[0]->duration = avio_rb32(pb) * 588LL * s->streams[0]->codecpar->sample_rate / 44100;
             break;
         }

@@ -502,9 +501,6 @@ static int iff_read_header(AVFormatContext *s)
         case ID_DST:
         case ID_MDAT:
             iff->body_pos = avio_tell(pb);
-            if (iff->body_pos < 0 || iff->body_pos + data_size > INT64_MAX)
-                return AVERROR_INVALIDDATA;
-
             iff->body_end = iff->body_pos + data_size;
             iff->body_size = data_size;
             if (chunk_id == ID_DST) {
diff --git a/libavformat/jacosubdec.c b/libavformat/jacosubdec.c
index 59544bb507..2ccbf4c9de 100644
--- a/libavformat/jacosubdec.c
+++ b/libavformat/jacosubdec.c
@@ -152,7 +152,7 @@ static int get_shift(int timeres, const char *buf)
     ret = 0;
     switch (n) {
     case 4:
-        ret = sign * (((int64_t)a*3600 + (int64_t)b*60 + c) * timeres + d);
+        ret = sign * (((int64_t)a*3600 + b*60 + c) * timeres + d);
         break;
     case 3:
         ret = sign * ((         (int64_t)a*60 + b) * timeres + c);
diff --git a/libavformat/jacosubenc.c b/libavformat/jacosubenc.c
index 1213a58d52..77575c6b3c 100644
--- a/libavformat/jacosubenc.c
+++ b/libavformat/jacosubenc.c
@@ -24,7 +24,7 @@ static int jacosub_write_header(AVFormatContext *s)
     const AVCodecParameters *par = s->streams[0]->codecpar;

     if (par->extradata_size) {
-        avio_write(s->pb, par->extradata, par->extradata_size);
+        avio_write(s->pb, par->extradata, par->extradata_size - 1);
     }
     return 0;
 }
diff --git a/libavformat/libzmq.c b/libavformat/libzmq.c
index 04c72ac601..1b0d8638db 100644
--- a/libavformat/libzmq.c
+++ b/libavformat/libzmq.c
@@ -51,7 +51,7 @@ static int zmq_proto_wait(URLContext *h, void *socket, int write)
     zmq_pollitem_t items = { .socket = socket, .fd = 0, .events = ev, .revents = 0 };
     ret = zmq_poll(&items, 1, POLLING_TIME);
     if (ret == -1) {
-        av_log(h, AV_LOG_ERROR, "Error occurred during zmq_poll(): %s\n", ZMQ_STRERROR);
+        av_log(h, AV_LOG_ERROR, "Error occured during zmq_poll(): %s\n", ZMQ_STRERROR);
         return AVERROR_EXTERNAL;
     }
     return items.revents & ev ? 0 : AVERROR(EAGAIN);
@@ -90,7 +90,7 @@ static int zmq_proto_open(URLContext *h, const char *uri, int flags)
     s->context = zmq_ctx_new();
     if (!s->context) {
         /*errno not set on failure during zmq_ctx_new()*/
-        av_log(h, AV_LOG_ERROR, "Error occurred during zmq_ctx_new()\n");
+        av_log(h, AV_LOG_ERROR, "Error occured during zmq_ctx_new()\n");
         return AVERROR_EXTERNAL;
     }

@@ -100,13 +100,13 @@ static int zmq_proto_open(URLContext *h, const char *uri, int flags)
     if (h->flags & AVIO_FLAG_WRITE) {
         s->socket = zmq_socket(s->context, ZMQ_PUB);
         if (!s->socket) {
-            av_log(h, AV_LOG_ERROR, "Error occurred during zmq_socket(): %s\n", ZMQ_STRERROR);
+            av_log(h, AV_LOG_ERROR, "Error occured during zmq_socket(): %s\n", ZMQ_STRERROR);
             goto fail_term;
         }

         ret = zmq_bind(s->socket, uri);
         if (ret == -1) {
-            av_log(h, AV_LOG_ERROR, "Error occurred during zmq_bind(): %s\n", ZMQ_STRERROR);
+            av_log(h, AV_LOG_ERROR, "Error occured during zmq_bind(): %s\n", ZMQ_STRERROR);
             goto fail_close;
         }
     }
@@ -115,19 +115,19 @@ static int zmq_proto_open(URLContext *h, const char *uri, int flags)
     if (h->flags & AVIO_FLAG_READ) {
         s->socket = zmq_socket(s->context, ZMQ_SUB);
         if (!s->socket) {
-            av_log(h, AV_LOG_ERROR, "Error occurred during zmq_socket(): %s\n", ZMQ_STRERROR);
+            av_log(h, AV_LOG_ERROR, "Error occured during zmq_socket(): %s\n", ZMQ_STRERROR);
             goto fail_term;
         }

         ret = zmq_setsockopt(s->socket, ZMQ_SUBSCRIBE, "", 0);
         if (ret == -1) {
-            av_log(h, AV_LOG_ERROR, "Error occurred during zmq_setsockopt(): %s\n", ZMQ_STRERROR);
+            av_log(h, AV_LOG_ERROR, "Error occured during zmq_setsockopt(): %s\n", ZMQ_STRERROR);
             goto fail_close;
         }

         ret = zmq_connect(s->socket, uri);
         if (ret == -1) {
-            av_log(h, AV_LOG_ERROR, "Error occurred during zmq_connect(): %s\n", ZMQ_STRERROR);
+            av_log(h, AV_LOG_ERROR, "Error occured during zmq_connect(): %s\n", ZMQ_STRERROR);
             goto fail_close;
         }
     }
@@ -150,7 +150,7 @@ static int zmq_proto_write(URLContext *h, const unsigned char *buf, int size)
         return ret;
     ret = zmq_send(s->socket, buf, size, 0);
     if (ret == -1) {
-        av_log(h, AV_LOG_ERROR, "Error occurred during zmq_send(): %s\n", ZMQ_STRERROR);
+        av_log(h, AV_LOG_ERROR, "Error occured during zmq_send(): %s\n", ZMQ_STRERROR);
         return AVERROR_EXTERNAL;
     }
     return ret; /*number of bytes sent*/
@@ -166,7 +166,7 @@ static int zmq_proto_read(URLContext *h, unsigned char *buf, int size)
         return ret;
     ret = zmq_recv(s->socket, buf, size, 0);
     if (ret == -1) {
-        av_log(h, AV_LOG_ERROR, "Error occurred during zmq_recv(): %s\n", ZMQ_STRERROR);
+        av_log(h, AV_LOG_ERROR, "Error occured during zmq_recv(): %s\n", ZMQ_STRERROR);
         return AVERROR_EXTERNAL;
     }
     if (ret > size) {
diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index c47518b73a..fb1849f9c3 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -1690,7 +1690,7 @@ static int matroska_decode_buffer(uint8_t **buf, int *buf_size,
     case MATROSKA_TRACK_ENCODING_COMP_ZLIB:
     {
         z_stream zstream = { 0 };
-        if (!pkt_size || inflateInit(&zstream) != Z_OK)
+        if (inflateInit(&zstream) != Z_OK)
             return -1;
         zstream.next_in  = data;
         zstream.avail_in = isize;
@@ -1723,7 +1723,7 @@ static int matroska_decode_buffer(uint8_t **buf, int *buf_size,
     case MATROSKA_TRACK_ENCODING_COMP_BZLIB:
     {
         bz_stream bzstream = { 0 };
-        if (!pkt_size || BZ2_bzDecompressInit(&bzstream, 0, 0) != BZ_OK)
+        if (BZ2_bzDecompressInit(&bzstream, 0, 0) != BZ_OK)
             return -1;
         bzstream.next_in  = data;
         bzstream.avail_in = isize;
@@ -2802,14 +2802,11 @@ static int matroska_parse_tracks(AVFormatContext *s)
                 mkv_stereo_mode_display_mul(track->video.stereo_mode, &display_width_mul, &display_height_mul);

             if (track->video.display_unit < MATROSKA_VIDEO_DISPLAYUNIT_UNKNOWN) {
-                if (track->video.display_width && track->video.display_height &&
-                    st->codecpar->height  < INT64_MAX / track->video.display_width  / display_width_mul &&
-                    st->codecpar->width   < INT64_MAX / track->video.display_height / display_height_mul)
-                    av_reduce(&st->sample_aspect_ratio.num,
-                              &st->sample_aspect_ratio.den,
-                              st->codecpar->height * track->video.display_width  * display_width_mul,
-                              st->codecpar->width  * track->video.display_height * display_height_mul,
-                              INT_MAX);
+                av_reduce(&st->sample_aspect_ratio.num,
+                          &st->sample_aspect_ratio.den,
+                          st->codecpar->height * track->video.display_width  * display_width_mul,
+                          st->codecpar->width  * track->video.display_height * display_height_mul,
+                          255);
             }
             if (st->codecpar->codec_id != AV_CODEC_ID_HEVC)
                 st->need_parsing = AVSTREAM_PARSE_HEADERS;
@@ -2978,8 +2975,6 @@ static int matroska_read_header(AVFormatContext *s)

     if (!matroska->time_scale)
         matroska->time_scale = 1000000;
-    if (isnan(matroska->duration))
-        matroska->duration = 0;
     if (matroska->duration)
         matroska->ctx->duration = matroska->duration * matroska->time_scale *
                                   1000 / AV_TIME_BASE;
@@ -3940,9 +3935,7 @@ static CueDesc get_cue_desc(AVFormatContext *s, int64_t ts, int64_t cues_start)
     int i;
     int nb_index_entries = s->streams[0]->nb_index_entries;
     AVIndexEntry *index_entries = s->streams[0]->index_entries;
-
-    if (ts >= (int64_t)(matroska->duration * matroska->time_scale))
-        return (CueDesc) {-1, -1, -1, -1};
+    if (ts >= matroska->duration * matroska->time_scale) return (CueDesc) {-1, -1, -1, -1};
     for (i = 1; i < nb_index_entries; i++) {
         if (index_entries[i - 1].timestamp * matroska->time_scale <= ts &&
             index_entries[i].timestamp * matroska->time_scale > ts) {
@@ -4131,8 +4124,6 @@ static int64_t webm_dash_manifest_compute_bandwidth(AVFormatContext *s, int64_t
             // prebuffered.
             pre_bytes = desc_end.end_offset - desc_end.start_offset;
             pre_ns = desc_end.end_time_ns - desc_end.start_time_ns;
-            if (pre_ns <= 0)
-                return -1;
             pre_sec = pre_ns / nano_seconds_per_second;
             prebuffer_bytes +=
                 pre_bytes * ((temp_prebuffer_ns / nano_seconds_per_second) / pre_sec);
@@ -4144,16 +4135,12 @@ static int64_t webm_dash_manifest_compute_bandwidth(AVFormatContext *s, int64_t
             do {
                 int64_t desc_bytes = desc_end.end_offset - desc_beg.start_offset;
                 int64_t desc_ns = desc_end.end_time_ns - desc_beg.start_time_ns;
-                double desc_sec, calc_bits_per_second, percent, mod_bits_per_second;
-                if (desc_bytes <= 0)
-                    return -1;
-
-                desc_sec = desc_ns / nano_seconds_per_second;
-                calc_bits_per_second = (desc_bytes * 8) / desc_sec;
+                double desc_sec = desc_ns / nano_seconds_per_second;
+                double calc_bits_per_second = (desc_bytes * 8) / desc_sec;

                 // Drop the bps by the percentage of bytes buffered.
-                percent = (desc_bytes - prebuffer_bytes) / desc_bytes;
-                mod_bits_per_second = calc_bits_per_second * percent;
+                double percent = (desc_bytes - prebuffer_bytes) / desc_bytes;
+                double mod_bits_per_second = calc_bits_per_second * percent;

                 if (prebuffer < desc_sec) {
                     double search_sec =
diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
index b4284a8778..692265593c 100644
--- a/libavformat/matroskaenc.c
+++ b/libavformat/matroskaenc.c
@@ -58,6 +58,9 @@
  * Info, Tracks, Chapters, Attachments, Tags (potentially twice) and Cues */
 #define MAX_SEEKHEAD_ENTRIES 7

+/* Reserved size for H264 headers if not extant at init time */
+#define MAX_H264_HEADER_SIZE 1024
+
 #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \
                               !(mkv)->is_live)

@@ -721,8 +724,12 @@ static int mkv_write_native_codecprivate(AVFormatContext *s, AVIOContext *pb,
     case AV_CODEC_ID_WAVPACK:
         return put_wv_codecpriv(dyn_cp, par);
     case AV_CODEC_ID_H264:
-        return ff_isom_write_avcc(dyn_cp, par->extradata,
-                                  par->extradata_size);
+        if (par->extradata_size)
+            return ff_isom_write_avcc(dyn_cp, par->extradata,
+                                      par->extradata_size);
+        else
+            put_ebml_void(pb, MAX_H264_HEADER_SIZE);
+        break;
     case AV_CODEC_ID_HEVC:
         return ff_isom_write_hvcc(dyn_cp, par->extradata,
                                   par->extradata_size, 0);
@@ -2259,7 +2266,9 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
         break;
     // FIXME: Remove the following once libaom starts propagating extradata during init()
     //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012
+    // H264 V4L2 has a similar issue
     case AV_CODEC_ID_AV1:
+    case AV_CODEC_ID_H264:
         if (side_data_size && mkv->track.bc && !par->extradata_size) {
             AVIOContext *dyn_cp;
             uint8_t *codecpriv;
@@ -2267,7 +2276,10 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
             ret = avio_open_dyn_buf(&dyn_cp);
             if (ret < 0)
                 return ret;
-            ff_isom_write_av1c(dyn_cp, side_data, side_data_size);
+            if (par->codec_id == AV_CODEC_ID_H264)
+                ff_isom_write_avcc(dyn_cp, side_data, side_data_size);
+            else
+                ff_isom_write_av1c(dyn_cp, side_data, side_data_size);
             codecpriv_size = avio_get_dyn_buf(dyn_cp, &codecpriv);
             if ((ret = dyn_cp->error) < 0 ||
                 !codecpriv_size && (ret = AVERROR_INVALIDDATA)) {
@@ -2275,8 +2287,25 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
                 return ret;
             }
             avio_seek(mkv->track.bc, track->codecpriv_offset, SEEK_SET);
-            // Do not write the OBUs as we don't have space saved for them
-            put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4);
+            if (par->codec_id == AV_CODEC_ID_H264) {
+                int filler;
+                // Up to 6 bytes for header and the filler must be at least 2
+                if (codecpriv_size > MAX_H264_HEADER_SIZE - 8) {
+                    av_log(s, AV_LOG_ERROR, "H264 header size %d > %d bytes\n", codecpriv_size, MAX_H264_HEADER_SIZE - 8);
+                    return AVERROR_INVALIDDATA;
+                }
+                put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, codecpriv_size);
+                filler = MAX_H264_HEADER_SIZE - (avio_tell(mkv->track.bc) - track->codecpriv_offset);
+                if (filler < 2) {
+                    av_log(s, AV_LOG_ERROR, "Unexpected SPS/PPS filler length: %d\n", filler);
+                    return AVERROR_BUG;
+                }
+                put_ebml_void(mkv->track.bc, filler);
+            }
+            else {
+                // Do not write the OBUs as we don't have space saved for them
+                put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4);
+            }
             ffio_free_dyn_buf(&dyn_cp);
             ret = ff_alloc_extradata(par, side_data_size);
             if (ret < 0)
diff --git a/libavformat/moflex.c b/libavformat/moflex.c
index ca40b51c3e..0706f88e64 100644
--- a/libavformat/moflex.c
+++ b/libavformat/moflex.c
@@ -172,7 +172,7 @@ static int moflex_read_sync(AVFormatContext *s)
         unsigned type, ssize, codec_id = 0;
         unsigned codec_type, width = 0, height = 0, sample_rate = 0, channels = 0;
         int stream_index = -1;
-        AVRational tb = av_make_q(0, 1);
+        AVRational fps;

         read_var_byte(s, &type);
         read_var_byte(s, &ssize);
@@ -195,7 +195,6 @@ static int moflex_read_sync(AVFormatContext *s)
                 return AVERROR_PATCHWELCOME;
             }
             sample_rate = avio_rb24(pb) + 1;
-            tb = av_make_q(1, sample_rate);
             channels = avio_r8(pb) + 1;
             break;
         case 1:
@@ -209,8 +208,8 @@ static int moflex_read_sync(AVFormatContext *s)
                 av_log(s, AV_LOG_ERROR, "Unsupported video codec: %d\n", codec_id);
                 return AVERROR_PATCHWELCOME;
             }
-            tb.den = avio_rb16(pb);
-            tb.num = avio_rb16(pb);
+            fps.num = avio_rb16(pb);
+            fps.den = avio_rb16(pb);
             width = avio_rb16(pb);
             height = avio_rb16(pb);
             avio_skip(pb, type == 3 ? 3 : 2);
@@ -238,8 +237,10 @@ static int moflex_read_sync(AVFormatContext *s)
             if (!st->priv_data)
                 return AVERROR(ENOMEM);

-            if (tb.num)
-                avpriv_set_pts_info(st, 63, tb.num, tb.den);
+            if (sample_rate)
+                avpriv_set_pts_info(st, 63, 1, sample_rate);
+            else
+                avpriv_set_pts_info(st, 63, fps.den, fps.num);
         }
     }

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 295d9826de..4af796ee31 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -607,13 +607,11 @@ static int mov_read_dref(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     for (i = 0; i < entries; i++) {
         MOVDref *dref = &sc->drefs[i];
         uint32_t size = avio_rb32(pb);
-        int64_t next = avio_tell(pb);
+        int64_t next = avio_tell(pb) + size - 4;

-        if (size < 12 || next < 0 || next > INT64_MAX - size)
+        if (size < 12)
             return AVERROR_INVALIDDATA;

-        next += size - 4;
-
         dref->type = avio_rl32(pb);
         avio_rb32(pb); // version + flags

@@ -1944,8 +1942,6 @@ static int mov_read_glbl(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         // wrap a whole fiel atom inside of a glbl atom.
         unsigned size = avio_rb32(pb);
         unsigned type = avio_rl32(pb);
-        if (avio_feof(pb))
-            return AVERROR_INVALIDDATA;
         avio_seek(pb, -8, SEEK_CUR);
         if (type == MKTAG('f','i','e','l') && size == atom.size)
             return mov_read_default(c, pb, atom);
@@ -2555,10 +2551,6 @@ int ff_mov_read_stsd_entries(MOVContext *c, AVIOContext *pb, int entries)
                 av_log(c->fc, AV_LOG_ERROR, "Invalid sample rate %d\n", st->codecpar->sample_rate);
                 return AVERROR_INVALIDDATA;
             }
-            if (st->codecpar->channels < 0) {
-                av_log(c->fc, AV_LOG_ERROR, "Invalid channels %d\n", st->codecpar->channels);
-                return AVERROR_INVALIDDATA;
-            }
         } else if (st->codecpar->codec_type==AVMEDIA_TYPE_SUBTITLE){
             mov_parse_stsd_subtitle(c, pb, st, sc,
                                     size - (avio_tell(pb) - start_pos));
@@ -3963,13 +3955,6 @@ static void mov_build_index(MOVContext *mov, AVStream *st)
                 if (keyframe)
                     distance = 0;
                 sample_size = sc->stsz_sample_size > 0 ? sc->stsz_sample_size : sc->sample_sizes[current_sample];
-                if (current_offset > INT64_MAX - sample_size) {
-                    av_log(mov->fc, AV_LOG_ERROR, "Current offset %"PRId64" or sample size %u is too large\n",
-                           current_offset,
-                           sample_size);
-                    return;
-                }
-
                 if (sc->pseudo_stream_id == -1 ||
                    sc->stsc_data[stsc_index].id - 1 == sc->pseudo_stream_id) {
                     AVIndexEntry *e;
@@ -5131,8 +5116,6 @@ static int mov_read_sidx(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     avio_rb16(pb); // reserved

     item_count = avio_rb16(pb);
-    if (item_count == 0)
-        return AVERROR_INVALIDDATA;

     for (i = 0; i < item_count; i++) {
         int index;
@@ -5458,9 +5441,6 @@ static int mov_read_smdm(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         av_log(c->fc, AV_LOG_WARNING, "Unsupported Mastering Display Metadata box version %d\n", version);
         return 0;
     }
-    if (sc->mastering)
-        return AVERROR_INVALIDDATA;
-
     avio_skip(pb, 3); /* flags */

     sc->mastering = av_mastering_display_metadata_alloc();
@@ -6149,8 +6129,6 @@ static int mov_read_senc(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         }
         if (pb->eof_reached) {
             av_log(c->fc, AV_LOG_ERROR, "Hit EOF while reading senc\n");
-            if (ret >= 0)
-                av_encryption_info_free(encryption_index->encrypted_samples[i]);
             ret = AVERROR_INVALIDDATA;
         }

@@ -7089,8 +7067,6 @@ static int mov_read_default(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         if (a.size == 0) {
             a.size = atom.size - total_size + 8;
         }
-        if (a.size < 0)
-            break;
         a.size -= 8;
         if (a.size < 0)
             break;
diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 8a06de2fd2..0cbbc094de 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -91,7 +91,7 @@ static const AVOption options[] = {
     { "frag_duration", "Maximum fragment duration", offsetof(MOVMuxContext, max_fragment_duration), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM},
     { "min_frag_duration", "Minimum fragment duration", offsetof(MOVMuxContext, min_fragment_duration), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM},
     { "frag_size", "Maximum fragment size", offsetof(MOVMuxContext, max_fragment_size), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM},
-    { "ism_lookahead", "Number of lookahead entries for ISM files", offsetof(MOVMuxContext, ism_lookahead), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 255, AV_OPT_FLAG_ENCODING_PARAM},
+    { "ism_lookahead", "Number of lookahead entries for ISM files", offsetof(MOVMuxContext, ism_lookahead), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM},
     { "video_track_timescale", "set timescale of all video tracks", offsetof(MOVMuxContext, video_track_timescale), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM},
     { "brand",    "Override major brand", offsetof(MOVMuxContext, major_brand),   AV_OPT_TYPE_STRING, {.str = NULL}, .flags = AV_OPT_FLAG_ENCODING_PARAM },
     { "use_editlist", "use edit list", offsetof(MOVMuxContext, use_editlist), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, AV_OPT_FLAG_ENCODING_PARAM},
@@ -5926,6 +5926,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
     if (trk->par->codec_id == AV_CODEC_ID_MP4ALS ||
             trk->par->codec_id == AV_CODEC_ID_AAC ||
             trk->par->codec_id == AV_CODEC_ID_AV1 ||
+            trk->par->codec_id == AV_CODEC_ID_H264 ||
             trk->par->codec_id == AV_CODEC_ID_FLAC) {
         buffer_size_t side_size;
         uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
diff --git a/libavformat/mxfdec.c b/libavformat/mxfdec.c
index 32c2464fb1..50174fcd5f 100644
--- a/libavformat/mxfdec.c
+++ b/libavformat/mxfdec.c
@@ -60,7 +60,6 @@
 #include "mxf.h"

 #define MXF_MAX_CHUNK_SIZE (32 << 20)
-#define RUN_IN_MAX (65535+1)  // S377m-2004 section 5.5 and S377-1-2009 section 6.5, the +1 is to be slightly more tolerant

 typedef enum {
     Header,
@@ -876,27 +875,15 @@ static int mxf_read_cryptographic_context(void *arg, AVIOContext *pb, int tag, i

 static int mxf_read_strong_ref_array(AVIOContext *pb, UID **refs, int *count)
 {
-    int64_t ret;
-    unsigned c = avio_rb32(pb);
-
-    //avio_read() used int
-    if (c > INT_MAX / sizeof(UID))
-        return AVERROR_PATCHWELCOME;
-    *count = c;
-
+    *count = avio_rb32(pb);
     av_free(*refs);
-    *refs = av_malloc_array(*count, sizeof(UID));
+    *refs = av_calloc(*count, sizeof(UID));
     if (!*refs) {
         *count = 0;
         return AVERROR(ENOMEM);
     }
     avio_skip(pb, 4); /* useless size of objects, always 16 according to specs */
-    ret = avio_read(pb, (uint8_t *)*refs, *count * sizeof(UID));
-    if (ret != *count * sizeof(UID)) {
-        *count = ret < 0 ? 0   : ret / sizeof(UID);
-        return   ret < 0 ? ret : AVERROR_INVALIDDATA;
-    }
-
+    avio_read(pb, (uint8_t *)*refs, *count * sizeof(UID));
     return 0;
 }

@@ -1105,9 +1092,6 @@ static int mxf_read_index_entry_array(AVIOContext *pb, MXFIndexTableSegment *seg
 {
     int i, length;

-    if (segment->temporal_offset_entries)
-        return AVERROR_INVALIDDATA;
-
     segment->nb_index_entries = avio_rb32(pb);

     length = avio_rb32(pb);
@@ -2269,12 +2253,12 @@ static enum AVColorRange mxf_get_color_range(MXFContext *mxf, MXFDescriptor *des
         /* CDCI range metadata */
         if (!descriptor->component_depth)
             return AVCOL_RANGE_UNSPECIFIED;
-        if (descriptor->black_ref_level == 0 && descriptor->component_depth < 31 &&
+        if (descriptor->black_ref_level == 0 &&
             descriptor->white_ref_level == ((1<<descriptor->component_depth) - 1) &&
             (descriptor->color_range    == (1<<descriptor->component_depth) ||
              descriptor->color_range    == ((1<<descriptor->component_depth) - 1)))
             return AVCOL_RANGE_JPEG;
-        if (descriptor->component_depth >= 8 && descriptor->component_depth < 31 &&
+        if (descriptor->component_depth >= 8 &&
             descriptor->black_ref_level == (1  <<(descriptor->component_depth - 4)) &&
             descriptor->white_ref_level == (235<<(descriptor->component_depth - 8)) &&
             descriptor->color_range     == ((14<<(descriptor->component_depth - 4)) + 1))
@@ -3358,7 +3342,6 @@ static int mxf_read_header(AVFormatContext *s)
     KLVPacket klv;
     int64_t essence_offset = 0;
     int ret;
-    int64_t run_in;

     mxf->last_forward_tell = INT64_MAX;

@@ -3369,10 +3352,7 @@ static int mxf_read_header(AVFormatContext *s)
     }
     avio_seek(s->pb, -14, SEEK_CUR);
     mxf->fc = s;
-    run_in = avio_tell(s->pb);
-    if (run_in < 0 || run_in > RUN_IN_MAX)
-        return AVERROR_INVALIDDATA;
-    mxf->run_in = run_in;
+    mxf->run_in = avio_tell(s->pb);

     mxf_read_random_index_pack(s);

@@ -3516,8 +3496,8 @@ static int64_t mxf_compute_sample_count(MXFContext *mxf, AVStream *st,
     if ((sample_rate.num / sample_rate.den) == 48000) {
         return av_rescale_q(edit_unit, sample_rate, track->edit_rate);
     } else {
-        int64_t remainder = (sample_rate.num * (int64_t)  time_base.num) %
-                            (  time_base.den * (int64_t)sample_rate.den);
+        int remainder = (sample_rate.num * time_base.num) %
+                        (time_base.den * sample_rate.den);
         if (remainder)
             av_log(mxf->fc, AV_LOG_WARNING,
                    "seeking detected on stream #%d with time base (%d/%d) and "
@@ -3785,7 +3765,7 @@ static int mxf_read_close(AVFormatContext *s)

 static int mxf_probe(const AVProbeData *p) {
     const uint8_t *bufp = p->buf;
-    const uint8_t *end = p->buf + FFMIN(p->buf_size, RUN_IN_MAX + 1 + sizeof(mxf_header_partition_pack_key));
+    const uint8_t *end = p->buf + p->buf_size;

     if (p->buf_size < sizeof(mxf_header_partition_pack_key))
         return 0;
diff --git a/libavformat/nutdec.c b/libavformat/nutdec.c
index 5de3ee553a..58a74612a4 100644
--- a/libavformat/nutdec.c
+++ b/libavformat/nutdec.c
@@ -199,8 +199,6 @@ static int decode_main_header(NUTContext *nut)
     int tmp_stream, tmp_mul, tmp_pts, tmp_size, tmp_res, tmp_head_idx;

     length = get_packetheader(nut, bc, 1, MAIN_STARTCODE);
-    if (length == (uint64_t)-1)
-        return AVERROR_INVALIDDATA;
     end = length + avio_tell(bc);

     nut->version = ffio_read_varlen(bc);
@@ -244,11 +242,6 @@ static int decode_main_header(NUTContext *nut)
     for (i = 0; i < 256;) {
         int tmp_flags  = ffio_read_varlen(bc);
         int tmp_fields = ffio_read_varlen(bc);
-        if (tmp_fields < 0) {
-            av_log(s, AV_LOG_ERROR, "fields %d is invalid\n", tmp_fields);
-            ret = AVERROR_INVALIDDATA;
-            goto fail;
-        }

         if (tmp_fields > 0)
             tmp_pts = get_s(bc);
@@ -358,12 +351,8 @@ static int decode_main_header(NUTContext *nut)
         ret = AVERROR(ENOMEM);
         goto fail;
     }
-    for (i = 0; i < stream_count; i++) {
-        if (!avformat_new_stream(s, NULL)) {
-            ret = AVERROR(ENOMEM);
-            goto fail;
-        }
-    }
+    for (i = 0; i < stream_count; i++)
+        avformat_new_stream(s, NULL);

     return 0;
 fail:
@@ -811,23 +800,19 @@ static int nut_read_header(AVFormatContext *s)
     NUTContext *nut = s->priv_data;
     AVIOContext *bc = s->pb;
     int64_t pos;
-    int initialized_stream_count, ret;
+    int initialized_stream_count;

     nut->avf = s;

     /* main header */
     pos = 0;
-    ret = 0;
     do {
-        if (ret == AVERROR(ENOMEM))
-            return ret;
-
         pos = find_startcode(bc, MAIN_STARTCODE, pos) + 1;
         if (pos < 0 + 1) {
             av_log(s, AV_LOG_ERROR, "No main startcode found.\n");
             goto fail;
         }
-    } while ((ret = decode_main_header(nut)) < 0);
+    } while (decode_main_header(nut) < 0);

     /* stream headers */
     pos = 0;
diff --git a/libavformat/omadec.c b/libavformat/omadec.c
index 0f1c93c0be..d31b475fd2 100644
--- a/libavformat/omadec.c
+++ b/libavformat/omadec.c
@@ -494,7 +494,7 @@ static int oma_read_header(AVFormatContext *s)
         AV_WL16(&edata[6],  jsflag);        // coding mode
         AV_WL16(&edata[8],  jsflag);        // coding mode
         AV_WL16(&edata[10], 1);             // always 1
-        AV_WL16(&edata[12], 0);             // always 0
+        // AV_WL16(&edata[12], 0);          // always 0

         avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate);
         break;
diff --git a/libavformat/replaygain.c b/libavformat/replaygain.c
index 01db483257..707d3cd4f1 100644
--- a/libavformat/replaygain.c
+++ b/libavformat/replaygain.c
@@ -61,7 +61,7 @@ static int32_t parse_value(const char *value, int32_t min)
         }
     }

-    if (llabs(db) > (INT32_MAX - mb) / 100000)
+    if (abs(db) > (INT32_MAX - mb) / 100000)
         return min;

     return db * 100000 + sign * mb;
diff --git a/libavformat/rmdec.c b/libavformat/rmdec.c
index c3945a9166..97378703d1 100644
--- a/libavformat/rmdec.c
+++ b/libavformat/rmdec.c
@@ -128,6 +128,10 @@ static int rm_read_audio_stream_info(AVFormatContext *s, AVIOContext *pb,
     uint32_t version;
     int ret;

+    // Duplicate tags
+    if (st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
+        return AVERROR_INVALIDDATA;
+
     /* ra type header */
     version = avio_rb16(pb); /* version */
     if (version == 3) {
@@ -327,11 +331,6 @@ int ff_rm_read_mdpr_codecdata(AVFormatContext *s, AVIOContext *pb,
     if (codec_data_size == 0)
         return 0;

-    // Duplicate tags
-    if (   st->codecpar->codec_type != AVMEDIA_TYPE_UNKNOWN
-        && st->codecpar->codec_type != AVMEDIA_TYPE_DATA)
-        return AVERROR_INVALIDDATA;
-
     avpriv_set_pts_info(st, 64, 1, 1000);
     codec_pos = avio_tell(pb);
     v = avio_rb32(pb);
@@ -565,8 +564,6 @@ static int rm_read_header(AVFormatContext *s)
     }

     tag_size = avio_rb32(pb);
-    if (tag_size < 0)
-        return AVERROR_INVALIDDATA;
     avio_skip(pb, tag_size - 8);

     for(;;) {
diff --git a/libavformat/rpl.c b/libavformat/rpl.c
index 10cde679f8..ad3659e936 100644
--- a/libavformat/rpl.c
+++ b/libavformat/rpl.c
@@ -276,7 +276,7 @@ static int rpl_read_header(AVFormatContext *s)
     error |= read_line(pb, line, sizeof(line));  // size of "helpful" sprite
     if (vst) {
         error |= read_line(pb, line, sizeof(line));  // offset to key frame list
-        vst->duration = number_of_chunks * (int64_t)rpl->frames_per_chunk;
+        vst->duration = number_of_chunks * rpl->frames_per_chunk;
     }

     // Read the index
diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index fae3a371e0..25bdf475b3 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -952,8 +952,6 @@ static void rtsp_parse_transport(AVFormatContext *s,
                              ";,", &p);
             }
             th->transport = RTSP_TRANSPORT_RAW;
-        } else {
-            break;
         }
         if (!av_strcasecmp(lower_transport, "TCP"))
             th->lower_transport = RTSP_LOWER_TRANSPORT_TCP;
diff --git a/libavformat/sbgdec.c b/libavformat/sbgdec.c
index c86bc40862..36cfff20fc 100644
--- a/libavformat/sbgdec.c
+++ b/libavformat/sbgdec.c
@@ -1316,8 +1316,6 @@ static int generate_intervals(void *log, struct sbg_script *s, int sample_rate,

     /* Pseudo event before the first one */
     ev0 = s->events[s->nb_events - 1];
-    if (av_sat_sub64(ev0.ts_int, period) != (uint64_t)ev0.ts_int - period)
-        return AVERROR_INVALIDDATA;
     ev0.ts_int   -= period;
     ev0.ts_trans -= period;
     ev0.ts_next  -= period;
diff --git a/libavformat/sccdec.c b/libavformat/sccdec.c
index d420f3c461..1786520944 100644
--- a/libavformat/sccdec.c
+++ b/libavformat/sccdec.c
@@ -63,7 +63,8 @@ static int scc_read_header(AVFormatContext *s)
 {
     SCCContext *scc = s->priv_data;
     AVStream *st = avformat_new_stream(s, NULL);
-    AVPacket *sub = NULL;
+    char line2[4096], line[4096];
+    int64_t pos, ts, next_ts = AV_NOPTS_VALUE;
     ptrdiff_t len;
     uint8_t out[4096];
     FFTextReader tr;
@@ -76,26 +77,47 @@ static int scc_read_header(AVFormatContext *s)
     st->codecpar->codec_type = AVMEDIA_TYPE_SUBTITLE;
     st->codecpar->codec_id   = AV_CODEC_ID_EIA_608;

-    while (1) {
+    while (!ff_text_eof(&tr) || next_ts == AV_NOPTS_VALUE || line2[0]) {
         char *saveptr = NULL, *lline;
         int hh, mm, ss, fs, i;
-        char line[4096];
-        int64_t pos, ts;
+        AVPacket *sub;

-        len = ff_subtitles_read_line(&tr, line, sizeof(line));
-        if (len <= 13) {
-            if (ff_text_eof(&tr))
-                break;
-            continue;
-        }
+        if (next_ts == AV_NOPTS_VALUE) {
+            while (!ff_text_eof(&tr)) {
+                len = ff_subtitles_read_line(&tr, line, sizeof(line));
+                if (len <= 13)
+                    continue;
                 if (!strncmp(line, "Scenarist_SCC V1.0", 18))
                     continue;
-        if (av_sscanf(line, "%d:%d:%d%*[:;]%d", &hh, &mm, &ss, &fs) != 4)
-            continue;
+                if (av_sscanf(line, "%d:%d:%d%*[:;]%d", &hh, &mm, &ss, &fs) == 4)
+                    break;
+            }
+
+            ts = (hh * 3600LL + mm * 60LL + ss) * 1000LL + fs * 33LL;
+
+            while (!ff_text_eof(&tr)) {
+                len = ff_subtitles_read_line(&tr, line2, sizeof(line2));
+                if (len <= 13)
+                    continue;
+
+                if (av_sscanf(line2, "%d:%d:%d%*[:;]%d", &hh, &mm, &ss, &fs) == 4)
+                    break;
+            }
+        } else {
+            memmove(line, line2, sizeof(line));
+            line2[0] = 0;
+
+            while (!ff_text_eof(&tr)) {
+                len = ff_subtitles_read_line(&tr, line2, sizeof(line2));
+                if (len <= 13)
+                    continue;
+
+                if (av_sscanf(line2, "%d:%d:%d%*[:;]%d", &hh, &mm, &ss, &fs) == 4)
+                    break;
+            }
+        }

-        ts = (hh * 3600LL + mm * 60LL + ss) * 1000LL + fs * 33LL;
-        if (sub)
-            sub->duration = ts - sub->pts;
+        next_ts = (hh * 3600LL + mm * 60LL + ss) * 1000LL + fs * 33LL;

         pos = ff_text_pos(&tr);
         lline = (char *)&line;
@@ -146,6 +168,8 @@ static int scc_read_header(AVFormatContext *s)

         sub->pos = pos;
         sub->pts = ts;
+        sub->duration = next_ts - ts;
+        ts = next_ts;
     }

     ff_subtitles_queue_finalize(s, &scc->q);
diff --git a/libavformat/sctp.c b/libavformat/sctp.c
index be0cb47865..9a80e9b015 100644
--- a/libavformat/sctp.c
+++ b/libavformat/sctp.c
@@ -282,8 +282,6 @@ fail:
         goto restart;
     }
 fail1:
-    if (fd >= 0)
-        closesocket(fd);
     ret = AVERROR(EIO);
     freeaddrinfo(ai);
     return ret;
diff --git a/libavformat/sdsdec.c b/libavformat/sdsdec.c
index 2289e1bdac..c70f5af849 100644
--- a/libavformat/sdsdec.c
+++ b/libavformat/sdsdec.c
@@ -112,7 +112,7 @@ static int sds_read_header(AVFormatContext *ctx)
     st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
     st->codecpar->channels = 1;
     st->codecpar->sample_rate = sample_period ? 1000000000 / sample_period : 16000;
-    st->duration = av_rescale((avio_size(pb) - 21) / 127,  s->size, 4);
+    st->duration = (avio_size(pb) - 21) / (127) * s->size / 4;

     avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate);

diff --git a/libavformat/spdifdec.c b/libavformat/spdifdec.c
index 03b95bd48a..1808fa9d65 100644
--- a/libavformat/spdifdec.c
+++ b/libavformat/spdifdec.c
@@ -226,7 +226,7 @@ int ff_spdif_read_packet(AVFormatContext *s, AVPacket *pkt)
     if (!s->bit_rate && s->streams[0]->codecpar->sample_rate)
         /* stream bitrate matches 16-bit stereo PCM bitrate for currently
            supported codecs */
-        s->bit_rate = 2 * 16LL * s->streams[0]->codecpar->sample_rate;
+        s->bit_rate = 2 * 16 * s->streams[0]->codecpar->sample_rate;

     return 0;
 }
diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c
index 576b2c49f1..6368ec74f9 100644
--- a/libavformat/subtitles.c
+++ b/libavformat/subtitles.c
@@ -418,7 +418,6 @@ ptrdiff_t ff_subtitles_read_line(FFTextReader *tr, char *buf, size_t size)
     size_t cur = 0;
     if (!size)
         return 0;
-    buf[0] = '\0';
     while (cur + 1 < size) {
         unsigned char c = ff_text_r8(tr);
         if (!c)
diff --git a/libavformat/subviewerdec.c b/libavformat/subviewerdec.c
index 0a2f0da3b1..5c2fe676f1 100644
--- a/libavformat/subviewerdec.c
+++ b/libavformat/subviewerdec.c
@@ -51,32 +51,26 @@ static int subviewer_probe(const AVProbeData *p)
     return 0;
 }

-static int get_multiplier(int e) {
-    switch (e) {
-    case 1  : return 100;
-    case 2  : return 10;
-    case 3  : return 1;
-    default : return -1;
-    }
-}
-
 static int read_ts(const char *s, int64_t *start, int *duration)
 {
     int64_t end;
     int hh1, mm1, ss1, ms1;
     int hh2, mm2, ss2, ms2;
-    int multiplier1, multiplier2;
-    int ms1p1, ms1p2, ms2p1, ms2p2;
-
-    if (sscanf(s, "%u:%u:%u.%n%u%n,%u:%u:%u.%n%u%n",
-               &hh1, &mm1, &ss1, &ms1p1, &ms1, &ms1p2, &hh2, &mm2, &ss2, &ms2p1, &ms2, &ms2p2) == 8) {
-        multiplier1 = get_multiplier(ms1p2 - ms1p1);
-        multiplier2 = get_multiplier(ms2p2 - ms2p1);
-        if (multiplier1 <= 0 ||multiplier2 <= 0)
-            return -1;
-
-        end    = (hh2*3600LL + mm2*60LL + ss2) * 1000LL + ms2 * multiplier2;
-        *start = (hh1*3600LL + mm1*60LL + ss1) * 1000LL + ms1 * multiplier1;
+    int multiplier = 1;
+
+    if (sscanf(s, "%u:%u:%u.%2u,%u:%u:%u.%2u",
+               &hh1, &mm1, &ss1, &ms1, &hh2, &mm2, &ss2, &ms2) == 8) {
+        multiplier = 10;
+    } else if (sscanf(s, "%u:%u:%u.%1u,%u:%u:%u.%1u",
+                      &hh1, &mm1, &ss1, &ms1, &hh2, &mm2, &ss2, &ms2) == 8) {
+        multiplier = 100;
+    }
+    if (sscanf(s, "%u:%u:%u.%u,%u:%u:%u.%u",
+               &hh1, &mm1, &ss1, &ms1, &hh2, &mm2, &ss2, &ms2) == 8) {
+        ms1 = FFMIN(ms1, 999);
+        ms2 = FFMIN(ms2, 999);
+        end    = (hh2*3600LL + mm2*60LL + ss2) * 1000LL + ms2 * multiplier;
+        *start = (hh1*3600LL + mm1*60LL + ss1) * 1000LL + ms1 * multiplier;
         *duration = end - *start;
         return 0;
     }
diff --git a/libavformat/tee.c b/libavformat/tee.c
index 6fafc0a99d..c0b69a386c 100644
--- a/libavformat/tee.c
+++ b/libavformat/tee.c
@@ -124,7 +124,6 @@ static int close_slave(TeeSlave *tee_slave)
     unsigned i;
     int ret = 0;

-    av_dict_free(&tee_slave->fifo_options);
     avf = tee_slave->avf;
     if (!avf)
         return 0;
@@ -230,7 +229,6 @@ static int open_slave(AVFormatContext *avf, char *slave, TeeSlave *tee_slave)

         av_dict_free(&options);
         options = tee_slave->fifo_options;
-        tee_slave->fifo_options = NULL;
     }
     ret = avformat_alloc_output_context2(&avf2, NULL,
                                          tee_slave->use_fifo ? "fifo" :format, filename);
@@ -405,8 +403,6 @@ end:
     av_free(format);
     av_free(select);
     av_free(on_fail);
-    av_free(use_fifo);
-    av_free(fifo_options_str);
     av_dict_free(&options);
     av_dict_free(&bsf_options);
     av_freep(&tmp_select);
diff --git a/libavformat/tls_mbedtls.c b/libavformat/tls_mbedtls.c
index beb6e1cf08..aadf17760d 100644
--- a/libavformat/tls_mbedtls.c
+++ b/libavformat/tls_mbedtls.c
@@ -19,7 +19,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

-#include <mbedtls/version.h>
+#include <mbedtls/certs.h>
+#include <mbedtls/config.h>
 #include <mbedtls/ctr_drbg.h>
 #include <mbedtls/entropy.h>
 #include <mbedtls/net_sockets.h>
@@ -129,15 +130,9 @@ static void handle_pk_parse_error(URLContext *h, int ret)
 static void handle_handshake_error(URLContext *h, int ret)
 {
     switch (ret) {
-#if MBEDTLS_VERSION_MAJOR < 3
     case MBEDTLS_ERR_SSL_NO_USABLE_CIPHERSUITE:
         av_log(h, AV_LOG_ERROR, "None of the common ciphersuites is usable. Was the local certificate correctly set?\n");
         break;
-#else
-    case MBEDTLS_ERR_SSL_HANDSHAKE_FAILURE:
-        av_log(h, AV_LOG_ERROR, "TLS handshake failed.\n");
-        break;
-#endif
     case MBEDTLS_ERR_SSL_FATAL_ALERT_MESSAGE:
         av_log(h, AV_LOG_ERROR, "A fatal alert message was received from the peer, has the peer a correct certificate?\n");
         break;
@@ -200,30 +195,25 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op
         }
     }

-    // seed the random number generator
-    if ((ret = mbedtls_ctr_drbg_seed(&tls_ctx->ctr_drbg_context,
-                                     mbedtls_entropy_func,
-                                     &tls_ctx->entropy_context,
-                                     NULL, 0)) != 0) {
-        av_log(h, AV_LOG_ERROR, "mbedtls_ctr_drbg_seed returned %d\n", ret);
-        goto fail;
-    }
-
     // load key file
     if (shr->key_file) {
         if ((ret = mbedtls_pk_parse_keyfile(&tls_ctx->priv_key,
                                             shr->key_file,
-                                            tls_ctx->priv_key_pw
-#if MBEDTLS_VERSION_MAJOR >= 3
-                                            , mbedtls_ctr_drbg_random,
-                                            &tls_ctx->ctr_drbg_context
-#endif
-                                            )) != 0) {
+                                            tls_ctx->priv_key_pw)) != 0) {
             handle_pk_parse_error(h, ret);
             goto fail;
         }
     }

+    // seed the random number generator
+    if ((ret = mbedtls_ctr_drbg_seed(&tls_ctx->ctr_drbg_context,
+                                     mbedtls_entropy_func,
+                                     &tls_ctx->entropy_context,
+                                     NULL, 0)) != 0) {
+        av_log(h, AV_LOG_ERROR, "mbedtls_ctr_drbg_seed returned %d\n", ret);
+        goto fail;
+    }
+
     if ((ret = mbedtls_ssl_config_defaults(&tls_ctx->ssl_config,
                                            shr->listen ? MBEDTLS_SSL_IS_SERVER : MBEDTLS_SSL_IS_CLIENT,
                                            MBEDTLS_SSL_TRANSPORT_STREAM,
diff --git a/libavformat/udp.c b/libavformat/udp.c
index 1f8b85cfca..9b9d3de197 100644
--- a/libavformat/udp.c
+++ b/libavformat/udp.c
@@ -740,10 +740,8 @@ static int udp_open(URLContext *h, const char *uri, int flags)
     /* XXX: fix av_url_split */
     if (hostname[0] == '\0' || hostname[0] == '?') {
         /* only accepts null hostname if input */
-        if (!(flags & AVIO_FLAG_READ)) {
-            ret = AVERROR(EINVAL);
+        if (!(flags & AVIO_FLAG_READ))
             goto fail;
-        }
     } else {
         if ((ret = ff_udp_set_remote_url(h, uri)) < 0)
             goto fail;
@@ -756,10 +754,8 @@ static int udp_open(URLContext *h, const char *uri, int flags)
         udp_fd = udp_socket_create(h, &my_addr, &len, localaddr);
     else
         udp_fd = udp_socket_create(h, &my_addr, &len, s->localaddr);
-    if (udp_fd < 0) {
-        ret = AVERROR(EIO);
+    if (udp_fd < 0)
         goto fail;
-    }

     s->local_addr_storage=my_addr; //store for future multicast join

diff --git a/libavformat/utils.c b/libavformat/utils.c
index b2d011a0db..e10b493dae 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -3013,6 +3013,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
     return 1;
 }

+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
+// This should be quite general purpose but avoid possible conflicts
+// by limiting usage to cases wehere we know it works.
+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
+{
+    // Only try fallback if we know it is supported (HEVC only)
+    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
+        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
+    int err;
+
+    // Failed to find fallback or we are already at the fallback
+    if (new_codec == NULL || new_codec == old_codec)
+    {
+        return AVERROR_DECODER_NOT_FOUND;
+    }
+
+    // * This may be dodgy - header says to not use this fn,
+    //   especially if we are going to reopen the context...
+    //   (but it does seem to work for our cases)
+    if (avcodec_is_open(avctx)) {
+        avcodec_close(avctx);
+    }
+
+    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
+    {
+        return err;
+    }
+
+    return 0;
+}
+#else
+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
+#endif
+
 /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
 static int try_decode_frame(AVFormatContext *s, AVStream *st,
                             const AVPacket *avpkt, AVDictionary **options)
@@ -3051,7 +3085,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st,
         av_dict_set(options ? options : &thread_opt, "lowres", "0", 0);
         if (s->codec_whitelist)
             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
-        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
+        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
+        {
+            // Try fallback if if looks worth a try
+            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
+        }
         if (!options)
             av_dict_free(&thread_opt);
         if (ret < 0) {
@@ -3082,6 +3120,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st,
         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
             ret = avcodec_send_packet(avctx, &pkt);
+
+            // If we are going to want to fall back we should know here
+            if (ret == AVERROR_DECODER_NOT_FOUND) {
+                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
+                    break;
+                continue;
+            }
+
             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
                 break;
             if (ret >= 0)
@@ -3710,9 +3756,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
         // Try to just open decoders, in case this is enough to get parameters.
         if (!has_codec_parameters(st, NULL) && st->internal->request_probe <= 0) {
             if (codec && !avctx->codec)
-                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
-                    av_log(ic, AV_LOG_WARNING,
-                           "Failed to open codec in %s\n",__FUNCTION__);
+            {
+                int err;
+
+                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
+                {
+                    if (err == AVERROR_DECODER_NOT_FOUND) {
+                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
+                    }
+                    if (err < 0) {
+                        av_log(ic, AV_LOG_WARNING,
+                               "Failed to open codec in %s\n",__FUNCTION__);
+                    }
+                }
+            }
         }
         if (!options)
             av_dict_free(&thread_opt);
@@ -4997,7 +5054,7 @@ void ff_parse_key_value(const char *str, ff_parse_key_val_cb callback_get_buf,
         key_len = ptr - key;

         callback_get_buf(context, key, key_len, &dest, &dest_len);
-        dest_end = dest ? dest + dest_len - 1 : NULL;
+        dest_end = dest + dest_len - 1;

         if (*ptr == '\"') {
             ptr++;
diff --git a/libavformat/vividas.c b/libavformat/vividas.c
index e253b376ab..d35a646bde 100644
--- a/libavformat/vividas.c
+++ b/libavformat/vividas.c
@@ -683,7 +683,6 @@ static int viv_read_packet(AVFormatContext *s,

     if (viv->sb_entries[viv->current_sb_entry].flag == 0) {
         uint64_t v_size = ffio_read_varlen(pb);
-        int last = 0, last_start;

         if (!viv->num_audio)
             return AVERROR_INVALIDDATA;
@@ -707,18 +706,12 @@ static int viv_read_packet(AVFormatContext *s,

             if (i > 0 && start == 0)
                 break;
-            if (start < last)
-                return AVERROR_INVALIDDATA;

             viv->n_audio_subpackets = i + 1;
-            last =
             viv->audio_subpackets[i].start = start;
             viv->audio_subpackets[i].pcm_bytes = pcm_bytes;
         }
-        last_start =
         viv->audio_subpackets[viv->n_audio_subpackets].start = (int)(off - avio_tell(pb));
-        if (last_start < last)
-            return AVERROR_INVALIDDATA;
         viv->current_audio_subpacket = 0;

     } else {
diff --git a/libavformat/vivo.c b/libavformat/vivo.c
index 78d1377e6b..fb58aa6178 100644
--- a/libavformat/vivo.c
+++ b/libavformat/vivo.c
@@ -26,7 +26,6 @@
  * @sa http://wiki.multimedia.cx/index.php?title=Vivo
  */

-#include "libavutil/avstring.h"
 #include "libavutil/parseutils.h"
 #include "avformat.h"
 #include "internal.h"
@@ -121,7 +120,7 @@ static int vivo_get_packet_header(AVFormatContext *s)
 static int vivo_read_header(AVFormatContext *s)
 {
     VivoContext *vivo = s->priv_data;
-    AVRational fps = { 0 };
+    AVRational fps = { 1, 25};
     AVStream *ast, *vst;
     unsigned char *line, *line_end, *key, *value;
     long value_int;
@@ -207,21 +206,17 @@ static int vivo_read_header(AVFormatContext *s)
                     return AVERROR_INVALIDDATA;
                 value_used = 1;
             } else if (!strcmp(key, "FPS")) {
-                double d;
-                if (av_sscanf(value, "%f", &d) != 1)
-                    return AVERROR_INVALIDDATA;
+                AVRational tmp;

                 value_used = 1;
-                if (!fps.num && !fps.den)
-                    fps = av_inv_q(av_d2q(d, 10000));
+                if (!av_parse_ratio(&tmp, value, 10000, AV_LOG_WARNING, s))
+                    fps = av_inv_q(tmp);
             }

             if (!value_used)
                 av_dict_set(&s->metadata, key, value, 0);
         }
     }
-    if (!fps.num || !fps.den)
-        fps = (AVRational){ 1, 25 };

     avpriv_set_pts_info(ast, 64, 1, ast->codecpar->sample_rate);
     avpriv_set_pts_info(vst, 64, fps.num, fps.den);
diff --git a/libavformat/webmdashenc.c b/libavformat/webmdashenc.c
index 181ae9db69..1304c1a8c3 100644
--- a/libavformat/webmdashenc.c
+++ b/libavformat/webmdashenc.c
@@ -93,7 +93,7 @@ static int write_header(AVFormatContext *s)
     }
     avio_printf(pb, "  minBufferTime=\"PT%gS\"\n", min_buffer_time);
     avio_printf(pb, "  profiles=\"%s\"%s",
-                w->is_live ? "urn:mpeg:dash:profile:isoff-live:2011" : "urn:mpeg:dash:profile:webm-on-demand:2012",
+                w->is_live ? "urn:mpeg:dash:profile:isoff-live:2011" : "urn:webm:dash:profile:webm-on-demand:2012",
                 w->is_live ? "\n" : ">\n");
     if (w->is_live) {
         time_t local_time = time(NULL);
diff --git a/libavformat/xwma.c b/libavformat/xwma.c
index 6997d5420b..aedadcf140 100644
--- a/libavformat/xwma.c
+++ b/libavformat/xwma.c
@@ -278,7 +278,7 @@ static int xwma_read_header(AVFormatContext *s)
          * the total duration using the average bits per sample and the
          * total data length.
          */
-        st->duration = av_rescale((size<<3), st->codecpar->sample_rate, st->codecpar->bit_rate);
+        st->duration = (size<<3) * st->codecpar->sample_rate / st->codecpar->bit_rate;
     }

 fail:
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 27bafe9e12..c9075ddf8a 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -68,6 +68,7 @@ HEADERS = adler32.h                                                     \
           rational.h                                                    \
           replaygain.h                                                  \
           ripemd.h                                                      \
+	  rpi_sand_fns.h                                                \
           samplefmt.h                                                   \
           sha.h                                                         \
           sha512.h                                                      \
@@ -87,6 +88,7 @@ HEADERS = adler32.h                                                     \
           film_grain_params.h                                           \

 HEADERS-$(CONFIG_LZO)                   += lzo.h
+HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h

 ARCH_HEADERS = bswap.h                                                  \
                intmath.h                                                \
@@ -182,6 +184,7 @@ OBJS-$(CONFIG_LZO)                      += lzo.o
 OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
 OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
 OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
 OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
 OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index 5613813ba8..ab8bcfcf34 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -1,4 +1,6 @@
 OBJS += aarch64/cpu.o                                                 \
         aarch64/float_dsp_init.o                                      \

-NEON-OBJS += aarch64/float_dsp_neon.o
+NEON-OBJS += aarch64/float_dsp_neon.o                                 \
+             aarch64/rpi_sand_neon.o                                  \
+
diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
new file mode 100644
index 0000000000..2f07d9674c
--- /dev/null
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -0,0 +1,781 @@
+/*
+Copyright (c) 2021 Michael Eiler
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: Michael Eiler <eiler.mike@gmail.com>
+*/
+
+#include "asm.S"
+
+// void ff_rpi_sand8_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+
+function ff_rpi_sand8_lines_to_planar_y8, export=1
+    // w15 contains the number of rows we need to process
+    ldr w15, [sp, #0]
+
+    // w8 will contain the number of blocks per row
+    // w8 = floor(_w/stride1)
+    // stride1 is assumed to always be 128
+    mov w8, w1
+    lsr w8, w8, #7
+
+    // in case the width of the image is not a multiple of 128, there will
+    // be an incomplete block at the end of every row
+    // w9 contains the number of pixels stored within this block
+    // w9 = _w - w8 * 128
+    lsl w9, w8, #7
+    sub w9, w7, w9
+
+    // this is the value we have to add to the src pointer after reading a complete block
+    // it will move the address to the start of the next block
+    // w10 = stride2 * stride1 - stride1
+    mov w10, w4
+    lsl w10, w10, #7
+    sub w10, w10, #128
+
+    // w11 is the row offset, meaning the start offset of the first block of every collumn
+    // this will be increased with stride1 within every iteration of the row_loop
+    eor w11, w11, w11
+
+    // w12 = 0, processed row count
+    eor w12, w12, w12
+row_loop:
+    // start of the first block within the current row
+    // x13 = row offset + src
+    mov x13, x2
+    add x13, x13, x11
+
+    // w14 = 0, processed block count
+    eor w14, w14, w14
+
+    cmp w8, #0
+    beq no_main_y8
+
+block_loop:
+    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
+    // fortunately these aren't callee saved ones, meaning we don't need to backup them
+    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64
+
+    // write these registers back to the destination vector and increase the dst address by 128
+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
+
+    // move the source register to the beginning of the next block (x13 = src + block offset)
+    add x13, x13, x10
+    // increase the block counter
+    add w14, w14, #1
+
+    // continue with the block_loop if we haven't copied all full blocks yet
+    cmp w8, w14
+    bgt block_loop
+
+    // handle the last block at the end of each row
+    // at most 127 byte values copied from src to dst
+no_main_y8:
+    eor w5, w5, w5 // i = 0
+incomplete_block_loop_y8:
+    cmp w5, w9
+    bge incomplete_block_loop_end_y8
+
+    ldrb w6, [x13]
+    strb w6, [x0]
+    add x13, x13, #1
+    add x0, x0, #1
+
+    add w5, w5, #1
+    b incomplete_block_loop_y8
+incomplete_block_loop_end_y8:
+
+
+    // increase the row offset by 128 (stride1)
+    add w11, w11, #128
+    // increment the row counter
+    add w12, w12, #1
+
+    // process the next row if we haven't finished yet
+    cmp w15, w12
+    bgt row_loop
+
+    ret
+endfunc
+
+
+
+// void ff_rpi_sand8_lines_to_planar_c8(
+//   uint8_t * dst_u,           : x0
+//   unsigned int dst_stride_u, : w1 == width
+//   uint8_t * dst_v,           : x2
+//   unsigned int dst_stride_v, : w3 == width
+//   const uint8_t * src,       : x4
+//   unsigned int stride1,      : w5 == 128
+//   unsigned int stride2,      : w6
+//   unsigned int _x,           : w7
+//   unsigned int y,            : [sp, #0]
+//   unsigned int _w,           : [sp, #8]
+//   unsigned int h);           : [sp, #16]
+
+function ff_rpi_sand8_lines_to_planar_c8, export=1
+    // w7 = width
+    ldr w7, [sp, #8]
+
+    // w15 contains the number of rows we need to process
+    // counts down
+    ldr w15, [sp, #16]
+
+    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
+    mov w8, w7
+    lsr w8, w8, #6
+
+    // number of pixels in block at the end of every row
+    // w9 = _w - (w8 * 64)
+    lsl w9, w8, #6
+    sub w9, w7, w9
+
+    // Skip at the end of the line to account for stride
+    sub w12, w1, w7
+
+    // address delta to the beginning of the next block
+    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
+    lsl w10, w6, #7
+    sub w10, w10, #128
+
+    // w11 = row address start offset = 0
+    eor w11, w11, w11
+
+row_loop_c8:
+    // start of the first block within the current row
+    // x13 = row offset + src
+    mov x13, x4
+    add x13, x13, x11
+
+    // w14 = 0, processed block count
+    eor w14, w14, w14
+
+    cmp w8, #0
+    beq no_main_c8
+
+block_loop_c8:
+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values
+    ld2 { v0.16b,  v1.16b }, [x13], #32
+    ld2 { v2.16b,  v3.16b }, [x13], #32
+    ld2 { v4.16b,  v5.16b }, [x13], #32
+    ld2 { v6.16b,  v7.16b }, [x13], #32
+
+    // swap register so that we can write them out with a single instruction
+    mov v16.16b, v1.16b
+    mov v17.16b, v3.16b
+    mov v18.16b, v5.16b
+    mov v1.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v3.16b, v6.16b
+    mov v4.16b, v16.16b
+    mov v5.16b, v17.16b
+    mov v6.16b, v18.16b
+
+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
+
+    // increment row counter and move src to the beginning of the next block
+    add w14, w14, #1
+    add x13, x13, x10
+
+    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
+    cmp w8, w14
+    bgt block_loop_c8
+
+no_main_c8:
+    // handle incomplete block at the end of every row
+    eor w5, w5, w5 // point counter, this might be
+incomplete_block_loop_c8:
+    cmp w5, w9
+    bge incomplete_block_loop_end_c8
+
+    ldrb w1, [x13]
+    strb w1, [x0]
+    add x13, x13, #1
+
+    ldrb w1, [x13]
+    strb w1, [x2]
+    add x13, x13, #1
+
+    add x0, x0, #1
+    add x2, x2, #1
+
+    add w5, w5, #1
+    b incomplete_block_loop_c8
+incomplete_block_loop_end_c8:
+
+    // increase row_offset by stride1
+    add w11, w11, #128
+    add x0, x0, w12, sxtw
+    add x2, x2, w12, sxtw
+
+    // jump to row_Loop_c8 iff the row count is small than the height
+    subs w15, w15, #1
+    bgt row_loop_c8
+
+    ret
+endfunc
+
+//void ff_rpi_sand30_lines_to_planar_c16(
+//  uint8_t * dst_u,            // [x0]
+//  unsigned int dst_stride_u,  // [w1] == _w*2
+//  uint8_t * dst_v,            // [x2]
+//  unsigned int dst_stride_v,  // [w3] == _w*2
+//  const uint8_t * src,        // [x4]
+//  unsigned int stride1,       // [w5] == 128
+//  unsigned int stride2,       // [w6]
+//  unsigned int _x,            // [w7] == 0
+//  unsigned int y,             // [sp, #0] == 0
+//  unsigned int _w,            // [sp, #8] -> w3
+//  unsigned int h);            // [sp, #16] -> w7
+
+.macro rpi_sand30_lines_to_planar_c16_block_half
+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
+
+    xtn v4.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v5.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v6.4h, v0.4s
+    xtn2 v4.8h, v1.4s
+    ushr v1.4s, v1.4s, #10
+    xtn2 v5.8h, v1.4s
+    ushr v1.4s, v1.4s, #10
+    xtn2 v6.8h, v1.4s
+    and v4.16b, v4.16b, v16.16b
+    and v5.16b, v5.16b, v16.16b
+    and v6.16b, v6.16b, v16.16b
+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
+
+    xtn v4.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v5.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v6.4h, v2.4s
+    xtn2 v4.8h, v3.4s
+    ushr v3.4s, v3.4s, #10
+    xtn2 v5.8h, v3.4s
+    ushr v3.4s, v3.4s, #10
+    xtn2 v6.8h, v3.4s
+    and v4.16b, v4.16b, v16.16b
+    and v5.16b, v5.16b, v16.16b
+    and v6.16b, v6.16b, v16.16b
+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
+    sub sp, sp, #48
+.endm
+
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+    stp x19, x20, [sp, #-48]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+
+    ldr w3, [sp, #48+8]    // w3 = width
+    ldr w7, [sp, #48+16]   // w7 = height
+
+    // reserve space on the stack for intermediate results
+    sub sp, sp, #256
+
+    // number of 128byte blocks per row, w8 = width / 48
+    mov w9, #48
+    udiv w8, w3, w9
+
+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
+    mul w9, w8, w9
+    sub w9, w3, w9
+
+    // row offset, the beginning of the next row to process
+    eor w10, w10, w10
+
+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
+    lsl w11, w6, #7
+    sub w11, w11, #128
+
+    // decrease the height by one and in case of remaining pixels increase the block count by one
+    sub w7, w7, #1
+    cmp w9, #0
+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
+    add w8, w8, w19
+
+    // bytes we have to move dst back by at the end of every row
+    mov w21, #48*2
+    mul w21, w21, w8
+    sub w21, w1, w21
+
+    mov w20, #0     // w20 = flag, last row processed
+
+    mov x12, #0x03ff03ff03ff03ff
+    dup v16.2d, x12
+
+    // iterate through rows, row counter = w12 = 0
+    eor w12, w12, w12
+row_loop_c16:
+    cmp w12, w7
+    bge row_loop_c16_fin
+
+    // address of row data = src + row_offset
+    mov x13, x4
+    add x13, x13, x10
+
+    eor w14, w14, w14
+block_loop_c16:
+    cmp w14, w8
+    bge block_loop_c16_fin
+
+    rpi_sand30_lines_to_planar_c16_block_half
+
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #64
+
+    st1 { v0.8h }, [x0], #16
+    st1 { v2.8h }, [x0], #16
+    st1 { v4.8h }, [x0], #16
+    st1 { v1.8h }, [x2], #16
+    st1 { v3.8h }, [x2], #16
+    st1 { v5.8h }, [x2], #16
+
+    rpi_sand30_lines_to_planar_c16_block_half
+
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #64
+
+    st1 { v0.8h }, [x0], #16
+    st1 { v2.8h }, [x0], #16
+    st1 { v4.8h }, [x0], #16
+    st1 { v1.8h }, [x2], #16
+    st1 { v3.8h }, [x2], #16
+    st1 { v5.8h }, [x2], #16
+
+    add x13, x13, x11 // offset to next block
+    add w14, w14, #1
+    b block_loop_c16
+block_loop_c16_fin:
+
+    add w10, w10, #128
+    add w12, w12, #1
+    add x0, x0, w21, sxtw  // move dst pointers back by x21
+    add x2, x2, w21, sxtw
+    b row_loop_c16
+row_loop_c16_fin:
+
+    cmp w20, #1
+    beq row_loop_c16_fin2
+    mov w20, #1
+    sub w8, w8, w19 // decrease block count by w19
+    add w7, w7, #1 // increase height
+    b row_loop_c16
+
+row_loop_c16_fin2:
+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
+
+    // last incomplete block to be finished
+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
+    rpi_sand30_lines_to_planar_c16_block_half
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp], #32
+    rpi_sand30_lines_to_planar_c16_block_half
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #160
+
+    mov x4, sp
+    eor w20, w20, w20
+rem_pix_c16_loop:
+    cmp w20, w9
+    bge rem_pix_c16_fin
+
+    ldr w22, [x4], #4
+    str w22, [x0], #2
+    lsr w22, w22, #16
+    str w22, [x2], #2
+
+    add w20, w20, #1
+    b rem_pix_c16_loop
+rem_pix_c16_fin:
+
+    add sp, sp, #256
+
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #48
+    ret
+endfunc
+
+
+
+//void ff_rpi_sand30_lines_to_planar_p010(
+//  uint8_t * dest,
+//  unsigned int dst_stride,
+//  const uint8_t * src,
+//  unsigned int src_stride1,
+//  unsigned int src_stride2,
+//  unsigned int _x,
+//  unsigned int y,
+//  unsigned int _w,
+//  unsigned int h);
+
+// void ff_rpi_sand30_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+//
+// Assumes that we are starting on a stripe boundary and that overreading
+// within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+                lsl             w4,  w4,  #7
+                sub             w4,  w4,  #64
+                sub             w1,  w1,  w7, lsl #1
+                uxtw            x6,  w6
+                add             x8,  x2,  x6, lsl #7
+                ldr             w6,  [sp, #0]
+
+10:
+                mov             x2,  x8
+                mov             w5,  w7
+1:
+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
+
+                subs            w5,  w5,  #96
+
+                // v0, v1
+
+                shrn            v18.4h,  v0.4s,   #14
+                xtn             v16.4h,  v0.4s
+                shrn            v17.4h,  v0.4s,   #10
+
+                shrn2           v18.8h,  v1.4s,   #14
+                xtn2            v16.8h,  v1.4s
+                shrn2           v17.8h,  v1.4s,   #10
+
+                ushr            v18.8h,  v18.8h,  #6
+                bic             v16.8h,  #0xfc,   lsl #8
+                bic             v17.8h,  #0xfc,   lsl #8
+
+                // v2, v3
+
+                shrn            v21.4h,  v2.4s,   #14
+                xtn             v19.4h,  v2.4s
+                shrn            v20.4h,  v2.4s,   #10
+
+                shrn2           v21.8h,  v3.4s,   #14
+                xtn2            v19.8h,  v3.4s
+                shrn2           v20.8h,  v3.4s,   #10
+
+                ushr            v21.8h,  v21.8h,  #6
+                bic             v19.8h,  #0xfc,   lsl #8
+                bic             v20.8h,  #0xfc,   lsl #8
+
+                // v4, v5
+
+                shrn            v24.4h,  v4.4s,   #14
+                xtn             v22.4h,  v4.4s
+                shrn            v23.4h,  v4.4s,   #10
+
+                shrn2           v24.8h,  v5.4s,   #14
+                xtn2            v22.8h,  v5.4s
+                shrn2           v23.8h,  v5.4s,   #10
+
+                ushr            v24.8h,  v24.8h,  #6
+                bic             v22.8h,  #0xfc,   lsl #8
+                bic             v23.8h,  #0xfc,   lsl #8
+
+                // v6, v7
+
+                shrn            v27.4h,  v6.4s,   #14
+                xtn             v25.4h,  v6.4s
+                shrn            v26.4h,  v6.4s,   #10
+
+                shrn2           v27.8h,  v7.4s,   #14
+                xtn2            v25.8h,  v7.4s
+                shrn2           v26.8h,  v7.4s,   #10
+
+                ushr            v27.8h,  v27.8h,  #6
+                bic             v25.8h,  #0xfc,   lsl #8
+                bic             v26.8h,  #0xfc,   lsl #8
+
+                blt             2f
+
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
+                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
+                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
+
+                bne             1b
+
+11:
+                subs            w6,  w6,  #1
+                add             x0,  x0,  w1,  uxtw
+                add             x8,  x8,  #128
+                bne             10b
+
+                ret
+
+// Partial final write
+2:
+                cmp             w5,  #48-96
+                blt             1f
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
+                beq             11b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                sub             w5,  w5,  #48
+                mov             v18.16b, v24.16b
+                mov             v19.16b, v25.16b
+                mov             v20.16b, v26.16b
+                mov             v21.16b, v27.16b
+1:
+                cmp             w5,  #24-96
+                blt             1f
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                beq             11b
+                mov             v16.16b, v19.16b
+                mov             v17.16b, v20.16b
+                sub             w5,  w5,  #24
+                mov             v18.16b, v21.16b
+1:
+                cmp             w5,  #12-96
+                blt             1f
+                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
+                beq             11b
+                mov             v16.2d[0], v16.2d[1]
+                sub             w5,  w5,  #12
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w5,  #6-96
+                blt             1f
+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
+                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
+                beq             11b
+                mov             v16.2s[0], v16.2s[1]
+                sub             w5,  w5,  #6
+                mov             v17.2s[0], v17.2s[1]
+                mov             v18.2s[0], v18.2s[1]
+1:
+                cmp             w5,  #3-96
+                blt             1f
+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
+                beq             11b
+                mov             v16.4h[0], v16.4h[1]
+                sub             w5,  w5,  #3
+                mov             v17.4h[0], v17.4h[1]
+1:
+                cmp             w5,  #2-96
+                blt             1f
+                st2             {v16.h, v17.h}[0], [x0], #4
+                b               11b
+1:
+                st1             {v16.h}[0], [x0], #2
+                b               11b
+
+endfunc
+
+// void ff_rpi_sand30_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+//
+// Assumes that we are starting on a stripe boundary and that overreading
+// within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y8, export=1
+                lsl             w4,  w4,  #7
+                sub             w4,  w4,  #64
+                sub             w1,  w1,  w7
+                uxtw            x6,  w6
+                add             x8,  x2,  x6, lsl #7
+                ldr             w6,  [sp, #0]
+
+10:
+                mov             x2,  x8
+                mov             w5,  w7
+1:
+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
+
+                subs            w5,  w5,  #96
+
+                // v0, v1
+
+                shrn            v18.4h,  v0.4s,   #16
+                xtn             v16.4h,  v0.4s
+                shrn            v17.4h,  v0.4s,   #12
+
+                shrn2           v18.8h,  v1.4s,   #16
+                xtn2            v16.8h,  v1.4s
+                shrn2           v17.8h,  v1.4s,   #12
+
+                shrn            v18.8b,  v18.8h,  #6
+                shrn            v16.8b,  v16.8h,  #2
+                xtn             v17.8b,  v17.8h
+
+                // v2, v3
+
+                shrn            v21.4h,  v2.4s,   #16
+                xtn             v19.4h,  v2.4s
+                shrn            v20.4h,  v2.4s,   #12
+
+                shrn2           v21.8h,  v3.4s,   #16
+                xtn2            v19.8h,  v3.4s
+                shrn2           v20.8h,  v3.4s,   #12
+
+                shrn2           v18.16b, v21.8h,  #6
+                shrn2           v16.16b, v19.8h,  #2
+                xtn2            v17.16b, v20.8h
+
+                // v4, v5
+
+                shrn            v24.4h,  v4.4s,   #16
+                xtn             v22.4h,  v4.4s
+                shrn            v23.4h,  v4.4s,   #12
+
+                shrn2           v24.8h,  v5.4s,   #16
+                xtn2            v22.8h,  v5.4s
+                shrn2           v23.8h,  v5.4s,   #12
+
+                shrn            v21.8b,  v24.8h,  #6
+                shrn            v19.8b,  v22.8h,  #2
+                xtn             v20.8b,  v23.8h
+
+                // v6, v7
+
+                shrn            v27.4h,  v6.4s,   #16
+                xtn             v25.4h,  v6.4s
+                shrn            v26.4h,  v6.4s,   #12
+
+                shrn2           v27.8h,  v7.4s,   #16
+                xtn2            v25.8h,  v7.4s
+                shrn2           v26.8h,  v7.4s,   #12
+
+                shrn2           v21.16b, v27.8h,  #6
+                shrn2           v19.16b, v25.8h,  #2
+                xtn2            v20.16b, v26.8h
+
+                blt             2f
+
+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
+                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
+
+                bne             1b
+
+11:
+                subs            w6,  w6,  #1
+                add             x0,  x0,  w1,  uxtw
+                add             x8,  x8,  #128
+                bne             10b
+
+                ret
+
+// Partial final write
+2:
+                cmp             w5,  #48-96
+                blt             1f
+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
+                beq             11b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                sub             w5,  w5,  #48
+                mov             v18.16b, v24.16b
+1:
+                cmp             w5,  #24-96
+                blt             1f
+                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
+                beq             11b
+                mov             v16.2d[0], v16.2d[1]
+                sub             w5,  w5,  #24
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w5,  #12-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
+                beq             11b
+                mov             v16.2s[0], v16.2s[1]
+                sub             w5,  w5,  #12
+                mov             v17.2s[0], v17.2s[1]
+                mov             v18.2s[0], v18.2s[1]
+1:
+                cmp             w5,  #6-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
+                beq             11b
+                mov             v16.4h[0], v16.4h[1]
+                sub             w5,  w5,  #6
+                mov             v17.4h[0], v17.4h[1]
+                mov             v18.4h[0], v18.4h[1]
+1:
+                cmp             w5,  #3-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                beq             11b
+                mov             v16.8b[0], v16.8b[1]
+                sub             w5,  w5,  #3
+                mov             v17.8b[0], v17.8b[1]
+1:
+                cmp             w5,  #2-96
+                blt             1f
+                st2             {v16.b, v17.b}[0], [x0], #2
+                b               11b
+1:
+                st1             {v16.b}[0], [x0], #1
+                b               11b
+
+endfunc
+
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
new file mode 100644
index 0000000000..2a56135bc3
--- /dev/null
+++ b/libavutil/aarch64/rpi_sand_neon.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2021 Michael Eiler
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: Michael Eiler <eiler.mike@gmail.com>
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
+  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
+  unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
+  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..b74b7c4e2f 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \

 NEON-OBJS += arm/float_dsp_init_neon.o                                  \
              arm/float_dsp_neon.o                                       \
+             arm/rpi_sand_neon.o                                        \
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
new file mode 100644
index 0000000000..60e697f681
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -0,0 +1,925 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#include "libavutil/arm/asm.S"
+
+
+@ General notes:
+@ Having done some timing on this in sand8->y8 (Pi4)
+@  vst1 (680fps) is a bit faster than vstm (660fps)
+@  vldm (680fps) is noticably faster than vld1 (480fps)
+@  (or it might be that a mix is what is required)
+@
+@ At least on a Pi4 it is no more expensive to have a single auto-inc register
+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
+@ the latter was better)
+@
+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
+@ the memory is uncached.
+@ As these are Sand -> planar we can assume that src is going to be aligned but
+@ it is possible that dest isn't (converting to .yuv or other packed format).
+@ Luckily vst1 is faster than vstm :-) so all is well
+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
+@ .8 stores would let us do non-word aligned stores into uncached but it
+@ probably isn't worth it.
+
+
+
+
+@ void ff_rpi_sand128b_stripe_to_8_10(
+@   uint8_t * dest,             // [r0]
+@   const uint8_t * src1,       // [r1]
+@   const uint8_t * src2,       // [r2]
+@   unsigned int lines);        // [r3]
+
+.macro  stripe2_to_8, bit_depth
+        vpush    {q4-q7}
+1:
+        vldm     r1!, {q0-q7}
+        subs     r3, #1
+        vldm     r2!, {q8-q15}
+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
+        vqrshrn.u16 d10, q10, #\bit_depth - 8
+        vqrshrn.u16 d11, q11, #\bit_depth - 8
+        vqrshrn.u16 d12, q12, #\bit_depth - 8
+        vqrshrn.u16 d13, q13, #\bit_depth - 8
+        vqrshrn.u16 d14, q14, #\bit_depth - 8
+        vqrshrn.u16 d15, q15, #\bit_depth - 8
+        vstm     r0!, {q0-q7}
+        bne      1b
+        vpop     {q4-q7}
+        bx       lr
+.endm
+
+function ff_rpi_sand128b_stripe_to_8_10, export=1
+        stripe2_to_8     10
+endfunc
+
+@ void ff_rpi_sand8_lines_to_planar_y8(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand8_lines_to_planar_y8, export=1
+                push            {r4-r8, lr}     @ +24            L
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                lsl             r3,  #7
+                sub             r1,  r6
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2,  {q8-q15}
+                add             r2,  r3
+                subs            r5,  #128
+                blt             2f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d20, d21, d22, d23}, [r0]!
+                vst1.8          {d24, d25, d26, d27}, [r0]!
+                vst1.8          {d28, d29, d30, d31}, [r0]!
+                bne             1b
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #64-128
+                blt             1f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d20, d21, d22, d23}, [r0]!
+                beq             11b
+                vmov            q8,  q12
+                vmov            q9,  q13
+                sub             r5,  #64
+                vmov            q10, q14
+                vmov            q11, q15
+1:
+                cmp             r5,  #32-128
+                blt             1f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                beq             11b
+                vmov            q8,  q10
+                sub             r5,  #32
+                vmov            q9,  q11
+1:
+                cmp             r5,  #16-128
+                blt             1f
+                vst1.8          {d16, d17}, [r0]!
+                beq             11b
+                sub             r5,  #16
+                vmov            q8,  q9
+1:
+                cmp             r5,  #8-128
+                blt             1f
+                vst1.8          {d16}, [r0]!
+                beq             11b
+                sub             r5,  #8
+                vmov            d16, d17
+1:
+                cmp             r5,  #4-128
+                blt             1f
+                vst1.32         {d16[0]}, [r0]!
+                beq             11b
+                sub             r5,  #4
+                vshr.u64        d16, #32
+1:
+                cmp             r5,  #2-128
+                blt             1f
+                vst1.16         {d16[0]}, [r0]!
+                beq             11b
+                vst1.8          {d16[2]}, [r0]!
+                b               11b
+1:
+                vst1.8          {d16[0]}, [r0]!
+                b               11b
+endfunc
+
+@ void ff_rpi_sand8_lines_to_planar_c8(
+@   uint8_t * dst_u,            // [r0]
+@   unsigned int dst_stride_u,  // [r1]
+@   uint8_t * dst_v,            // [r2]
+@   unsigned int dst_stride_v,  // [r3]
+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
+@   unsigned int stride1,       // [sp, #4]  128
+@   unsigned int stride2,       // [sp, #8]  -> r8
+@   unsigned int _x,            // [sp, #12] 0
+@   unsigned int y,             // [sp, #16] (r7 in prefix)
+@   unsigned int _w,            // [sp, #20] -> r12, r6
+@   unsigned int h);            // [sp, #24] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand8_lines_to_planar_c8, export=1
+                push            {r4-r8, lr}     @ +24
+
+                ldr             r5,  [sp, #24]
+                ldr             r8,  [sp, #32]
+                ldr             r7,  [sp, #40]
+                ldr             r6,  [sp, #44]
+                lsl             r8,  #7
+                add             r5,  r5,  r7,  lsl #7
+                sub             r1,  r1,  r6
+                sub             r3,  r3,  r6
+                ldr             r7,  [sp, #48]
+                vpush           {q4-q7}
+
+10:
+                mov             r4,  r5
+                mov             r12, r6
+1:
+                subs            r12, #64
+                vldm            r4,  {q0-q7}
+                add             r4,  r8
+                it              gt
+                vldmgt          r4,  {q8-q15}
+                add             r4,  r8
+
+                vuzp.8          q0,  q1
+                vuzp.8          q2,  q3
+                vuzp.8          q4,  q5
+                vuzp.8          q6,  q7
+
+                vuzp.8          q8,  q9
+                vuzp.8          q10, q11
+                vuzp.8          q12, q13
+                vuzp.8          q14, q15
+                subs            r12, #64
+
+                @ Rearrange regs so we can use vst1 with 4 regs
+                vswp            q1,  q2
+                vswp            q5,  q6
+                vswp            q9,  q10
+                vswp            q13, q14
+                blt             2f
+
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d24, d25, d26, d27}, [r0]!
+
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                vst1.8          {d12, d13, d14, d15}, [r2]!
+                vst1.8          {d20, d21, d22, d23}, [r2]!
+                vst1.8          {d28, d29, d30, d31}, [r2]!
+                bne             1b
+11:
+                subs            r7,  #1
+                add             r5,  #128
+                add             r0,  r1
+                add             r2,  r3
+                bne             10b
+                vpop            {q4-q7}
+                pop             {r4-r8,pc}
+
+2:
+                cmp             r12, #64-128
+                blt             1f
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                vst1.8          {d12, d13, d14, d15}, [r2]!
+                beq             11b
+                sub             r12, #64
+                vmov            q0,  q8
+                vmov            q1,  q9
+                vmov            q2,  q10
+                vmov            q3,  q11
+                vmov            q4,  q12
+                vmov            q5,  q13
+                vmov            q6,  q14
+                vmov            q7,  q15
+1:
+                cmp             r12, #32-128
+                blt             1f
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                beq             11b
+                sub             r12, #32
+                vmov            q0,  q4
+                vmov            q1,  q5
+                vmov            q2,  q6
+                vmov            q3,  q7
+1:
+                cmp             r12, #16-128
+                blt             1f
+                vst1.8          {d0,  d1 }, [r0]!
+                vst1.8          {d4,  d5 }, [r2]!
+                beq             11b
+                sub             r12, #16
+                vmov            q0,  q1
+                vmov            q2,  q3
+1:
+                cmp             r12, #8-128
+                blt             1f
+                vst1.8          {d0}, [r0]!
+                vst1.8          {d4}, [r2]!
+                beq             11b
+                sub             r12, #8
+                vmov            d0,  d1
+                vmov            d4,  d5
+1:
+                cmp             r12, #4-128
+                blt             1f
+                vst1.32         {d0[0]}, [r0]!
+                vst1.32         {d4[0]}, [r2]!
+                beq             11b
+                sub             r12, #4
+                vmov            s0,  s1
+                vmov            s8,  s9
+1:
+                cmp             r12, #2-128
+                blt             1f
+                vst1.16         {d0[0]}, [r0]!
+                vst1.16         {d4[0]}, [r2]!
+                beq             11b
+                vst1.8          {d0[2]}, [r0]!
+                vst1.8          {d4[2]}, [r2]!
+                b               11b
+1:
+                vst1.8          {d0[0]}, [r0]!
+                vst1.8          {d4[0]}, [r2]!
+                b               11b
+endfunc
+
+
+
+@ void ff_rpi_sand30_lines_to_planar_y16(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                sub             r3,  #1
+                lsl             r3,  #7
+                sub             r1,  r1,  r6,  lsl #1
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2!, {q10-q13}
+                add             lr,  #64
+
+                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
+                ands            lr,  #127
+                vshrn.u32       d2,  q10, #10
+                vmovn.u32       d0,  q10
+
+                vshrn.u32       d5,  q11, #14
+                it              eq
+                addeq           r2,  r3
+                vshrn.u32       d3,  q11, #10
+                vmovn.u32       d1,  q11
+
+                subs            r5,  #48
+                vshr.u16        q2,  #6
+                vbic.u16        q0,  #0xfc00
+                vbic.u16        q1,  #0xfc00
+
+                vshrn.u32       d20, q12, #14
+                vshrn.u32       d18, q12, #10
+                vmovn.u32       d16, q12
+
+                vshrn.u32       d21, q13, #14
+                vshrn.u32       d19, q13, #10
+                vmovn.u32       d17, q13
+
+                vshr.u16        q10, #6
+                vbic.u16        q8,  #0xfc00
+                vbic.u16        q9 , #0xfc00
+                blt             2f
+
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4], r12
+                vst3.16         {d16, d18, d20}, [r0], r12
+                vst3.16         {d17, d19, d21}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #24-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4]
+                beq             11b
+                vmov            q0,  q8
+                sub             r5,  #24
+                vmov            q1,  q9
+                vmov            q2,  q10
+1:
+                cmp             r5,  #12-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0]!
+                beq             11b
+                vmov            d0, d1
+                sub             r5, #12
+                vmov            d2, d3
+                vmov            d4, d5
+1:
+                cmp             r5,  #6-48
+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
+                add             r0,  #12
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #6
+                vmov            s4,  s5
+                vmov            s8,  s9
+1:
+                cmp             r5, #3-48
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #16
+                vshr.u32        d2, #16
+1:
+                cmp             r5, #2-48
+                blt             1f
+                vst2.16         {d0[0], d2[0]}, [r0]!
+                b               11b
+1:
+                vst1.16         {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
+@ void ff_rpi_sand30_lines_to_planar_c16(
+@   uint8_t * dst_u,            // [r0]
+@   unsigned int dst_stride_u,  // [r1]
+@   uint8_t * dst_v,            // [r2]
+@   unsigned int dst_stride_v,  // [r3]
+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
+@   unsigned int stride1,       // [sp, #4]  128
+@   unsigned int stride2,       // [sp, #8]  -> r8
+@   unsigned int _x,            // [sp, #12] 0
+@   unsigned int y,             // [sp, #16] (r7 in prefix)
+@   unsigned int _w,            // [sp, #20] -> r6, r9
+@   unsigned int h);            // [sp, #24] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+                push            {r4-r10, lr}    @ +32
+                ldr             r5,  [sp, #32]
+                ldr             r8,  [sp, #40]
+                ldr             r7,  [sp, #48]
+                ldr             r9,  [sp, #52]
+                mov             r12, #48
+                sub             r8,  #1
+                lsl             r8,  #7
+                add             r5,  r5,  r7,  lsl #7
+                sub             r1,  r1,  r9,  lsl #1
+                sub             r3,  r3,  r9,  lsl #1
+                ldr             r7,  [sp, #56]
+10:
+                mov             lr,  #0
+                mov             r4,  r5
+                mov             r6,  r9
+1:
+                vldm            r4!, {q0-q3}
+                add             lr,  #64
+
+                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
+                vshrn.u32       d20, q0,  #14
+                vmovn.u32       d18, q0
+                vshrn.u32       d0,  q0,  #10
+                ands            lr,  #127
+
+                vshrn.u32       d21, q1,  #14
+                vmovn.u32       d19, q1
+                vshrn.u32       d1,  q1,  #10
+
+                vshrn.u32       d22, q2,  #10
+                vmovn.u32       d2,  q2
+                vshrn.u32       d4,  q2,  #14
+
+                add             r10, r0,  #24
+                vshrn.u32       d23, q3,  #10
+                vmovn.u32       d3,  q3
+                vshrn.u32       d5,  q3,  #14
+
+                it              eq
+                addeq           r4,  r8
+                vuzp.16         q0,  q11
+                vuzp.16         q9,  q1
+                vuzp.16         q10, q2
+
+                @ q0   V0, V3,..
+                @ q9   U0, U3...
+                @ q10  U1, U4...
+                @ q11  U2, U5,..
+                @ q1   V1, V4,
+                @ q2   V2, V5,..
+
+                subs            r6,  #24
+                vbic.u16        q11, #0xfc00
+                vbic.u16        q9,  #0xfc00
+                vshr.u16        q10, #6
+                vshr.u16        q2,  #6
+                vbic.u16        q0,  #0xfc00
+                vbic.u16        q1,  #0xfc00
+
+                blt             2f
+
+                vst3.16         {d18, d20, d22}, [r0],  r12
+                vst3.16         {d19, d21, d23}, [r10]
+                add             r10, r2,  #24
+                vst3.16         {d0,  d2,  d4},  [r2],  r12
+                vst3.16         {d1,  d3,  d5},  [r10]
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r5,  #128
+                add             r0,  r1
+                add             r2,  r3
+                bne             10b
+
+                pop             {r4-r10, pc}
+
+@ Partial final write
+2:
+                cmp             r6,  #-12
+                blt             1f
+                vst3.16         {d18, d20, d22}, [r0]!
+                vst3.16         {d0,  d2,  d4},  [r2]!
+                beq             11b
+                vmov            d18, d19
+                vmov            d20, d21
+                vmov            d22, d23
+                sub             r6,  #12
+                vmov            d0,  d1
+                vmov            d2,  d3
+                vmov            d4,  d5
+1:
+                cmp             r6,  #-18
+                @ Rezip here as it makes the remaining tail handling easier
+                vzip.16         d0,  d18
+                vzip.16         d2,  d20
+                vzip.16         d4,  d22
+                blt             1f
+                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
+                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
+                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
+                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
+                beq             11b
+                vmov            d0,  d18
+                vmov            d2,  d20
+                sub             r6,  #6
+                vmov            d4,  d22
+1:
+                cmp             r6,  #-21
+                blt             1f
+                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
+                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
+                beq             11b
+                vmov            s4,  s5
+                sub             r6,  #3
+                vmov            s0,  s1
+1:
+                cmp             r6,  #-22
+                blt             1f
+                vst2.16         {d0[1], d2[1]}, [r0]!
+                vst2.16         {d0[0], d2[0]}, [r2]!
+                b               11b
+1:
+                vst1.16         {d0[1]}, [r0]!
+                vst1.16         {d0[0]}, [r2]!
+                b               11b
+
+endfunc
+
+@ void ff_rpi_sand30_lines_to_planar_p010(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_p010, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                vmov.u16        q15, #0xffc0
+                sub             r3,  #1
+                lsl             r3,  #7
+                sub             r1,  r1,  r6,  lsl #1
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2!, {q10-q13}
+                add             lr,  #64
+
+                vshl.u32        q14, q10, #6
+                ands            lr,  #127
+                vshrn.u32       d4,  q10, #14
+                vshrn.u32       d2,  q10, #4
+                vmovn.u32       d0,  q14
+
+                vshl.u32        q14, q11, #6
+                it              eq
+                addeq           r2,  r3
+                vshrn.u32       d5,  q11, #14
+                vshrn.u32       d3,  q11, #4
+                vmovn.u32       d1,  q14
+
+                subs            r5,  #48
+                vand            q2,  q15
+                vand            q1,  q15
+                vand            q0,  q15
+
+                vshl.u32        q14, q12, #6
+                vshrn.u32       d20, q12, #14
+                vshrn.u32       d18, q12, #4
+                vmovn.u32       d16, q14
+
+                vshl.u32        q14, q13, #6
+                vshrn.u32       d21, q13, #14
+                vshrn.u32       d19, q13, #4
+                vmovn.u32       d17, q14
+
+                vand            q10, q15
+                vand            q9,  q15
+                vand            q8,  q15
+                blt             2f
+
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4], r12
+                vst3.16         {d16, d18, d20}, [r0], r12
+                vst3.16         {d17, d19, d21}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #24-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4]
+                beq             11b
+                vmov            q0,  q8
+                sub             r5,  #24
+                vmov            q1,  q9
+                vmov            q2,  q10
+1:
+                cmp             r5,  #12-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0]!
+                beq             11b
+                vmov            d0, d1
+                sub             r5, #12
+                vmov            d2, d3
+                vmov            d4, d5
+1:
+                cmp             r5,  #6-48
+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
+                add             r0,  #12
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #6
+                vmov            s4,  s5
+                vmov            s8,  s9
+1:
+                cmp             r5, #3-48
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #16
+                vshr.u32        d2, #16
+1:
+                cmp             r5, #2-48
+                blt             1f
+                vst2.16         {d0[0], d2[0]}, [r0]!
+                b               11b
+1:
+                vst1.16         {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
+@ void ff_rpi_sand30_lines_to_planar_y8(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y8, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                lsl             r3,  #7
+                sub             r1,  r1,  r6
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+1:
+                vldm            r2,  {q8-q15}
+
+                subs            r5,  #96
+
+                vmovn.u32       d0,  q8
+                vshrn.u32       d2,  q8,  #12
+                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
+
+                add             r2,  r3
+
+                vmovn.u32       d1,  q9
+                vshrn.u32       d3,  q9,  #12
+                vshrn.u32       d5,  q9,  #16
+
+                pld             [r2, #0]
+
+                vshrn.u16       d0,  q0,  #2
+                vmovn.u16       d1,  q1
+                vshrn.u16       d2,  q2,  #6
+
+                vmovn.u32       d16, q10
+                vshrn.u32       d18, q10, #12
+                vshrn.u32       d20, q10, #16
+
+                vmovn.u32       d17, q11
+                vshrn.u32       d19, q11, #12
+                vshrn.u32       d21, q11, #16
+
+                pld             [r2, #64]
+
+                vshrn.u16       d4,  q8,  #2
+                vmovn.u16       d5,  q9
+                vshrn.u16       d6,  q10, #6
+
+                vmovn.u32       d16, q12
+                vshrn.u32       d18, q12, #12
+                vshrn.u32       d20, q12, #16
+
+                vmovn.u32       d17, q13
+                vshrn.u32       d19, q13, #12
+                vshrn.u32       d21, q13, #16
+
+                vshrn.u16       d16, q8,  #2
+                vmovn.u16       d17, q9
+                vshrn.u16       d18, q10, #6
+
+                vmovn.u32       d20, q14
+                vshrn.u32       d22, q14, #12
+                vshrn.u32       d24, q14, #16
+
+                vmovn.u32       d21, q15
+                vshrn.u32       d23, q15, #12
+                vshrn.u32       d25, q15, #16
+
+                vshrn.u16       d20, q10, #2
+                vmovn.u16       d21, q11
+                vshrn.u16       d22, q12, #6
+
+                blt             2f
+
+                vst3.8          {d0,  d1,  d2},  [r0], r12
+                vst3.8          {d4,  d5,  d6},  [r4], r12
+                vst3.8          {d16, d17, d18}, [r0], r12
+                vst3.8          {d20, d21, d22}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #48-96
+                blt             1f
+                vst3.8          {d0,  d1,  d2},  [r0], r12
+                vst3.8          {d4,  d5,  d6},  [r4], r12
+                beq             11b
+                vmov            q0,  q8
+                vmov            q2,  q10
+                sub             r5,  #48
+                vmov            d2,  d18
+                vmov            d6,  d22
+1:
+                cmp             r5,  #24-96
+                blt             1f
+                vst3.8          {d0,  d1,  d2},  [r0]!
+                beq             11b
+                vmov            q0,  q2
+                sub             r5,  #24
+                vmov            d2,  d6
+1:
+                cmp             r5,  #12-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
+                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
+                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #12
+                vmov            s2,  s3
+                vmov            s4,  s5
+1:
+                cmp             r5,  #6-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
+                add             r0,  #12
+                beq             11b
+                vshr.u32        d0,  #16
+                sub             r5,  #6
+                vshr.u32        d1,  #16
+                vshr.u32        d2,  #16
+1:
+                cmp             r5, #3-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #8
+                vshr.u32        d1, #8
+1:
+                cmp             r5, #2-96
+                blt             1f
+                vst2.8          {d0[0], d1[0]}, [r0]!
+                b               11b
+1:
+                vst1.8          {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
new file mode 100644
index 0000000000..d457c10870
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.h
@@ -0,0 +1,110 @@
+/*
+Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef AVUTIL_ARM_SAND_NEON_H
+#define AVUTIL_ARM_SAND_NEON_H
+
+void ff_rpi_sand128b_stripe_to_8_10(
+  uint8_t * dest,             // [r0]
+  const uint8_t * src1,       // [r1]
+  const uint8_t * src2,       // [r2]
+  unsigned int lines);        // [r3]
+
+void ff_rpi_sand8_lines_to_planar_y8(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand8_lines_to_planar_c8(
+  uint8_t * dst_u,            // [r0]
+  unsigned int dst_stride_u,  // [r1]
+  uint8_t * dst_v,            // [r2]
+  unsigned int dst_stride_v,  // [r3]
+  const uint8_t * src,        // [sp, #0]  -> r4, r5
+  unsigned int stride1,       // [sp, #4]  128
+  unsigned int stride2,       // [sp, #8]  -> r8
+  unsigned int _x,            // [sp, #12] 0
+  unsigned int y,             // [sp, #16] (r7 in prefix)
+  unsigned int _w,            // [sp, #20] -> r12, r6
+  unsigned int h);            // [sp, #24] -> r7
+
+void ff_rpi_sand30_lines_to_planar_y16(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand30_lines_to_planar_c16(
+  uint8_t * dst_u,            // [r0]
+  unsigned int dst_stride_u,  // [r1]
+  uint8_t * dst_v,            // [r2]
+  unsigned int dst_stride_v,  // [r3]
+  const uint8_t * src,        // [sp, #0]  -> r4, r5
+  unsigned int stride1,       // [sp, #4]  128
+  unsigned int stride2,       // [sp, #8]  -> r8
+  unsigned int _x,            // [sp, #12] 0
+  unsigned int y,             // [sp, #16] (r7 in prefix)
+  unsigned int _w,            // [sp, #20] -> r6, r9
+  unsigned int h);            // [sp, #24] -> r7
+
+void ff_rpi_sand30_lines_to_planar_p010(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand30_lines_to_planar_y8(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+#endif // AVUTIL_ARM_SAND_NEON_H
+
diff --git a/libavutil/frame.c b/libavutil/frame.c
index 75e347bf2f..daa6477485 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -16,6 +16,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include "config.h"
+
 #include "channel_layout.h"
 #include "avassert.h"
 #include "buffer.h"
@@ -26,6 +28,9 @@
 #include "mem.h"
 #include "samplefmt.h"
 #include "hwcontext.h"
+#if CONFIG_SAND
+#include "rpi_sand_fns.h"
+#endif

 #if FF_API_FRAME_GET_SET
 MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
@@ -903,6 +908,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
         (frame->crop_top + frame->crop_bottom) >= frame->height)
         return AVERROR(ERANGE);

+#if CONFIG_SAND
+    // Sand cannot be cropped - do not try
+    if (av_rpi_is_sand_format(frame->format))
+        return 0;
+#endif
+
     desc = av_pix_fmt_desc_get(frame->format);
     if (!desc)
         return AVERROR_BUG;
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 7d1f8e2935..a4e7dc915d 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -990,6 +990,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
  */
 const char *av_frame_side_data_name(enum AVFrameSideDataType type);

+
+static inline int av_frame_cropped_width(const AVFrame * const frame)
+{
+    return frame->width - (frame->crop_left + frame->crop_right);
+}
+static inline int av_frame_cropped_height(const AVFrame * const frame)
+{
+    return frame->height - (frame->crop_top + frame->crop_bottom);
+}
+
 /**
  * @}
  */
diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
index 7a9fdbd263..2f825b7e16 100644
--- a/libavutil/hwcontext_drm.c
+++ b/libavutil/hwcontext_drm.c
@@ -21,6 +21,7 @@
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
+#include <sys/ioctl.h>

 /* This was introduced in version 4.6. And may not exist all without an
  * optional package. So to prevent a hard dependency on needing the Linux
@@ -31,6 +32,7 @@
 #endif

 #include <drm.h>
+#include <libdrm/drm_fourcc.h>
 #include <xf86drm.h>

 #include "avassert.h"
@@ -38,7 +40,9 @@
 #include "hwcontext_drm.h"
 #include "hwcontext_internal.h"
 #include "imgutils.h"
-
+#if CONFIG_SAND
+#include "libavutil/rpi_sand_fns.h"
+#endif

 static void drm_device_free(AVHWDeviceContext *hwdev)
 {
@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
     AVDRMDeviceContext *hwctx = hwdev->hwctx;
     drmVersionPtr version;

+    if (device == NULL) {
+        hwctx->fd = -1;
+        return 0;
+    }
+
     hwctx->fd = open(device, O_RDWR);
     if (hwctx->fd < 0)
         return AVERROR(errno);
@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
     if (flags & AV_HWFRAME_MAP_WRITE)
         mmap_prot |= PROT_WRITE;

+    if (dst->format == AV_PIX_FMT_NONE)
+        dst->format = hwfc->sw_format;
 #if HAVE_LINUX_DMA_BUF_H
     if (flags & AV_HWFRAME_MAP_READ)
         map->sync_flags |= DMA_BUF_SYNC_READ;
@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,

     dst->width  = src->width;
     dst->height = src->height;
+    dst->crop_top    = src->crop_top;
+    dst->crop_bottom = src->crop_bottom;
+    dst->crop_left   = src->crop_left;
+    dst->crop_right  = src->crop_right;
+
+#if CONFIG_SAND
+    // Rework for sand frames
+    if (av_rpi_is_sand_frame(dst)) {
+        // As it stands the sand formats hold stride2 in linesize[3]
+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
+        dst->linesize[0] = 128;
+        dst->linesize[1] = 128;
+        // *** Are we sure src->height is actually what we want ???
+    }
+#endif

     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
                                 &drm_unmap_frame, map);
@@ -206,16 +234,29 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
                                     enum AVHWFrameTransferDirection dir,
                                     enum AVPixelFormat **formats)
 {
-    enum AVPixelFormat *pix_fmts;
+    enum AVPixelFormat *p;

-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
-    if (!pix_fmts)
+    p = *formats = av_malloc_array(3, sizeof(*p));
+    if (!p)
         return AVERROR(ENOMEM);

-    pix_fmts[0] = ctx->sw_format;
-    pix_fmts[1] = AV_PIX_FMT_NONE;
+    // **** Offer native sand too ????
+    *p++ =
+#if CONFIG_SAND
+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
+            AV_PIX_FMT_YUV420P :
+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
+            AV_PIX_FMT_YUV420P10LE :
+#endif
+            ctx->sw_format;
+
+#if CONFIG_SAND
+    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
+        *p++ = AV_PIX_FMT_NV12;
+#endif

-    *formats = pix_fmts;
+    *p = AV_PIX_FMT_NONE;
     return 0;
 }

@@ -231,18 +272,63 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
     map = av_frame_alloc();
     if (!map)
         return AVERROR(ENOMEM);
-    map->format = dst->format;

+    // Map to default
+    map->format = AV_PIX_FMT_NONE;
     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
     if (err)
         goto fail;

-    map->width  = dst->width;
-    map->height = dst->height;
+#if 0
+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
+           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
+           map->width, map->height,
+           map->linesize[0],
+           map->linesize[1],
+           map->linesize[2],
+           map->linesize[3],
+           dst->width, dst->height,
+           dst->linesize[0],
+           dst->linesize[1],
+           dst->linesize[2]);
+#endif
+#if CONFIG_SAND
+    if (av_rpi_is_sand_frame(map)) {
+        // Preserve crop - later ffmpeg code assumes that we have in that it
+        // overwrites any crop that we create with the old values
+        unsigned int stride2 = map->linesize[3];
+        const unsigned int w = FFMIN(dst->width, map->width);
+        const unsigned int h = FFMIN(dst->height, map->height);
+
+        map->crop_top = 0;
+        map->crop_bottom = 0;
+        map->crop_left = 0;
+        map->crop_right = 0;
+
+        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
+        {
+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
+            err = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        dst->width = w;
+        dst->height = h;
+    }
+    else
+#endif
+    {
+        // Kludge mapped h/w s.t. frame_copy works
+        map->width  = dst->width;
+        map->height = dst->height;
+        err = av_frame_copy(dst, map);
+    }

-    err = av_frame_copy(dst, map);
     if (err)
+    {
+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
         goto fail;
+    }

     err = 0;
 fail:
@@ -257,7 +343,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
     int err;

     if (src->width > hwfc->width || src->height > hwfc->height)
+    {
+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
         return AVERROR(EINVAL);
+    }

     map = av_frame_alloc();
     if (!map)
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 18c7a0efc8..bab13a4d50 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2395,6 +2395,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .name = "vulkan",
         .flags = AV_PIX_FMT_FLAG_HWACCEL,
     },
+    [AV_PIX_FMT_SAND128] = {
+        .name = "sand128",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
+            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
+            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_10] = {
+        .name = "sand64_10",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
+            { 1, 4, 0, 0, 10, 3, 9, 1 },        /* U */
+            { 1, 4, 2, 0, 10, 3, 9, 3 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_16] = {
+        .name = "sand64_16",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 16, 0, 15, 1 },        /* Y */
+            { 1, 4, 0, 0, 16, 3, 15, 1 },        /* U */
+            { 1, 4, 2, 0, 16, 3, 15, 3 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_RPI4_8] = {
+        .name = "rpi4_8",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
+    [AV_PIX_FMT_RPI4_10] = {
+        .name = "rpi4_10",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
 };
 #if FF_API_PLUS1_MINUS1
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 46ef211add..9195ead15f 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -357,6 +357,14 @@ enum AVPixelFormat {

     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
+// RPI - not on ifdef so can be got at by calling progs
+// #define so code that uses this can know it is there
+#define AVUTIL_HAVE_PIX_FMT_SAND 1
+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_RPI4_8,
+    AV_PIX_FMT_RPI4_10,

     AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined
     AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
new file mode 100644
index 0000000000..0d5d203dc3
--- /dev/null
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -0,0 +1,227 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+// * Included twice from rpi_sand_fn with different PW
+
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x;
+    const unsigned int w = _w;
+    const unsigned int mask = stride1 - 1;
+
+#if PW == 1 && HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
+                                     src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+            memcpy(dst, p, w);
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const uint8_t * p = p2;
+            uint8_t * d = dst;
+            memcpy(d, p1, w1);
+            d += w1;
+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+                memcpy(d, p, stride1);
+            }
+            memcpy(d, p, w3);
+        }
+    }
+}
+
+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+
+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+
+#if PW == 1 && HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                     src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            const pixel * p = (const pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * p = (const pixel *)p1;
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *du++ = *p++;
+                    *dv++ = *p++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+}
+
+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *p++ = *su++;
+                    *p++ = *sv++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+}
+
+
+#undef pixel
+#undef STRCAT
+#undef FUNC
+
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
new file mode 100644
index 0000000000..b6071e2928
--- /dev/null
+++ b/libavutil/rpi_sand_fns.c
@@ -0,0 +1,445 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#include "config.h"
+#include <stdint.h>
+#include <string.h>
+#include "rpi_sand_fns.h"
+#include "avassert.h"
+#include "frame.h"
+
+#if ARCH_ARM && HAVE_NEON
+#include "arm/rpi_sand_neon.h"
+#define HAVE_SAND_ASM 1
+#elif ARCH_AARCH64 && HAVE_NEON
+#include "aarch64/rpi_sand_neon.h"
+#define HAVE_SAND_ASM 1
+#else
+#define HAVE_SAND_ASM 0
+#endif
+
+#define PW 1
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#define PW 2
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#if 1
+// Simple round
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    const unsigned int rnd = (1 << shr) >> 1;
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        *dst++ = (*src++ + rnd) >> shr;
+    }
+}
+#else
+// Dithered variation
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    unsigned int rnd = (1 << shr) >> 1;
+    const unsigned int mask = ((1 << shr) - 1);
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        rnd = *src++ + (rnd & mask);
+        *dst++ = rnd >> shr;
+    }
+}
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+// _x & _w in pixels, strides in bytes
+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 4;
+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint16_t * d = (uint16_t *)dst;
+
+        if (xskip0 != 0) {
+            const uint32_t p3 = *p++;
+
+            if (xskip0 == 1)
+                *d++ = (p3 >> 10) & 0x3ff;
+            *d++ = (p3 >> 20) & 0x3ff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3 = *p++;
+            *d++ = p3 & 0x3ff;
+            *d++ = (p3 >> 10) & 0x3ff;
+            *d++ = (p3 >> 20) & 0x3ff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3 = *p;
+
+            *d++ = p3 & 0x3ff;
+            if (xrem1 == 2)
+                *d++ = (p3 >> 10) & 0x3ff;
+        }
+    }
+}
+
+
+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 8;
+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                       src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint16_t * du = (uint16_t *)dst_u;
+        uint16_t * dv = (uint16_t *)dst_v;
+
+        if (xskip0 != 0) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            if (xskip0 == 1)
+            {
+                *du++ = (p3a >> 20) & 0x3ff;
+                *dv++ = (p3b >>  0) & 0x3ff;
+            }
+            *du++ = (p3b >> 10) & 0x3ff;
+            *dv++ = (p3b >> 20) & 0x3ff;
+
+            if (((x += 8) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            *du++ = p3a & 0x3ff;
+            *dv++ = (p3a >> 10) & 0x3ff;
+            *du++ = (p3a >> 20) & 0x3ff;
+            *dv++ = p3b & 0x3ff;
+            *du++ = (p3b >> 10) & 0x3ff;
+            *dv++ = (p3b >> 20) & 0x3ff;
+
+            if (((x += 8) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            *du++ = p3a & 0x3ff;
+            *dv++ = (p3a >> 10) & 0x3ff;
+            if (xrem1 == 2)
+            {
+                *du++ = (p3a >> 20) & 0x3ff;
+                *dv++ = p3b & 0x3ff;
+            }
+        }
+    }
+}
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// single lose bottom 2 bits truncation
+// _x & _w in pixels, strides in bytes
+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 4;
+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint8_t * d = dst;
+
+        if (xskip0 != 0) {
+            const uint32_t p3 = *p++;
+
+            if (xskip0 == 1)
+                *d++ = (p3 >> 12) & 0xff;
+            *d++ = (p3 >> 22) & 0xff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3 = *p++;
+            *d++ = (p3 >> 2) & 0xff;
+            *d++ = (p3 >> 12) & 0xff;
+            *d++ = (p3 >> 22) & 0xff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3 = *p;
+
+            *d++ = (p3 >> 2) & 0xff;
+            if (xrem1 == 2)
+                *d++ = (p3 >> 12) & 0xff;
+        }
+    }
+}
+
+
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr)
+{
+    const unsigned int n = dst_stride1 / 2;
+    unsigned int j;
+
+    // This is true for our current layouts
+    av_assert0(dst_stride1 == src_stride1);
+
+    // As we have the same stride1 for src & dest and src is wider than dest
+    // then if we loop on src we can always write contiguously to dest
+    // We make no effort to copy an exact width - round up to nearest src stripe
+    // as we will always have storage in dest for that
+
+#if ARCH_ARM && HAVE_NEON
+    if (shr == 3 && src_stride1 == 128) {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+        }
+    }
+    else
+#endif
+    {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+                cpy16_to_8(d, s1, n, shr);
+                cpy16_to_8(d + n, s2, n, shr);
+            }
+        }
+    }
+
+    // Fix up a trailing dest half stripe
+    if (j < w) {
+        uint8_t * d = dst + j * dst_stride2;
+        const uint8_t * s1 = src + j * 2 * src_stride2;
+
+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+            cpy16_to_8(d, s1, n, shr);
+        }
+    }
+}
+
+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
+{
+    const int w = av_frame_cropped_width(src);
+    const int h = av_frame_cropped_height(src);
+    const int x = src->crop_left;
+    const int y = src->crop_top;
+
+    // We will crop as part of the conversion
+    dst->crop_top = 0;
+    dst->crop_left = 0;
+    dst->crop_bottom = 0;
+    dst->crop_right = 0;
+
+    switch (src->format){
+        case AV_PIX_FMT_SAND128:
+        case AV_PIX_FMT_RPI4_8:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P:
+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2,  w/2, h/2);
+                    break;
+                case AV_PIX_FMT_NV12:
+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        case AV_PIX_FMT_SAND64_10:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P10:
+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x*2, y, w*2, h);
+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y/2,  w, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        case AV_PIX_FMT_RPI4_10:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P10:
+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w/2, h/2);
+                    break;
+                case AV_PIX_FMT_NV12:
+                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        default:
+            return -1;
+    }
+
+    return av_frame_copy_props(dst, src);
+}
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
new file mode 100644
index 0000000000..462ccb8abd
--- /dev/null
+++ b/libavutil/rpi_sand_fns.h
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef AVUTIL_RPI_SAND_FNS
+#define AVUTIL_RPI_SAND_FNS
+
+#include "libavutil/frame.h"
+
+// For all these fns _x & _w are measured as coord * PW
+// For the C fns coords are in chroma pels (so luma / 2)
+// Strides are in bytes
+
+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr);
+
+
+// dst must contain required pixel format & allocated data buffers
+// Cropping on the src buffer will be honoured and dst crop will be set to zero
+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
+
+
+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+{
+#ifdef RPI_ZC_SAND128_ONLY
+    // If we are sure we only only support 128 byte sand formats replace the
+    // var with a constant which should allow for better optimisation
+    return 128;
+#else
+    return frame->linesize[0];
+#endif
+}
+
+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+{
+    return frame->linesize[3];
+}
+
+
+static inline int av_rpi_is_sand_format(const int format)
+{
+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+{
+    return av_rpi_is_sand_format(frame->format);
+}
+
+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
+}
+
+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+{
+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+{
+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+}
+
+// If x is measured in bytes (not pixels) then this works for sand64_16 as
+// well as sand128 - but in the general case we work that out
+
+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y + stride2 * x2;
+}
+
+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y_c + stride2 * x2;
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+}
+
+#endif
+
diff --git a/libavutil/utils.c b/libavutil/utils.c
index ea9b5097b8..c1cd452eee 100644
--- a/libavutil/utils.c
+++ b/libavutil/utils.c
@@ -37,6 +37,10 @@ const char *av_version_info(void)

 unsigned avutil_version(void)
 {
+    static int checks_done;
+    if (checks_done)
+        return LIBAVUTIL_VERSION_INT;
+
     av_assert0(AV_SAMPLE_FMT_DBLP == 9);
     av_assert0(AVMEDIA_TYPE_ATTACHMENT == 4);
     av_assert0(AV_PICTURE_TYPE_BI == 7);
@@ -54,6 +58,7 @@ unsigned avutil_version(void)
         av_log(NULL, AV_LOG_ERROR, "Libavutil has been linked to a broken llrint()\n");
     }

+    checks_done = 1;
     return LIBAVUTIL_VERSION_INT;
 }

diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S
index f341268c5d..f4b220fb60 100644
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -118,8 +118,8 @@
 .endm

 .macro increment_yuv422p
-    add                 x6,  x6,  w7, SXTW                              // srcU += incU
-    add                 x13, x13, w14, SXTW                             // srcV += incV
+    add                 x6,  x6,  w7, UXTW                              // srcU += incU
+    add                 x13, x13, w14, UXTW                             // srcV += incV
 .endm

 .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
@@ -189,8 +189,8 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
     st4                 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
     subs                w8, w8, #16                                     // width -= 16
     b.gt                2b
-    add                 x2, x2, w3, SXTW                                // dst  += padding
-    add                 x4, x4, w5, SXTW                                // srcY += paddingY
+    add                 x2, x2, w3, UXTW                                // dst  += padding
+    add                 x4, x4, w5, UXTW                                // srcY += paddingY
     increment_\ifmt
     subs                w1, w1, #1                                      // height -= 1
     b.gt                1b
diff --git a/libswscale/input.c b/libswscale/input.c
index 197152f65b..6850801a44 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -84,9 +84,9 @@ rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
     int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
     av_assert1(src1==src2);
     for (i = 0; i < width; i++) {
-        unsigned r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
-        unsigned   g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
-        unsigned b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
+        int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
+        int   g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;

         dstU[i]= (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
         dstV[i]= (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
@@ -156,9 +156,9 @@ static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU,
     int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
     av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
-        unsigned r_b = input_pixel(&src1[i * 3 + 0]);
-        unsigned g   = input_pixel(&src1[i * 3 + 1]);
-        unsigned b_r = input_pixel(&src1[i * 3 + 2]);
+        int r_b = input_pixel(&src1[i * 3 + 0]);
+        int g   = input_pixel(&src1[i * 3 + 1]);
+        int b_r = input_pixel(&src1[i * 3 + 2]);

         dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
         dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
@@ -178,12 +178,12 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
     int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
     av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
-        unsigned r_b = (input_pixel(&src1[6 * i + 0]) +
-                        input_pixel(&src1[6 * i + 3]) + 1) >> 1;
-        unsigned g   = (input_pixel(&src1[6 * i + 1]) +
-                        input_pixel(&src1[6 * i + 4]) + 1) >> 1;
-        unsigned b_r = (input_pixel(&src1[6 * i + 2]) +
-                        input_pixel(&src1[6 * i + 5]) + 1) >> 1;
+        int r_b = (input_pixel(&src1[6 * i + 0]) +
+                   input_pixel(&src1[6 * i + 3]) + 1) >> 1;
+        int g   = (input_pixel(&src1[6 * i + 1]) +
+                   input_pixel(&src1[6 * i + 4]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[6 * i + 2]) +
+                   input_pixel(&src1[6 * i + 5]) + 1) >> 1;

         dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
         dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
diff --git a/libswscale/output.c b/libswscale/output.c
index aef0e7f82a..e855ad606a 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -1043,8 +1043,8 @@ yuv2rgba64_X_c_template(SwsContext *c, const int16_t *lumFilter,
         Y2 -= c->yuv2rgb_y_offset;
         Y1 *= c->yuv2rgb_y_coeff;
         Y2 *= c->yuv2rgb_y_coeff;
-        Y1 += (1 << 13) - (1 << 29); // 21
-        Y2 += (1 << 13) - (1 << 29);
+        Y1 += 1 << 13; // 21
+        Y2 += 1 << 13;
         // 8 bits: 17 + 13 bits = 30 bits, 16 bits: 17 + 13 bits = 30 bits

         R = V * c->yuv2rgb_v2r_coeff;
@@ -1052,20 +1052,20 @@ yuv2rgba64_X_c_template(SwsContext *c, const int16_t *lumFilter,
         B =                            U * c->yuv2rgb_u2b_coeff;

         // 8 bits: 30 - 22 = 8 bits, 16 bits: 30 bits - 14 = 16 bits
-        output_pixel(&dest[0], av_clip_uintp2(((R_B + Y1) >> 14) + (1<<15), 16));
-        output_pixel(&dest[1], av_clip_uintp2(((  G + Y1) >> 14) + (1<<15), 16));
-        output_pixel(&dest[2], av_clip_uintp2(((B_R + Y1) >> 14) + (1<<15), 16));
+        output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+        output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+        output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
         if (eightbytes) {
             output_pixel(&dest[3], av_clip_uintp2(A1      , 30) >> 14);
-            output_pixel(&dest[4], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[5], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[6], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+            output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
+            output_pixel(&dest[5], av_clip_uintp2(  G + Y2, 30) >> 14);
+            output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
             output_pixel(&dest[7], av_clip_uintp2(A2      , 30) >> 14);
             dest += 8;
         } else {
-            output_pixel(&dest[3], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[4], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[5], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+            output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+            output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+            output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
             dest += 6;
         }
     }
@@ -1102,8 +1102,8 @@ yuv2rgba64_2_c_template(SwsContext *c, const int32_t *buf[2],
         Y2 -= c->yuv2rgb_y_offset;
         Y1 *= c->yuv2rgb_y_coeff;
         Y2 *= c->yuv2rgb_y_coeff;
-        Y1 += (1 << 13) - (1 << 29);
-        Y2 += (1 << 13) - (1 << 29);
+        Y1 += 1 << 13;
+        Y2 += 1 << 13;

         R = V * c->yuv2rgb_v2r_coeff;
         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
@@ -1117,20 +1117,20 @@ yuv2rgba64_2_c_template(SwsContext *c, const int32_t *buf[2],
             A2 += 1 << 13;
         }

-        output_pixel(&dest[0], av_clip_uintp2(((R_B + Y1) >> 14) + (1<<15), 16));
-        output_pixel(&dest[1], av_clip_uintp2(((  G + Y1) >> 14) + (1<<15), 16));
-        output_pixel(&dest[2], av_clip_uintp2(((B_R + Y1) >> 14) + (1<<15), 16));
+        output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+        output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+        output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
         if (eightbytes) {
             output_pixel(&dest[3], av_clip_uintp2(A1      , 30) >> 14);
-            output_pixel(&dest[4], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[5], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[6], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+            output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
+            output_pixel(&dest[5], av_clip_uintp2(  G + Y2, 30) >> 14);
+            output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
             output_pixel(&dest[7], av_clip_uintp2(A2      , 30) >> 14);
             dest += 8;
         } else {
-            output_pixel(&dest[3], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[4], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-            output_pixel(&dest[5], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+            output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+            output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+            output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
             dest += 6;
         }
     }
@@ -1158,8 +1158,8 @@ yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
             Y2 -= c->yuv2rgb_y_offset;
             Y1 *= c->yuv2rgb_y_coeff;
             Y2 *= c->yuv2rgb_y_coeff;
-            Y1 += (1 << 13) - (1 << 29);
-            Y2 += (1 << 13) - (1 << 29);
+            Y1 += 1 << 13;
+            Y2 += 1 << 13;

             if (hasAlpha) {
                 A1 = abuf0[i * 2    ] << 11;
@@ -1173,20 +1173,20 @@ yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
             B =                            U * c->yuv2rgb_u2b_coeff;

-            output_pixel(&dest[0], av_clip_uintp2(((R_B + Y1) >> 14) + (1<<15), 16));
-            output_pixel(&dest[1], av_clip_uintp2(((  G + Y1) >> 14) + (1<<15), 16));
-            output_pixel(&dest[2], av_clip_uintp2(((B_R + Y1) >> 14) + (1<<15), 16));
+            output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+            output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+            output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
             if (eightbytes) {
                 output_pixel(&dest[3], av_clip_uintp2(A1      , 30) >> 14);
-                output_pixel(&dest[4], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[5], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[6], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+                output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
+                output_pixel(&dest[5], av_clip_uintp2(  G + Y2, 30) >> 14);
+                output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
                 output_pixel(&dest[7], av_clip_uintp2(A2      , 30) >> 14);
                 dest += 8;
             } else {
-                output_pixel(&dest[3], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[4], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[5], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+                output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+                output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+                output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
                 dest += 6;
             }
         }
@@ -1204,8 +1204,8 @@ yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
             Y2 -= c->yuv2rgb_y_offset;
             Y1 *= c->yuv2rgb_y_coeff;
             Y2 *= c->yuv2rgb_y_coeff;
-            Y1 += (1 << 13) - (1 << 29);
-            Y2 += (1 << 13) - (1 << 29);
+            Y1 += 1 << 13;
+            Y2 += 1 << 13;

             if (hasAlpha) {
                 A1 = abuf0[i * 2    ] << 11;
@@ -1219,20 +1219,20 @@ yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
             B =                            U * c->yuv2rgb_u2b_coeff;

-            output_pixel(&dest[0], av_clip_uintp2(((R_B + Y1) >> 14) + (1<<15), 16));
-            output_pixel(&dest[1], av_clip_uintp2(((  G + Y1) >> 14) + (1<<15), 16));
-            output_pixel(&dest[2], av_clip_uintp2(((B_R + Y1) >> 14) + (1<<15), 16));
+            output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+            output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+            output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
             if (eightbytes) {
                 output_pixel(&dest[3], av_clip_uintp2(A1      , 30) >> 14);
-                output_pixel(&dest[4], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[5], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[6], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+                output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
+                output_pixel(&dest[5], av_clip_uintp2(  G + Y2, 30) >> 14);
+                output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
                 output_pixel(&dest[7], av_clip_uintp2(A2      , 30) >> 14);
                 dest += 8;
             } else {
-                output_pixel(&dest[3], av_clip_uintp2(((R_B + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[4], av_clip_uintp2(((  G + Y2) >> 14) + (1<<15), 16));
-                output_pixel(&dest[5], av_clip_uintp2(((B_R + Y2) >> 14) + (1<<15), 16));
+                output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+                output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+                output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
                 dest += 6;
             }
         }
@@ -1283,7 +1283,7 @@ yuv2rgba64_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
         Y -= c->yuv2rgb_y_offset;
         Y *= c->yuv2rgb_y_coeff;
-        Y += (1 << 13) - (1<<29); // 21
+        Y += 1 << 13; // 21
         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit

         R = V * c->yuv2rgb_v2r_coeff;
@@ -1291,9 +1291,9 @@ yuv2rgba64_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
         B =                            U * c->yuv2rgb_u2b_coeff;

         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
-        output_pixel(&dest[0], av_clip_uintp2(((R_B + Y)>>14) + (1<<15), 16));
-        output_pixel(&dest[1], av_clip_uintp2(((  G + Y)>>14) + (1<<15), 16));
-        output_pixel(&dest[2], av_clip_uintp2(((B_R + Y)>>14) + (1<<15), 16));
+        output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
+        output_pixel(&dest[1], av_clip_uintp2(  G + Y, 30) >> 14);
+        output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
         if (eightbytes) {
             output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
             dest += 4;
@@ -1331,7 +1331,7 @@ yuv2rgba64_full_2_c_template(SwsContext *c, const int32_t *buf[2],

         Y -= c->yuv2rgb_y_offset;
         Y *= c->yuv2rgb_y_coeff;
-        Y += (1 << 13) - (1 << 29);
+        Y += 1 << 13;

         R = V * c->yuv2rgb_v2r_coeff;
         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
@@ -1343,9 +1343,9 @@ yuv2rgba64_full_2_c_template(SwsContext *c, const int32_t *buf[2],
             A += 1 << 13;
         }

-        output_pixel(&dest[0], av_clip_uintp2(((R_B + Y) >> 14) + (1<<15), 16));
-        output_pixel(&dest[1], av_clip_uintp2(((  G + Y) >> 14) + (1<<15), 16));
-        output_pixel(&dest[2], av_clip_uintp2(((B_R + Y) >> 14) + (1<<15), 16));
+        output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
+        output_pixel(&dest[1], av_clip_uintp2(  G + Y, 30) >> 14);
+        output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
         if (eightbytes) {
             output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
             dest += 4;
@@ -1374,7 +1374,7 @@ yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,

             Y -= c->yuv2rgb_y_offset;
             Y *= c->yuv2rgb_y_coeff;
-            Y += (1 << 13) - (1 << 29);
+            Y += 1 << 13;

             if (hasAlpha) {
                 A = abuf0[i] << 11;
@@ -1386,9 +1386,9 @@ yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
             B =                            U * c->yuv2rgb_u2b_coeff;

-            output_pixel(&dest[0], av_clip_uintp2(((R_B + Y) >> 14) + (1<<15), 16));
-            output_pixel(&dest[1], av_clip_uintp2(((  G + Y) >> 14) + (1<<15), 16));
-            output_pixel(&dest[2], av_clip_uintp2(((B_R + Y) >> 14) + (1<<15), 16));
+            output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
+            output_pixel(&dest[1], av_clip_uintp2(  G + Y, 30) >> 14);
+            output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
             if (eightbytes) {
                 output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
                 dest += 4;
@@ -1407,7 +1407,7 @@ yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,

             Y -= c->yuv2rgb_y_offset;
             Y *= c->yuv2rgb_y_coeff;
-            Y += (1 << 13) - (1 << 29);
+            Y += 1 << 13;

             if (hasAlpha) {
                 A = abuf0[i] << 11;
@@ -1419,9 +1419,9 @@ yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
             B =                            U * c->yuv2rgb_u2b_coeff;

-            output_pixel(&dest[0], av_clip_uintp2(((R_B + Y) >> 14) + (1<<15), 16));
-            output_pixel(&dest[1], av_clip_uintp2(((  G + Y) >> 14) + (1<<15), 16));
-            output_pixel(&dest[2], av_clip_uintp2(((B_R + Y) >> 14) + (1<<15), 16));
+            output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
+            output_pixel(&dest[1], av_clip_uintp2(  G + Y, 30) >> 14);
+            output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
             if (eightbytes) {
                 output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
                 dest += 4;
diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
new file mode 100644
index 0000000000..b050971f63
--- /dev/null
+++ b/pi-util/BUILD.txt
@@ -0,0 +1,59 @@
+Building Pi FFmpeg
+==================
+
+Current only building on a Pi is supported.
+This builds ffmpeg the way I've tested it
+
+Get all dependencies - the current package dependencies are good enough
+
+$ sudo apt-get build-dep ffmpeg
+
+Configure using the pi-util/conf_native.sh script
+-------------------------------------------------
+
+This sets the normal release options and creates an ouutput dir to build into
+The directory name will depend on system and options but will be under out/
+
+There are a few choices here
+ --mmal  build including the legacy mmal-based decoders and zero-copy code
+         this requires appropriate libraries which currently will exist for
+         armv7 but not arm64
+ --noshared
+         Build a static image rather than a shared library one.  Static is
+         easier for testing as there is no need to worry about library
+         paths being confused and therefore running the wrong code,  Shared
+         is what is needed, in most cases, when building for use by other
+         programs.
+
+So for a static build
+---------------------
+
+$ pi-util/conf_native.sh --noshared
+
+$ make -j8 -C out/<wherever the script said it was building to>
+
+You can now run ffmpeg directly from where it was built
+
+For a shared build
+------------------
+
+$ pi-util/conf_native.sh
+
+You will normally want an install target if shared. Note that the script has
+set this up to be generated in out/<builddir>/install, you don't have to worry
+about overwriting your system libs.
+
+$ make -j8 -C out/<builddir> install
+
+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
+built or install the image on the system - you have to be careful to get rid
+of all other ffmpeg libs or confusion may result.  There is a little script
+that wipes all other versions - obviously use with care!
+
+$ sudo pi-util/clean_usr_libs.sh
+
+Then simply copying from the install to /usr works
+
+$ sudo cp -r out/<builddir>/install/* /usr
+
+
diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
new file mode 100644
index 0000000000..fcce72226a
--- /dev/null
+++ b/pi-util/NOTES.txt
@@ -0,0 +1,69 @@
+Notes on the hevc_rpi decoder & associated support code
+-------------------------------------------------------
+
+There are 3 main parts to the existing code:
+
+1) The decoder - this is all in libavcodec as rpi_hevc*.
+
+2) A few filters to deal with Sand frames and a small patch to
+automatically select the sand->i420 converter when required.
+
+3) A kludge in ffmpeg.c to display the decoded video. This could & should
+be converted into a proper ffmpeg display module.
+
+
+Decoder
+-------
+
+The decoder is a modified version of the existing ffmpeg hevc decoder.
+Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
+More complex bitstreams can be up to ~200% faster but particularly easy
+streams can cut its advantage down to ~50%.  This means that a Pi3+ can
+display nearly all 8-bit 1080p30 streams and with some overclocking it can
+display most lower bitrate 10-bit 1080p30 streams - this latter case is
+not helped by the requirement to downsample to 8-bit before display on a
+Pi.
+
+It has had co-processor offload added for inter-pred and large block
+residual transform.  Various parts have had optimized ARM NEON assembler
+added and the existing ARM asm sections have been profiled and
+re-optimized for A53. The main C code has been substantially reworked at
+its lower levels in an attempt to optimize it and minimize memory
+bandwidth. To some extent code paths that deal with frame types that it
+doesn't support have been pruned.
+
+It outputs frames in Broadcom Sand format. This is a somewhat annoying
+layout that doesn't fit into ffmpegs standard frame descriptions. It has
+vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
+the stripe followed by interleaved U & V, that is then followed by the Y
+for the next stripe, etc. The final stripe is always padded to
+stripe-width. This is used in an attempt to help with cache locality and
+cut down on the number of dram bank switches. It is annoying to use for
+inter-pred with conventional processing but the way the Pi QPU (which is
+used for inter-pred) works means that it has negligible downsides here and
+the improved memory performance exceeds the overhead of the increased
+complexity in the rest of the code.
+
+Frames must be allocated out of GPU memory (as otherwise they can't be
+accessed by the co-processors). Utility functions (in rpi_zc.c) have been
+written to make this easier. As the frames are already in GPU memory they
+can be displayed by the Pi h/w without any further copying.
+
+
+Known non-features
+------------------
+
+Frame allocation should probably be done in some other way in order to fit
+into the standard framework better.
+
+Sand frames are currently declared as software frames, there is an
+argument that they should be hardware frames but they aren't really.
+
+There must be a better way of auto-selecting the hevc_rpi decoder over the
+normal s/w hevc decoder, but I became confused by the existing h/w
+acceleration framework and what I wanted to do didn't seem to fit in
+neatly.
+
+Display should be a proper device rather than a kludge in ffmpeg.c
+
+
diff --git a/pi-util/TESTMESA.txt b/pi-util/TESTMESA.txt
new file mode 100644
index 0000000000..92bc13a3df
--- /dev/null
+++ b/pi-util/TESTMESA.txt
@@ -0,0 +1,82 @@
+# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
+
+# These assume that the drm_mmal test for Sand8 has been built on this Pi
+# as build relies on many of the same files
+
+# 1st get everything required to build ffmpeg
+# If sources aren't already enabled on your Pi then enable them
+sudo su
+sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
+sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
+mv /tmp/sources.list /etc/apt/
+mv /tmp/raspi.list /etc/apt/sources.list.d/
+apt update
+
+# Get dependancies
+sudo apt build-dep ffmpeg
+
+sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
+
+# Enable H265 V4L2 request decoder
+sudo su
+echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
+# You may also want to add more CMA if you are going to try 4k videos
+# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
+# dtoverlay=vc4-fkms-v3d,cma-512
+reboot
+# Check it has turned up
+ls -la /dev/video*
+# This should include video19
+# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
+
+# Currently on the Pi the linux headers from the debian distro don't match
+# the kernel that we ship and we need to update them - hopefully this step
+# will be unneeded in the future
+sudo apt install git bc bison flex libssl-dev make
+git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
+cd linux
+KERNEL=kernel7l
+make bcm2711_defconfig
+make headers_install
+sudo cp -r usr/include/linux /usr/include
+cd ..
+
+# Config - this builds a staticly linked ffmpeg which is easier for testing
+pi-util/conf_native.sh --noshared
+
+# Build (this is a bit dull)
+# If you want to poke the source the libavdevice/egl_vout.c contains the
+# output code -
+cd out/armv7-static-rel
+
+# Check that you have actually configured V4L2 request
+grep HEVC_V4L2REQUEST config.h
+# You are hoping for
+# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
+# if you get 0 then the config has failed
+
+make -j6
+
+# Grab test streams
+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
+
+# Test i420 output (works currently)
+./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
+
+# Test Sand8 output - doesn't currently work but should once you have
+# Sand8 working in drm_mmal. I can't guarantee that this will work as
+# I can't test this path with a known working format, but the debug looks
+# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
+# The "show_all 1" forces vout to display every frame otherwise it drops any
+# frame that would cause it to block
+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
+
+# Test Sand30 - doesn't currently work
+# (Beware that when FFmpeg errors out it often leaves your teminal window
+# in a state where you need to reset it)
+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
+
+
+
diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
new file mode 100755
index 0000000000..01bd6a6a22
--- /dev/null
+++ b/pi-util/clean_usr_libs.sh
@@ -0,0 +1,42 @@
+set -e
+U=/usr/include/arm-linux-gnueabihf
+rm -rf $U/libavcodec
+rm -rf $U/libavdevice
+rm -rf $U/libavfilter
+rm -rf $U/libavformat
+rm -rf $U/libavutil
+rm -rf $U/libswresample
+rm -rf $U/libswscale
+U=/usr/include/aarch64-linux-gnu
+rm -rf $U/libavcodec
+rm -rf $U/libavdevice
+rm -rf $U/libavfilter
+rm -rf $U/libavformat
+rm -rf $U/libavutil
+rm -rf $U/libswresample
+rm -rf $U/libswscale
+U=/usr/lib/arm-linux-gnueabihf
+rm -f $U/libavcodec.*
+rm -f $U/libavdevice.*
+rm -f $U/libavfilter.*
+rm -f $U/libavformat.*
+rm -f $U/libavutil.*
+rm -f $U/libswresample.*
+rm -f $U/libswscale.*
+U=/usr/lib/arm-linux-gnueabihf/neon/vfp
+rm -f $U/libavcodec.*
+rm -f $U/libavdevice.*
+rm -f $U/libavfilter.*
+rm -f $U/libavformat.*
+rm -f $U/libavutil.*
+rm -f $U/libswresample.*
+rm -f $U/libswscale.*
+U=/usr/lib/aarch64-linux-gnu
+rm -f $U/libavcodec.*
+rm -f $U/libavdevice.*
+rm -f $U/libavfilter.*
+rm -f $U/libavformat.*
+rm -f $U/libavutil.*
+rm -f $U/libswresample.*
+rm -f $U/libswscale.*
+
diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh
new file mode 100644
index 0000000000..9e3bbfa190
--- /dev/null
+++ b/pi-util/conf_arm64_native.sh
@@ -0,0 +1,45 @@
+echo "Configure for ARM64 native build"
+
+#RPI_KEEPS="-save-temps=obj"
+
+SHARED_LIBS="--enable-shared"
+if [ "$1" == "--noshared" ]; then
+  SHARED_LIBS="--disable-shared"
+  echo Static libs
+  OUT=out/arm64-static-rel
+else
+  echo Shared libs
+  OUT=out/arm64-shared-rel
+fi
+
+mkdir -p $OUT
+cd $OUT
+
+A=aarch64-linux-gnu
+USR_PREFIX=`pwd`/install
+LIB_PREFIX=$USR_PREFIX/lib/$A
+INC_PREFIX=$USR_PREFIX/include/$A
+
+../../configure \
+ --prefix=$USR_PREFIX\
+ --libdir=$LIB_PREFIX\
+ --incdir=$INC_PREFIX\
+ --disable-stripping\
+ --disable-thumb\
+ --disable-mmal\
+ --enable-sand\
+ --enable-v4l2-request\
+ --enable-libdrm\
+ --enable-epoxy\
+ --enable-libudev\
+ --enable-vout-drm\
+ --enable-vout-egl\
+ $SHARED_LIBS\
+ --extra-cflags="-ggdb"
+
+# --enable-decoder=hevc_rpi\
+# --enable-extra-warnings\
+# --arch=armv71\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
new file mode 100644
index 0000000000..4efd5d1c67
--- /dev/null
+++ b/pi-util/conf_h265.2016.csv
@@ -0,0 +1,195 @@
+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
new file mode 100644
index 0000000000..6082641271
--- /dev/null
+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
@@ -0,0 +1,147 @@
+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
new file mode 100644
index 0000000000..fc14f2a3c2
--- /dev/null
+++ b/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
new file mode 100755
index 0000000000..a9e053801c
--- /dev/null
+++ b/pi-util/conf_native.sh
@@ -0,0 +1,107 @@
+echo "Configure for native build"
+
+FFSRC=`pwd`
+MC=`dpkg --print-architecture`
+BUILDBASE=$FFSRC/out
+
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+NOSHARED=
+MMAL=
+
+while [ "$1" != "" ] ; do
+    case $1 in
+	--noshared)
+	    NOSHARED=1
+	    ;;
+	--mmal)
+	    MMAL=1
+	    ;;
+	*)
+	    echo "Usage $0: [--noshared] [--mmal]"
+	    exit 1
+	    ;;
+    esac
+    shift
+done
+
+
+MCOPTS=
+RPI_INCLUDES=
+RPI_LIBDIRS=
+RPI_DEFINES=
+RPI_EXTRALIBS=
+
+if [ "$MC" == "arm64" ]; then
+  echo "M/C aarch64"
+  A=aarch64-linux-gnu
+  B=arm64
+elif [ "$MC" == "armhf" ]; then
+  echo "M/C armv7"
+  A=arm-linux-gnueabihf
+  B=armv7
+  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
+  RPI_DEFINES=-mfpu=neon-vfpv4
+else
+  echo Unexpected architecture $MC
+  exit 1
+fi
+
+if [ $MMAL ]; then
+  RPI_OPT_VC=/opt/vc
+  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
+  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
+  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
+  RPIOPTS="--enable-mmal --enable-rpi"
+else
+  RPIOPTS="--disable-mmal --enable-sand"
+fi
+
+C=`lsb_release -sc`
+V=`cat RELEASE`
+
+SHARED_LIBS="--enable-shared"
+if [ $NOSHARED ]; then
+  SHARED_LIBS="--disable-shared"
+  OUT=$BUILDBASE/$B-$C-$V-static-rel
+  echo Static libs
+else
+  echo Shared libs
+  OUT=$BUILDBASE/$B-$C-$V-shared-rel
+fi
+
+USR_PREFIX=$OUT/install
+LIB_PREFIX=$USR_PREFIX/lib/$A
+INC_PREFIX=$USR_PREFIX/include/$A
+
+echo Destination directory: $OUT
+mkdir -p $OUT
+# Nothing under here need worry git - including this .gitignore!
+echo "**" > $BUILDBASE/.gitignore
+cd $OUT
+
+$FFSRC/configure \
+ --prefix=$USR_PREFIX\
+ --libdir=$LIB_PREFIX\
+ --incdir=$INC_PREFIX\
+ $MCOPTS\
+ --disable-stripping\
+ --disable-thumb\
+ --enable-v4l2-request\
+ --enable-libdrm\
+ --enable-vout-egl\
+ --enable-vout-drm\
+ --enable-gpl\
+ $SHARED_LIBS\
+ $RPIOPTS\
+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS"\
+ --extra-libs="$RPI_EXTRALIBS"\
+ --extra-version="rpi"
+
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100755
index 0000000000..657568014e
--- /dev/null
+++ b/pi-util/ffconf.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+
+import string
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+CODEC_HEVC_RPI  = 1
+HWACCEL_RPI     = 2
+HWACCEL_DRM     = 3
+HWACCEL_VAAPI   = 4
+
+def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
+    hwaccel = ""
+    if dectype == HWACCEL_RPI:
+        hwaccel = "rpi"
+    elif dectype == HWACCEL_DRM:
+        hwaccel = "drm"
+    elif dectype == HWACCEL_VAAPI:
+        hwaccel = "vaapi"
+
+    pix_fmt = []
+    if pix == "8":
+        pix_fmt = ["-pix_fmt", "yuv420p"]
+    elif pix == "10":
+        pix_fmt = ["-pix_fmt", "yuv420p10le"]
+    elif pix == "12":
+        pix_fmt = ["-pix_fmt", "yuv420p12le"]
+
+    tmp_root = "/tmp"
+
+    names = srcname.split('/')
+    while len(names) > 1:
+        tmp_root = os.path.join(tmp_root, names[0])
+        del names[0]
+    name = names[0]
+
+    if not os.path.exists(tmp_root):
+        os.makedirs(tmp_root)
+
+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+    try:
+        os.remove(dec_file)
+    except:
+        pass
+
+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+
+    ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
+
+    # Unaligned needed for cropping conformance
+    if hwaccel:
+        rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
+    else:
+        rstr = subprocess.call(
+            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+            stdout=flog, stderr=subprocess.STDOUT)
+
+    try:
+        m1 = None
+        m2 = None
+        with open(os.path.join(fileroot, md5_file)) as f:
+            for line in f:
+                m1 = re.search("[0-9a-f]{32}", line.lower())
+                if m1:
+                    break
+
+        with open(dec_file) as f:
+            m2 = re.search("[0-9a-f]{32}", f.readline())
+    except:
+        pass
+
+    if  m1 and m2 and m1.group() == m2.group():
+        print("Match: " + m1.group(), file=flog)
+        rv = 0
+    elif not m1:
+        print("****** Cannot find m1", file=flog)
+        rv = 3
+    elif not m2:
+        print("****** Cannot find m2", file=flog)
+        rv = 2
+    else:
+        print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
+        rv = 1
+    flog.close()
+    return rv
+
+def scandir(root):
+    aconf = []
+    ents = os.listdir(root)
+    ents.sort(key=str.lower)
+    for name in ents:
+        test_path = os.path.join(root, name)
+        if S_ISDIR(os.stat(test_path).st_mode):
+            files = os.listdir(test_path)
+            es_file = "?"
+            md5_file = "?"
+            for f in files:
+                (base, ext) = os.path.splitext(f)
+                if base[0] == '.':
+                    pass
+                elif ext == ".bit" or ext == ".bin":
+                    es_file = f
+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
+                    if md5_file == "?":
+                        md5_file = f
+                    elif base[-3:] == "yuv":
+                        md5_file = f
+            aconf.append((1, name, es_file, md5_file))
+    return aconf
+
+def runtest(name, tests):
+    if not tests:
+        return True
+    for t in tests:
+        if name[0:len(t)] == t or name.find("/" + t) != -1:
+            return True
+    return False
+
+def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
+    unx_failures = []
+    unx_success = []
+    failures = 0
+    successes = 0
+    for a in csva:
+        exp_test = int(a[0])
+        if (exp_test and runtest(a[1], tests)):
+            name = a[1]
+            print ("==== ", name, end="")
+            sys.stdout.flush()
+
+            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
+            if (rv == 0):
+                successes += 1
+            else:
+                failures += 1
+
+            if (rv == 0):
+                if exp_test == 2:
+                    print(": * OK *")
+                    unx_success.append(name)
+                else:
+                    print(": ok")
+            elif exp_test == 2 and rv == 1:
+                print(": fail")
+            elif exp_test == 3 and rv == 2:
+                # Call an expected "crash" an abort
+                print(": abort")
+            else:
+                unx_failures.append(name)
+                if rv == 1:
+                    print(": * FAIL *")
+                elif (rv == 2) :
+                    print(": * CRASH *")
+                elif (rv == 3) :
+                    print(": * MD5 MISSING *")
+                else :
+                    print(": * BANG *")
+
+    if unx_failures or unx_success:
+        print("Unexpected Failures:", unx_failures)
+        print("Unexpected Success: ", unx_success)
+    else:
+        print("All tests normal:", successes, "ok,", failures, "failed")
+
+
+class ConfCSVDialect(csv.Dialect):
+    delimiter = ','
+    doublequote = True
+    lineterminator = '\n'
+    quotechar='"'
+    quoting = csv.QUOTE_MINIMAL
+    skipinitialspace = True
+    strict = True
+
+if __name__ == '__main__':
+
+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+    argp.add_argument("tests", nargs='*')
+    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
+    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
+    argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
+    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
+    args = argp.parse_args()
+
+    if args.csvgen:
+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
+        exit(0)
+
+    with open(args.csv, 'rt') as csvfile:
+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+    dectype = CODEC_HEVC_RPI
+    if os.path.exists("/dev/rpivid-hevcmem"):
+        dectype = HWACCEL_RPI
+    if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
+        dectype = HWACCEL_DRM
+
+    if args.pi4:
+        dectype = HWACCEL_RPI
+    elif args.drm:
+        dectype = HWACCEL_DRM
+    elif args.vaapi:
+        dectype = HWACCEL_VAAPI
+
+    doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
+
diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
new file mode 100755
index 0000000000..65c5224cd8
--- /dev/null
+++ b/pi-util/ffperf.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import time
+import string
+import os
+import tempfile
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class tstats:
+    close_threshold = 0.01
+
+    def __init__(self, stats_dict=None):
+        if stats_dict != None:
+            self.name = stats_dict["name"]
+            self.elapsed = float(stats_dict["elapsed"])
+            self.user = float(stats_dict["user"])
+            self.sys = float(stats_dict["sys"])
+
+    def times_str(self):
+        ctime = self.sys + self.user
+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+
+    def dict(self):
+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+
+    def is_close(self, other):
+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+
+    def __lt__(self, other):
+        return self.elapsed < other.elapsed
+    def __gt__(self, other):
+        return self.elapsed > other.elapsed
+
+    def time_file(name, prefix, ffmpeg="./ffmpeg"):
+        stats = tstats()
+        stats.name = name
+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
+                                  "-vcodec", "hevc_rpi",
+                                  "-t", "30", "-i", prefix + name,
+                                  "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+        pinfo = os.wait4(cproc.pid, 0)
+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        stats.elapsed = end_time - start_time
+        stats.user = pinfo[2].ru_utime
+        stats.sys = pinfo[2].ru_stime
+        return stats
+
+
+def common_prefix(s1, s2):
+    for i in range(min(len(s1),len(s2))):
+        if s1[i] != s2[i]:
+            return s1[:i]
+    return s1[:i+1]
+
+def main():
+    global flog
+
+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
+To blank the screen before starting use "xdg-screensaver activate"
+(For some reason this doesn't seem to work from within python).
+""")
+
+    argp.add_argument("streams", nargs='*')
+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
+    argp.add_argument("--csv_in", help="CSV input filename")
+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
+    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
+
+    args = argp.parse_args()
+
+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
+    csv_out.writeheader()
+
+    stats_in = {}
+    if args.csv_in != None:
+        with open(args.csv_in, 'r', newline='') as f_in:
+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+
+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
+
+    streams = args.streams
+    if not streams:
+        if not stats_in:
+            print ("No source streams specified")
+            return 1
+        prefix = "" if args.prefix == None else args.prefix
+        streams = [k for k in stats_in]
+    elif args.prefix != None:
+        prefix = args.prefix
+    else:
+        prefix = streams[0]
+        for f in streams[1:]:
+            prefix = common_prefix(prefix, f)
+        pp = prefix.rpartition(os.sep)
+        prefix = pp[0] + pp[1]
+        streams = [s[len(prefix):] for s in streams]
+
+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
+        print ("====", f)
+
+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
+        for i in range(args.repeat):
+            t = tstats.time_file(f, prefix, args.ffmpeg)
+            print ("...", t.times_str())
+            if t0 > t:
+                t0 = t
+
+        if t0.name in stats_in:
+            pstat = stats_in[t0.name]
+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
+
+        csv_out.writerow(t0.dict())
+
+        print ()
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
+
diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh
new file mode 100755
index 0000000000..0948a68a7a
--- /dev/null
+++ b/pi-util/genpatch.sh
@@ -0,0 +1,35 @@
+set -e
+
+NOPATCH=
+if [ "$1" == "--notag" ]; then
+  shift
+  NOPATCH=1
+fi
+
+if [ "$1" == "" ]; then
+  echo Usage: $0 [--notag] \<patch_tag\>
+  echo e.g.: $0 mmal_4
+  exit 1
+fi
+
+VERSION=`cat RELEASE`
+if [ "$VERSION" == "" ]; then
+  echo Can\'t find version RELEASE
+  exit 1
+fi
+
+PATCHFILE=../ffmpeg-$VERSION-$1.patch
+
+if [ $NOPATCH ]; then
+  echo Not tagged
+else
+  # Only continue if we are all comitted
+  git diff --name-status --exit-code
+
+  PATCHTAG=pi/$VERSION/$1
+  echo Tagging: $PATCHTAG
+
+  git tag $PATCHTAG
+fi
+echo Generating patch: $PATCHFILE
+git diff n$VERSION -- > $PATCHFILE
diff --git a/pi-util/make_array.py b/pi-util/make_array.py
new file mode 100755
index 0000000000..67b22d2d51
--- /dev/null
+++ b/pi-util/make_array.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# Usage
+#   make_array file.bin
+#   Produces file.h with array of bytes.
+#
+import sys
+for file in sys.argv[1:]:
+  prefix,suffix = file.split('.')
+  assert suffix=='bin'
+  name=prefix.split('/')[-1]
+  print 'Converting',file
+  with open(prefix+'.h','wb') as out:
+    print >>out, 'static const unsigned char',name,'[] = {'
+    with open(file,'rb') as fd:
+      i = 0
+      for byte in fd.read():
+        print >>out, '0x%02x, ' % ord(byte),
+        i = i + 1
+        if i % 8 == 0:
+          print >>out, ' // %04x' % (i - 8)
+    print >>out,'};'
+
diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh
new file mode 100755
index 0000000000..271a39e846
--- /dev/null
+++ b/pi-util/mkinst.sh
@@ -0,0 +1,5 @@
+set -e
+
+make install
+
+cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
diff --git a/pi-util/patkodi.sh b/pi-util/patkodi.sh
new file mode 100644
index 0000000000..dcd05a606e
--- /dev/null
+++ b/pi-util/patkodi.sh
@@ -0,0 +1,9 @@
+set -e
+KODIBASE=/home/jc/rpi/kodi/xbmc
+JOBS=-j20
+make $JOBS
+git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
+make -C $KODIBASE/build install
+
+
diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
new file mode 100755
index 0000000000..e44cfa0c3c
--- /dev/null
+++ b/pi-util/perfcmp.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+
+import time
+import string
+import os
+import tempfile
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class tstats:
+    close_threshold = 0.01
+
+    def __init__(self, stats_dict=None):
+        if stats_dict != None:
+            self.name = stats_dict["name"]
+            self.elapsed = float(stats_dict["elapsed"])
+            self.user = float(stats_dict["user"])
+            self.sys = float(stats_dict["sys"])
+
+    def times_str(self):
+        ctime = self.sys + self.user
+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+
+    def dict(self):
+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+
+    def is_close(self, other):
+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+
+    def __lt__(self, other):
+        return self.elapsed < other.elapsed
+    def __gt__(self, other):
+        return self.elapsed > other.elapsed
+
+    def time_file(name, prefix):
+        stats = tstats()
+        stats.name = name
+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+        pinfo = os.wait4(cproc.pid, 0)
+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        stats.elapsed = end_time - start_time
+        stats.user = pinfo[2].ru_utime
+        stats.sys = pinfo[2].ru_stime
+        return stats
+
+
+def common_prefix(s1, s2):
+    for i in range(min(len(s1),len(s2))):
+        if s1[i] != s2[i]:
+            return s1[:i]
+    return s1[:i+1]
+
+def main():
+    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
+
+    argp.add_argument("stream0", help="CSV to compare")
+    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
+
+    args = argp.parse_args()
+
+    with open(args.stream0, 'r', newline='') as f_in:
+        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+    with open(args.stream1, 'r', newline='') as f_in:
+        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+
+    print (args.stream0, "<<-->>", args.stream1)
+    print ()
+
+    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
+       if not (f in stats0) :
+           print ("           XX               :", f)
+           continue
+       if not (f in stats1) :
+           print ("       XX                   :", f)
+           continue
+
+       s0 = stats0[f]
+       s1 = stats1[f]
+
+       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
+       thresh = 0.3
+       tc = 6
+
+       nchar = min(tc - 1, int(abs(pcent) / thresh))
+       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
+
+       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
+           (s0.elapsed, cc, s1.elapsed, pcent, f))
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
+
diff --git a/pi-util/qem.sh b/pi-util/qem.sh
new file mode 100755
index 0000000000..a4dbb6eacd
--- /dev/null
+++ b/pi-util/qem.sh
@@ -0,0 +1,9 @@
+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+QASM=python\ ../local/bin/qasm.py
+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
+DST_BASE=shader
+
+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+
diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py
new file mode 100755
index 0000000000..b322dac0c2
--- /dev/null
+++ b/pi-util/testfilt.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import string
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class validator:
+    def __init__(self):
+        self.ok = False
+
+    def isok(self):
+        return self.ok
+
+    def setok(self):
+        self.ok = True
+
+class valid_regex(validator):
+    def __init__(self, regex):
+        super().__init__()
+        self.regex = re.compile(regex)
+
+    def scanline(self, line):
+        if self.isok() or self.regex.search(line):
+            self.setok()
+
+
+def validate(validators, flog):
+    for line in flog:
+        for v in validators:
+            v.scanline(line)
+
+    ok = True
+    for v in validators:
+        if not v.isok():
+            ok = False
+            # complain
+            print("Test failed")
+
+    if ok:
+        print("OK")
+    return ok
+
+def runtest(name, ffmpeg, args, suffix, validators):
+    log_root = os.path.join("/tmp", "testfilt", name)
+    ofilename = os.path.join(log_root, name + suffix)
+
+    if not os.path.exists(log_root):
+        os.makedirs(log_root)
+
+    try:
+        os.remove(ofilename)
+    except:
+        pass
+
+    flog = open(os.path.join(log_root, name + ".log"), "wb")
+    ffargs = [ffmpeg] + args + [ofilename]
+
+    subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False)
+    flog.close
+
+    flog = open(os.path.join(log_root, name + ".log"), "rt")
+    return validate(validators, flog)
+
+def sayok(log_root, flog):
+    print("Woohoo")
+    return True
+
+if __name__ == '__main__':
+
+    argp = argparse.ArgumentParser(description="FFmpeg filter tester")
+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
+    args = argp.parse_args()
+
+    runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i",
+                                   "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv",
+#                                    "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv",
+                                   "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv",
+            [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')])
diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
new file mode 100755
index 0000000000..5935a11ca5
--- /dev/null
+++ b/pi-util/v3dusage.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import re
+
+def do_logparse(logname):
+
+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
+
+    ttotal = {'idle':0.0}
+    tstart = {}
+    qctotal = {}
+    qtstotal = {}
+    l2hits = {}
+    l2total = {}
+    time0 = None
+    idle_start = None
+    qpu_op_no = 0
+    op_count = 0
+
+    with open(logname, "rt") as infile:
+        for line in infile:
+            match = rmatch.match(line)
+            if match:
+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
+                time = float(match.group(1))
+                unit = match.group(3)
+                opstart = not match.group(2)
+                optype = match.group(7)
+                hascb = match.group(8) != "0"
+
+                if unit == 'qpu1':
+                    unit = unit + "." + str(qpu_op_no)
+                    if not opstart:
+                        if hascb or optype == 'EXECUTE_SYNC':
+                            qpu_op_no = 0
+                        else:
+                            qpu_op_no += 1
+
+                # Ignore sync type
+                if optype == 'EXECUTE_SYNC':
+                    continue
+
+                if not time0:
+                    time0 = time
+
+                if opstart:
+                    tstart[unit] = time;
+                elif unit in tstart:
+                    op_count += 1
+                    if not unit in ttotal:
+                        ttotal[unit] = 0.0
+                    ttotal[unit] += time - tstart[unit]
+                    del tstart[unit]
+
+                if not idle_start and not tstart:
+                    idle_start = time
+                elif idle_start and tstart:
+                    ttotal['idle'] += time - idle_start
+                    idle_start = None
+
+            match = rqcycle.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in qctotal:
+                    qctotal[unit] = 0
+                qctotal[unit] += int(match.group(2))
+
+            match = rqtscycle.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in qtstotal:
+                    qtstotal[unit] = 0
+                qtstotal[unit] += int(match.group(2))
+
+            match = rl2hits.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in l2total:
+                    l2total[unit] = 0
+                    l2hits[unit] = 0
+                l2total[unit] += int(match.group(3))
+                if match.group(2) == "hits":
+                    l2hits[unit] += int(match.group(3))
+
+
+    if not time0:
+        print "No v3d profile records found"
+    else:
+        tlogged = time - time0
+
+        print "Logged time:", tlogged, "  Op count:", op_count
+        for unit in sorted(ttotal):
+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
+        print
+        for unit in sorted(qctotal):
+            if not unit in qtstotal:
+                qtstotal[unit] = 0;
+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
+            if unit in l2total:
+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
+
+
+
+if __name__ == '__main__':
+    argp = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="QPU/VPU perf summary from VC logging",
+        epilog = """
+Will also summarise TMU stalls if logging requests set in qpu noflush param
+in the profiled code.
+
+Example use:
+  vcgencmd set_logging level=0xc0
+  <command to profile>
+  sudo vcdbg log msg >& t.log
+  v3dusage.py t.log
+""")
+
+    argp.add_argument("logfile")
+    args = argp.parse_args()
+
+    do_logparse(args.logfile)
+
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 1827a4e134..08da4166ef 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)           += g722dsp.o
 AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
 AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
+AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
+AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
 AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
 AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8338e8ff58..81ef182f04 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -131,6 +131,9 @@ static const struct {
     #if CONFIG_HUFFYUV_DECODER
         { "huffyuvdsp", checkasm_check_huffyuvdsp },
     #endif
+    #if CONFIG_IDCTDSP
+        { "idctdsp", checkasm_check_idctdsp },
+    #endif
     #if CONFIG_JPEG2000_DECODER
         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
     #endif
@@ -155,6 +158,9 @@ static const struct {
     #if CONFIG_V210_ENCODER
         { "v210enc", checkasm_check_v210enc },
     #endif
+    #if CONFIG_VC1DSP
+        { "vc1dsp", checkasm_check_vc1dsp },
+    #endif
     #if CONFIG_VP8DSP
         { "vp8dsp", checkasm_check_vp8dsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index ef6645e3a2..1a1e17d835 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -70,6 +70,7 @@ void checkasm_check_hevc_epel_bi(void);
 void checkasm_check_hevc_epel_bi_w(void);
 void checkasm_check_hevc_sao(void);
 void checkasm_check_huffyuvdsp(void);
+void checkasm_check_idctdsp(void);
 void checkasm_check_jpeg2000dsp(void);
 void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
@@ -83,6 +84,7 @@ void checkasm_check_sw_scale(void);
 void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
+void checkasm_check_vc1dsp(void);
 void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c
new file mode 100644
index 0000000000..02724536a7
--- /dev/null
+++ b/tests/checkasm/idctdsp.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022 Ben Avison
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "checkasm.h"
+
+#include "libavcodec/idctdsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
+
+typedef struct {
+    const char *name;
+    size_t offset;
+} test;
+
+#define RANDOMIZE_BUFFER16(name, size)          \
+    do {                                        \
+        int i;                                  \
+        for (i = 0; i < size; ++i) {            \
+            uint16_t r = rnd() % 0x201 - 0x100; \
+            AV_WN16A(name##0 + i, r);           \
+            AV_WN16A(name##1 + i, r);           \
+        }                                       \
+    } while (0)
+
+#define RANDOMIZE_BUFFER8(name, size)         \
+    do {                                      \
+        int i;                                \
+        for (i = 0; i < size; ++i) {          \
+            uint8_t r = rnd();                \
+            name##0[i] = r;                   \
+            name##1[i] = r;                   \
+        }                                     \
+    } while (0)
+
+static void check_add_put_clamped(void)
+{
+    /* Source buffers are only as big as needed, since any over-read won't affect results */
+    LOCAL_ALIGNED_16(int16_t, src0, [64]);
+    LOCAL_ALIGNED_16(int16_t, src1, [64]);
+    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
+    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
+    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
+
+    AVCodecContext avctx = { 0 };
+    IDCTDSPContext h;
+
+    const test tests[] = {
+        IDCTDSP_TEST(add_pixels_clamped)
+        IDCTDSP_TEST(put_pixels_clamped)
+        IDCTDSP_TEST(put_signed_pixels_clamped)
+    };
+
+    ff_idctdsp_init(&h, &avctx);
+
+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
+        if (check_func(func, "idctdsp.%s", tests[t].name)) {
+            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
+            RANDOMIZE_BUFFER16(src, 64);
+            RANDOMIZE_BUFFER8(dst, 10 * 24);
+            call_ref(src0, dst0 + 24 + 8, 24);
+            call_new(src1, dst1 + 24 + 8, 24);
+            if (memcmp(dst0, dst1, 10 * 24))
+                fail();
+            bench_new(src1, dst1 + 24 + 8, 24);
+        }
+    }
+}
+
+void checkasm_check_idctdsp(void)
+{
+    check_add_put_clamped();
+    report("idctdsp");
+}
diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
new file mode 100644
index 0000000000..52628d15e4
--- /dev/null
+++ b/tests/checkasm/vc1dsp.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2022 Ben Avison
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "checkasm.h"
+
+#include "libavcodec/vc1dsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
+#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
+
+typedef struct {
+    const char *name;
+    size_t offset;
+    int width;
+    int height;
+} test;
+
+typedef struct matrix {
+    size_t width;
+    size_t height;
+    float d[];
+} matrix;
+
+static const matrix T8 = { 8, 8, {
+        12,  12,  12,  12,  12,  12,  12,  12,
+        16,  15,   9,   4,  -4,  -9, -15, -16,
+        16,   6,  -6, -16, -16,  -6,   6,  16,
+        15,  -4, -16,  -9,   9,  16,   4, -15,
+        12, -12, -12,  12,  12, -12, -12,  12,
+         9, -16,   4,  15, -15,  -4,  16,  -9,
+         6, -16,  16,  -6,  -6,  16, -16,   6,
+         4,  -9,  15, -16,  16, -15,   9,  -4
+} };
+
+static const matrix T4 = { 4, 4, {
+        17,  17,  17,  17,
+        22,  10, -10, -22,
+        17, -17, -17,  17,
+        10, -22,  22, -10
+} };
+
+static const matrix T8t = { 8, 8, {
+        12,  16,  16,  15,  12,   9,   6,   4,
+        12,  15,   6,  -4, -12, -16, -16,  -9,
+        12,   9,  -6, -16, -12,   4,  16,  15,
+        12,   4, -16,  -9,  12,  15,  -6, -16,
+        12,  -4, -16,   9,  12, -15,  -6,  16,
+        12,  -9,  -6,  16, -12,  -4,  16, -15,
+        12, -15,   6,   4, -12,  16, -16,   9,
+        12, -16,  16, -15,  12,  -9,   6,  -4
+} };
+
+static const matrix T4t = { 4, 4, {
+        17,  22,  17,  10,
+        17,  10, -17, -22,
+        17, -10, -17,  22,
+        17, -22,  17, -10
+} };
+
+static matrix *new_matrix(size_t width, size_t height)
+{
+    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
+    if (out == NULL) {
+        fprintf(stderr, "Memory allocation failure\n");
+        exit(EXIT_FAILURE);
+    }
+    out->width = width;
+    out->height = height;
+    return out;
+}
+
+static matrix *multiply(const matrix *a, const matrix *b)
+{
+    matrix *out;
+    if (a->width != b->height) {
+        fprintf(stderr, "Incompatible multiplication\n");
+        exit(EXIT_FAILURE);
+    }
+    out = new_matrix(b->width, a->height);
+    for (int j = 0; j < out->height; ++j)
+        for (int i = 0; i < out->width; ++i) {
+            float sum = 0;
+            for (int k = 0; k < a->width; ++k)
+                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
+            out->d[j * out->width + i] = sum;
+        }
+    return out;
+}
+
+static void normalise(matrix *a)
+{
+    for (int j = 0; j < a->height; ++j)
+        for (int i = 0; i < a->width; ++i) {
+            float *p = a->d + j * a->width + i;
+            *p *= 64;
+            if (a->height == 4)
+                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
+            else
+                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
+            if (a->width == 4)
+                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
+            else
+                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
+        }
+}
+
+static void divide_and_round_nearest(matrix *a, float by)
+{
+    for (int j = 0; j < a->height; ++j)
+        for (int i = 0; i < a->width; ++i) {
+            float *p = a->d + j * a->width + i;
+            *p = rintf(*p / by);
+        }
+}
+
+static void tweak(matrix *a)
+{
+    for (int j = 4; j < a->height; ++j)
+        for (int i = 0; i < a->width; ++i) {
+            float *p = a->d + j * a->width + i;
+            *p += 1;
+        }
+}
+
+/* The VC-1 spec places restrictions on the values permitted at three
+ * different stages:
+ * - D: the input coefficients in frequency domain
+ * - E: the intermediate coefficients, inverse-transformed only horizontally
+ * - R: the fully inverse-transformed coefficients
+ *
+ * To fully cater for the ranges specified requires various intermediate
+ * values to be held to 17-bit precision; yet these conditions do not appear
+ * to be utilised in real-world streams. At least some assembly
+ * implementations have chosen to restrict these values to 16-bit precision,
+ * to accelerate the decoding of real-world streams at the cost of strict
+ * adherence to the spec. To avoid our test marking these as failures,
+ * reduce our random inputs.
+ */
+#define ATTENUATION 4
+
+static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
+{
+    matrix *raw, *tmp, *D, *E, *R;
+    raw = new_matrix(width, height);
+    for (int i = 0; i < width * height; ++i)
+        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
+    tmp = multiply(height == 8 ? &T8 : &T4, raw);
+    D = multiply(tmp, width == 8 ? &T8t : &T4t);
+    normalise(D);
+    divide_and_round_nearest(D, 1);
+    for (int i = 0; i < width * height; ++i) {
+        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
+            /* Rare, so simply try again */
+            av_free(raw);
+            av_free(tmp);
+            av_free(D);
+            return generate_inverse_quantized_transform_coefficients(width, height);
+        }
+    }
+    E = multiply(D, width == 8 ? &T8 : &T4);
+    divide_and_round_nearest(E, 8);
+    for (int i = 0; i < width * height; ++i)
+        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
+            /* Rare, so simply try again */
+            av_free(raw);
+            av_free(tmp);
+            av_free(D);
+            av_free(E);
+            return generate_inverse_quantized_transform_coefficients(width, height);
+        }
+    R = multiply(height == 8 ? &T8t : &T4t, E);
+    tweak(R);
+    divide_and_round_nearest(R, 128);
+    for (int i = 0; i < width * height; ++i)
+        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
+            /* Rare, so simply try again */
+            av_free(raw);
+            av_free(tmp);
+            av_free(D);
+            av_free(E);
+            av_free(R);
+            return generate_inverse_quantized_transform_coefficients(width, height);
+        }
+    av_free(raw);
+    av_free(tmp);
+    av_free(E);
+    av_free(R);
+    return D;
+}
+
+#define RANDOMIZE_BUFFER16(name, size)        \
+    do {                                      \
+        int i;                                \
+        for (i = 0; i < size; ++i) {          \
+            uint16_t r = rnd();               \
+            AV_WN16A(name##0 + i, r);         \
+            AV_WN16A(name##1 + i, r);         \
+        }                                     \
+    } while (0)
+
+#define RANDOMIZE_BUFFER8(name, size)         \
+    do {                                      \
+        int i;                                \
+        for (i = 0; i < size; ++i) {          \
+            uint8_t r = rnd();                \
+            name##0[i] = r;                   \
+            name##1[i] = r;                   \
+        }                                     \
+    } while (0)
+
+#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
+    do {                                            \
+        uint8_t *p##0 = name##0, *p##1 = name##1;   \
+        int i = (size);                             \
+        while (i-- > 0) {                           \
+            int x = 0x80 | (rnd() & 0x7F);          \
+            x >>= rnd() % 9;                        \
+            if (rnd() & 1)                          \
+                x = -x;                             \
+            *p##1++ = *p##0++ = 0x80 + x;           \
+        }                                           \
+    } while (0)
+
+static void check_inv_trans_inplace(void)
+{
+    /* Inverse transform input coefficients are stored in a 16-bit buffer
+     * with row stride of 8 coefficients irrespective of transform size.
+     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
+     * are stored in column-major order, and the outputs are written back
+     * to the input buffer, so we oversize it slightly to catch overruns. */
+    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
+    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
+
+    VC1DSPContext h;
+
+    ff_vc1dsp_init(&h);
+
+    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
+        matrix *coeffs;
+        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
+        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
+        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
+        for (int j = 0; j < 8; ++j)
+            for (int i = 0; i < 8; ++i) {
+                int idx = 8 + i * 8 + j;
+                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
+            }
+        call_ref(inv_trans_in0 + 8);
+        call_new(inv_trans_in1 + 8);
+        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
+            fail();
+        bench_new(inv_trans_in1 + 8);
+        av_free(coeffs);
+    }
+}
+
+static void check_inv_trans_adding(void)
+{
+    /* Inverse transform input coefficients are stored in a 16-bit buffer
+     * with row stride of 8 coefficients irrespective of transform size. */
+    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
+    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
+
+    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
+     * added with saturation to an array of unsigned 8-bit values. Oversize
+     * this by 8 samples left and right and one row above and below. */
+    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
+    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
+
+    VC1DSPContext h;
+
+    const test tests[] = {
+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
+    };
+
+    ff_vc1dsp_init(&h);
+
+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
+        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
+            matrix *coeffs;
+            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
+            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
+            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
+            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
+            for (int j = 0; j < tests[t].height; ++j)
+                for (int i = 0; i < tests[t].width; ++i) {
+                    int idx = j * 8 + i;
+                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
+                }
+            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
+            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
+            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
+                fail();
+            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
+            av_free(coeffs);
+        }
+    }
+}
+
+static void check_loop_filter(void)
+{
+    /* Deblocking filter buffers are big enough to hold a 16x16 block,
+     * plus 16 columns left and 4 rows above to hold filter inputs
+     * (depending on whether v or h neighbouring block edge, oversized
+     * horizontally to maintain 16-byte alignment) plus 16 columns and
+     * 4 rows below to catch write overflows */
+    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
+    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
+
+    VC1DSPContext h;
+
+    const test tests[] = {
+        VC1DSP_TEST(vc1_v_loop_filter4)
+        VC1DSP_TEST(vc1_h_loop_filter4)
+        VC1DSP_TEST(vc1_v_loop_filter8)
+        VC1DSP_TEST(vc1_h_loop_filter8)
+        VC1DSP_TEST(vc1_v_loop_filter16)
+        VC1DSP_TEST(vc1_h_loop_filter16)
+    };
+
+    ff_vc1dsp_init(&h);
+
+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
+        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
+        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
+            for (int count = 1000; count > 0; --count) {
+                int pq = rnd() % 31 + 1;
+                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
+                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
+                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
+                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
+                    fail();
+            }
+        }
+        for (int j = 0; j < 24; ++j)
+            for (int i = 0; i < 48; ++i)
+                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
+        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
+            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
+        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
+            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
+    }
+}
+
+#define TEST_UNESCAPE                                                                               \
+    do {                                                                                            \
+        for (int count = 100; count > 0; --count) {                                                 \
+            escaped_offset = rnd() & 7;                                                             \
+            unescaped_offset = rnd() & 7;                                                           \
+            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
+            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
+            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
+            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
+            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
+                fail();                                                                             \
+        }                                                                                           \
+    } while (0)
+
+static void check_unescape(void)
+{
+    /* This appears to be a typical length of buffer in use */
+#define LOG2_UNESCAPE_BUF_SIZE 17
+#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
+    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
+
+    VC1DSPContext h;
+
+    ff_vc1dsp_init(&h);
+
+    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
+        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
+        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
+
+        /* Test data which consists of escapes sequences packed as tightly as possible */
+        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
+            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
+        TEST_UNESCAPE;
+
+        /* Test random data */
+        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
+        TEST_UNESCAPE;
+
+        /* Test data with escape sequences at random intervals */
+        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
+            int gap, gap_msb;
+            escaped1[x+0] = escaped0[x+0] = 0;
+            escaped1[x+1] = escaped0[x+1] = 0;
+            escaped1[x+2] = escaped0[x+2] = 3;
+            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
+            gap_msb = 2u << (rnd() % 8);
+            gap = (rnd() &~ -gap_msb) | gap_msb;
+            x += gap;
+        }
+        TEST_UNESCAPE;
+
+        /* Test data which is known to contain no escape sequences */
+        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
+        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
+        TEST_UNESCAPE;
+
+        /* Benchmark the no-escape-sequences case */
+        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
+    }
+}
+
+void checkasm_check_vc1dsp(void)
+{
+    check_inv_trans_inplace();
+    check_inv_trans_adding();
+    report("inv_trans");
+
+    check_loop_filter();
+    report("loop_filter");
+
+    check_unescape();
+    report("unescape_buffer");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 07f1d8238e..aa5f45ec8f 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-hevc_add_res                              \
                 fate-checkasm-hevc_idct                                 \
                 fate-checkasm-hevc_sao                                  \
+                fate-checkasm-idctdsp                                   \
                 fate-checkasm-jpeg2000dsp                               \
                 fate-checkasm-llviddsp                                  \
                 fate-checkasm-llviddspenc                               \
@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-sw_scale                                  \
                 fate-checkasm-v210dec                                   \
                 fate-checkasm-v210enc                                   \
+                fate-checkasm-vc1dsp                                    \
                 fate-checkasm-vf_blend                                  \
                 fate-checkasm-vf_colorspace                             \
                 fate-checkasm-vf_eq                                     \
diff --git a/tests/ref/fate/webm-dash-manifest b/tests/ref/fate/webm-dash-manifest
index 3a557fc39f..f5fc9121da 100644
--- a/tests/ref/fate/webm-dash-manifest
+++ b/tests/ref/fate/webm-dash-manifest
@@ -6,7 +6,7 @@
   type="static"
   mediaPresentationDuration="PT32.501S"
   minBufferTime="PT1S"
-  profiles="urn:mpeg:dash:profile:webm-on-demand:2012">
+  profiles="urn:webm:dash:profile:webm-on-demand:2012">
 <Period id="0" start="PT0S" duration="PT32.501S" >
 <AdaptationSet id="0" mimeType="video/webm" codecs="vp8" lang="eng" width="640" height="360" bitstreamSwitching="true" subsegmentAlignment="true" subsegmentStartsWithSAP="1">
 <Representation id="0" bandwidth="302355">
diff --git a/tests/ref/fate/webm-dash-manifest-representations b/tests/ref/fate/webm-dash-manifest-representations
index 41713bb367..8556ecebee 100644
--- a/tests/ref/fate/webm-dash-manifest-representations
+++ b/tests/ref/fate/webm-dash-manifest-representations
@@ -6,7 +6,7 @@
   type="static"
   mediaPresentationDuration="PT32.48S"
   minBufferTime="PT1S"
-  profiles="urn:mpeg:dash:profile:webm-on-demand:2012">
+  profiles="urn:webm:dash:profile:webm-on-demand:2012">
 <Period id="0" start="PT0S" duration="PT32.48S" >
 <AdaptationSet id="0" mimeType="video/webm" codecs="vp8" lang="eng" bitstreamSwitching="true" subsegmentAlignment="false" subsegmentStartsWithSAP="1">
 <Representation id="0" bandwidth="302355" width="640" height="360">
diff --git a/tests/ref/fate/webm-dash-manifest-unaligned-audio-streams b/tests/ref/fate/webm-dash-manifest-unaligned-audio-streams
index b1bc7ecea1..6e9de211fb 100644
--- a/tests/ref/fate/webm-dash-manifest-unaligned-audio-streams
+++ b/tests/ref/fate/webm-dash-manifest-unaligned-audio-streams
@@ -6,7 +6,7 @@
   type="static"
   mediaPresentationDuration="PT32.501S"
   minBufferTime="PT1S"
-  profiles="urn:mpeg:dash:profile:webm-on-demand:2012">
+  profiles="urn:webm:dash:profile:webm-on-demand:2012">
 <Period id="0" start="PT0S" duration="PT32.501S" >
 <AdaptationSet id="0" mimeType="audio/webm" codecs="vorbis" lang="eng" audioSamplingRate="44100" bitstreamSwitching="false" subsegmentAlignment="false" subsegmentStartsWithSAP="1">
 <Representation id="0" bandwidth="82867">
diff --git a/tests/ref/fate/webm-dash-manifest-unaligned-video-streams b/tests/ref/fate/webm-dash-manifest-unaligned-video-streams
index 690c2aabe3..ce205638b6 100644
--- a/tests/ref/fate/webm-dash-manifest-unaligned-video-streams
+++ b/tests/ref/fate/webm-dash-manifest-unaligned-video-streams
@@ -6,7 +6,7 @@
   type="static"
   mediaPresentationDuration="PT32.48S"
   minBufferTime="PT1S"
-  profiles="urn:mpeg:dash:profile:webm-on-demand:2012">
+  profiles="urn:webm:dash:profile:webm-on-demand:2012">
 <Period id="0" start="PT0S" duration="PT32.48S" >
 <AdaptationSet id="0" mimeType="video/webm" codecs="vp8" lang="eng" width="640" height="360" bitstreamSwitching="true" subsegmentAlignment="false" subsegmentStartsWithSAP="0">
 <Representation id="0" bandwidth="302355">
diff --git a/tests/ref/seek/vsynth_lena-snow b/tests/ref/seek/vsynth_lena-snow
index b2d2d22cda..33d6c27463 100644
--- a/tests/ref/seek/vsynth_lena-snow
+++ b/tests/ref/seek/vsynth_lena-snow
@@ -2,45 +2,45 @@ ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:   5652 size:  3035
 ret: 0         st:-1 flags:0  ts:-1.000000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:   5652 size:  3035
 ret: 0         st:-1 flags:1  ts: 1.894167
-ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39690 size:  3640
+ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39806 size:  3640
 ret: 0         st: 0 flags:0  ts: 0.800000
-ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:  27382 size:  3493
+ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:  27442 size:  3494
 ret:-1         st: 0 flags:1  ts:-0.320000
 ret:-1         st:-1 flags:0  ts: 2.576668
 ret: 0         st:-1 flags:1  ts: 1.470835
-ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39690 size:  3640
+ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39806 size:  3640
 ret: 0         st: 0 flags:0  ts: 0.360000
-ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:  16074 size:  3245
+ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:  16134 size:  3244
 ret:-1         st: 0 flags:1  ts:-0.760000
 ret:-1         st:-1 flags:0  ts: 2.153336
 ret: 0         st:-1 flags:1  ts: 1.047503
-ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:  27382 size:  3493
+ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:  27442 size:  3494
 ret: 0         st: 0 flags:0  ts:-0.040000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:   5652 size:  3035
 ret: 0         st: 0 flags:1  ts: 2.840000
-ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52538 size:  3582
+ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52608 size:  3582
 ret: 0         st:-1 flags:0  ts: 1.730004
-ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52538 size:  3582
+ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52608 size:  3582
 ret: 0         st:-1 flags:1  ts: 0.624171
-ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:  16074 size:  3245
+ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:  16134 size:  3244
 ret: 0         st: 0 flags:0  ts:-0.480000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:   5652 size:  3035
 ret: 0         st: 0 flags:1  ts: 2.400000
-ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52538 size:  3582
+ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52608 size:  3582
 ret: 0         st:-1 flags:0  ts: 1.306672
-ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39690 size:  3640
+ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39806 size:  3640
 ret: 0         st:-1 flags:1  ts: 0.200839
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:   5652 size:  3035
 ret: 0         st: 0 flags:0  ts:-0.920000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:   5652 size:  3035
 ret: 0         st: 0 flags:1  ts: 2.000000
-ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52538 size:  3582
+ret: 0         st: 0 flags:1 dts: 1.920000 pts: 1.920000 pos:  52608 size:  3582
 ret: 0         st:-1 flags:0  ts: 0.883340
-ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:  27382 size:  3493
+ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:  27442 size:  3494
 ret:-1         st:-1 flags:1  ts:-0.222493
 ret:-1         st: 0 flags:0  ts: 2.680000
 ret: 0         st: 0 flags:1  ts: 1.560000
-ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39690 size:  3640
+ret: 0         st: 0 flags:1 dts: 1.440000 pts: 1.440000 pos:  39806 size:  3640
 ret: 0         st:-1 flags:0  ts: 0.460008
-ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:  16074 size:  3245
+ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:  16134 size:  3244
 ret:-1         st:-1 flags:1  ts:-0.645825
diff --git a/tests/ref/vsynth/vsynth1-snow b/tests/ref/vsynth/vsynth1-snow
index b0e3a0bfd7..f20abd2ee4 100644
--- a/tests/ref/vsynth/vsynth1-snow
+++ b/tests/ref/vsynth/vsynth1-snow
@@ -1,4 +1,4 @@
-c4c77a6fb926b89fe6591c398f5cd4db *tests/data/fate/vsynth1-snow.avi
-136160 tests/data/fate/vsynth1-snow.avi
-dcf8b3f62d9c3ae2b2d0fbbacbf83e4e *tests/data/fate/vsynth1-snow.out.rawvideo
-stddev:   22.74 PSNR: 20.99 MAXDIFF:  173 bytes:  7603200/  7603200
+67c10f8d52fcd1103caa675a1408bf6e *tests/data/fate/vsynth1-snow.avi
+136088 tests/data/fate/vsynth1-snow.avi
+bfc0bcc4bc7b956933aa58acc587018d *tests/data/fate/vsynth1-snow.out.rawvideo
+stddev:   22.77 PSNR: 20.98 MAXDIFF:  175 bytes:  7603200/  7603200
diff --git a/tests/ref/vsynth/vsynth1-snow-hpel b/tests/ref/vsynth/vsynth1-snow-hpel
index 72b082b2ce..39780ad8a2 100644
--- a/tests/ref/vsynth/vsynth1-snow-hpel
+++ b/tests/ref/vsynth/vsynth1-snow-hpel
@@ -1,4 +1,4 @@
-5c9eb93646eb0e5570d37e9adc9625e4 *tests/data/fate/vsynth1-snow-hpel.avi
-138580 tests/data/fate/vsynth1-snow-hpel.avi
-3382bdde624d8bb4af206a5ac6614605 *tests/data/fate/vsynth1-snow-hpel.out.rawvideo
-stddev:   22.71 PSNR: 21.00 MAXDIFF:  171 bytes:  7603200/  7603200
+e62ae25d5040d04622a965bcb27fdb1e *tests/data/fate/vsynth1-snow-hpel.avi
+138446 tests/data/fate/vsynth1-snow-hpel.avi
+57c914cd150f8fc260b5989ce3e5884c *tests/data/fate/vsynth1-snow-hpel.out.rawvideo
+stddev:   22.74 PSNR: 20.99 MAXDIFF:  172 bytes:  7603200/  7603200
diff --git a/tests/ref/vsynth/vsynth2-snow b/tests/ref/vsynth/vsynth2-snow
index 355f89d5f4..e9607bb7d0 100644
--- a/tests/ref/vsynth/vsynth2-snow
+++ b/tests/ref/vsynth/vsynth2-snow
@@ -1,4 +1,4 @@
-5e130d6a48b69348eee7f7c76c5869a3 *tests/data/fate/vsynth2-snow.avi
-72942 tests/data/fate/vsynth2-snow.avi
-9b6cee60e3ec0d1f312a8a25a7878fcc *tests/data/fate/vsynth2-snow.out.rawvideo
-stddev:   13.39 PSNR: 25.59 MAXDIFF:  154 bytes:  7603200/  7603200
+0a41e73ddd2f54936490655b46dad4a3 *tests/data/fate/vsynth2-snow.avi
+72868 tests/data/fate/vsynth2-snow.avi
+34a75f5cf8a71159f1a572d9cedcfef9 *tests/data/fate/vsynth2-snow.out.rawvideo
+stddev:   13.73 PSNR: 25.37 MAXDIFF:  162 bytes:  7603200/  7603200
diff --git a/tests/ref/vsynth/vsynth2-snow-hpel b/tests/ref/vsynth/vsynth2-snow-hpel
index ec3b5dfad2..66839fd6f6 100644
--- a/tests/ref/vsynth/vsynth2-snow-hpel
+++ b/tests/ref/vsynth/vsynth2-snow-hpel
@@ -1,4 +1,4 @@
-8edcf0fd7f066972ff77d5b891ed6dde *tests/data/fate/vsynth2-snow-hpel.avi
-79798 tests/data/fate/vsynth2-snow-hpel.avi
-7e0f2a24feda6fb3e54b85511a28c45f *tests/data/fate/vsynth2-snow-hpel.out.rawvideo
-stddev:   13.35 PSNR: 25.62 MAXDIFF:  157 bytes:  7603200/  7603200
+9bc409e4794ee50691a26c9c836d31a7 *tests/data/fate/vsynth2-snow-hpel.avi
+79728 tests/data/fate/vsynth2-snow-hpel.avi
+2cc64d8171175a1532fd7d3ed3011fbf *tests/data/fate/vsynth2-snow-hpel.out.rawvideo
+stddev:   13.70 PSNR: 25.39 MAXDIFF:  162 bytes:  7603200/  7603200
diff --git a/tests/ref/vsynth/vsynth_lena-snow b/tests/ref/vsynth/vsynth_lena-snow
index 582c294531..ec29a78483 100644
--- a/tests/ref/vsynth/vsynth_lena-snow
+++ b/tests/ref/vsynth/vsynth_lena-snow
@@ -1,4 +1,4 @@
-bf2cf9cacc1d98388798be98872049ee *tests/data/fate/vsynth_lena-snow.avi
-57604 tests/data/fate/vsynth_lena-snow.avi
-707a42eb20195913be55ba8dfadf72fb *tests/data/fate/vsynth_lena-snow.out.rawvideo
-stddev:   10.37 PSNR: 27.81 MAXDIFF:  120 bytes:  7603200/  7603200
+8e96f337e8f4ccac7d72ef517e1d2208 *tests/data/fate/vsynth_lena-snow.avi
+57680 tests/data/fate/vsynth_lena-snow.avi
+90963cfd2359d460001c94d94256dc2b *tests/data/fate/vsynth_lena-snow.out.rawvideo
+stddev:   10.48 PSNR: 27.72 MAXDIFF:  119 bytes:  7603200/  7603200
diff --git a/tests/ref/vsynth/vsynth_lena-snow-hpel b/tests/ref/vsynth/vsynth_lena-snow-hpel
index 67effebc8a..2d6edd8a79 100644
--- a/tests/ref/vsynth/vsynth_lena-snow-hpel
+++ b/tests/ref/vsynth/vsynth_lena-snow-hpel
@@ -1,4 +1,4 @@
-c6ec87a11415a99b1a781f9f5bacb722 *tests/data/fate/vsynth_lena-snow-hpel.avi
-61814 tests/data/fate/vsynth_lena-snow-hpel.avi
-40f330397b7acf6bdbb3ec6d908be451 *tests/data/fate/vsynth_lena-snow-hpel.out.rawvideo
-stddev:   10.34 PSNR: 27.83 MAXDIFF:  118 bytes:  7603200/  7603200
+56b14cb1cbb637536233982e87f7ac3e *tests/data/fate/vsynth_lena-snow-hpel.avi
+61764 tests/data/fate/vsynth_lena-snow-hpel.avi
+244b0266127fa354d8485234b2c388e4 *tests/data/fate/vsynth_lena-snow-hpel.out.rawvideo
+stddev:   10.45 PSNR: 27.74 MAXDIFF:  119 bytes:  7603200/  7603200
diff --git a/tools/target_dec_fuzzer.c b/tools/target_dec_fuzzer.c
index 825ca2d7eb..9e15216e59 100644
--- a/tools/target_dec_fuzzer.c
+++ b/tools/target_dec_fuzzer.c
@@ -172,7 +172,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     case AV_CODEC_ID_INTERPLAY_ACM: maxsamples /= 16384;  break;
     case AV_CODEC_ID_LAGARITH:    maxpixels  /= 1024;  break;
     case AV_CODEC_ID_LSCR:        maxpixels  /= 16;    break;
-    case AV_CODEC_ID_MMVIDEO:     maxpixels  /= 256;   break;
     case AV_CODEC_ID_MOTIONPIXELS:maxpixels  /= 256;   break;
     case AV_CODEC_ID_MP4ALS:      maxsamples /= 65536; break;
     case AV_CODEC_ID_MSA1:        maxpixels  /= 16384; break;