From f954f3383c91a51271af002295b9e50a8d5af1ab Mon Sep 17 00:00:00 2001
From: graysky <therealgraysky@proton.me>
Date: Tue, 9 May 2023 15:03:09 -0400
Subject: [PATCH] alarm/ffmpeg-rpi to 4.4.4-2

---
 alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch | 1237 ++++++++++++++++-
 alarm/ffmpeg-rpi/PKGBUILD                     |    4 +-
 2 files changed, 1208 insertions(+), 33 deletions(-)

diff --git a/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch b/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch
index fafa470e0..3abaa3c99 100644
--- a/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch
+++ b/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch
@@ -2,8 +2,8 @@ ffmpeg: jc-kynesim/test/4.4.1/main
 
 git diff-index --binary n4.4.4
 
-https://github.com/jc-kynesim/rpi-ffmpeg/commit/06605ea7f20102aa140632007aa07edd6bf86546
-21-Mar-2023
+https://github.com/jc-kynesim/rpi-ffmpeg/commit/4185270f334d006a108a878be8a62bab7dce38ee
+05-May-2023
 
 diff --git a/CREDITS b/CREDITS
 index f1aea93d6b..e29f0b853c 100644
@@ -70511,6 +70511,44 @@ index 10cde679f8..ad3659e936 100644
      }
  
      // Read the index
+diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
+index 38e4c65c4e..5e04c1df08 100644
+--- a/libavformat/rtpenc.c
++++ b/libavformat/rtpenc.c
+@@ -19,6 +19,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "avc.h"
+ #include "avformat.h"
+ #include "mpegts.h"
+ #include "internal.h"
+@@ -582,8 +583,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
+         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
+         break;
+     case AV_CODEC_ID_H264:
++    {
++        uint8_t *side_data;
++        int side_data_size = 0;
++
++        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &side_data_size);
++
++        if (side_data_size != 0) {
++            int ps_size = side_data_size;
++            uint8_t * ps_buf = NULL;
++
++            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
++            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
++            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
++            av_free(ps_buf);
++        }
+         ff_rtp_send_h264_hevc(s1, pkt->data, size);
+         break;
++    }
+     case AV_CODEC_ID_H261:
+         ff_rtp_send_h261(s1, pkt->data, size);
+         break;
 diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
 index fae3a371e0..25bdf475b3 100644
 --- a/libavformat/rtsp.c
@@ -74233,6 +74271,484 @@ index ea9b5097b8..c1cd452eee 100644
      return LIBAVUTIL_VERSION_INT;
  }
  
+diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
+index a9bf6ff9e0..6a0e2dcc09 100644
+--- a/libswscale/aarch64/rgb2rgb.c
++++ b/libswscale/aarch64/rgb2rgb.c
+@@ -30,6 +30,12 @@
+ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride);
++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ av_cold void rgb2rgb_init_aarch64(void)
+ {
+@@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void)
+ 
+     if (have_neon(cpu_flags)) {
+         interleaveBytes = ff_interleave_bytes_neon;
++        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
++        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
+     }
+ }
+diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
+index d81110ec57..978ab443ea 100644
+--- a/libswscale/aarch64/rgb2rgb_neon.S
++++ b/libswscale/aarch64/rgb2rgb_neon.S
+@@ -77,3 +77,448 @@ function ff_interleave_bytes_neon, export=1
+ 0:
+         ret
+ endfunc
++
++// void ff_rgb24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++function ff_rgb24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld1             {v3.s}[2], [x15], #4
++        ld1             {v3.s}[1], [x15], #4
++        ld1             {v3.s}[0], [x15], #4
++        ld1             {v4.s}[2], [x15], #4
++        ld1             {v4.s}[1], [x15], #4
++        ld1             {v4.s}[0], [x15], #4
++        ld1             {v5.s}[2], [x15], #4
++        ld1             {v5.s}[1], [x15], #4
++        ld1             {v5.s}[0], [x15]
++        b               99f
++endfunc
++
++// void ff_bgr24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++// regs
++// v0-2         Src bytes - reused as chroma src
++// v3-5         Coeffs (packed very inefficiently - could be squashed)
++// v6           128b
++// v7           128h
++// v8-15        Reserved
++// v16-18       Lo Src expanded as H
++// v19          -
++// v20-22       Hi Src expanded as H
++// v23          -
++// v24          U out
++// v25          U tmp
++// v26          Y out
++// v27-29       Y tmp
++// v30          V out
++// v31          V tmp
++
++// Assumes Little Endian in tail stores & conversion matrix
++
++function ff_bgr24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[2], [x15]
++99:
++        ldr             w14, [sp, #0]
++        movi            v7.8b, #128
++        uxtl            v6.8h, v7.8b
++        // Ensure if nothing to do then we do nothing
++        cmp             w4, #0
++        b.le            90f
++        cmp             w5, #0
++        b.le            90f
++        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
++        // the remainder done in the tail
++        tst             w4, #15
++        b.eq            1f
++        sub             w4, w4, #16
++1:
++
++// -------------------- Even line body - YUV
++11:
++        subs            w9,  w4, #0
++        mov             x10, x0
++        mov             x11, x1
++        mov             x12, x2
++        mov             x13, x3
++        b.lt            12f
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
++
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        subs            w9, w9, #16
++
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++
++        b.gt            10b
++
++// -------------------- Even line tail - YUV
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
++
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        cmp             w9, #-16
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++        st1             {v24.s}[0],  [x12], #4
++        st1             {v30.s}[0],  [x13], #4
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++        st1             {v24.h}[2],  [x12], #2
++        st1             {v30.h}[2],  [x13], #2
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++        st1             {v24.b}[6],  [x12], #1
++        st1             {v30.b}[6],  [x13], #1
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++        st1             {v24.b}[7],  [x12]
++        st1             {v30.b}[7],  [x13]
++1:
++3:
++
++// -------------------- Odd line body - Y only
++
++        subs            w5, w5, #1
++        b.eq            90f
++
++        subs            w9,  w4, #0
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        mov             x10, x0
++        mov             x11, x1
++        b.lt            12f
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        subs            w9, w9, #16
++
++        st1             {v26.16b}, [x11], #16
++
++        b.gt            10b
++
++// -------------------- Odd line tail - Y
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        cmp             w9, #-16
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++1:
++3:
++
++// ------------------- Loop to start
++
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        add             x2, x2, w7, SXTX
++        add             x3, x3, w7, SXTX
++        subs            w5, w5, #1
++        b.gt            11b
++90:
++        ret
++endfunc
 diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S
 index f341268c5d..f4b220fb60 100644
 --- a/libswscale/aarch64/yuv2rgb_neon.S
@@ -74564,12 +75080,643 @@ index aef0e7f82a..e855ad606a 100644
              if (eightbytes) {
                  output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
                  dest += 4;
+diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
+index a7300f3ba4..ba1db155b0 100644
+--- a/libswscale/rgb2rgb.c
++++ b/libswscale/rgb2rgb.c
+@@ -83,6 +83,31 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
+                        int width, int height,
+                        int lumStride, int chromStride, int srcStride,
+                        int32_t *rgb2yuv);
++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
++                       uint8_t *udst, uint8_t *vdst,
++                       int width, int height,
++                       int lumStride, int chromStride, int srcStride,
++                       int32_t *rgb2yuv);
++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
+ void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                  int srcStride, int dstStride);
+ void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
+index 48bba1586a..6329533f18 100644
+--- a/libswscale/rgb2rgb.h
++++ b/libswscale/rgb2rgb.h
+@@ -82,6 +82,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
+ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height, int lumStride,
+                       int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                      uint8_t *vdst, int width, int height, int lumStride,
++                      int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ /**
+  * Height should be a multiple of 2 and width should be a multiple of 16.
+@@ -131,6 +134,26 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                               int width, int height,
+                               int lumStride, int chromStride, int srcStride,
+                               int32_t *rgb2yuv);
++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                              int width, int height,
++                              int lumStride, int chromStride, int srcStride,
++                              int32_t *rgb2yuv);
++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
+ extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                         int srcStride, int dstStride);
+ 
+diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
+index 42c69801ba..e711589e1e 100644
+--- a/libswscale/rgb2rgb_template.c
++++ b/libswscale/rgb2rgb_template.c
+@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
+  * others are ignored in the C version.
+  * FIXME: Write HQ version.
+  */
+-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+-                   int chromStride, int srcStride, int32_t *rgb2yuv)
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
+ {
+-    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+-    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+-    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
+     int y;
+     const int chromWidth = width >> 1;
+ 
+@@ -678,6 +679,19 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
+         ydst += lumStride;
+         src  += srcStride;
+ 
+@@ -700,6 +714,125 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
++        udst += chromStride;
++        vdst += chromStride;
++        ydst += lumStride;
++        src  += srcStride;
++    }
++}
++
++static const uint8_t x_rgb[9] = {
++    RY_IDX, GY_IDX, BY_IDX,
++    RU_IDX, GU_IDX, BU_IDX,
++    RV_IDX, GV_IDX, BV_IDX,
++};
++
++static const uint8_t x_bgr[9] = {
++     BY_IDX, GY_IDX, RY_IDX,
++     BU_IDX, GU_IDX, RU_IDX,
++     BV_IDX, GV_IDX, RV_IDX,
++};
++
++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
++{
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
++    int y;
++    const int chromWidth = width >> 1;
++
++    for (y = 0; y < height; y += 2) {
++        int i;
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
++        ydst += lumStride;
++        src  += srcStride;
++
++        if (y+1 == height)
++            break;
++
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
+         udst += chromStride;
+         vdst += chromStride;
+         ydst += lumStride;
+@@ -707,6 +840,37 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+     }
+ }
+ 
++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++// As the general code does no SIMD-like ops simply adding 1 to the src address
++// will fix the ignored alpha position
++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++
+ static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride)
+@@ -980,6 +1144,11 @@ static av_cold void rgb2rgb_init_c(void)
+     yuy2toyv12         = yuy2toyv12_c;
+     planar2x           = planar2x_c;
+     ff_rgb24toyv12     = ff_rgb24toyv12_c;
++    ff_bgr24toyv12     = ff_bgr24toyv12_c;
++    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
++    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
++    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
++    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
+     interleaveBytes    = interleaveBytes_c;
+     deinterleaveBytes  = deinterleaveBytes_c;
+     vu9_to_vu12        = vu9_to_vu12_c;
+diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
+index c4dd8a4d83..da38d7f8ac 100644
+--- a/libswscale/swscale_unscaled.c
++++ b/libswscale/swscale_unscaled.c
+@@ -1655,6 +1655,91 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+     return srcSliceH;
+ }
+ 
++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                              int srcStride[], int srcSliceY, int srcSliceH,
++                              uint8_t *dst[], int dstStride[])
++{
++    ff_bgr24toyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_bgrxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_rgbxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xbgrtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xrgbtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
+ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+@@ -2035,6 +2120,32 @@ void ff_get_unscaled_swscale(SwsContext *c)
+         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+         !(flags & SWS_ACCURATE_RND))
+         c->swscale = bgr24ToYv12Wrapper;
++    /* rgb24toYV12 */
++    if (srcFormat == AV_PIX_FMT_RGB24 &&
++        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = rgb24ToYv12Wrapper;
++
++    /* bgrxtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = bgrxToYv12Wrapper;
++    /* rgbx24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = rgbxToYv12Wrapper;
++    /* xbgrtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = xbgrToYv12Wrapper;
++    /* xrgb24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = xrgbToYv12Wrapper;
+ 
+     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
+diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
+index 6c38041ddb..12776ffec7 100644
+--- a/libswscale/tests/swscale.c
++++ b/libswscale/tests/swscale.c
+@@ -23,6 +23,7 @@
+ #include <string.h>
+ #include <inttypes.h>
+ #include <stdarg.h>
++#include <time.h>
+ 
+ #undef HAVE_AV_CONFIG_H
+ #include "libavutil/cpu.h"
+@@ -78,6 +79,15 @@ struct Results {
+     uint32_t crc;
+ };
+ 
++static int time_rep = 0;
++
++static uint64_t utime(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
++}
++
+ // test by ref -> src -> dst -> out & compare out against ref
+ // ref & out are YV12
+ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+         goto end;
+     }
+ 
+-    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
++    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
+            desc_src->name, srcW, srcH,
+            desc_dst->name, dstW, dstH,
+            flags);
+@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+ 
+     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+ 
++    if (time_rep != 0)
++    {
++        const uint64_t now = utime();
++        uint64_t done;
++        for (i = 1; i != time_rep; ++i) {
++            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
++        }
++        done = utime();
++        printf(" T=%7"PRId64"us ", done-now);
++    }
++
+     for (i = 0; i < 4 && dstStride[i]; i++)
+         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+                      dstStride[i] * dstH);
+@@ -355,56 +376,78 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
+     return 0;
+ }
+ 
+-#define W 96
+-#define H 96
+-
+ int main(int argc, char **argv)
+ {
++    unsigned int W = 96;
++    unsigned int H = 96;
++    unsigned int W2;
++    unsigned int H2;
++    unsigned int S;
+     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
+     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
+-    uint8_t *rgb_data   = av_malloc(W * H * 4);
+-    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
+-    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
+-    uint8_t *data       = av_malloc(4 * W * H);
+-    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+-    int stride[4]       = { W, W, W, W };
+     int x, y;
+     struct SwsContext *sws;
+     AVLFG rand;
+     int res = -1;
+     int i;
+     FILE *fp = NULL;
+-
+-    if (!rgb_data || !data)
+-        return -1;
++    uint8_t *rgb_data;
++    uint8_t * rgb_src[4] = { NULL };
++    int rgb_stride[4]   = { 0 };
++    uint8_t *data;
++    uint8_t * src[4] = { NULL };
++    int stride[4]       = { 0 };
+ 
+     for (i = 1; i < argc; i += 2) {
++        const char * const arg2 = argv[i+1];
++
+         if (argv[i][0] != '-' || i + 1 == argc)
+             goto bad_option;
+         if (!strcmp(argv[i], "-ref")) {
+-            fp = fopen(argv[i + 1], "r");
++            fp = fopen(arg2, "r");
+             if (!fp) {
+-                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
++                fprintf(stderr, "could not open '%s'\n", arg2);
+                 goto error;
+             }
+         } else if (!strcmp(argv[i], "-cpuflags")) {
+             unsigned flags = av_get_cpu_flags();
+-            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
++            int ret = av_parse_cpu_caps(&flags, arg2);
+             if (ret < 0) {
+-                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid cpu flags %s\n", arg2);
+                 return ret;
+             }
+             av_force_cpu_flags(flags);
+         } else if (!strcmp(argv[i], "-src")) {
+-            srcFormat = av_get_pix_fmt(argv[i + 1]);
++            srcFormat = av_get_pix_fmt(arg2);
+             if (srcFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
+                 return -1;
+             }
+         } else if (!strcmp(argv[i], "-dst")) {
+-            dstFormat = av_get_pix_fmt(argv[i + 1]);
++            dstFormat = av_get_pix_fmt(arg2);
+             if (dstFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-w")) {
++            char * p = NULL;
++            W = strtoul(arg2, &p, 0);
++            if (!W || *p) {
++                fprintf(stderr, "bad width %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-h")) {
++            char * p = NULL;
++            H = strtoul(arg2, &p, 0);
++            if (!H || *p) {
++                fprintf(stderr, "bad height '%s'\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-t")) {
++            char * p = NULL;
++            time_rep = (int)strtol(arg2, &p, 0);
++            if (*p) {
++                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
+                 return -1;
+             }
+         } else {
+@@ -414,15 +457,34 @@ bad_option:
+         }
+     }
+ 
+-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
++    S = (W + 15) & ~15;
++    rgb_data   = av_mallocz(S * H * 4);
++    rgb_src[0] = rgb_data;
++    rgb_stride[0]   = 4 * S;
++    data       = av_mallocz(4 * S * H);
++    src[0] = data;
++    src[1] = data + S * H;
++    src[2] = data + S * H * 2;
++    src[3] = data + S * H * 3;
++    stride[0] = S;
++    stride[1] = S;
++    stride[2] = S;
++    stride[3] = S;
++    H2 = H < 96 ? 8 : H / 12;
++    W2 = W < 96 ? 8 : W / 12;
++
++    if (!rgb_data || !data)
++        return -1;
++
++    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
+                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+ 
+     av_lfg_init(&rand, 1);
+ 
+     for (y = 0; y < H; y++)
+         for (x = 0; x < W * 4; x++)
+-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+-    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
++            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
++    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
+     if (res < 0 || res != H) {
+         res = -1;
+         goto error;
+@@ -431,10 +493,10 @@ bad_option:
+     av_free(rgb_data);
+ 
+     if(fp) {
+-        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
++        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
+         fclose(fp);
+     } else {
+-        selfTest(src, stride, W, H, srcFormat, dstFormat);
++        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
+         res = 0;
+     }
+ error:
 diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
-index 0000000000..b050971f63
+index 0000000000..2b62d660c0
 --- /dev/null
 +++ b/pi-util/BUILD.txt
-@@ -0,0 +1,59 @@
+@@ -0,0 +1,67 @@
 +Building Pi FFmpeg
 +==================
 +
@@ -74596,6 +75743,8 @@ index 0000000000..b050971f63
 +         paths being confused and therefore running the wrong code,  Shared
 +         is what is needed, in most cases, when building for use by other
 +         programs.
++ --usr   Set install dir to /usr (i.e. system default) rather than in
++         <builddir>/install
 +
 +So for a static build
 +---------------------
@@ -74609,25 +75758,31 @@ index 0000000000..b050971f63
 +For a shared build
 +------------------
 +
++There are two choices here
++
 +$ pi-util/conf_native.sh
-+
-+You will normally want an install target if shared. Note that the script has
-+set this up to be generated in out/<builddir>/install, you don't have to worry
-+about overwriting your system libs.
-+
 +$ make -j8 -C out/<builddir> install
 +
++This sets the install prefix to <builddir>/install and is probably what you
++want if you don't want to overwrite the system files.
++
 +You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
-+built or install the image on the system - you have to be careful to get rid
-+of all other ffmpeg libs or confusion may result.  There is a little script
-+that wipes all other versions - obviously use with care!
++built. You can copy the contents of <build dir>/install to /usr and that mostly
++works. The only downside is that paths in pkgconfig end up being set to the
++install directory in your build directory which may be less than ideal when
++building other packages.
 +
++The alternative if you just want to replace the system libs is:
++
++$ pi-util/conf_native.sh --usr
++$ make -j8 -C out/<builddir>
 +$ sudo pi-util/clean_usr_libs.sh
++$ sudo make -j8 -C out/<builddir> install
 +
-+Then simply copying from the install to /usr works
-+
-+$ sudo cp -r out/<builddir>/install/* /usr
-+
++The clean_usr_libs.sh step wipes any existing libs & includes (for all
++architectures) from the system which helps avoid confusion when running other
++progs as you can be sure you're not running old code which is unfortunately
++easy to do otherwise.
 +
 diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
 new file mode 100644
@@ -75397,10 +76552,10 @@ index 0000000000..fc14f2a3c2
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
 new file mode 100755
-index 0000000000..a9e053801c
+index 0000000000..5fb69ccee2
 --- /dev/null
 +++ b/pi-util/conf_native.sh
-@@ -0,0 +1,107 @@
+@@ -0,0 +1,127 @@
 +echo "Configure for native build"
 +
 +FFSRC=`pwd`
@@ -75412,6 +76567,7 @@ index 0000000000..a9e053801c
 +
 +NOSHARED=
 +MMAL=
++USR_PREFIX=
 +
 +while [ "$1" != "" ] ; do
 +    case $1 in
@@ -75421,8 +76577,14 @@ index 0000000000..a9e053801c
 +	--mmal)
 +	    MMAL=1
 +	    ;;
++	--usr)
++	    USR_PREFIX=/usr
++	    ;;
 +	*)
-+	    echo "Usage $0: [--noshared] [--mmal]"
++	    echo "Usage $0: [--noshared] [--mmal] [--usr]"
++	    echo "  noshared  Build static libs and executable - good for testing"
++	    echo "  mmal      Build mmal decoders"
++	    echo "  usr       Set install prefix to /usr [default=<build-dir>/install]"
 +	    exit 1
 +	    ;;
 +    esac
@@ -75436,18 +76598,28 @@ index 0000000000..a9e053801c
 +RPI_DEFINES=
 +RPI_EXTRALIBS=
 +
-+if [ "$MC" == "arm64" ]; then
-+  echo "M/C aarch64"
-+  A=aarch64-linux-gnu
-+  B=arm64
-+elif [ "$MC" == "armhf" ]; then
-+  echo "M/C armv7"
-+  A=arm-linux-gnueabihf
-+  B=armv7
-+  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
-+  RPI_DEFINES=-mfpu=neon-vfpv4
++# uname -m gives kernel type which may not have the same
++# 32/64bitness as userspace :-( getconf shoudl provide the answer
++# but use uname to check we are on the right processor
++MC=`uname -m`
++LB=`getconf LONG_BIT`
++if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then
++  if [ "$LB" == "32" ]; then
++    echo "M/C armv7"
++    A=arm-linux-gnueabihf
++    B=armv7
++    MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
++    RPI_DEFINES=-mfpu=neon-vfpv4
++  elif [ "$LB" == "64" ]; then
++    echo "M/C aarch64"
++    A=aarch64-linux-gnu
++    B=arm64
++  else
++    echo "Unknown LONG_BIT name: $LB"
++    exit 1
++  fi
 +else
-+  echo Unexpected architecture $MC
++  echo "Unknown machine name: $MC"
 +  exit 1
 +fi
 +
@@ -75475,7 +76647,9 @@ index 0000000000..a9e053801c
 +  OUT=$BUILDBASE/$B-$C-$V-shared-rel
 +fi
 +
-+USR_PREFIX=$OUT/install
++if [ ! $USR_PREFIX ]; then
++  USR_PREFIX=$OUT/install
++fi
 +LIB_PREFIX=$USR_PREFIX/lib/$A
 +INC_PREFIX=$USR_PREFIX/include/$A
 +
@@ -75505,6 +76679,7 @@ index 0000000000..a9e053801c
 + --extra-libs="$RPI_EXTRALIBS"\
 + --extra-version="rpi"
 +
++echo "Configured into $OUT"
 +
 +# gcc option for getting asm listing
 +# -Wa,-ahls
diff --git a/alarm/ffmpeg-rpi/PKGBUILD b/alarm/ffmpeg-rpi/PKGBUILD
index 228025715..96c55174d 100644
--- a/alarm/ffmpeg-rpi/PKGBUILD
+++ b/alarm/ffmpeg-rpi/PKGBUILD
@@ -7,7 +7,7 @@
 pkgbase=ffmpeg-rpi
 pkgname=($pkgbase $pkgbase-bin)
 pkgver=4.4.4
-pkgrel=1
+pkgrel=2
 arch=(aarch64)
 url=https://ffmpeg.org/
 license=(GPL3)
@@ -83,7 +83,7 @@ source=(https://ffmpeg.org/releases/${pkgname/-rpi}-$pkgver.tar.xz{,.asc}
 sha256sums=('e80b380d595c809060f66f96a5d849511ef4a76a26b76eacf5778b94c3570309'
             'SKIP'
             '2e8d885de789b461ddf63c10646cdb16ad5519b671efd1624bf5a8e7da43dbf3'
-            '316f5b7a2cad9efbc84cf0148d9a5b99e9a9e2543c62ebc7a02f0924da096542'
+            'c3db95417fbfdd9e7a96d63cb2a91ad1eee17ae233c0ef1cf1588f8c0eff90fa'
             '42f57e7a55f250811515571c870372d6ed0ed504f823b341d26f383c082ce0a0')
 validpgpkeys=('FCF986EA15E6E293A5644F10B4322F04D67658D8')