From f954f3383c91a51271af002295b9e50a8d5af1ab Mon Sep 17 00:00:00 2001 From: graysky Date: Tue, 9 May 2023 15:03:09 -0400 Subject: [PATCH] alarm/ffmpeg-rpi to 4.4.4-2 --- alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch | 1237 ++++++++++++++++- alarm/ffmpeg-rpi/PKGBUILD | 4 +- 2 files changed, 1208 insertions(+), 33 deletions(-) diff --git a/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch b/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch index fafa470e0..3abaa3c99 100644 --- a/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch +++ b/alarm/ffmpeg-rpi/0002-ffmpeg-4.4.4n-rpi.patch @@ -2,8 +2,8 @@ ffmpeg: jc-kynesim/test/4.4.1/main git diff-index --binary n4.4.4 -https://github.com/jc-kynesim/rpi-ffmpeg/commit/06605ea7f20102aa140632007aa07edd6bf86546 -21-Mar-2023 +https://github.com/jc-kynesim/rpi-ffmpeg/commit/4185270f334d006a108a878be8a62bab7dce38ee +05-May-2023 diff --git a/CREDITS b/CREDITS index f1aea93d6b..e29f0b853c 100644 @@ -70511,6 +70511,44 @@ index 10cde679f8..ad3659e936 100644 } // Read the index +diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c +index 38e4c65c4e..5e04c1df08 100644 +--- a/libavformat/rtpenc.c ++++ b/libavformat/rtpenc.c +@@ -19,6 +19,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "avc.h" + #include "avformat.h" + #include "mpegts.h" + #include "internal.h" +@@ -582,8 +583,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt) + ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0); + break; + case AV_CODEC_ID_H264: ++ { ++ uint8_t *side_data; ++ int side_data_size = 0; ++ ++ side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, ++ &side_data_size); ++ ++ if (side_data_size != 0) { ++ int ps_size = side_data_size; ++ uint8_t * ps_buf = NULL; ++ ++ ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size); ++ av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size); ++ ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size); ++ av_free(ps_buf); ++ } + ff_rtp_send_h264_hevc(s1, pkt->data, size); + break; ++ } + case AV_CODEC_ID_H261: + ff_rtp_send_h261(s1, pkt->data, size); + break; diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c index fae3a371e0..25bdf475b3 100644 --- a/libavformat/rtsp.c @@ -74233,6 +74271,484 @@ index ea9b5097b8..c1cd452eee 100644 return LIBAVUTIL_VERSION_INT; } +diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c +index a9bf6ff9e0..6a0e2dcc09 100644 +--- a/libswscale/aarch64/rgb2rgb.c ++++ b/libswscale/aarch64/rgb2rgb.c +@@ -30,6 +30,12 @@ + void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int width, int height, + int src1Stride, int src2Stride, int dstStride); ++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); ++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); + + av_cold void rgb2rgb_init_aarch64(void) + { +@@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void) + + if (have_neon(cpu_flags)) { + interleaveBytes = ff_interleave_bytes_neon; ++ ff_rgb24toyv12 = ff_rgb24toyv12_aarch64; ++ ff_bgr24toyv12 = ff_bgr24toyv12_aarch64; + } + } +diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S +index d81110ec57..978ab443ea 100644 +--- a/libswscale/aarch64/rgb2rgb_neon.S ++++ b/libswscale/aarch64/rgb2rgb_neon.S +@@ -77,3 +77,448 @@ function ff_interleave_bytes_neon, export=1 + 0: + ret + endfunc ++ ++// void ff_rgb24toyv12_aarch64( ++// const uint8_t *src, // x0 ++// uint8_t *ydst, // x1 ++// uint8_t *udst, // x2 ++// uint8_t *vdst, // x3 ++// int width, // w4 ++// int height, // w5 ++// int lumStride, // w6 ++// int chromStride, // w7 ++// int srcStr, // [sp, #0] ++// int32_t *rgb2yuv); // [sp, #8] ++ ++function ff_rgb24toyv12_aarch64, export=1 ++ ldr x15, [sp, #8] ++ ld1 {v3.s}[2], [x15], #4 ++ ld1 {v3.s}[1], [x15], #4 ++ ld1 {v3.s}[0], [x15], #4 ++ ld1 {v4.s}[2], [x15], #4 ++ ld1 {v4.s}[1], [x15], #4 ++ ld1 {v4.s}[0], [x15], #4 ++ ld1 {v5.s}[2], [x15], #4 ++ ld1 {v5.s}[1], [x15], #4 ++ ld1 {v5.s}[0], [x15] ++ b 99f ++endfunc ++ ++// void ff_bgr24toyv12_aarch64( ++// const uint8_t *src, // x0 ++// uint8_t *ydst, // x1 ++// uint8_t *udst, // x2 ++// uint8_t *vdst, // x3 ++// int width, // w4 ++// int height, // w5 ++// int lumStride, // w6 ++// int chromStride, // w7 ++// int srcStr, // [sp, #0] ++// int32_t *rgb2yuv); // [sp, #8] ++ ++// regs ++// v0-2 Src bytes - reused as chroma src ++// v3-5 Coeffs (packed very inefficiently - could be squashed) ++// v6 128b ++// v7 128h ++// v8-15 Reserved ++// v16-18 Lo Src expanded as H ++// v19 - ++// v20-22 Hi Src expanded as H ++// v23 - ++// v24 U out ++// v25 U tmp ++// v26 Y out ++// v27-29 Y tmp ++// v30 V out ++// v31 V tmp ++ ++// Assumes Little Endian in tail stores & conversion matrix ++ ++function ff_bgr24toyv12_aarch64, export=1 ++ ldr x15, [sp, #8] ++ ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 ++ ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 ++ ld3 {v3.s, v4.s, v5.s}[2], [x15] ++99: ++ ldr w14, [sp, #0] ++ movi v7.8b, #128 ++ uxtl v6.8h, v7.8b ++ // Ensure if nothing to do then we do nothing ++ cmp w4, #0 ++ b.le 90f ++ cmp w5, #0 ++ b.le 90f ++ // If w % 16 != 0 then -16 so we do main loop 1 fewer times with ++ // the remainder done in the tail ++ tst w4, #15 ++ b.eq 1f ++ sub w4, w4, #16 ++1: ++ ++// -------------------- Even line body - YUV ++11: ++ subs w9, w4, #0 ++ mov x10, x0 ++ mov x11, x1 ++ mov x12, x2 ++ mov x13, x3 ++ b.lt 12f ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ subs w9, w9, #16 ++ b.le 13f ++ ++10: ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ bic v0.8h, #0xff, LSL #8 ++ bic v1.8h, #0xff, LSL #8 ++ bic v2.8h, #0xff, LSL #8 ++ ++ // Testing shows it is faster to stack the smull/smlal ops together ++ // rather than interleave them between channels and indeed even the ++ // shift/add sections seem happier not interleaved ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ uqrshrn v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ uqrshrn2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // U ++ // Vector subscript *2 as we loaded into S but are only using H ++ smull v24.4s, v0.4h, v3.h[2] ++ smlal v24.4s, v1.4h, v4.h[2] ++ smlal v24.4s, v2.4h, v5.h[2] ++ smull2 v25.4s, v0.8h, v3.h[2] ++ smlal2 v25.4s, v1.8h, v4.h[2] ++ smlal2 v25.4s, v2.8h, v5.h[2] ++ ++ // V ++ smull v30.4s, v0.4h, v3.h[4] ++ smlal v30.4s, v1.4h, v4.h[4] ++ smlal v30.4s, v2.4h, v5.h[4] ++ smull2 v31.4s, v0.8h, v3.h[4] ++ smlal2 v31.4s, v1.8h, v4.h[4] ++ smlal2 v31.4s, v2.8h, v5.h[4] ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ ++ shrn v24.4h, v24.4s, #14 ++ shrn2 v24.8h, v25.4s, #14 ++ sqrshrn v24.8b, v24.8h, #1 ++ add v24.8b, v24.8b, v7.8b // +128 ++ shrn v30.4h, v30.4s, #14 ++ shrn2 v30.8h, v31.4s, #14 ++ sqrshrn v30.8b, v30.8h, #1 ++ add v30.8b, v30.8b, v7.8b // +128 ++ ++ subs w9, w9, #16 ++ ++ st1 {v26.16b}, [x11], #16 ++ st1 {v24.8b}, [x12], #8 ++ st1 {v30.8b}, [x13], #8 ++ ++ b.gt 10b ++ ++// -------------------- Even line tail - YUV ++// If width % 16 == 0 then simply runs once with preloaded RGB ++// If other then deals with preload & then does remaining tail ++ ++13: ++ // Body is simple copy of main loop body minus preload ++ ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ bic v0.8h, #0xff, LSL #8 ++ bic v1.8h, #0xff, LSL #8 ++ bic v2.8h, #0xff, LSL #8 ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ uqrshrn v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ uqrshrn2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // U ++ // Vector subscript *2 as we loaded into S but are only using H ++ smull v24.4s, v0.4h, v3.h[2] ++ smlal v24.4s, v1.4h, v4.h[2] ++ smlal v24.4s, v2.4h, v5.h[2] ++ smull2 v25.4s, v0.8h, v3.h[2] ++ smlal2 v25.4s, v1.8h, v4.h[2] ++ smlal2 v25.4s, v2.8h, v5.h[2] ++ ++ // V ++ smull v30.4s, v0.4h, v3.h[4] ++ smlal v30.4s, v1.4h, v4.h[4] ++ smlal v30.4s, v2.4h, v5.h[4] ++ smull2 v31.4s, v0.8h, v3.h[4] ++ smlal2 v31.4s, v1.8h, v4.h[4] ++ smlal2 v31.4s, v2.8h, v5.h[4] ++ ++ cmp w9, #-16 ++ ++ shrn v24.4h, v24.4s, #14 ++ shrn2 v24.8h, v25.4s, #14 ++ sqrshrn v24.8b, v24.8h, #1 ++ add v24.8b, v24.8b, v7.8b // +128 ++ shrn v30.4h, v30.4s, #14 ++ shrn2 v30.8h, v31.4s, #14 ++ sqrshrn v30.8b, v30.8h, #1 ++ add v30.8b, v30.8b, v7.8b // +128 ++ ++ // Here: ++ // w9 == 0 width % 16 == 0, tail done ++ // w9 > -16 1st tail done (16 pels), remainder still to go ++ // w9 == -16 shouldn't happen ++ // w9 > -32 2nd tail done ++ // w9 <= -32 shouldn't happen ++ ++ b.lt 2f ++ st1 {v26.16b}, [x11], #16 ++ st1 {v24.8b}, [x12], #8 ++ st1 {v30.8b}, [x13], #8 ++ cbz w9, 3f ++ ++12: ++ sub w9, w9, #16 ++ ++ tbz w9, #3, 1f ++ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 ++1: tbz w9, #2, 1f ++ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 ++1: tbz w9, #1, 1f ++ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 ++1: tbz w9, #0, 13b ++ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 ++ b 13b ++ ++2: ++ tbz w9, #3, 1f ++ st1 {v26.8b}, [x11], #8 ++ st1 {v24.s}[0], [x12], #4 ++ st1 {v30.s}[0], [x13], #4 ++1: tbz w9, #2, 1f ++ st1 {v26.s}[2], [x11], #4 ++ st1 {v24.h}[2], [x12], #2 ++ st1 {v30.h}[2], [x13], #2 ++1: tbz w9, #1, 1f ++ st1 {v26.h}[6], [x11], #2 ++ st1 {v24.b}[6], [x12], #1 ++ st1 {v30.b}[6], [x13], #1 ++1: tbz w9, #0, 1f ++ st1 {v26.b}[14], [x11] ++ st1 {v24.b}[7], [x12] ++ st1 {v30.b}[7], [x13] ++1: ++3: ++ ++// -------------------- Odd line body - Y only ++ ++ subs w5, w5, #1 ++ b.eq 90f ++ ++ subs w9, w4, #0 ++ add x0, x0, w14, SXTX ++ add x1, x1, w6, SXTX ++ mov x10, x0 ++ mov x11, x1 ++ b.lt 12f ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ subs w9, w9, #16 ++ b.le 13f ++ ++10: ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ // Testing shows it is faster to stack the smull/smlal ops together ++ // rather than interleave them between channels and indeed even the ++ // shift/add sections seem happier not interleaved ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ uqrshrn v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ uqrshrn2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ subs w9, w9, #16 ++ ++ st1 {v26.16b}, [x11], #16 ++ ++ b.gt 10b ++ ++// -------------------- Odd line tail - Y ++// If width % 16 == 0 then simply runs once with preloaded RGB ++// If other then deals with preload & then does remaining tail ++ ++13: ++ // Body is simple copy of main loop body minus preload ++ ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ ++ cmp w9, #-16 ++ ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ uqrshrn v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ uqrshrn2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // Here: ++ // w9 == 0 width % 16 == 0, tail done ++ // w9 > -16 1st tail done (16 pels), remainder still to go ++ // w9 == -16 shouldn't happen ++ // w9 > -32 2nd tail done ++ // w9 <= -32 shouldn't happen ++ ++ b.lt 2f ++ st1 {v26.16b}, [x11], #16 ++ cbz w9, 3f ++ ++12: ++ sub w9, w9, #16 ++ ++ tbz w9, #3, 1f ++ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 ++1: tbz w9, #2, 1f ++ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 ++1: tbz w9, #1, 1f ++ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 ++1: tbz w9, #0, 13b ++ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 ++ b 13b ++ ++2: ++ tbz w9, #3, 1f ++ st1 {v26.8b}, [x11], #8 ++1: tbz w9, #2, 1f ++ st1 {v26.s}[2], [x11], #4 ++1: tbz w9, #1, 1f ++ st1 {v26.h}[6], [x11], #2 ++1: tbz w9, #0, 1f ++ st1 {v26.b}[14], [x11] ++1: ++3: ++ ++// ------------------- Loop to start ++ ++ add x0, x0, w14, SXTX ++ add x1, x1, w6, SXTX ++ add x2, x2, w7, SXTX ++ add x3, x3, w7, SXTX ++ subs w5, w5, #1 ++ b.gt 11b ++90: ++ ret ++endfunc diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index f341268c5d..f4b220fb60 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S @@ -74564,12 +75080,643 @@ index aef0e7f82a..e855ad606a 100644 if (eightbytes) { output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14); dest += 4; +diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c +index a7300f3ba4..ba1db155b0 100644 +--- a/libswscale/rgb2rgb.c ++++ b/libswscale/rgb2rgb.c +@@ -83,6 +83,31 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); ++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); + void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, + int srcStride, int dstStride); + void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, +diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h +index 48bba1586a..6329533f18 100644 +--- a/libswscale/rgb2rgb.h ++++ b/libswscale/rgb2rgb.h +@@ -82,6 +82,9 @@ void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size); + void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); ++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); + + /** + * Height should be a multiple of 2 and width should be a multiple of 16. +@@ -131,6 +134,26 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); ++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); + extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, + int srcStride, int dstStride); + +diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c +index 42c69801ba..e711589e1e 100644 +--- a/libswscale/rgb2rgb_template.c ++++ b/libswscale/rgb2rgb_template.c +@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst, + * others are ignored in the C version. + * FIXME: Write HQ version. + */ +-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, +- int chromStride, int srcStride, int32_t *rgb2yuv) ++ int chromStride, int srcStride, int32_t *rgb2yuv, ++ const uint8_t x[9]) + { +- int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; +- int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; +- int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; ++ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; ++ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; ++ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; + int y; + const int chromWidth = width >> 1; + +@@ -678,6 +679,19 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + ydst[2 * i + 1] = Y; + } ++ if ((width & 1) != 0) { ++ unsigned int b = src[6 * i + 0]; ++ unsigned int g = src[6 * i + 1]; ++ unsigned int r = src[6 * i + 2]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ } + ydst += lumStride; + src += srcStride; + +@@ -700,6 +714,125 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + ydst[2 * i + 1] = Y; + } ++ if ((width & 1) != 0) { ++ unsigned int b = src[6 * i + 0]; ++ unsigned int g = src[6 * i + 1]; ++ unsigned int r = src[6 * i + 2]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ } ++ udst += chromStride; ++ vdst += chromStride; ++ ydst += lumStride; ++ src += srcStride; ++ } ++} ++ ++static const uint8_t x_rgb[9] = { ++ RY_IDX, GY_IDX, BY_IDX, ++ RU_IDX, GU_IDX, BU_IDX, ++ RV_IDX, GV_IDX, BV_IDX, ++}; ++ ++static const uint8_t x_bgr[9] = { ++ BY_IDX, GY_IDX, RY_IDX, ++ BU_IDX, GU_IDX, RU_IDX, ++ BV_IDX, GV_IDX, RV_IDX, ++}; ++ ++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv, ++ const uint8_t x[9]) ++{ ++ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; ++ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; ++ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; ++ int y; ++ const int chromWidth = width >> 1; ++ ++ for (y = 0; y < height; y += 2) { ++ int i; ++ for (i = 0; i < chromWidth; i++) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ ++ b = src[8 * i + 6]; ++ g = src[8 * i + 5]; ++ r = src[8 * i + 4]; ++ ++ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ydst[2 * i + 1] = Y; ++ } ++ if ((width & 1) != 0) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ } ++ ydst += lumStride; ++ src += srcStride; ++ ++ if (y+1 == height) ++ break; ++ ++ for (i = 0; i < chromWidth; i++) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ ++ b = src[8 * i + 6]; ++ g = src[8 * i + 5]; ++ r = src[8 * i + 4]; ++ ++ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ydst[2 * i + 1] = Y; ++ } ++ if ((width & 1) != 0) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ } + udst += chromStride; + vdst += chromStride; + ydst += lumStride; +@@ -707,6 +840,37 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + } + } + ++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++// As the general code does no SIMD-like ops simply adding 1 to the src address ++// will fix the ignored alpha position ++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++ + static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int width, int height, + int src1Stride, int src2Stride, int dstStride) +@@ -980,6 +1144,11 @@ static av_cold void rgb2rgb_init_c(void) + yuy2toyv12 = yuy2toyv12_c; + planar2x = planar2x_c; + ff_rgb24toyv12 = ff_rgb24toyv12_c; ++ ff_bgr24toyv12 = ff_bgr24toyv12_c; ++ ff_rgbxtoyv12 = ff_rgbxtoyv12_c; ++ ff_bgrxtoyv12 = ff_bgrxtoyv12_c; ++ ff_xrgbtoyv12 = ff_xrgbtoyv12_c; ++ ff_xbgrtoyv12 = ff_xbgrtoyv12_c; + interleaveBytes = interleaveBytes_c; + deinterleaveBytes = deinterleaveBytes_c; + vu9_to_vu12 = vu9_to_vu12_c; +diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c +index c4dd8a4d83..da38d7f8ac 100644 +--- a/libswscale/swscale_unscaled.c ++++ b/libswscale/swscale_unscaled.c +@@ -1655,6 +1655,91 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], + return srcSliceH; + } + ++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_bgr24toyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_bgrxtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_rgbxtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_xbgrtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_xrgbtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ + static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +@@ -2035,6 +2120,32 @@ void ff_get_unscaled_swscale(SwsContext *c) + (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && + !(flags & SWS_ACCURATE_RND)) + c->swscale = bgr24ToYv12Wrapper; ++ /* rgb24toYV12 */ ++ if (srcFormat == AV_PIX_FMT_RGB24 && ++ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = rgb24ToYv12Wrapper; ++ ++ /* bgrxtoYV12 */ ++ if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = bgrxToYv12Wrapper; ++ /* rgbx24toYV12 */ ++ if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = rgbxToYv12Wrapper; ++ /* xbgrtoYV12 */ ++ if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = xbgrToYv12Wrapper; ++ /* xrgb24toYV12 */ ++ if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = xrgbToYv12Wrapper; + + /* RGB/BGR -> RGB/BGR (no dither needed forms) */ + if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c) +diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c +index 6c38041ddb..12776ffec7 100644 +--- a/libswscale/tests/swscale.c ++++ b/libswscale/tests/swscale.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #undef HAVE_AV_CONFIG_H + #include "libavutil/cpu.h" +@@ -78,6 +79,15 @@ struct Results { + uint32_t crc; + }; + ++static int time_rep = 0; ++ ++static uint64_t utime(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000; ++} ++ + // test by ref -> src -> dst -> out & compare out against ref + // ref & out are YV12 + static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, +@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, + goto end; + } + +- printf(" %s %dx%d -> %s %3dx%3d flags=%2d", ++ printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d", + desc_src->name, srcW, srcH, + desc_dst->name, dstW, dstH, + flags); +@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, + + sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); + ++ if (time_rep != 0) ++ { ++ const uint64_t now = utime(); ++ uint64_t done; ++ for (i = 1; i != time_rep; ++i) { ++ sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); ++ } ++ done = utime(); ++ printf(" T=%7"PRId64"us ", done-now); ++ } ++ + for (i = 0; i < 4 && dstStride[i]; i++) + crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i], + dstStride[i] * dstH); +@@ -355,56 +376,78 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4], + return 0; + } + +-#define W 96 +-#define H 96 +- + int main(int argc, char **argv) + { ++ unsigned int W = 96; ++ unsigned int H = 96; ++ unsigned int W2; ++ unsigned int H2; ++ unsigned int S; + enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE; + enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE; +- uint8_t *rgb_data = av_malloc(W * H * 4); +- const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL }; +- int rgb_stride[4] = { 4 * W, 0, 0, 0 }; +- uint8_t *data = av_malloc(4 * W * H); +- const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 }; +- int stride[4] = { W, W, W, W }; + int x, y; + struct SwsContext *sws; + AVLFG rand; + int res = -1; + int i; + FILE *fp = NULL; +- +- if (!rgb_data || !data) +- return -1; ++ uint8_t *rgb_data; ++ uint8_t * rgb_src[4] = { NULL }; ++ int rgb_stride[4] = { 0 }; ++ uint8_t *data; ++ uint8_t * src[4] = { NULL }; ++ int stride[4] = { 0 }; + + for (i = 1; i < argc; i += 2) { ++ const char * const arg2 = argv[i+1]; ++ + if (argv[i][0] != '-' || i + 1 == argc) + goto bad_option; + if (!strcmp(argv[i], "-ref")) { +- fp = fopen(argv[i + 1], "r"); ++ fp = fopen(arg2, "r"); + if (!fp) { +- fprintf(stderr, "could not open '%s'\n", argv[i + 1]); ++ fprintf(stderr, "could not open '%s'\n", arg2); + goto error; + } + } else if (!strcmp(argv[i], "-cpuflags")) { + unsigned flags = av_get_cpu_flags(); +- int ret = av_parse_cpu_caps(&flags, argv[i + 1]); ++ int ret = av_parse_cpu_caps(&flags, arg2); + if (ret < 0) { +- fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid cpu flags %s\n", arg2); + return ret; + } + av_force_cpu_flags(flags); + } else if (!strcmp(argv[i], "-src")) { +- srcFormat = av_get_pix_fmt(argv[i + 1]); ++ srcFormat = av_get_pix_fmt(arg2); + if (srcFormat == AV_PIX_FMT_NONE) { +- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid pixel format %s\n", arg2); + return -1; + } + } else if (!strcmp(argv[i], "-dst")) { +- dstFormat = av_get_pix_fmt(argv[i + 1]); ++ dstFormat = av_get_pix_fmt(arg2); + if (dstFormat == AV_PIX_FMT_NONE) { +- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid pixel format %s\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-w")) { ++ char * p = NULL; ++ W = strtoul(arg2, &p, 0); ++ if (!W || *p) { ++ fprintf(stderr, "bad width %s\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-h")) { ++ char * p = NULL; ++ H = strtoul(arg2, &p, 0); ++ if (!H || *p) { ++ fprintf(stderr, "bad height '%s'\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-t")) { ++ char * p = NULL; ++ time_rep = (int)strtol(arg2, &p, 0); ++ if (*p) { ++ fprintf(stderr, "bad time repetitions '%s'\n", arg2); + return -1; + } + } else { +@@ -414,15 +457,34 @@ bad_option: + } + } + +- sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H, ++ S = (W + 15) & ~15; ++ rgb_data = av_mallocz(S * H * 4); ++ rgb_src[0] = rgb_data; ++ rgb_stride[0] = 4 * S; ++ data = av_mallocz(4 * S * H); ++ src[0] = data; ++ src[1] = data + S * H; ++ src[2] = data + S * H * 2; ++ src[3] = data + S * H * 3; ++ stride[0] = S; ++ stride[1] = S; ++ stride[2] = S; ++ stride[3] = S; ++ H2 = H < 96 ? 8 : H / 12; ++ W2 = W < 96 ? 8 : W / 12; ++ ++ if (!rgb_data || !data) ++ return -1; ++ ++ sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H, + AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL); + + av_lfg_init(&rand, 1); + + for (y = 0; y < H; y++) + for (x = 0; x < W * 4; x++) +- rgb_data[ x + y * 4 * W] = av_lfg_get(&rand); +- res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride); ++ rgb_data[ x + y * 4 * S] = av_lfg_get(&rand); ++ res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride); + if (res < 0 || res != H) { + res = -1; + goto error; +@@ -431,10 +493,10 @@ bad_option: + av_free(rgb_data); + + if(fp) { +- res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat); ++ res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat); + fclose(fp); + } else { +- selfTest(src, stride, W, H, srcFormat, dstFormat); ++ selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat); + res = 0; + } + error: diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 -index 0000000000..b050971f63 +index 0000000000..2b62d660c0 --- /dev/null +++ b/pi-util/BUILD.txt -@@ -0,0 +1,59 @@ +@@ -0,0 +1,67 @@ +Building Pi FFmpeg +================== + @@ -74596,6 +75743,8 @@ index 0000000000..b050971f63 + paths being confused and therefore running the wrong code, Shared + is what is needed, in most cases, when building for use by other + programs. ++ --usr Set install dir to /usr (i.e. system default) rather than in ++ /install + +So for a static build +--------------------- @@ -74609,25 +75758,31 @@ index 0000000000..b050971f63 +For a shared build +------------------ + ++There are two choices here ++ +$ pi-util/conf_native.sh -+ -+You will normally want an install target if shared. Note that the script has -+set this up to be generated in out//install, you don't have to worry -+about overwriting your system libs. -+ +$ make -j8 -C out/ install + ++This sets the install prefix to /install and is probably what you ++want if you don't want to overwrite the system files. ++ +You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was -+built or install the image on the system - you have to be careful to get rid -+of all other ffmpeg libs or confusion may result. There is a little script -+that wipes all other versions - obviously use with care! ++built. You can copy the contents of /install to /usr and that mostly ++works. The only downside is that paths in pkgconfig end up being set to the ++install directory in your build directory which may be less than ideal when ++building other packages. + ++The alternative if you just want to replace the system libs is: ++ ++$ pi-util/conf_native.sh --usr ++$ make -j8 -C out/ +$ sudo pi-util/clean_usr_libs.sh ++$ sudo make -j8 -C out/ install + -+Then simply copying from the install to /usr works -+ -+$ sudo cp -r out//install/* /usr -+ ++The clean_usr_libs.sh step wipes any existing libs & includes (for all ++architectures) from the system which helps avoid confusion when running other ++progs as you can be sure you're not running old code which is unfortunately ++easy to do otherwise. + diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt new file mode 100644 @@ -75397,10 +76552,10 @@ index 0000000000..fc14f2a3c2 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh new file mode 100755 -index 0000000000..a9e053801c +index 0000000000..5fb69ccee2 --- /dev/null +++ b/pi-util/conf_native.sh -@@ -0,0 +1,107 @@ +@@ -0,0 +1,127 @@ +echo "Configure for native build" + +FFSRC=`pwd` @@ -75412,6 +76567,7 @@ index 0000000000..a9e053801c + +NOSHARED= +MMAL= ++USR_PREFIX= + +while [ "$1" != "" ] ; do + case $1 in @@ -75421,8 +76577,14 @@ index 0000000000..a9e053801c + --mmal) + MMAL=1 + ;; ++ --usr) ++ USR_PREFIX=/usr ++ ;; + *) -+ echo "Usage $0: [--noshared] [--mmal]" ++ echo "Usage $0: [--noshared] [--mmal] [--usr]" ++ echo " noshared Build static libs and executable - good for testing" ++ echo " mmal Build mmal decoders" ++ echo " usr Set install prefix to /usr [default=/install]" + exit 1 + ;; + esac @@ -75436,18 +76598,28 @@ index 0000000000..a9e053801c +RPI_DEFINES= +RPI_EXTRALIBS= + -+if [ "$MC" == "arm64" ]; then -+ echo "M/C aarch64" -+ A=aarch64-linux-gnu -+ B=arm64 -+elif [ "$MC" == "armhf" ]; then -+ echo "M/C armv7" -+ A=arm-linux-gnueabihf -+ B=armv7 -+ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" -+ RPI_DEFINES=-mfpu=neon-vfpv4 ++# uname -m gives kernel type which may not have the same ++# 32/64bitness as userspace :-( getconf shoudl provide the answer ++# but use uname to check we are on the right processor ++MC=`uname -m` ++LB=`getconf LONG_BIT` ++if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then ++ if [ "$LB" == "32" ]; then ++ echo "M/C armv7" ++ A=arm-linux-gnueabihf ++ B=armv7 ++ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" ++ RPI_DEFINES=-mfpu=neon-vfpv4 ++ elif [ "$LB" == "64" ]; then ++ echo "M/C aarch64" ++ A=aarch64-linux-gnu ++ B=arm64 ++ else ++ echo "Unknown LONG_BIT name: $LB" ++ exit 1 ++ fi +else -+ echo Unexpected architecture $MC ++ echo "Unknown machine name: $MC" + exit 1 +fi + @@ -75475,7 +76647,9 @@ index 0000000000..a9e053801c + OUT=$BUILDBASE/$B-$C-$V-shared-rel +fi + -+USR_PREFIX=$OUT/install ++if [ ! $USR_PREFIX ]; then ++ USR_PREFIX=$OUT/install ++fi +LIB_PREFIX=$USR_PREFIX/lib/$A +INC_PREFIX=$USR_PREFIX/include/$A + @@ -75505,6 +76679,7 @@ index 0000000000..a9e053801c + --extra-libs="$RPI_EXTRALIBS"\ + --extra-version="rpi" + ++echo "Configured into $OUT" + +# gcc option for getting asm listing +# -Wa,-ahls diff --git a/alarm/ffmpeg-rpi/PKGBUILD b/alarm/ffmpeg-rpi/PKGBUILD index 228025715..96c55174d 100644 --- a/alarm/ffmpeg-rpi/PKGBUILD +++ b/alarm/ffmpeg-rpi/PKGBUILD @@ -7,7 +7,7 @@ pkgbase=ffmpeg-rpi pkgname=($pkgbase $pkgbase-bin) pkgver=4.4.4 -pkgrel=1 +pkgrel=2 arch=(aarch64) url=https://ffmpeg.org/ license=(GPL3) @@ -83,7 +83,7 @@ source=(https://ffmpeg.org/releases/${pkgname/-rpi}-$pkgver.tar.xz{,.asc} sha256sums=('e80b380d595c809060f66f96a5d849511ef4a76a26b76eacf5778b94c3570309' 'SKIP' '2e8d885de789b461ddf63c10646cdb16ad5519b671efd1624bf5a8e7da43dbf3' - '316f5b7a2cad9efbc84cf0148d9a5b99e9a9e2543c62ebc7a02f0924da096542' + 'c3db95417fbfdd9e7a96d63cb2a91ad1eee17ae233c0ef1cf1588f8c0eff90fa' '42f57e7a55f250811515571c870372d6ed0ed504f823b341d26f383c082ce0a0') validpgpkeys=('FCF986EA15E6E293A5644F10B4322F04D67658D8')