checkasm --bench on a Raspberry Pi 5 Model B Rev 1.0:
yuv420p_gbrp_128_c: 1243.0
yuv420p_gbrp_128_neon: 453.5
yuv420p_gbrp_1920_c: 18165.5
yuv420p_gbrp_1920_neon: 6700.0
yuv422p_gbrp_128_c: 1463.5
yuv422p_gbrp_128_neon: 471.5
yuv422p_gbrp_1920_c: 21343.7
yuv422p_gbrp_1920_neon: 6743.5
---
libswscale/aarch64/swscale_unscaled.c | 58 +++++++++++++++++++++
libswscale/aarch64/yuv2rgb_neon.S | 73 ++++++++++++++++++++++-----
2 files changed, 118 insertions(+), 13 deletions(-)
diff --git a/libswscale/aarch64/swscale_unscaled.c
b/libswscale/aarch64/swscale_unscaled.c
index b3093bbc9d..5c4f6fee34 100644
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@@ -52,11 +52,41 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c,
const uint8_t *src[],
c->yuv2rgb_y_coeff);
\
}
\
+#define DECLARE_FF_YUVX_TO_GBRP_FUNCS(ifmt, ofmt)
\
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h,
\
+ uint8_t *dst, int linesize,
\
+ const uint8_t *srcY, int linesizeY,
\
+ const uint8_t *srcU, int linesizeU,
\
+ const uint8_t *srcV, int linesizeV,
\
+ const int16_t *table,
\
+ int y_offset,
\
+ int y_coeff,
\
+ uint8_t *dst1, int linesize1,
\
+ uint8_t *dst2, int linesize2);
\
+
\
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t
*src[], \
+ int srcStride[], int srcSliceY, int
srcSliceH, \
+ uint8_t *dst[], int dstStride[]) {
\
+ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };
\
+
\
+ return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH,
\
+ dst[0] + srcSliceY * dstStride[0],
dstStride[0], \
+ src[0], srcStride[0],
\
+ src[1], srcStride[1],
\
+ src[2], srcStride[2],
\
+ yuv2rgb_table,
\
+ c->yuv2rgb_y_offset >> 6,
\
+ c->yuv2rgb_y_coeff,
\
+ dst[1] + srcSliceY * dstStride[1],
dstStride[1], \
+ dst[2] + srcSliceY * dstStride[2],
dstStride[2]); \
+}
\
+
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx)
\
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb)
\
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba)
\
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr)
\
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra)
\
+DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp)
\
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
@@ -83,11 +113,38 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c,
const uint8_t *src[],
c->yuv2rgb_y_coeff);
\
}
\
+#define DECLARE_FF_NVX_TO_GBRP_FUNCS(ifmt, ofmt)
\
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h,
\
+ uint8_t *dst, int linesize,
\
+ const uint8_t *srcY, int linesizeY,
\
+ const uint8_t *srcC, int linesizeC,
\
+ const int16_t *table,
\
+ int y_offset,
\
+ int y_coeff,
\
+ uint8_t *dst1, int linesize1,
\
+ uint8_t *dst2, int linesize2);
\
+
\
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t
*src[], \
+ int srcStride[], int srcSliceY, int
srcSliceH, \
+ uint8_t *dst[], int dstStride[]) {
\
+ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };
\
+
\
+ return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH,
\
+ dst[0] + srcSliceY * dstStride[0],
dstStride[0], \
+ src[0], srcStride[0], src[1],
srcStride[1], \
+ yuv2rgb_table,
\
+ c->yuv2rgb_y_offset >> 6,
\
+ c->yuv2rgb_y_coeff,
\
+ dst[1] + srcSliceY * dstStride[1],
dstStride[1], \
+ dst[2] + srcSliceY * dstStride[2],
dstStride[2]); \
+}
\
+
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx)
\
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb)
\
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)
\
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr)
\
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra)
\
+DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp)
\
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
@@ -110,6 +167,7 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd);
\
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd);
\
+ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd);
\
} while (0)
static void get_unscaled_swscale_neon(SwsContext *c) {
diff --git a/libswscale/aarch64/yuv2rgb_neon.S
b/libswscale/aarch64/yuv2rgb_neon.S
index 89d69e7f6c..b89eb2c781 100644
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -30,23 +30,43 @@
#endif
.endm
-.macro load_args_nv12
+.macro load_dst1_dst2 dst1 linesize1 dst2 linesize2
+#if defined(__APPLE__)
+#define DST_OFFSET 8
+#else
+#define DST_OFFSET 0
+#endif
+ ldr x10, [sp, #\dst1 - DST_OFFSET]
+ ldr w12, [sp, #\linesize1 - DST_OFFSET]
+ ldr x15, [sp, #\dst2 - DST_OFFSET]
+ ldr w16, [sp, #\linesize2 - DST_OFFSET]
+#undef DST_OFFSET
+ sub w12, w12, w0 // w12
= linesize1 - width (padding1)
+ sub w16, w16, w0 // w16
= linesize2 - width (padding2)
+.endm
+
+.macro load_args_nv12 ofmt
ldr x8, [sp] //
table
load_yoff_ycoeff 8, 16 //
y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
+.ifc \ofmt,gbrp
+ load_dst1_dst2 24, 32, 40, 48
+ sub w3, w3, w0 // w3
= linesize - width (padding)
+.else
sub w3, w3, w0, lsl #2 // w3
= linesize - width * 4 (padding)
+.endif
sub w5, w5, w0 // w5
= linesizeY - width (paddingY)
sub w7, w7, w0 // w7
= linesizeC - width (paddingC)
neg w11, w0
.endm
-.macro load_args_nv21
- load_args_nv12
+.macro load_args_nv21 ofmt
+ load_args_nv12 \ofmt
.endm
-.macro load_args_yuv420p
+.macro load_args_yuv420p ofmt
ldr x13, [sp] // srcV
ldr w14, [sp, #8] //
linesizeV
ldr x8, [sp, #16] //
table
@@ -54,7 +74,12 @@
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
+.ifc \ofmt,gbrp
+ load_dst1_dst2 40, 48, 56, 64
+ sub w3, w3, w0 // w3
= linesize - width (padding)
+.else
sub w3, w3, w0, lsl #2 // w3
= linesize - width * 4 (padding)
+.endif
sub w5, w5, w0 // w5
= linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7
= linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14
= linesizeV - width / 2 (paddingV)
@@ -62,7 +87,7 @@
neg w11, w11
.endm
-.macro load_args_yuv422p
+.macro load_args_yuv422p ofmt
ldr x13, [sp] // srcV
ldr w14, [sp, #8] //
linesizeV
ldr x8, [sp, #16] //
table
@@ -70,7 +95,12 @@
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
+.ifc \ofmt,gbrp
+ load_dst1_dst2 40, 48, 56, 64
+ sub w3, w3, w0 // w3
= linesize - width (padding)
+.else
sub w3, w3, w0, lsl #2 // w3
= linesize - width * 4 (padding)
+.endif
sub w5, w5, w0 // w5
= linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7
= linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14
= linesizeV - width / 2 (paddingV)
@@ -100,9 +130,9 @@
.endm
.macro increment_nv12
- ands w15, w1, #1
- csel w16, w7, w11, ne //
incC = (h & 1) ? paddincC : -width
- add x6, x6, w16, sxtw //
srcC += incC
+ ands w17, w1, #1
+ csel w17, w7, w11, ne //
incC = (h & 1) ? paddincC : -width
+ add x6, x6, w17, sxtw //
srcC += incC
.endm
.macro increment_nv21
@@ -110,10 +140,10 @@
.endm
.macro increment_yuv420p
- ands w15, w1, #1
- csel w16, w7, w11, ne //
incU = (h & 1) ? paddincU : -width/2
+ ands w17, w1, #1
+ csel w17, w7, w11, ne //
incU = (h & 1) ? paddincU : -width/2
+ add x6, x6, w17, sxtw //
srcU += incU
csel w17, w14, w11, ne //
incV = (h & 1) ? paddincV : -width/2
- add x6, x6, w16, sxtw //
srcU += incU
add x13, x13, w17, sxtw //
srcV += incV
.endm
@@ -122,7 +152,7 @@
add x13, x13, w14, sxtw //
srcV += incV
.endm
-.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
+.macro compute_rgb r1 g1 b1 r2 g2 b2
add v20.8h, v26.8h, v20.8h // Y1
+ R1
add v21.8h, v27.8h, v21.8h // Y2
+ R2
add v22.8h, v26.8h, v22.8h // Y1
+ G1
@@ -135,13 +165,18 @@
sqrshrun \g2, v23.8h, #1 //
clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8h, #1 //
clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8h, #1 //
clip_u8((Y2 + B1) >> 1)
+.endm
+
+.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
+ compute_rgb \r1, \g1, \b1, \r2, \g2, \b2
movi \a1, #255
movi \a2, #255
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
- load_args_\ifmt
+ load_args_\ifmt \ofmt
+
mov w9, w1
1:
mov w8, w0 // w8
= width
@@ -185,11 +220,22 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif
+.ifc \ofmt,gbrp
+ compute_rgb v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
+ st1 { v4.8b, v5.8b }, [x2], #16
+ st1 { v6.8b, v7.8b }, [x10], #16
+ st1 { v18.8b, v19.8b }, [x15], #16
+.else
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
+.endif
subs w8, w8, #16 //
width -= 16
b.gt 2b
add x2, x2, w3, sxtw // dst
+= padding
+.ifc \ofmt,gbrp
+ add x10, x10, w12, sxtw //
dst1 += padding1
+ add x15, x15, w16, sxtw //
dst2 += padding2
+.endif
add x4, x4, w5, sxtw //
srcY += paddingY
increment_\ifmt
subs w1, w1, #1 //
height -= 1
@@ -204,6 +250,7 @@ endfunc
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
+ declare_func \ifmt, gbrp
.endm
declare_rgb_funcs nv12
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".