> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <[email protected]> wrote:
>
>
>
> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <[email protected]
> <mailto:[email protected]>> a écrit :
>> From: Zhao Zhili <[email protected]>
>>
>> Test on Apple M1:
>>
>> rgb24_to_uv_1080_c: 7.2
>> rgb24_to_uv_1080_neon: 5.5
>> rgb24_to_uv_1280_c: 8.2
>> rgb24_to_uv_1280_neon: 6.2
>> rgb24_to_uv_1920_c: 12.5
>> rgb24_to_uv_1920_neon: 9.5
>>
>> rgb24_to_uv_half_540_c: 6.5
>> rgb24_to_uv_half_540_neon: 3.0
>> rgb24_to_uv_half_640_c: 7.5
>> rgb24_to_uv_half_640_neon: 3.2
>> rgb24_to_uv_half_960_c: 12.5
>> rgb24_to_uv_half_960_neon: 6.0
>>
>> rgb24_to_y_1080_c: 4.5
>> rgb24_to_y_1080_neon: 3.5
>> rgb24_to_y_1280_c: 5.2
>> rgb24_to_y_1280_neon: 4.2
>> rgb24_to_y_1920_c: 8.0
>> rgb24_to_y_1920_neon: 6.0
>>
>> Signed-off-by: Zhao Zhili <[email protected]>
>> ---
>> libswscale/aarch64/Makefile | 1 +
>> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
>> libswscale/aarch64/swscale.c | 25 ++++
>> 3 files changed, 255 insertions(+)
>> create mode 100644 libswscale/aarch64/input.S
>>
>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>> index da1d909561..adfd90a1b6 100644
>> --- a/libswscale/aarch64/Makefile
>> +++ b/libswscale/aarch64/Makefile
>> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
>> aarch64/swscale_unscaled.o \
>>
>> NEON-OBJS += aarch64/hscale.o \
>> + aarch64/input.o \
>> aarch64/output.o \
>> aarch64/rgb2rgb_neon.o \
>> aarch64/yuv2rgb_neon.o \
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> new file mode 100644
>> index 0000000000..ee0d223c6e
>> --- /dev/null
>> +++ b/libswscale/aarch64/input.S
>> @@ -0,0 +1,229 @@
>> +/*
>> + * Copyright (c) 2024 Zhao Zhili <[email protected]>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
>> USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +.macro rgb24_to_yuv_load_rgb, src
>> + ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>> + ushll v19.8h, v16.8b, #0 // v19: r
>> + ushll v20.8h, v17.8b, #0 // v20: g
>> + ushll v21.8h, v18.8b, #0 // v21: b
>> + ushll2 v22.8h, v16.16b, #0 // v22: r
>> + ushll2 v23.8h, v17.16b, #0 // v23: g
>> + ushll2 v24.8h, v18.16b, #0 // v24: b
>> +.endm
>> +
>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2,
>> right_shift
>> + mov \dst1\().16b, v6.16b // dst1 =
>> const_offset
>> + mov \dst2\().16b, v6.16b // dst2 =
>> const_offset
>> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 +=
>> rx * r
>> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 +=
>> rx * r
>> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 +=
>> gx * g
>> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 +=
>> gx * g
>> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 +=
>> bx * b
>> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 +=
>> bx * b
>> + sqshrn \dst\().4h, \dst1\().4s, \right_shift //
>> dst_lower_half = dst1 >> right_shift
>> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift //
>> dst_higher_half = dst2 >> right_shift
>> +.endm
>> +
>> +function ff_rgb24ToY_neon, export=1
>> + cmp w4, #0 // check width > 0
>> + b.le 4f
>> +
>> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy
>
> I don't think it affects anything on your OoO execution hardware, but you're
> using the result of this load right off the bat in the next instruction.
> Ditto below. This may hurt perfs on not-so-fancy CPUs.
Will do.
>
>> + dup v0.8h, w10
>> + dup v1.8h, w11
>> + ldr w12, [x5] // w12: by
>> + dup v2.8h, w12
>> +
>> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT
>> - 7)
>> + movk w9, #8, lsl #16 // w9 += 32 <<
>> (RGB2YUV_SHIFT - 1)
>> + dup v6.4s, w9 // w9: const_offset
>> +
>> + mov x2, #0 // w2: i
>> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
>> + cbz w3, 3f
>> +1:
>> + rgb24_to_yuv_load_rgb x1
>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> + stp q16, q17, [x0], #32 // store to dst
>> +
>> + add w2, w2, #16 // i += 16
>> + add x1, x1, #48 // src += 48
>> + cmp w2, w3 // i < (width / 16 * 16)
>> + b.lt 1b
>> + b 3f
>> +2:
>> + ldrb w13, [x1] // w13: r
>> + ldrb w14, [x1, #1] // w14: g
>> + ldrb w15, [x1, #2] // w15: b
>
> You can reorder instructions a little to use post-index and eliminate the
> ADD, though that won't make much difference.
>
> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
I have checked the assembly of C version. The compiler has done pretty well on
loop unroll and
vectorize on this simple case.
>
>> +
>> + smaddl x13, w13, w10, x9 // x13 = ry * r +
>> const_offset
>> + smaddl x13, w14, w11, x13 // x13 += gy * g
>> + smaddl x13, w15, w12, x13 // x13 += by * b
>> + asr w13, w13, #9 // x13 >>= 9
>> + strh w13, [x0], #2 // store to dst
>> +
>> + add w2, w2, #1 // i++
>> + add x1, x1, #3 // src += 3
>> +3:
>> + cmp w2, w4 // i < width
>> + b.lt 2b
>> +4:
>> + ret
>> +endfunc
>> +
>> +.macro rgb24_load_uv_coeff half
>> + add x6, x6, #12
>> +
>> + ldp w10, w11, [x6], #8 // w10: ru, w11: gu
>> + dup v0.8h, w10
>> + dup v1.8h, w11
>> +
>> + ldp w12, w13, [x6], #8 // w12: bu, w13: rv
>> + dup v2.8h, w12
>> + dup v3.8h, w13
>> +
>> + ldp w14, w15, [x6], #8 // w14: gv, w15: bv
>> + dup v4.8h, w14
>> + dup v5.8h, w15
>> +
>> + .if \half
>> + mov w9, #512
>> + movk w9, #128, lsl #16 // w9: const_offset
>> + .else
>> + mov w9, #256
>> + movk w9, #64, lsl #16 // w9: const_offset
>> + .endif
>> + dup v6.4s, w9
>> +.endm
>> +
>> +function ff_rgb24ToUV_half_neon, export=1
>> + cmp w5, #0 // check width > 0
>> + b.le 4f
>> +
>> + rgb24_load_uv_coeff half=1
>> +
>> + mov x9, #0 // x9: i
>> + and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8
>> + cbz w7, 3f
>> +1:
>> + ld3 { v16.16b, v17.16b, v18.16b }, [x3]
>> + uaddlp v19.8h, v16.16b // v19: r
>> + uaddlp v20.8h, v17.16b // v20: g
>> + uaddlp v21.8h, v18.16b // v21: b
>> +
>> + rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>> + str q16, [x0], #16 // store dst_u
>> + rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>> + str q17, [x1], #16 // store dst_v
>> +
>> + add w9, w9, #8 // i += 8
>> + add x3, x3, #48 // src += 48
>> + cmp w9, w7 // i < (width * 8 / 8)
>> + b.lt 1b
>> + b 3f
>> +2:
>> + ldrb w2, [x3] // w2: r1
>> + ldrb w4, [x3, #3] // w4: r2
>> + add w2, w2, w4 // w2 = r1 + r2
>> +
>> + ldrb w4, [x3, #1] // w4: g1
>> + ldrb w7, [x3, #4] // w7: g2
>> + add w4, w4, w7 // w4 = g1 + g2
>> +
>> + ldrb w7, [x3, #2] // w7: b1
>> + ldrb w8, [x3, #5] // w8: b2
>> + add w7, w7, w8 // w7 = b1 + b2
>> +
>> + umov w8, v6.s[0] // dst_u = const_offset
>> + smaddl x8, w2, w10, x8 // dst_u += ru * r
>> + smaddl x8, w4, w11, x8 // dst_u += gu * g
>> + smaddl x8, w7, w12, x8 // dst_u += bu * b
>> + asr x8, x8, #10 // dst_u >>= 10
>> + strh w8, [x0], #2 // store dst_u
>> +
>> + umov w8, v6.s[0] // dst_v = const_offset
>> + smaddl x8, w2, w13, x8 // dst_v += rv * r
>> + smaddl x8, w4, w14, x8 // dst_v += gv * g
>> + smaddl x8, w7, w15, x8 // dst_v += bv * b
>> + asr x8, x8, #10 // dst_v >>= 10
>> + strh w8, [x1], #2 // store dst_v
>> +
>> + add w9, w9, #1 // i++
>> + add x3, x3, #6 // src += 6
>> +3:
>> + cmp w9, w5
>> + b.lt 2b
>> +4:
>> + ret
>> +endfunc
>> +
>> +function ff_rgb24ToUV_neon, export=1
>> + cmp w5, #0 // check width > 0
>> + b.le 4f
>> +
>> + rgb24_load_uv_coeff half=0
>> +
>> + mov x2, #0 // w2: i
>> + and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16
>> + cbz w4, 3f
>> +1:
>> + rgb24_to_yuv_load_rgb x3
>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> + stp q16, q17, [x0], #32 // store to dst_u
>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>> + stp q16, q17, [x1], #32 // store to dst_v
>> +
>> + add w2, w2, #16 // i += 16
>> + add x3, x3, #48 // src += 48
>> + cmp w2, w4 // i < (width / 16 * 16)
>> + b.lt 1b
>> + b 3f
>> +2:
>> + ldrb w16, [x3] // w16: r
>> + ldrb w17, [x3, #1] // w17: g
>> + ldrb w4, [x3, #2] // w4: b
>> +
>> + umov w7, v6.s[0] // w7 = const_offset
>> +
>> + smaddl x8, w16, w10, x7 // x8 = ru * r +
>> const_offset
>> + smaddl x8, w17, w11, x8 // x8 += gu * g
>> + smaddl x8, w4, w12, x8 // x8 += bu * b
>> + asr w8, w8, #9 // x8 >>= 9
>> + strh w8, [x0], #2 // store to dst_u
>> +
>> + smaddl x8, w16, w13, x7 // x8 = rv * r +
>> const_offset
>> + smaddl x8, w17, w14, x8 // x8 += gv * g
>> + smaddl x8, w4, w15, x8 // x8 += bv * b
>> + asr w8, w8, #9 // x8 >>= 9
>> + strh w8, [x1], #2 // store to dst_v
>> +
>> + add w2, w2, #1 // i++
>> + add x3, x3, #3 // src += 3
>> +3:
>> + cmp w2, w5 // i < width
>> + b.lt 2b
>> +4:
>> + ret
>> +endfunc
>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>> index bbd9719a44..4c4ea39dc1 100644
>> --- a/libswscale/aarch64/swscale.c
>> +++ b/libswscale/aarch64/swscale.c
>> @@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
>> default: break; \
>> }
>>
>> +void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t
>> *unused1,
>> + const uint8_t *unused2, int width,
>> + uint32_t *rgb2yuv, void *opq);
>> +
>> +void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t
>> *unused0,
>> + const uint8_t *src1,
>> + const uint8_t *src2, int width, uint32_t *rgb2yuv,
>> + void *opq);
>> +
>> +void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t
>> *unused0,
>> + const uint8_t *src1,
>> + const uint8_t *src2, int width, uint32_t *rgb2yuv,
>> + void *opq);
>> +
>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> {
>> int cpu_flags = av_get_cpu_flags();
>> @@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> if (c->dstBpc == 8) {
>> c->yuv2planeX = ff_yuv2planeX_8_neon;
>> }
>> + switch (c->srcFormat) {
>> + case AV_PIX_FMT_RGB24:
>> + c->lumToYV12 = ff_rgb24ToY_neon;
>> + if (c->chrSrcHSubSample)
>> + c->chrToYV12 = ff_rgb24ToUV_half_neon;
>> + else
>> + c->chrToYV12 = ff_rgb24ToUV_neon;
>> + break;
>> + default:
>> + break;
>> + }
>> }
>> }
> _______________________________________________
> ffmpeg-devel mailing list
> [email protected] <mailto:[email protected]>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] <mailto:[email protected]> with
> subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".