> On Apr 25, 2025, at 16:25, Martin Storsjö <[email protected]> wrote:
>
> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>
>> From: Zhao Zhili <[email protected]>
>>
>> int8_t[] is enough for offset_table of 8 bit streams.
>>
>> On rpi5:
>> Before After
>> hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x)
>> hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x)
>> hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x)
>> hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x)
>> hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x)
>> ---
>> libavcodec/aarch64/h26x/dsp.h | 4 +
>> libavcodec/aarch64/h26x/sao_neon.S | 93 ++++++++++++++---------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +-
>> libavcodec/aarch64/vvc/dsp_init.c | 5 +-
>> 4 files changed, 65 insertions(+), 41 deletions(-)
>>
>> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
>> index 0fefb4d70f..6ea6a8d36a 100644
>> --- a/libavcodec/aarch64/h26x/dsp.h
>> +++ b/libavcodec/aarch64/h26x/dsp.h
>> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst,
>> const uint8_t *_src,
>> ptrdiff_t stride_dst, ptrdiff_t
>> stride_src,
>> const int16_t *sao_offset_val, int
>> sao_left_class,
>> int width, int height);
>> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t
>> *_src,
>> + ptrdiff_t stride_dst, ptrdiff_t
>> stride_src,
>> + const int16_t *sao_offset_val, int
>> sao_left_class,
>> + int width, int height);
>> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src,
>> ptrdiff_t stride_dst,
>> const int16_t *sao_offset_val, int
>> eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src,
>> ptrdiff_t stride_dst,
>> diff --git a/libavcodec/aarch64/h26x/sao_neon.S
>> b/libavcodec/aarch64/h26x/sao_neon.S
>> index c43820135e..60c026fe95 100644
>> --- a/libavcodec/aarch64/h26x/sao_neon.S
>> +++ b/libavcodec/aarch64/h26x/sao_neon.S
>> @@ -35,48 +35,67 @@
>> // int16_t *sao_offset_val, int sao_left_class,
>> // int width, int height)
>> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
>> - stp xzr, xzr, [sp, #-64]!
>> + stp xzr, xzr, [sp, #-32]!
>> stp xzr, xzr, [sp, #16]
>> - stp xzr, xzr, [sp, #32]
>> - stp xzr, xzr, [sp, #48]
>> mov w8, #4
>> -0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
>> - subs w8, w8, #1
>> - add w10, w8, w5 // k + sao_left_class
>> +0:
>> + ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
>> + subs w8, w8, #1
>> + add w10, w8, w5 // k + sao_left_class
>> and w10, w10, #0x1F
>> - strh w9, [sp, x10, lsl #1]
>> + strb w9, [sp, x10]
>> bne 0b
>> - add w6, w6, #7
>> - bic w6, w6, #7
>> - ld1 {v16.16b-v19.16b}, [sp], #64
>> - sub x2, x2, x6
>> - sub x3, x3, x6
>> - movi v20.8h, #1
>> -1: mov w8, w6 // beginning of line
>> -2: // Simple layout for accessing 16bit values
>> - // with 8bit LUT.
>> - //
>> - // 00 01 02 03 04 05 06 07
>> - // +----------------------------------->
>> - // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
>> - // +----------------------------------->
>> - // i-0 i-1 i-2 i-3
>> - ld1 {v2.8b}, [x1], #8 // dst[x] =
>> av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>> - subs w8, w8, #8
>> - uxtl v0.8h, v2.8b // load src[x]
>> - ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3
>> - shl v1.8h, v2.8h, #1 // low (x2, accessing
>> short)
>> - add v3.8h, v1.8h, v20.8h // +1 access upper short
>> - sli v1.8h, v3.8h, #8 // shift insert index to
>> upper byte
>> - tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
>> - add v1.8h, v0.8h, v2.8h // src[x] + table
>> - sqxtun v4.8b, v1.8h // clip + narrow
>> - st1 {v4.8b}, [x0], #8 // store
>> - // done 8 pixels
>> + ldp q16, q17, [sp], #32
>> +1:
>> + ld1 {v2.8b}, [x1], x3
>> + subs w7, w7, #1
>> + uxtl v0.8h, v2.8b
>> + ushr v3.8b, v2.8b, #3 // >> BIT_DEPTH - 3
>
> Nitpick: The comment on this line seems to be misaligned with the other
> comments below - please check.
Fixed before push.
>
>> + tbx v3.8b, {v16.16b-v17.16b}, v3.8b
>
> Is there any specific reason for preferring tbx over tbl here? (I know the
> existing code used tbx.) Without having studied cycle tables, I would expect
> tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx
> is faster)?
tbl can be faster. The result is quite impressive. Changed to tbl before push.
Before tbx tbl
hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x) 252.3 (
1.00x)
hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x) 61.0 (
4.57x)
hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x) 864.9 (
1.00x)
hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x) 150.0 (
6.26x)
hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x) 3871.6 (
1.00x)
hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x) 550.6 ( 7.39)
hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x) 8182.6 (
1.00x)
hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x) 1185.8 (
7.36x)
hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x) 14038.9 (
1.00x)
hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x) 2078.4 (
7.15x)
>
>
> Other than these comments, this patch looks good to me, thanks - feel free to
> push.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> [email protected] <mailto:[email protected]>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] <mailto:[email protected]> with
> subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".