PR #22338 opened by hezuoqiang URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22338 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22338.patch
The fixed-length characteristic of NEON registers eliminates the long int length handling logic present in the C version, and mask calculation is also simplified. Label 1 in the assembly code is loop-unrolled to process 32 elements of data per iteration, which delivers a slight performance gain compared to processing 16 elements per iteration. Benchmark Results (1024 iterations, Raspberry Pi 5 - Cortex-A76): - add_int16_128_c: 914.5 ( 1.00x) - add_int16_128_neon: 452.5 ( 2.02x) - add_int16_rnd_width_c: 914.4 ( 1.00x) - add_int16_rnd_width_neon: 455.6 ( 2.01x) >From 2f0efc1640a3be345ad8b84c46e99b7b3eb275b7 Mon Sep 17 00:00:00 2001 From: Zuoqiang He <[email protected]> Date: Mon, 2 Mar 2026 00:37:57 +0800 Subject: [PATCH] libavcodec/huffyuvdsp: Add NEON optimization for the add_int16 function The fixed-length characteristic of NEON registers eliminates the long int length handling logic present in the C version, and mask calculation is also simplified. Label 1 in the assembly code is loop-unrolled to process 32 elements of data per iteration, which delivers a slight performance gain compared to processing 16 elements per iteration. Benchmark Results (1024 iterations, Raspberry Pi 5 - Cortex-A76): - add_int16_128_c: 914.5 ( 1.00x) - add_int16_128_neon: 452.5 ( 2.02x) - add_int16_rnd_width_c: 914.4 ( 1.00x) - add_int16_rnd_width_neon: 455.6 ( 2.01x) --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/huffyuvdsp_init_aarch64.c | 35 +++++++++ libavcodec/aarch64/huffyuvdsp_neon.S | 75 ++++++++++++++++++++ libavcodec/huffyuvdsp.c | 2 + libavcodec/huffyuvdsp.h | 2 + 5 files changed, 116 insertions(+) create mode 100644 libavcodec/aarch64/huffyuvdsp_init_aarch64.c create mode 100644 libavcodec/aarch64/huffyuvdsp_neon.S diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index e3abdbfd72..41ab0257b3 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -4,6 +4,7 @@ OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_init_aarch64.o OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o +OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o @@ -51,6 +52,7 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o +NEON-OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_neon.o NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ aarch64/simple_idct_neon.o NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o diff --git a/libavcodec/aarch64/huffyuvdsp_init_aarch64.c b/libavcodec/aarch64/huffyuvdsp_init_aarch64.c new file mode 100644 index 0000000000..210c58589a --- /dev/null +++ b/libavcodec/aarch64/huffyuvdsp_init_aarch64.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2025 + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/huffyuvdsp.h" + +void ff_add_int16_neon(uint16_t *dst, const uint16_t *src, unsigned mask, int w); + +av_cold void ff_huffyuvdsp_init_aarch64(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->add_int16 = ff_add_int16_neon; + } +} diff --git a/libavcodec/aarch64/huffyuvdsp_neon.S b/libavcodec/aarch64/huffyuvdsp_neon.S new file mode 100644 index 0000000000..5c0d970c7a --- /dev/null +++ b/libavcodec/aarch64/huffyuvdsp_neon.S @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2025 + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// Add int16 array with masking +// On entry: +// x0 -> destination array (uint16_t*) +// x1 -> source array (const uint16_t*) +// w2 = mask value +// w3 = number of elements +function ff_add_int16_neon, export=1 + dup v4.8h, w2 + + // Process 32 elements (64 bytes) if available +1: cmp w3, #32 + b.lt 2f + ld1 {v0.8h, v1.8h}, [x1], #32 + ld1 {v2.8h, v3.8h}, [x0] + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + st1 {v0.8h, v1.8h}, [x0], #32 + + ld1 {v0.8h, v1.8h}, [x1], #32 + ld1 {v2.8h, v3.8h}, [x0] + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + st1 {v0.8h, v1.8h}, [x0], #32 + sub w3, w3, #32 + b 1b + + // Process 8 elements (16 bytes) if available +2: cmp w3, #8 + b.lt 3f + ld1 {v0.8h}, [x1], #16 + ld1 {v1.8h}, [x0] + add v0.8h, v0.8h, v1.8h + and v0.16b, v0.16b, v4.16b + st1 {v0.8h}, [x0], #16 + sub w3, w3, #8 + b 2b + + // Scalar path for remaining elements +3: cbz w3, 4f + ldrh w5, [x1], #2 + ldrh w6, [x0] + add w5, w5, w6 + and w5, w5, w2 + strh w5, [x0], #2 + subs w3, w3, #1 + b.ne 3b + +4: ret +endfunc diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c index 1ae2f820d0..888b3e4474 100644 --- a/libavcodec/huffyuvdsp.c +++ b/libavcodec/huffyuvdsp.c @@ -89,6 +89,8 @@ av_cold void ff_huffyuvdsp_init(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt #if ARCH_RISCV ff_huffyuvdsp_init_riscv(c, pix_fmt); +#elif ARCH_AARCH64 + ff_huffyuvdsp_init_aarch64(c, pix_fmt); #elif ARCH_X86 && HAVE_X86ASM ff_huffyuvdsp_init_x86(c, pix_fmt); #endif diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h index 34bed58ef2..3f319b17c2 100644 --- a/libavcodec/huffyuvdsp.h +++ b/libavcodec/huffyuvdsp.h @@ -34,6 +34,8 @@ typedef struct HuffYUVDSPContext { } HuffYUVDSPContext; void ff_huffyuvdsp_init(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt); +void ff_huffyuvdsp_init_aarch64(HuffYUVDSPContext *c, + enum AVPixelFormat pix_fmt); void ff_huffyuvdsp_init_riscv(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt); void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt); -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
