PR #22288 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22288 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22288.patch
>From 47d966306d84a0fb91d8fc8bed22648d6d097f5c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Feb 2026 20:00:46 +0100 Subject: [PATCH 01/10] avcodec/huffyuvenc: Calculate mask only once Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/huffyuvenc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index b213d4dc95..370b383de5 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -54,7 +54,7 @@ typedef struct HYuvEncContext { int bitstream_bpp; int version; int bps; - int n; // 1<<bps + unsigned mask; // (1<<bps)-1 int vlc_n; // number of vlc codes (FFMIN(1<<bps, MAX_VLC_N)) int alpha; int chroma; @@ -84,7 +84,7 @@ static inline void diff_bytes(HYuvEncContext *s, uint8_t *dst, if (s->bps <= 8) { s->llvidencdsp.diff_bytes(dst, src0, src1, w); } else { - s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w); + s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->mask, w); } } @@ -114,7 +114,7 @@ static inline int sub_left_prediction(HYuvEncContext *s, uint8_t *dst, } if (w < 32) return left; - s->hencdsp.diff_int16(dst16 + 32, src16 + 32, src16 + 31, s->n - 1, w - 32); + s->hencdsp.diff_int16(dst16 + 32, src16 + 32, src16 + 31, s->mask, w - 32); return src16[w-1]; } } @@ -190,7 +190,8 @@ static void sub_median_prediction(HYuvEncContext *s, uint8_t *dst, if (s->bps <= 8) { s->llvidencdsp.sub_median_pred(dst, src1, src2, w , left, left_top); } else { - s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top); + s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, + (const uint16_t *)src2, s->mask, w, left, left_top); } } @@ -274,6 +275,9 @@ static av_cold int encode_init(AVCodecContext *avctx) s->chroma_h_shift = desc->log2_chroma_w; s->chroma_v_shift = desc->log2_chroma_h; + s->mask = (1 << s->bps) - 1; + s->vlc_n = FFMIN(1 << s->bps, MAX_VLC_N); + switch (avctx->pix_fmt) { case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUV422P: @@ -335,8 +339,6 @@ static av_cold int encode_init(AVCodecContext *avctx) av_log(avctx, AV_LOG_ERROR, "format not supported\n"); return AVERROR(EINVAL); } - s->n = 1<<s->bps; - s->vlc_n = FFMIN(s->n, MAX_VLC_N); avctx->bits_per_coded_sample = s->bitstream_bpp; s->decorrelate = s->bitstream_bpp >= 24 && !s->yuv && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR); @@ -587,7 +589,7 @@ do { \ if (s->bps <= 8) { ENCODE_PLANE(LOAD2, LOADEND, WRITE2, WRITEEND, STAT2, STATEND); } else if (s->bps <= 14) { - int mask = s->n - 1; + unsigned mask = s->mask; ENCODE_PLANE(LOAD2_14, LOADEND_14, WRITE2, WRITEEND, STAT2, STATEND); } else { -- 2.52.0 >From 82a0114e403764717637b30df7a685dd4d168015 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Feb 2026 20:13:50 +0100 Subject: [PATCH 02/10] avcodec/huffyuvenc: Mark unreachable code as such Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/huffyuvenc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index 370b383de5..8f320f0272 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -336,8 +336,7 @@ static av_cold int encode_init(AVCodecContext *avctx) s->bitstream_bpp = 24; break; default: - av_log(avctx, AV_LOG_ERROR, "format not supported\n"); - return AVERROR(EINVAL); + av_unreachable("Already checked via CODEC_PIXFMTS"); } avctx->bits_per_coded_sample = s->bitstream_bpp; -- 2.52.0 >From 397d53d9c8cb0388e00d104b4608dbaf23449469 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Feb 2026 20:33:04 +0100 Subject: [PATCH 03/10] avcodec/huffyuvencdsp: Pass bpp, not AVPixelFormat for init Avoids having to get a pixel format descriptor. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/huffyuvenc.c | 3 ++- libavcodec/huffyuvencdsp.c | 4 ++-- libavcodec/huffyuvencdsp.h | 6 ++---- libavcodec/x86/huffyuvencdsp_init.c | 6 ++---- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index 8f320f0272..0f2cf1791d 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -253,7 +253,6 @@ static av_cold int encode_init(AVCodecContext *avctx) s->flags = avctx->flags; ff_bswapdsp_init(&s->bdsp); - ff_huffyuvencdsp_init(&s->hencdsp, avctx->pix_fmt); ff_llvidencdsp_init(&s->llvidencdsp); avctx->extradata = av_mallocz(3*MAX_N + 4); @@ -278,6 +277,8 @@ static av_cold int encode_init(AVCodecContext *avctx) s->mask = (1 << s->bps) - 1; s->vlc_n = FFMIN(1 << s->bps, MAX_VLC_N); + ff_huffyuvencdsp_init(&s->hencdsp, s->bps); + switch (avctx->pix_fmt) { case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUV422P: diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c index e332f678d4..dcae51f4f8 100644 --- a/libavcodec/huffyuvencdsp.c +++ b/libavcodec/huffyuvencdsp.c @@ -84,12 +84,12 @@ static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, co *left_top = lt; } -av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt) +av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp) { c->diff_int16 = diff_int16_c; c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c; #if ARCH_X86 && HAVE_X86ASM - ff_huffyuvencdsp_init_x86(c, pix_fmt); + ff_huffyuvencdsp_init_x86(c, bpp); #endif } diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h index 779a51ac79..fae182add1 100644 --- a/libavcodec/huffyuvencdsp.h +++ b/libavcodec/huffyuvencdsp.h @@ -21,8 +21,6 @@ #include <stdint.h> -#include "libavutil/pixfmt.h" - typedef struct HuffYUVEncDSPContext { void (*diff_int16)(uint16_t *dst /* align 16 */, const uint16_t *src1 /* align 16 */, @@ -34,7 +32,7 @@ typedef struct HuffYUVEncDSPContext { int w, int *left, int *left_top); } HuffYUVEncDSPContext; -void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt); -void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt); +void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp); +void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp); #endif /* AVCODEC_HUFFYUVENCDSP_H */ diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index c9c33b75b4..fd54fdcc00 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -24,7 +24,6 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/pixdesc.h" #include "libavutil/x86/cpu.h" #include "libavcodec/huffyuvencdsp.h" @@ -35,12 +34,11 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); -av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt) +av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp) { av_unused int cpu_flags = av_get_cpu_flags(); - const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt); - if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) { + if (EXTERNAL_MMXEXT(cpu_flags) && bpp < 16) { c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; } -- 2.52.0 >From 12fbc3f8187da1f3b4a7fa82fa6ef6959757e263 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Feb 2026 21:38:30 +0100 Subject: [PATCH 04/10] tests/checkasm: Add huffyuvencdsp test Only covers sub_hfyu_median_pred_int16 for now. Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/huffyuvencdsp.c | 89 ++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 5 files changed, 95 insertions(+) create mode 100644 tests/checkasm/huffyuvencdsp.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 48de4d22a0..491bde2a7a 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -13,6 +13,7 @@ AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o AVCODECOBJS-$(CONFIG_HPELDSP) += hpeldsp.o +AVCODECOBJS-$(CONFIG_HUFFYUVENCDSP) += huffyuvencdsp.o AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o AVCODECOBJS-$(CONFIG_LLAUDDSP) += llauddsp.o AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index bdaaa8695d..38bd1edce7 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -198,6 +198,9 @@ static const struct { #if CONFIG_HUFFYUV_DECODER { "huffyuvdsp", checkasm_check_huffyuvdsp }, #endif + #if CONFIG_HUFFYUVENCDSP + { "huffyuvencdsp", checkasm_check_huffyuvencdsp }, + #endif #if CONFIG_IDCTDSP { "idctdsp", checkasm_check_idctdsp }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 2a6c7e8ea6..db30ddb863 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -116,6 +116,7 @@ void checkasm_check_hevc_pel(void); void checkasm_check_hevc_sao(void); void checkasm_check_hpeldsp(void); void checkasm_check_huffyuvdsp(void); +void checkasm_check_huffyuvencdsp(void); void checkasm_check_idctdsp(void); void checkasm_check_idet(void); void checkasm_check_jpeg2000dsp(void); diff --git a/tests/checkasm/huffyuvencdsp.c b/tests/checkasm/huffyuvencdsp.c new file mode 100644 index 0000000000..049a7d126b --- /dev/null +++ b/tests/checkasm/huffyuvencdsp.c @@ -0,0 +1,89 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "checkasm.h" +#include "libavcodec/huffyuvencdsp.h" +#include "libavutil/cpu.h" +#include "libavutil/macros.h" +#include "libavutil/mem_internal.h" + +enum { + MAX_WIDTH = 4096, ///< maximum test width, must be a power of two smaller than the maximum alignment +}; + +#define randomize_buffers(buf, size, mask) \ + do { \ + for (size_t j = 0; j < size; ++j) \ + buf[j] = rnd() & mask; \ + } while (0) + + +static void check_sub_hfyu_median_pred_int16(const char *aligned, unsigned width) +{ + static const int bpps[] = { 9, 16, }; + HuffYUVEncDSPContext c; + + declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint16_t *dst, const uint16_t *src1, + const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); + + for (size_t i = 0; i < FF_ARRAY_ELEMS(bpps); ++i) { + const int bpp = bpps[i]; + + ff_huffyuvencdsp_init(&c, bpp); + + if (check_func(c.sub_hfyu_median_pred_int16, "sub_hfyu_median_pred_int16_%dbpp%s", bpp, aligned)) { + DECLARE_ALIGNED(32, uint16_t, dst0)[MAX_WIDTH]; + DECLARE_ALIGNED(32, uint16_t, dst1)[MAX_WIDTH]; + uint16_t src1[MAX_WIDTH]; + uint16_t src2[MAX_WIDTH]; + const unsigned mask = (1 << bpp) - 1; + int l1 = rnd() & mask, lt1 = rnd() & mask, l2 = l1, lt2 = lt1; + + randomize_buffers(src1, width, mask); + randomize_buffers(src2, width, mask); + + call_ref(dst0, src1, src2, mask, width, &l1, <1); + call_new(dst1, src1, src2, mask, width, &l2, <2); + if (l1 != l2 || lt1 != lt2 || memcmp(dst0, dst1, width * sizeof(dst0[0]))) + fail(); + bench_new(dst1, src1, src2, mask, width, &l2, <2); + } + } +} + +void checkasm_check_huffyuvencdsp(void) +{ + static unsigned width = 0; + + if (!width) { + width = rnd() % MAX_WIDTH; + width = width ? width : 1; + } + + const size_t align = av_cpu_max_align(); + + check_sub_hfyu_median_pred_int16("_aligned", FFALIGN(width, align / sizeof(uint16_t))); + report("sub_hfyu_median_pred_int16_aligned"); + + check_sub_hfyu_median_pred_int16("", width); + report("sub_hfyu_median_pred_int16"); +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index 16c6f1f775..b05dc61f67 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -33,6 +33,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-hevc_sao \ fate-checkasm-hpeldsp \ fate-checkasm-huffyuvdsp \ + fate-checkasm-huffyuvencdsp \ fate-checkasm-idctdsp \ fate-checkasm-jpeg2000dsp \ fate-checkasm-llauddsp \ -- 2.52.0 >From 42c44aae631e58f4c456e760fd36fd92ec62a449 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Feb 2026 21:49:31 +0100 Subject: [PATCH 05/10] tests/checkasm: Fix huffyuvdsp test criterion Use CONFIG_HUFFYUVDSP, not CONFIG_HUFFYUV_DECODER (although they are equivalent). Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 491bde2a7a..a9b58f5d1d 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -13,6 +13,7 @@ AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o AVCODECOBJS-$(CONFIG_HPELDSP) += hpeldsp.o +AVCODECOBJS-$(CONFIG_HUFFYUVDSP) += huffyuvdsp.o AVCODECOBJS-$(CONFIG_HUFFYUVENCDSP) += huffyuvencdsp.o AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o AVCODECOBJS-$(CONFIG_LLAUDDSP) += llauddsp.o @@ -39,7 +40,6 @@ AVCODECOBJS-$(CONFIG_DCA_DECODER) += dcadsp.o synth_filter.o AVCODECOBJS-$(CONFIG_DIRAC_DECODER) += diracdsp.o AVCODECOBJS-$(CONFIG_EXR_DECODER) += exrdsp.o AVCODECOBJS-$(CONFIG_FLAC_DECODER) += flacdsp.o -AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuvdsp.o AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o AVCODECOBJS-$(CONFIG_OPUS_DECODER) += opusdsp.o AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 38bd1edce7..407267a4c3 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -195,7 +195,7 @@ static const struct { #if CONFIG_HPELDSP { "hpeldsp", checkasm_check_hpeldsp }, #endif - #if CONFIG_HUFFYUV_DECODER + #if CONFIG_HUFFYUVDSP { "huffyuvdsp", checkasm_check_huffyuvdsp }, #endif #if CONFIG_HUFFYUVENCDSP -- 2.52.0 >From f59de7aeb552df07979d51d9a53714e968639bb9 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Feb 2026 21:50:46 +0100 Subject: [PATCH 06/10] avcodec/hufyuvencdsp: Add width parameter to init This allows to only use certain functions using wide registers if there is enough work to do and if one can even read a whole register wide without overreading. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/huffyuvenc.c | 2 +- libavcodec/huffyuvencdsp.c | 4 ++-- libavcodec/huffyuvencdsp.h | 4 ++-- libavcodec/x86/huffyuvencdsp_init.c | 2 +- tests/checkasm/huffyuvencdsp.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index 0f2cf1791d..f7211d1ba0 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -277,7 +277,7 @@ static av_cold int encode_init(AVCodecContext *avctx) s->mask = (1 << s->bps) - 1; s->vlc_n = FFMIN(1 << s->bps, MAX_VLC_N); - ff_huffyuvencdsp_init(&s->hencdsp, s->bps); + ff_huffyuvencdsp_init(&s->hencdsp, s->bps, avctx->width >> s->chroma_h_shift); switch (avctx->pix_fmt) { case AV_PIX_FMT_YUV420P: diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c index dcae51f4f8..9dd84dbafe 100644 --- a/libavcodec/huffyuvencdsp.c +++ b/libavcodec/huffyuvencdsp.c @@ -84,12 +84,12 @@ static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, co *left_top = lt; } -av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp) +av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp, int width) { c->diff_int16 = diff_int16_c; c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c; #if ARCH_X86 && HAVE_X86ASM - ff_huffyuvencdsp_init_x86(c, bpp); + ff_huffyuvencdsp_init_x86(c, bpp, width); #endif } diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h index fae182add1..173fbca08f 100644 --- a/libavcodec/huffyuvencdsp.h +++ b/libavcodec/huffyuvencdsp.h @@ -32,7 +32,7 @@ typedef struct HuffYUVEncDSPContext { int w, int *left, int *left_top); } HuffYUVEncDSPContext; -void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp); -void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp); +void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp, int width); +void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width); #endif /* AVCODEC_HUFFYUVENCDSP_H */ diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index fd54fdcc00..153edabf02 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -34,7 +34,7 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); -av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp) +av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width) { av_unused int cpu_flags = av_get_cpu_flags(); diff --git a/tests/checkasm/huffyuvencdsp.c b/tests/checkasm/huffyuvencdsp.c index 049a7d126b..a74b4295d6 100644 --- a/tests/checkasm/huffyuvencdsp.c +++ b/tests/checkasm/huffyuvencdsp.c @@ -48,7 +48,7 @@ static void check_sub_hfyu_median_pred_int16(const char *aligned, unsigned width for (size_t i = 0; i < FF_ARRAY_ELEMS(bpps); ++i) { const int bpp = bpps[i]; - ff_huffyuvencdsp_init(&c, bpp); + ff_huffyuvencdsp_init(&c, bpp, width); if (check_func(c.sub_hfyu_median_pred_int16, "sub_hfyu_median_pred_int16_%dbpp%s", bpp, aligned)) { DECLARE_ALIGNED(32, uint16_t, dst0)[MAX_WIDTH]; -- 2.52.0 >From 0bbc5641732210564dd836166f7dabe56a1f8b2f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Feb 2026 00:43:09 +0100 Subject: [PATCH 07/10] avcodec/x86/huffyuvencdsp: Add SSE2 sub_hfyu_median_pred_int16 Contrary to the MMXEXT version this version does not overread at all (the MMXEXT version processes the input of 2*w width in eight byte chunks and overreads by a further six bytes, because it loads the next left and left top values at the end of the loop, i.e. it reads FFALIGN(2*w,8)+6 bytes instead of 2*w). Benchmarks: sub_hfyu_median_pred_int16_9bpp_c: 12673.6 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_mmxext: 1947.7 ( 6.51x) sub_hfyu_median_pred_int16_9bpp_sse2: 993.9 (12.75x) sub_hfyu_median_pred_int16_9bpp_aligned_c: 12596.1 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_aligned_mmxext: 1956.1 ( 6.44x) sub_hfyu_median_pred_int16_9bpp_aligned_sse2: 989.4 (12.73x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/huffyuvencdsp.asm | 50 +++++++++++++++++++++++++++++ libavcodec/x86/huffyuvencdsp_init.c | 4 +++ 2 files changed, 54 insertions(+) diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 8bfd0face0..3d38931893 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -94,3 +94,53 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_ movzx maskd, word [src2q + wq - 2] mov [leftq], maskd RET + +INIT_XMM sse2 +cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top + movd m5, maskd + lea wd, [wd+wd-(mmsize-1)] + movu m0, [src1q] + movu m2, [src2q] + SPLATW m5, m5 + add dstq, wq + movd m1, [left_topq] + neg wq + movd m3, [leftq] + sub src1q, wq + sub src2q, wq + pslldq m0, 2 + pslldq m2, 2 + por m0, m1 + por m2, m3 + jmp .init + +.loop: + movu m0, [src1q + wq - 2] ; lt + movu m2, [src2q + wq - 2] ; l +.init: + movu m1, [src1q + wq] ; t + movu m3, [src2q + wq] + psubw m4, m2, m0 ; l - lt + pmaxsw m0, m1, m2 + paddw m4, m1 ; l - lt + t + pminsw m2, m1 + pand m4, m5 ; (l - lt + t)&mask + pminsw m4, m0 + pmaxsw m4, m2 ; pred + psubw m3, m4 ; l - pred + pand m3, m5 + movu [dstq + wq], m3 + add wq, 16 + js .loop + + cmp wd, mmsize-1 + jne .tail + + movzx src1d, word [src1q + (mmsize-1) - 2] + movzx src2d, word [src2q + (mmsize-1) - 2] + mov [left_topq], src1d + mov [leftq], src2d + RET +.tail: + mov wq, -1 + jmp .loop diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index 153edabf02..e32b7ea19d 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -33,6 +33,8 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src unsigned mask, int w); void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); +void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w, int *left, int *left_top); av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width) { @@ -44,6 +46,8 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid if (EXTERNAL_SSE2(cpu_flags)) { c->diff_int16 = ff_diff_int16_sse2; + if (bpp < 16 && width >= 8) + c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_sse2; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { -- 2.52.0 >From 8851c0b73444fa979ba49d27b430cdd476f50207 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Feb 2026 02:37:48 +0100 Subject: [PATCH 08/10] avcodec/x86/huffyuvencdsp: Add AVX2 sub_hfyu_median_pred_int16 This version can also process 16bpp. Benchmarks: sub_hfyu_median_pred_int16_9bpp_c: 12667.7 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_mmxext: 1966.5 ( 6.44x) sub_hfyu_median_pred_int16_9bpp_sse2: 997.6 (12.70x) sub_hfyu_median_pred_int16_9bpp_avx2: 474.8 (26.68x) sub_hfyu_median_pred_int16_9bpp_aligned_c: 12604.6 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_aligned_mmxext: 1964.6 ( 6.42x) sub_hfyu_median_pred_int16_9bpp_aligned_sse2: 981.9 (12.84x) sub_hfyu_median_pred_int16_9bpp_aligned_avx2: 462.6 (27.25x) sub_hfyu_median_pred_int16_16bpp_c: 12592.5 ( 1.00x) sub_hfyu_median_pred_int16_16bpp_avx2: 465.6 (27.04x) sub_hfyu_median_pred_int16_16bpp_aligned_c: 12587.5 ( 1.00x) sub_hfyu_median_pred_int16_16bpp_aligned_avx2: 462.5 (27.22x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/huffyuvencdsp.asm | 50 ++++++++++++++++++++--------- libavcodec/x86/huffyuvencdsp_init.c | 4 +++ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 3d38931893..11f4b8c01f 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -95,23 +95,32 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_ mov [leftq], maskd RET -INIT_XMM sse2 +%macro SUB_HFYU_MEDIAN_PRED_INT16 1 ; u,s for pmaxuw vs pmaxsw cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top - movd m5, maskd + movd xm5, maskd lea wd, [wd+wd-(mmsize-1)] - movu m0, [src1q] - movu m2, [src2q] - SPLATW m5, m5 + movu xm0, [src1q] + movu xm2, [src2q] + SPLATW m5, xm5 add dstq, wq - movd m1, [left_topq] + movd xm1, [left_topq] neg wq - movd m3, [leftq] + movd xm3, [leftq] +%if mmsize >= 32 + movu xm4, [src1q+14] +%endif sub src1q, wq + pslldq xm0, 2 + pslldq xm2, 2 + por xm0, xm1 +%if mmsize >= 32 + vinserti128 m0, xm4, 1 +%endif + por xm2, xm3 +%if mmsize >= 32 + vinserti128 m2, [src2q+14], 1 +%endif sub src2q, wq - pslldq m0, 2 - pslldq m2, 2 - por m0, m1 - por m2, m3 jmp .init .loop: @@ -121,16 +130,16 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_ movu m1, [src1q + wq] ; t movu m3, [src2q + wq] psubw m4, m2, m0 ; l - lt - pmaxsw m0, m1, m2 + pmax%1w m0, m1, m2 paddw m4, m1 ; l - lt + t - pminsw m2, m1 + pmin%1w m2, m1 pand m4, m5 ; (l - lt + t)&mask - pminsw m4, m0 - pmaxsw m4, m2 ; pred + pmin%1w m4, m0 + pmax%1w m4, m2 ; pred psubw m3, m4 ; l - pred pand m3, m5 movu [dstq + wq], m3 - add wq, 16 + add wq, mmsize js .loop cmp wd, mmsize-1 @@ -144,3 +153,12 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_ .tail: mov wq, -1 jmp .loop +%endmacro + +INIT_XMM sse2 +SUB_HFYU_MEDIAN_PRED_INT16 s + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +SUB_HFYU_MEDIAN_PRED_INT16 u +%endif diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index e32b7ea19d..7289e94bc7 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -35,6 +35,8 @@ void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, c unsigned mask, int w, int *left, int *left_top); void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); +void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w, int *left, int *left_top); av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width) { @@ -52,5 +54,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid if (EXTERNAL_AVX2_FAST(cpu_flags)) { c->diff_int16 = ff_diff_int16_avx2; + if (width >= 16) + c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_avx2; } } -- 2.52.0 >From af610f25b25214bab2ba2bb5b0679b93b9b68323 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Feb 2026 02:44:37 +0100 Subject: [PATCH 09/10] avcodec/x86/huffyuvencdsp: Remove MMX sub_hfyu_median_pred_int16 Superseded by SSE2 and AVX2. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/huffyuvenc.c | 2 -- libavcodec/x86/huffyuvencdsp.asm | 40 ----------------------------- libavcodec/x86/huffyuvencdsp_init.c | 6 ----- tests/checkasm/huffyuvencdsp.c | 4 +-- 4 files changed, 2 insertions(+), 50 deletions(-) diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index f7211d1ba0..706d65597a 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -39,7 +39,6 @@ #include "huffyuvencdsp.h" #include "lossless_videoencdsp.h" #include "put_bits.h" -#include "libavutil/emms.h" #include "libavutil/mem.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" @@ -939,7 +938,6 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, } else { av_log(avctx, AV_LOG_ERROR, "Format not supported!\n"); } - emms_c(); size += (put_bits_count(&s->pb) + 31) / 8; put_bits(&s->pb, 16, 0); diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 11f4b8c01f..e8e7a6469d 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -55,46 +55,6 @@ INIT_YMM avx2 DIFF_INT16 %endif -INIT_MMX mmxext -cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top - add wd, wd - movd mm7, maskd - SPLATW mm7, mm7 - movq mm0, [src1q] - movq mm2, [src2q] - psllq mm0, 16 - psllq mm2, 16 - movd mm6, [left_topq] - por mm0, mm6 - movd mm6, [leftq] - por mm2, mm6 - xor maskq, maskq -.loop: - movq mm1, [src1q + maskq] - movq mm3, [src2q + maskq] - movq mm4, mm2 - psubw mm2, mm0 - paddw mm2, mm1 - pand mm2, mm7 - movq mm5, mm4 - pmaxsw mm4, mm1 - pminsw mm1, mm5 - pminsw mm4, mm2 - pmaxsw mm4, mm1 - psubw mm3, mm4 - pand mm3, mm7 - movq [dstq + maskq], mm3 - add maskq, 8 - movq mm0, [src1q + maskq - 2] - movq mm2, [src2q + maskq - 2] - cmp maskq, wq - jb .loop - movzx maskd, word [src1q + wq - 2] - mov [left_topq], maskd - movzx maskd, word [src2q + wq - 2] - mov [leftq], maskd - RET - %macro SUB_HFYU_MEDIAN_PRED_INT16 1 ; u,s for pmaxuw vs pmaxsw cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top movd xm5, maskd diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index 7289e94bc7..c46be95cb9 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -31,8 +31,6 @@ void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src unsigned mask, int w); void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); -void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, - unsigned mask, int w, int *left, int *left_top); void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, @@ -42,10 +40,6 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid { av_unused int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMXEXT(cpu_flags) && bpp < 16) { - c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; - } - if (EXTERNAL_SSE2(cpu_flags)) { c->diff_int16 = ff_diff_int16_sse2; if (bpp < 16 && width >= 8) diff --git a/tests/checkasm/huffyuvencdsp.c b/tests/checkasm/huffyuvencdsp.c index a74b4295d6..b5d02cda6d 100644 --- a/tests/checkasm/huffyuvencdsp.c +++ b/tests/checkasm/huffyuvencdsp.c @@ -42,8 +42,8 @@ static void check_sub_hfyu_median_pred_int16(const char *aligned, unsigned width static const int bpps[] = { 9, 16, }; HuffYUVEncDSPContext c; - declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint16_t *dst, const uint16_t *src1, - const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); + declare_func(void, uint16_t *dst, const uint16_t *src1, + const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); for (size_t i = 0; i < FF_ARRAY_ELEMS(bpps); ++i) { const int bpp = bpps[i]; -- 2.52.0 >From e7fd30a2dbe801b29fa7a9395f588870e98d2556 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Feb 2026 02:46:40 +0100 Subject: [PATCH 10/10] avcodec/x86/huffyuvencdsp_init: Remove pointless av_unused Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/huffyuvencdsp_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index c46be95cb9..b4dd69bd28 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -38,7 +38,7 @@ void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, con av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width) { - av_unused int cpu_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE2(cpu_flags)) { c->diff_int16 = ff_diff_int16_sse2; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
