PR #21014 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21014 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21014.patch
>From a68c465722f642d3b378107a002a04fe53a91da8 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 00:19:58 +0100 Subject: [PATCH 01/10] avcodec/arm/vp6dsp: Remove VP6 edge filter functions Forgotten in 160ebe0a8d780f6db7c18e824d8ec6f437da33a2. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/arm/Makefile | 2 - libavcodec/arm/vp6dsp_init_arm.c | 39 ---------- libavcodec/arm/vp6dsp_neon.S | 121 ------------------------------- libavcodec/vp56dsp.c | 4 +- libavcodec/vp56dsp.h | 1 - 5 files changed, 1 insertion(+), 166 deletions(-) delete mode 100644 libavcodec/arm/vp6dsp_init_arm.c delete mode 100644 libavcodec/arm/vp6dsp_neon.S diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 811b364195..e32a0bf49f 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -42,7 +42,6 @@ OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o -OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \ arm/vp9dsp_init_12bpp_arm.o \ arm/vp9dsp_init_arm.o @@ -139,7 +138,6 @@ NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o NEON-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o -NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \ arm/vp9itxfm_neon.o \ arm/vp9lpf_16bpp_neon.o \ diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c deleted file mode 100644 index a59d61278c..0000000000 --- a/libavcodec/arm/vp6dsp_init_arm.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <[email protected]> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" - -#include "libavcodec/vp56dsp.h" - -void ff_vp6_edge_filter_hor_neon(uint8_t *yuv, ptrdiff_t stride, int t); -void ff_vp6_edge_filter_ver_neon(uint8_t *yuv, ptrdiff_t stride, int t); - -av_cold void ff_vp6dsp_init_arm(VP56DSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->edge_filter_hor = ff_vp6_edge_filter_hor_neon; - s->edge_filter_ver = ff_vp6_edge_filter_ver_neon; - } -} diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S deleted file mode 100644 index 03dd28d1cb..0000000000 --- a/libavcodec/arm/vp6dsp_neon.S +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <[email protected]> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro vp6_edge_filter - vdup.16 q3, r2 @ t - vmov.i16 q13, #1 - vsubl.u8 q0, d20, d18 @ p[ 0] - p[-s] - vsubl.u8 q1, d16, d22 @ p[-2*s] - p[ s] - vsubl.u8 q14, d21, d19 - vsubl.u8 q15, d17, d23 - vadd.i16 q2, q0, q0 @ 2*(p[0]-p[-s]) - vadd.i16 d29, d28, d28 - vadd.i16 q0, q0, q1 @ p[0]-p[-s] + p[-2*s]-p[s] - vadd.i16 d28, d28, d30 - vadd.i16 q0, q0, q2 @ 3*(p[0]-p[-s]) + p[-2*s]-p[s] - vadd.i16 d28, d28, d29 - vrshr.s16 q0, q0, #3 @ v - vrshr.s16 d28, d28, #3 - vsub.i16 q8, q3, q13 @ t-1 - vabs.s16 q1, q0 @ V - vshr.s16 q2, q0, #15 @ s - vabs.s16 d30, d28 - vshr.s16 d29, d28, #15 - vsub.i16 q12, q1, q3 @ V-t - vsub.i16 d31, d30, d6 - vsub.i16 q12, q12, q13 @ V-t-1 - vsub.i16 d31, d31, d26 - vcge.u16 q12, q12, q8 @ V-t-1 >= t-1 - vcge.u16 d31, d31, d16 - vadd.i16 q13, q3, q3 @ 2*t - vadd.i16 d16, d6, d6 - vsub.i16 q13, q13, q1 @ 2*t - V - vsub.i16 d16, d16, d30 - vadd.i16 q13, q13, q2 @ += s - vadd.i16 d16, d16, d29 - veor q13, q13, q2 @ ^= s - veor d16, d16, d29 - vbif q0, q13, q12 - vbif d28, d16, d31 - vmovl.u8 q1, d20 - vmovl.u8 q15, d21 - vaddw.u8 q2, q0, d18 - vaddw.u8 q3, q14, d19 - vsub.i16 q1, q1, q0 - vsub.i16 d30, d30, d28 - vqmovun.s16 d18, q2 - vqmovun.s16 d19, q3 - vqmovun.s16 d20, q1 - vqmovun.s16 d21, q15 -.endm - -function ff_vp6_edge_filter_ver_neon, export=1 - sub r0, r0, r1, lsl #1 - vld1.8 {q8}, [r0], r1 @ p[-2*s] - vld1.8 {q9}, [r0], r1 @ p[-s] - vld1.8 {q10}, [r0], r1 @ p[0] - vld1.8 {q11}, [r0] @ p[s] - vp6_edge_filter - sub r0, r0, r1, lsl #1 - sub r1, r1, #8 - vst1.8 {d18}, [r0]! - vst1.32 {d19[0]}, [r0], r1 - vst1.8 {d20}, [r0]! - vst1.32 {d21[0]}, [r0] - bx lr -endfunc - -function ff_vp6_edge_filter_hor_neon, export=1 - sub r3, r0, #1 - sub r0, r0, #2 - vld1.32 {d16[0]}, [r0], r1 - vld1.32 {d18[0]}, [r0], r1 - vld1.32 {d20[0]}, [r0], r1 - vld1.32 {d22[0]}, [r0], r1 - vld1.32 {d16[1]}, [r0], r1 - vld1.32 {d18[1]}, [r0], r1 - vld1.32 {d20[1]}, [r0], r1 - vld1.32 {d22[1]}, [r0], r1 - vld1.32 {d17[0]}, [r0], r1 - vld1.32 {d19[0]}, [r0], r1 - vld1.32 {d21[0]}, [r0], r1 - vld1.32 {d23[0]}, [r0], r1 - vtrn.8 q8, q9 - vtrn.8 q10, q11 - vtrn.16 q8, q10 - vtrn.16 q9, q11 - vp6_edge_filter - vtrn.8 q9, q10 - vst1.16 {d18[0]}, [r3], r1 - vst1.16 {d20[0]}, [r3], r1 - vst1.16 {d18[1]}, [r3], r1 - vst1.16 {d20[1]}, [r3], r1 - vst1.16 {d18[2]}, [r3], r1 - vst1.16 {d20[2]}, [r3], r1 - vst1.16 {d18[3]}, [r3], r1 - vst1.16 {d20[3]}, [r3], r1 - vst1.16 {d19[0]}, [r3], r1 - vst1.16 {d21[0]}, [r3], r1 - vst1.16 {d19[1]}, [r3], r1 - vst1.16 {d21[1]}, [r3], r1 - bx lr -endfunc diff --git a/libavcodec/vp56dsp.c b/libavcodec/vp56dsp.c index a668712384..1ff67b1c87 100644 --- a/libavcodec/vp56dsp.c +++ b/libavcodec/vp56dsp.c @@ -77,9 +77,7 @@ av_cold void ff_vp6dsp_init(VP56DSPContext *s) { s->vp6_filter_diag4 = ff_vp6_filter_diag4_c; -#if ARCH_ARM - ff_vp6dsp_init_arm(s); -#elif ARCH_X86 +#if ARCH_X86 ff_vp6dsp_init_x86(s); #endif } diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h index e35e232ea3..f2cbb41a1e 100644 --- a/libavcodec/vp56dsp.h +++ b/libavcodec/vp56dsp.h @@ -38,7 +38,6 @@ void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_vp5dsp_init(VP56DSPContext *s); void ff_vp6dsp_init(VP56DSPContext *s); -void ff_vp6dsp_init_arm(VP56DSPContext *s); void ff_vp6dsp_init_x86(VP56DSPContext *s); #endif /* AVCODEC_VP56DSP_H */ -- 2.49.1 >From 9c7542c2575563398cb1d8a2b2356a8d7c495baf Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 00:28:32 +0100 Subject: [PATCH 02/10] avcodec/vp56: Fix indentation Forgotten in 160ebe0a8d780f6db7c18e824d8ec6f437da33a2. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/vp56.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c index dc3ae70c66..0ddf7c985c 100644 --- a/libavcodec/vp56.c +++ b/libavcodec/vp56.c @@ -325,9 +325,9 @@ static void vp56_deblock_filter(VP56Context *s, uint8_t *yuv, ptrdiff_t stride, int dx, int dy) { if (s->avctx->codec->id == AV_CODEC_ID_VP5) { - int t = ff_vp56_filter_threshold[s->quantizer]; - if (dx) s->vp56dsp.edge_filter_hor(yuv + 10-dx , stride, t); - if (dy) s->vp56dsp.edge_filter_ver(yuv + stride*(10-dy), stride, t); + int t = ff_vp56_filter_threshold[s->quantizer]; + if (dx) s->vp56dsp.edge_filter_hor(yuv + 10-dx , stride, t); + if (dy) s->vp56dsp.edge_filter_ver(yuv + stride*(10-dy), stride, t); } else { int * bounding_values = s->bounding_values_array + 127; if (dx) -- 2.49.1 >From fc33633b4459e8cac10b76e95b685254e2dd6eab Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 00:44:47 +0100 Subject: [PATCH 03/10] avcodec/vp56dsp: Separate VP5DSP and VP6DSP They don't have anything in common since 160ebe0a8d780f6db7c18e824d8ec6f437da33a2. Signed-off-by: Andreas Rheinhardt <[email protected]> --- configure | 5 ++--- libavcodec/Makefile | 4 ++-- libavcodec/vp5.c | 2 +- libavcodec/vp56.c | 4 ++-- libavcodec/vp56.h | 5 ++++- libavcodec/vp56dsp.h | 15 +++++++-------- libavcodec/{vp56dsp.c => vp5dsp.c} | 17 +---------------- libavcodec/vp6.c | 4 ++-- libavcodec/vp6dsp.c | 13 +++++++++++-- libavcodec/x86/vp6dsp_init.c | 2 +- 10 files changed, 33 insertions(+), 38 deletions(-) rename libavcodec/{vp56dsp.c => vp5dsp.c} (87%) diff --git a/configure b/configure index 99734e9d03..c135ce3a75 100755 --- a/configure +++ b/configure @@ -2701,7 +2701,6 @@ CONFIG_EXTRA=" vc1dsp videodsp vp3dsp - vp56dsp vp8dsp vulkan_encode vvc_sei @@ -3197,8 +3196,8 @@ vc1image_decoder_select="vc1_decoder" vorbis_encoder_select="audio_frame_queue" vp3_decoder_select="hpeldsp vp3dsp videodsp" vp4_decoder_select="vp3_decoder" -vp5_decoder_select="h264chroma hpeldsp videodsp vp3dsp vp56dsp" -vp6_decoder_select="h264chroma hpeldsp huffman videodsp vp3dsp vp56dsp" +vp5_decoder_select="h264chroma hpeldsp videodsp vp3dsp" +vp6_decoder_select="h264chroma hpeldsp huffman videodsp vp3dsp" vp6a_decoder_select="vp6_decoder" vp6f_decoder_select="vp6_decoder" vp7_decoder_select="h264pred videodsp vp8dsp" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 0cd2408865..3ed3188a9a 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -182,7 +182,6 @@ OBJS-$(CONFIG_AV1_AMF_DECODER) += amfdec.o OBJS-$(CONFIG_VC1DSP) += vc1dsp.o OBJS-$(CONFIG_VIDEODSP) += videodsp.o OBJS-$(CONFIG_VP3DSP) += vp3dsp.o -OBJS-$(CONFIG_VP56DSP) += vp56dsp.o OBJS-$(CONFIG_VP8DSP) += vp8dsp.o OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o @@ -803,7 +802,8 @@ OBJS-$(CONFIG_VORBIS_DECODER) += vorbisdec.o vorbisdsp.o vorbis.o \ OBJS-$(CONFIG_VORBIS_ENCODER) += vorbisenc.o vorbis.o \ vorbis_data.o OBJS-$(CONFIG_VP3_DECODER) += vp3.o jpegquanttables.o -OBJS-$(CONFIG_VP5_DECODER) += vp5.o vp56.o vp56data.o vpx_rac.o +OBJS-$(CONFIG_VP5_DECODER) += vp5.o vp56.o vp56data.o \ + vp5dsp.o vpx_rac.o OBJS-$(CONFIG_VP6_DECODER) += vp6.o vp56.o vp56data.o \ vp6dsp.o vpx_rac.o OBJS-$(CONFIG_VP7_DECODER) += vp8.o vp8data.o vpx_rac.o diff --git a/libavcodec/vp5.c b/libavcodec/vp5.c index 77b479471b..98b8cf41f2 100644 --- a/libavcodec/vp5.c +++ b/libavcodec/vp5.c @@ -285,7 +285,7 @@ static av_cold int vp5_decode_init(AVCodecContext *avctx) if ((ret = ff_vp56_init_context(avctx, s, 1, 0)) < 0) return ret; - ff_vp5dsp_init(&s->vp56dsp); + ff_vp5dsp_init(&s->vp5dsp); s->vp56_coord_div = vp5_coord_div; s->parse_vector_adjustment = vp5_parse_vector_adjustment; s->parse_coeff = vp5_parse_coeff; diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c index 0ddf7c985c..0d13d7a276 100644 --- a/libavcodec/vp56.c +++ b/libavcodec/vp56.c @@ -326,8 +326,8 @@ static void vp56_deblock_filter(VP56Context *s, uint8_t *yuv, { if (s->avctx->codec->id == AV_CODEC_ID_VP5) { int t = ff_vp56_filter_threshold[s->quantizer]; - if (dx) s->vp56dsp.edge_filter_hor(yuv + 10-dx , stride, t); - if (dy) s->vp56dsp.edge_filter_ver(yuv + stride*(10-dy), stride, t); + if (dx) s->vp5dsp.edge_filter_hor(yuv + 10-dx , stride, t); + if (dy) s->vp5dsp.edge_filter_ver(yuv + stride*(10-dy), stride, t); } else { int * bounding_values = s->bounding_values_array + 127; if (dx) diff --git a/libavcodec/vp56.h b/libavcodec/vp56.h index af46e2f188..6610fc2892 100644 --- a/libavcodec/vp56.h +++ b/libavcodec/vp56.h @@ -118,7 +118,10 @@ struct vp56_context { HpelDSPContext hdsp; VideoDSPContext vdsp; VP3DSPContext vp3dsp; - VP56DSPContext vp56dsp; + union { + VP5DSPContext vp5dsp; + VP6DSPContext vp6dsp; + }; uint8_t idct_scantable[64]; AVFrame *frames[4]; uint8_t *edge_emu_buffer_alloc; diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h index f2cbb41a1e..692fd0c8ac 100644 --- a/libavcodec/vp56dsp.h +++ b/libavcodec/vp56dsp.h @@ -24,20 +24,19 @@ #include <stddef.h> #include <stdint.h> -typedef struct VP56DSPContext { +typedef struct VP5DSPContext { void (*edge_filter_hor)(uint8_t *yuv, ptrdiff_t stride, int t); void (*edge_filter_ver)(uint8_t *yuv, ptrdiff_t stride, int t); +} VP5DSPContext; +typedef struct VP6DSPContext { void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, ptrdiff_t stride, const int16_t *h_weights,const int16_t *v_weights); -} VP56DSPContext; +} VP6DSPContext; -void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - const int16_t *h_weights, const int16_t *v_weights); +void ff_vp5dsp_init(VP5DSPContext *s); -void ff_vp5dsp_init(VP56DSPContext *s); -void ff_vp6dsp_init(VP56DSPContext *s); - -void ff_vp6dsp_init_x86(VP56DSPContext *s); +void ff_vp6dsp_init(VP6DSPContext *s); +void ff_vp6dsp_init_x86(VP6DSPContext *s); #endif /* AVCODEC_VP56DSP_H */ diff --git a/libavcodec/vp56dsp.c b/libavcodec/vp5dsp.c similarity index 87% rename from libavcodec/vp56dsp.c rename to libavcodec/vp5dsp.c index 1ff67b1c87..a06c2cfd5f 100644 --- a/libavcodec/vp56dsp.c +++ b/libavcodec/vp5dsp.c @@ -21,8 +21,6 @@ #include <stdint.h> -#include "config.h" -#include "config_components.h" #include "libavutil/attributes.h" #include "vp56dsp.h" #include "libavutil/common.h" @@ -43,7 +41,6 @@ static void pfx ## _edge_filter_ ## suf(uint8_t *yuv, ptrdiff_t stride, \ } \ } -#if CONFIG_VP5_DECODER /* Gives very similar result than the vp6 version except in a few cases */ static int vp5_adjust(int v, int t) { @@ -65,20 +62,8 @@ static int vp5_adjust(int v, int t) VP56_EDGE_FILTER(vp5, hor, 1, stride) VP56_EDGE_FILTER(vp5, ver, stride, 1) -av_cold void ff_vp5dsp_init(VP56DSPContext *s) +av_cold void ff_vp5dsp_init(VP5DSPContext *s) { s->edge_filter_hor = vp5_edge_filter_hor; s->edge_filter_ver = vp5_edge_filter_ver; } -#endif /* CONFIG_VP5_DECODER */ - -#if CONFIG_VP6_DECODER -av_cold void ff_vp6dsp_init(VP56DSPContext *s) -{ - s->vp6_filter_diag4 = ff_vp6_filter_diag4_c; - -#if ARCH_X86 - ff_vp6dsp_init_x86(s); -#endif -} -#endif /* CONFIG_VP6_DECODER */ diff --git a/libavcodec/vp6.c b/libavcodec/vp6.c index 48ff9da818..3f4bd42d07 100644 --- a/libavcodec/vp6.c +++ b/libavcodec/vp6.c @@ -641,7 +641,7 @@ static void vp6_filter(VP56Context *s, uint8_t *dst, uint8_t *src, vp6_filter_hv4(dst, src+offset1, stride, stride, vp6_block_copy_filter[select][y8]); } else { - s->vp56dsp.vp6_filter_diag4(dst, src+offset1+((mv.x^mv.y)>>31), stride, + s->vp6dsp.vp6_filter_diag4(dst, src+offset1+((mv.x^mv.y)>>31), stride, vp6_block_copy_filter[select][x8], vp6_block_copy_filter[select][y8]); } @@ -661,7 +661,7 @@ static av_cold int vp6_decode_init_context(AVCodecContext *avctx, if (ret < 0) return ret; - ff_vp6dsp_init(&s->vp56dsp); + ff_vp6dsp_init(&s->vp6dsp); s->deblock_filtering = 0; s->vp56_coord_div = vp6_coord_div; diff --git a/libavcodec/vp6dsp.c b/libavcodec/vp6dsp.c index f7f6856330..76c4983960 100644 --- a/libavcodec/vp6dsp.c +++ b/libavcodec/vp6dsp.c @@ -27,8 +27,8 @@ #include "vp56dsp.h" -void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - const int16_t *h_weights, const int16_t *v_weights) +static void vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + const int16_t *h_weights, const int16_t *v_weights) { int x, y; int tmp[8*11]; @@ -59,3 +59,12 @@ void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, t += 8; } } + +av_cold void ff_vp6dsp_init(VP6DSPContext *s) +{ + s->vp6_filter_diag4 = vp6_filter_diag4_c; + +#if ARCH_X86 + ff_vp6dsp_init_x86(s); +#endif +} diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c index 83d45ec36c..07e3becaec 100644 --- a/libavcodec/x86/vp6dsp_init.c +++ b/libavcodec/x86/vp6dsp_init.c @@ -28,7 +28,7 @@ void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, const int16_t *h_weights,const int16_t *v_weights); -av_cold void ff_vp6dsp_init_x86(VP56DSPContext *c) +av_cold void ff_vp6dsp_init_x86(VP6DSPContext *c) { int cpu_flags = av_get_cpu_flags(); -- 2.49.1 >From b2f692cf2f4dc05ed01b236a5b0df4511437c978 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 01:00:58 +0100 Subject: [PATCH 04/10] avcodec/vp6dsp: Constify source in vp6_filter_diag4 Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/vp56dsp.h | 2 +- libavcodec/vp6dsp.c | 2 +- libavcodec/x86/vp6dsp_init.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h index 692fd0c8ac..3981de4015 100644 --- a/libavcodec/vp56dsp.h +++ b/libavcodec/vp56dsp.h @@ -30,7 +30,7 @@ typedef struct VP5DSPContext { } VP5DSPContext; typedef struct VP6DSPContext { - void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + void (*vp6_filter_diag4)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, const int16_t *h_weights,const int16_t *v_weights); } VP6DSPContext; diff --git a/libavcodec/vp6dsp.c b/libavcodec/vp6dsp.c index 76c4983960..bdaa054307 100644 --- a/libavcodec/vp6dsp.c +++ b/libavcodec/vp6dsp.c @@ -27,7 +27,7 @@ #include "vp56dsp.h" -static void vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, +static void vp6_filter_diag4_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, const int16_t *h_weights, const int16_t *v_weights) { int x, y; diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c index 07e3becaec..db9a95767e 100644 --- a/libavcodec/x86/vp6dsp_init.c +++ b/libavcodec/x86/vp6dsp_init.c @@ -25,7 +25,7 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/vp56dsp.h" -void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, +void ff_vp6_filter_diag4_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, const int16_t *h_weights,const int16_t *v_weights); av_cold void ff_vp6dsp_init_x86(VP6DSPContext *c) -- 2.49.1 >From 7cf5a977d4079224f78533ff97f045fbb3c2c11c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 10:53:41 +0100 Subject: [PATCH 05/10] tests/checkasm: Test VP6DSP Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/vp6dsp.c | 93 +++++++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 5 files changed, 99 insertions(+) create mode 100644 tests/checkasm/vp6dsp.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 6636bc7774..3762c0d83b 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -50,6 +50,7 @@ AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER) += vorbisdsp.o +AVCODECOBJS-$(CONFIG_VP6_DECODER) += vp6dsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_mc.o vvc_sao.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 20d8f19757..8c64684fa3 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -254,6 +254,9 @@ static const struct { #if CONFIG_VP3DSP { "vp3dsp", checkasm_check_vp3dsp }, #endif + #if CONFIG_VP6_DECODER + { "vp6dsp", checkasm_check_vp6dsp }, + #endif #if CONFIG_VP8DSP { "vp8dsp", checkasm_check_vp8dsp }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 45cd23cac4..bd33aba263 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -154,6 +154,7 @@ void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); void checkasm_check_vf_sobel(void); void checkasm_check_vp3dsp(void); +void checkasm_check_vp6dsp(void); void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); diff --git a/tests/checkasm/vp6dsp.c b/tests/checkasm/vp6dsp.c new file mode 100644 index 0000000000..a5f1c9c2fc --- /dev/null +++ b/tests/checkasm/vp6dsp.c @@ -0,0 +1,93 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <assert.h> +#include <stddef.h> +#include <string.h> + +#include "checkasm.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/macros.h" +#include "libavutil/mem_internal.h" +#include "libavcodec/vp6data.h" +#include "libavcodec/vp56dsp.h" + +#define randomize_buffer(buf) \ + do { \ + for (size_t k = 0; k < (sizeof(buf) & ~3); k += 4) \ + AV_WN32A(buf + k, rnd()); \ + for (size_t k = sizeof(buf) & ~3; k < sizeof(buf); ++k) \ + buf[k] = rnd(); \ + } while (0) + + +void checkasm_check_vp6dsp(void) +{ + enum { + BLOCK_SIZE_1D = 8, + SRC_ROWS_ABOVE = 1, + SRC_ROWS_BELOW = 2, + SRC_COLS_LEFT = 1, + SRC_COLS_RIGHT = 2, + SRC_ROWS = SRC_ROWS_ABOVE + BLOCK_SIZE_1D + SRC_ROWS_BELOW, + SRC_ROW_SIZE = SRC_COLS_LEFT + BLOCK_SIZE_1D + SRC_COLS_RIGHT, + MAX_STRIDE = 64, ///< arbitrary + SRC_BUF_SIZE = (SRC_ROWS - 1) * MAX_STRIDE + SRC_ROW_SIZE + 7 /* to vary misalignment */, + DST_BUF_SIZE = (BLOCK_SIZE_1D - 1) * MAX_STRIDE + BLOCK_SIZE_1D, + }; + VP6DSPContext vp6dsp; + + ff_vp6dsp_init(&vp6dsp); + + declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + const int16_t *h_weights, const int16_t *v_weights); + + if (check_func(vp6dsp.vp6_filter_diag4, "filter_diag4")) { + DECLARE_ALIGNED(8, uint8_t, dstbuf_ref)[DST_BUF_SIZE]; + DECLARE_ALIGNED(8, uint8_t, dstbuf_new)[DST_BUF_SIZE]; + DECLARE_ALIGNED(8, uint8_t, srcbuf)[SRC_BUF_SIZE]; + + randomize_buffer(dstbuf_ref); + randomize_buffer(srcbuf); + memcpy(dstbuf_new, dstbuf_ref, sizeof(dstbuf_new)); + + ptrdiff_t stride = (rnd() % (MAX_STRIDE / 16) + 1) * 16; + const uint8_t *src = srcbuf + SRC_COLS_LEFT + rnd() % 8U; + uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref; + + if (rnd() & 1) { + dst_new += (BLOCK_SIZE_1D - 1) * stride; + dst_ref += (BLOCK_SIZE_1D - 1) * stride; + src += (SRC_ROWS - 1) * stride; + stride *= -1; + } + src += SRC_ROWS_ABOVE * stride; + + unsigned select = rnd() % FF_ARRAY_ELEMS(vp6_block_copy_filter); + unsigned x8 = 1 + rnd() % (FF_ARRAY_ELEMS(vp6_block_copy_filter[0]) - 1); + unsigned y8 = 1 + rnd() % (FF_ARRAY_ELEMS(vp6_block_copy_filter[0]) - 1); + const int16_t *h_weights = vp6_block_copy_filter[select][x8]; + const int16_t *v_weights = vp6_block_copy_filter[select][y8]; + + call_ref(dst_ref, src, stride, h_weights, v_weights); + call_new(dst_new, src, stride, h_weights, v_weights); + if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_new))) + fail(); + bench_new(dst_new, src, stride, h_weights, v_weights); + } +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index 2be880c8db..f182efde46 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -76,6 +76,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-videodsp \ fate-checkasm-vorbisdsp \ fate-checkasm-vp3dsp \ + fate-checkasm-vp6dsp \ fate-checkasm-vp8dsp \ fate-checkasm-vp9dsp \ fate-checkasm-vvc_alf \ -- 2.49.1 >From 5b0d9eed695ba95421277845b664670b1c7ab9fd Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 10:57:39 +0100 Subject: [PATCH 06/10] avcodec/x86/vp6dsp: Fix outdated comment Forgotten in 6cb3ee80b3b58d692a722fb38ee05f170ae8b0d2. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp6dsp.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index 0106541734..61336f6465 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* MMX/SSE2-optimized functions for the VP6 decoder +;* SSE2-optimized functions for the VP6 decoder ;* Copyright (C) 2009 Sebastien Lucas <[email protected]> ;* Copyright (C) 2009 Zuxy Meng <[email protected]> ;* -- 2.49.1 >From 6dd59e3cdd55edce907d884230d1f172813b97b7 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 11:15:15 +0100 Subject: [PATCH 07/10] avcodec/x86/vp6dsp: Don't align the stack manually For most systems (particularly all x64), the stack is already guaranteed to be sufficiently aligned. So just use x86inc's stack feature which does the right thing. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp6dsp.asm | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index 61336f6465..a9340ed05b 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -62,11 +62,7 @@ SECTION .text ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ; const int16_t h_weight[4], const int16_t v_weights[4]) INIT_XMM sse2 -cglobal vp6_filter_diag4, 5, 7, 8 - mov r5, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - sub rsp, 8*11 - +cglobal vp6_filter_diag4, 5, 6, 8, -8*11 sub r1, r2 pxor m7, m7 @@ -74,25 +70,24 @@ cglobal vp6_filter_diag4, 5, 7, 8 SPLAT4REGS mov r3, rsp - mov r6, 11 + mov r5d, 11 .nextrow: DIAG4 r1, -1, 0, 1, 2, r3 add r3, 8 add r1, r2 - dec r6 + dec r5d jnz .nextrow movq m3, [r4] SPLAT4REGS lea r3, [rsp+8] - mov r6, 8 + mov r1d, 8 .nextcol: DIAG4 r3, -8, 0, 8, 16, r0 add r3, 8 add r0, r2 - dec r6 + dec r1d jnz .nextcol - mov rsp, r5 ; restore stack pointer RET -- 2.49.1 >From 0e440771dacd71e918694456c02a3a3344c97264 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 11:22:45 +0100 Subject: [PATCH 08/10] avcodec/x86/vp6dsp: Simplify splatting Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp6dsp.asm | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index a9340ed05b..83b26d03cd 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -49,14 +49,11 @@ SECTION .text %endmacro %macro SPLAT4REGS 0 - pshuflw m4, m3, 0x0 - pshuflw m5, m3, 0x55 - pshuflw m6, m3, 0xAA - pshuflw m3, m3, 0xFF - punpcklqdq m4, m4 - punpcklqdq m5, m5 - punpcklqdq m6, m6 - punpcklqdq m3, m3 + punpcklwd m3, m3 + pshufd m4, m3, 0x0 + pshufd m5, m3, 0x55 + pshufd m6, m3, 0xAA + pshufd m3, m3, 0xFF %endmacro ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, ptrdiff_t stride, -- 2.49.1 >From ca983cb934190a6bb6ac3a9ec056750b06514f37 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 11:27:16 +0100 Subject: [PATCH 09/10] avcodec/x86/vp6dsp: Avoid saturated addition Only the two middle coefficients are so huge that overflow can happen. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp6dsp.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index 83b26d03cd..b9b562f84f 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -38,11 +38,11 @@ SECTION .text movq m2, [%1+%5] punpcklbw m1, m7 punpcklbw m2, m7 + paddw m0, [pw_64] ; Add 64 pmullw m1, m6 ; src[x+8 ] * biweight [2] pmullw m2, m3 ; src[x+16] * biweight [3] paddw m1, m2 paddsw m0, m1 - paddsw m0, [pw_64] ; Add 64 psraw m0, 7 packuswb m0, m0 movq [%6], m0 -- 2.49.1 >From 5b1ffeb1bd7d5ecdd74d1ded43d79e0924133116 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 11:46:40 +0100 Subject: [PATCH 10/10] avcodec/x86/vp6dsp: Avoid packing+unpacking Store the intermediate values as words, clipped to the 0..255 range instead. Old benchmarks: filter_diag4_c: 353.4 ( 1.00x) filter_diag4_sse2: 57.5 ( 6.15x) New benchmarks: filter_diag4_c: 350.6 ( 1.00x) filter_diag4_sse2: 55.1 ( 6.36x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp6dsp.asm | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index b9b562f84f..1f7443db69 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -26,26 +26,41 @@ cextern pw_64 SECTION .text -%macro DIAG4 6 +%macro DIAG4 7 +%if %7 + mova m0, [%1+%2] + mova m1, [%1+%3] +%else movq m0, [%1+%2] movq m1, [%1+%3] punpcklbw m0, m7 punpcklbw m1, m7 +%endif pmullw m0, m4 ; src[x-8 ] * biweight [0] pmullw m1, m5 ; src[x ] * biweight [1] paddw m0, m1 +%if %7 + mova m1, [%1+%4] + mova m2, [%1+%5] +%else movq m1, [%1+%4] movq m2, [%1+%5] punpcklbw m1, m7 punpcklbw m2, m7 +%endif paddw m0, [pw_64] ; Add 64 pmullw m1, m6 ; src[x+8 ] * biweight [2] pmullw m2, m3 ; src[x+16] * biweight [3] paddw m1, m2 paddsw m0, m1 psraw m0, 7 +%if %7 packuswb m0, m0 movq [%6], m0 +%else + pmaxsw m0, m7 ; clip to 0-255 range + mova [%6], m0 +%endif %endmacro %macro SPLAT4REGS 0 @@ -59,7 +74,7 @@ SECTION .text ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ; const int16_t h_weight[4], const int16_t v_weights[4]) INIT_XMM sse2 -cglobal vp6_filter_diag4, 5, 6, 8, -8*11 +cglobal vp6_filter_diag4, 5, 6, 8, -16*11 sub r1, r2 pxor m7, m7 @@ -69,8 +84,8 @@ cglobal vp6_filter_diag4, 5, 6, 8, -8*11 mov r3, rsp mov r5d, 11 .nextrow: - DIAG4 r1, -1, 0, 1, 2, r3 - add r3, 8 + DIAG4 r1, -1, 0, 1, 2, r3, 0 + add r3, 16 add r1, r2 dec r5d jnz .nextrow @@ -78,11 +93,11 @@ cglobal vp6_filter_diag4, 5, 6, 8, -8*11 movq m3, [r4] SPLAT4REGS - lea r3, [rsp+8] + lea r3, [rsp+16] mov r1d, 8 .nextcol: - DIAG4 r3, -8, 0, 8, 16, r0 - add r3, 8 + DIAG4 r3, -16, 0, 16, 32, r0, 1 + add r3, 16 add r0, r2 dec r1d jnz .nextcol -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
