PR #22275 opened by Niklas Haas (haasn) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22275 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22275.patch
Makes it so that the self-test now passes `ubsan` again. >From 0f194da38d3087fbf32070365da41c7e7f5350d1 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:21:11 +0100 Subject: [PATCH 1/8] swscale/ops_chain: properly mark unreachable branch By breaking to the `av_unreachable` below. This branch is unreachable because of the `if (entry->flexible)` branch further above. Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops_chain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c index d1ec1ef83d..e5fbc4f016 100644 --- a/libswscale/ops_chain.c +++ b/libswscale/ops_chain.c @@ -152,7 +152,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps nex case SWS_OP_LSHIFT: case SWS_OP_RSHIFT: av_assert1(entry->flexible); - return score; + break; case SWS_OP_SWIZZLE: for (int i = 0; i < 4; i++) { if (op->swizzle.in[i] != entry->swizzle.in[i] && !next.unused[i]) @@ -169,7 +169,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps nex case SWS_OP_MIN: case SWS_OP_MAX: av_assert1(entry->flexible); - return score; + break; case SWS_OP_LINEAR: /* All required elements must be present */ if (op->lin.mask & ~entry->linear_mask) -- 2.52.0 >From b2a6b54b7ee083cbe273cd598ad949f650759d3d Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:24:57 +0100 Subject: [PATCH 2/8] swscale/ops_chain: add ability to match fixed scale factor This is useful especially for the special case of scaling by common not-quite-power-of-two constants like 255 or 1023. Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops_chain.c | 2 +- libswscale/ops_chain.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c index e5fbc4f016..2445154186 100644 --- a/libswscale/ops_chain.c +++ b/libswscale/ops_chain.c @@ -184,7 +184,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps nex score += av_popcount(SWS_MASK_ALL ^ entry->linear_mask); return score; case SWS_OP_SCALE: - return score; + return av_cmp_q(op->c.q, entry->scale) ? 0 : score; case SWS_OP_TYPE_NB: break; } diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h index 2f5a31793e..0bc8c01283 100644 --- a/libswscale/ops_chain.h +++ b/libswscale/ops_chain.h @@ -111,6 +111,7 @@ typedef struct SwsOpEntry { uint32_t linear_mask; /* subset of SwsLinearOp */ int dither_size; /* subset of SwsDitherOp */ int clear_value; /* clear value for integer clears */ + AVRational scale; /* scale factor for SWS_OP_SCALE */ }; /* Kernel implementation */ -- 2.52.0 >From 6bf73d2044875201856ff38af827878b4547fe89 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 11:51:18 +0100 Subject: [PATCH 3/8] swscale/x86/ops: allow matching planar rw against 1-element packed fmt Otherwise, the x86 backend fails to serve e.g. rgb565le. For -src rgb565le: Before: Overall speedup=2.210x faster, min=0.256x max=60.465x After: Overall speedup=4.929x faster, min=0.638x max=181.260x Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index 44dbe05b35..fadc1ce8c9 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -551,7 +551,7 @@ static bool op_is_type_invariant(const SwsOp *op) switch (op->op) { case SWS_OP_READ: case SWS_OP_WRITE: - return !op->rw.packed && !op->rw.frac; + return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac; case SWS_OP_SWIZZLE: case SWS_OP_CLEAR: return true; -- 2.52.0 >From 3c96548547e0d555a61465172b12bb149f914e79 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:02:18 +0100 Subject: [PATCH 4/8] swscale/x86/ops: add missing U32 <-> F32 conversions For -src x2rgb10le: Before: Overall speedup=1.634x faster, min=0.356x max=44.083x After: Overall speedup=4.662x faster, min=0.676x max=137.445x Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 4 ++++ libswscale/x86/ops_float.asm | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index fadc1ce8c9..e9598ba437 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -439,6 +439,8 @@ static const SwsOpTable ops16##EXT = { DECL_CONVERT(EXT, F32, U8) \ DECL_CONVERT(EXT, U16, F32) \ DECL_CONVERT(EXT, F32, U16) \ + DECL_CONVERT(EXT, U32, F32) \ + DECL_CONVERT(EXT, F32, U32) \ DECL_EXPAND(EXT, U8, U32) \ DECL_MIN_MAX(EXT) \ DECL_SCALE(EXT) \ @@ -489,6 +491,8 @@ static const SwsOpTable ops32##EXT = { REF_COMMON_PATTERNS(convert_F32_U8##EXT), \ REF_COMMON_PATTERNS(convert_U16_F32##EXT), \ REF_COMMON_PATTERNS(convert_F32_U16##EXT), \ + REF_COMMON_PATTERNS(convert_U32_F32##EXT), \ + REF_COMMON_PATTERNS(convert_F32_U32##EXT), \ REF_COMMON_PATTERNS(expand_U8_U32##EXT), \ REF_COMMON_PATTERNS(min##EXT), \ REF_COMMON_PATTERNS(max##EXT), \ diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm index 2863085a8e..5336adb50b 100644 --- a/libswscale/x86/ops_float.asm +++ b/libswscale/x86/ops_float.asm @@ -77,6 +77,20 @@ IF W, vcvtdq2ps mw2, mw2 CONTINUE tmp0q %endmacro +%macro conv32to32f 0 +op convert_U32_F32 + LOAD_CONT tmp0q +IF X, vcvtdq2ps mx, mx +IF Y, vcvtdq2ps my, my +IF Z, vcvtdq2ps mz, mz +IF W, vcvtdq2ps mw, mw +IF X, vcvtdq2ps mx2, mx2 +IF Y, vcvtdq2ps my2, my2 +IF Z, vcvtdq2ps mz2, mz2 +IF W, vcvtdq2ps mw2, mw2 + CONTINUE tmp0q +%endmacro + %macro conv32fto8 0 op convert_F32_U8 LOAD_CONT tmp0q @@ -130,6 +144,20 @@ IF W, vpermq mw, mw, q3120 CONTINUE tmp0q %endmacro +%macro conv32fto32 0 +op convert_F32_U32 + LOAD_CONT tmp0q +IF X, cvttps2dq mx, mx +IF Y, cvttps2dq my, my +IF Z, cvttps2dq mz, mz +IF W, cvttps2dq mw, mw +IF X, cvttps2dq mx2, mx2 +IF Y, cvttps2dq my2, my2 +IF Z, cvttps2dq mz2, mz2 +IF W, cvttps2dq mw2, mw2 + CONTINUE tmp0q +%endmacro + %macro min_max 0 op min IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0] @@ -375,8 +403,10 @@ op dot3 INIT_YMM avx2 decl_common_patterns conv8to32f decl_common_patterns conv16to32f +decl_common_patterns conv32to32f decl_common_patterns conv32fto8 decl_common_patterns conv32fto16 +decl_common_patterns conv32fto32 decl_common_patterns min_max decl_common_patterns scale decl_common_patterns dither_fns -- 2.52.0 >From 3032759f60e79239306822f20adc370d08a360f2 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:24:32 +0100 Subject: [PATCH 5/8] swscale/x86/ops: properly mark SWS_OP_SCALE as flexible --- libswscale/x86/ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index e9598ba437..0c6ec03a76 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -182,6 +182,7 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out) DECL_COMMON_PATTERNS(F32, scale##EXT, \ .op = SWS_OP_SCALE, \ .setup = ff_sws_setup_q, \ + .flexible = true, \ ); static int setup_dither(const SwsOp *op, SwsOpPriv *out) -- 2.52.0 >From b498e723623cdb0f844546d6b85e9fcad1592d20 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:41:38 +0100 Subject: [PATCH 6/8] swscale/x86/ops: add special case for expanding bits to bytes/words Not super useful but also not expensive to carry. monob -> gbrp: Before: time=84 us, ref=137 us, speedup=1.618x faster After: time=23 us, ref=185 us, speedup=7.773x faster monob -> gray16le: Before: time=75 us, ref=108 us, speedup=1.440x faster After: time=20 us, ref=108 us, speedup=5.192x faster Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 10 ++++++++++ libswscale/x86/ops_int.asm | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index 0c6ec03a76..82e85635d6 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -185,6 +185,12 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out) .flexible = true, \ ); +#define DECL_EXPAND_BITS(EXT, BITS) \ + DECL_ASM(U##BITS, expand_bits##BITS##EXT, \ + .op = SWS_OP_SCALE, \ + .scale = Q((1 << (BITS)) - 1), \ + ); + static int setup_dither(const SwsOp *op, SwsOpPriv *out) { /* 1x1 matrix / single constant */ @@ -268,6 +274,7 @@ static int setup_linear(const SwsOp *op, SwsOpPriv *out) DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \ DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \ DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \ + DECL_EXPAND_BITS(EXT, 8) \ DECL_PACKED_RW(EXT, 8) \ DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \ DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \ @@ -336,6 +343,7 @@ static const SwsOpTable ops8##EXT = { &op_read_nibbles1##EXT, \ &op_read_bits1##EXT, \ &op_write_bits1##EXT, \ + &op_expand_bits8##EXT, \ &op_pack_1210##EXT, \ &op_pack_3320##EXT, \ &op_pack_2330##EXT, \ @@ -386,6 +394,7 @@ static const SwsOpTable ops8##EXT = { #define DECL_FUNCS_16(SIZE, EXT, FLAG) \ DECL_PACKED_RW(EXT, 16) \ + DECL_EXPAND_BITS(EXT, 16) \ DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \ DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \ DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \ @@ -414,6 +423,7 @@ static const SwsOpTable ops16##EXT = { &op_unpack_4440##EXT, \ &op_unpack_5550##EXT, \ &op_unpack_5650##EXT, \ + &op_expand_bits16##EXT, \ REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \ REF_COMMON_PATTERNS(convert_U8_U16##EXT), \ REF_COMMON_PATTERNS(convert_U16_U8##EXT), \ diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm index 44af92a7da..bc9e43a098 100644 --- a/libswscale/x86/ops_int.asm +++ b/libswscale/x86/ops_int.asm @@ -52,6 +52,9 @@ mask2: times 32 db 0x03 mask3: times 32 db 0x07 mask4: times 32 db 0x0F +const1b equ mask1 +const1w: times 16 dw 0x01 + SECTION .text ;--------------------------------------------------------- @@ -456,7 +459,7 @@ IF V2, movd mx2, [in0q + 2] %endif mova m8, [bits_shuf] VBROADCASTI128 m9, [bits_mask] - VBROADCASTI128 m10, [mask1] + VBROADCASTI128 m10, [const1b] LOAD_CONT tmp0q add in0q, (mmsize >> 3) * (1 + V2) pshufb mx, m8 @@ -947,7 +950,7 @@ IF W, vpermq mw, mw, q3120 %endmacro ;--------------------------------------------------------- -; Shifting +; Shifting and scaling %macro lshift16 0 op lshift16 @@ -983,6 +986,16 @@ IF W, psrlw mw2, xm8 CONTINUE tmp0q %endmacro +; special cases for expanding bits to full range +%macro expand_bits 2 ; bits, suffix +op expand_bits%1 + mova m8, [const1%2] + LOAD_CONT tmp0q + pcmpeq%2 mx, m8 +IF V2, pcmpeq%2 mx2, m8 + CONTINUE tmp0q +%endmacro + ;--------------------------------------------------------- ; Macro instantiations for kernel functions @@ -1000,6 +1013,7 @@ IF W, psrlw mw2, xm8 read_nibbles read_bits write_bits + expand_bits 8, b pack_generic 1, 2, 1 pack_generic 3, 3, 2 @@ -1022,6 +1036,7 @@ IF W, psrlw mw2, xm8 %macro funcs_u16 0 rw_packed 16 + expand_bits 16, w pack_generic 4, 4, 4 pack_generic 5, 5, 5 pack_generic 5, 6, 5 -- 2.52.0 >From 8f450150cb2877629f920d9d730a9139fc558262 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:54:27 +0100 Subject: [PATCH 7/8] swscale/ops_backend: avoid UB from incorrect function signature Annoying C-ism; we can't overload the function type even though they will always be pointers. We can't even get away with using (void *) in the function signature, despite casts to void * being technically valid. Avoid the issue altogether by just moving the process loop into the type-specific template altogether, and just referring to the correct compiled process function at runtime. Hopefully, the compiler should be able to optimize these into a single implementation. Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops_backend.c | 36 ++++++++++++------------------------ libswscale/ops_tmpl_common.c | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c index a503139016..449ba8c975 100644 --- a/libswscale/ops_backend.c +++ b/libswscale/ops_backend.c @@ -48,29 +48,6 @@ typedef float f32block_t[SWS_BLOCK_SIZE]; # include "ops_tmpl_float.c" #undef BIT_DEPTH -static void process(const SwsOpExec *exec, const void *priv, - const int bx_start, const int y_start, int bx_end, int y_end) -{ - const SwsOpChain *chain = priv; - const SwsOpImpl *impl = chain->impl; - u32block_t x, y, z, w; /* allocate enough space for any intermediate */ - - SwsOpIter iterdata; - SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */ - - for (iter->y = y_start; iter->y < y_end; iter->y++) { - for (int i = 0; i < 4; i++) { - iter->in[i] = exec->in[i] + (iter->y - y_start) * exec->in_stride[i]; - iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i]; - } - - for (int block = bx_start; block < bx_end; block++) { - iter->x = block * SWS_BLOCK_SIZE; - CONTINUE(u32block_t, x, y, z, w); - } - } -} - static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) { int ret; @@ -79,6 +56,9 @@ static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) if (!chain) return AVERROR(ENOMEM); + av_assert0(ops->num_ops > 0); + const SwsPixelType read_type = ops->ops[0].type; + static const SwsOpTable *const tables[] = { &bitfn(op_table_int, u8), &bitfn(op_table_int, u16), @@ -96,12 +76,20 @@ static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) } *out = (SwsCompiledOp) { - .func = process, .block_size = SWS_BLOCK_SIZE, .cpu_flags = chain->cpu_flags, .priv = chain, .free = ff_sws_op_chain_free_cb, }; + + switch (read_type) { + case SWS_PIXEL_U8: out->func = process_u8; break; + case SWS_PIXEL_U16: out->func = process_u16; break; + case SWS_PIXEL_U32: out->func = process_u32; break; + case SWS_PIXEL_F32: out->func = process_f32; break; + default: return AVERROR(EINVAL); + } + return 0; } diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c index 7cfec4e3f6..c0e0d9f3fb 100644 --- a/libswscale/ops_tmpl_common.c +++ b/libswscale/ops_tmpl_common.c @@ -175,3 +175,27 @@ WRAP_COMMON_PATTERNS(scale, .setup = ff_sws_setup_q, .flexible = true, ); + +static void fn(process)(const SwsOpExec *exec, const void *priv, + const int bx_start, const int y_start, + int bx_end, int y_end) +{ + const SwsOpChain *chain = priv; + const SwsOpImpl *impl = chain->impl; + u32block_t x, y, z, w; /* allocate enough space for any intermediate */ + + SwsOpIter iterdata; + SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */ + + for (iter->y = y_start; iter->y < y_end; iter->y++) { + for (int i = 0; i < 4; i++) { + iter->in[i] = exec->in[i] + (iter->y - y_start) * exec->in_stride[i]; + iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i]; + } + + for (int block = bx_start; block < bx_end; block++) { + iter->x = block * SWS_BLOCK_SIZE; + CONTINUE(block_t, (void *) x, (void *) y, (void *) z, (void *) w); + } + } +} -- 2.52.0 >From 95a3b32c2a204a792a20af95da276f3d4b786762 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 24 Feb 2026 12:58:30 +0100 Subject: [PATCH 8/8] swscale/ops: avoid UB in handle_tail() Stupid NULL + 0 rule. Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libswscale/ops.c b/libswscale/ops.c index 900077584a..cf5950aa7d 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -1028,9 +1028,9 @@ handle_tail(const SwsOpPass *p, SwsOpExec *exec, } for (int i = 0; i < 4; i++) { - if (!copy_in) + if (!copy_in && exec->in[i]) exec->in[i] += in.linesize[i]; - if (!copy_out) + if (!copy_out && exec->out[i]) exec->out[i] += out.linesize[i]; } } -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
