[FFmpeg-devel] [PR] swscale/ops: minor fixes (some UB, some missing x86 functions) (PR #22275)

Niklas Haas via ffmpeg-devel Tue, 24 Feb 2026 04:05:52 -0800

PR #22275 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22275
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22275.patch


Makes it so that the self-test now passes `ubsan` again.


>From 0f194da38d3087fbf32070365da41c7e7f5350d1 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:21:11 +0100
Subject: [PATCH 1/8] swscale/ops_chain: properly mark unreachable branch

By breaking to the `av_unreachable` below. This branch is unreachable because
of the `if (entry->flexible)` branch further above.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_chain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c
index d1ec1ef83d..e5fbc4f016 100644
--- a/libswscale/ops_chain.c
+++ b/libswscale/ops_chain.c
@@ -152,7 +152,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry 
*entry, const SwsComps nex
     case SWS_OP_LSHIFT:
     case SWS_OP_RSHIFT:
         av_assert1(entry->flexible);
-        return score;
+        break;
     case SWS_OP_SWIZZLE:
         for (int i = 0; i < 4; i++) {
             if (op->swizzle.in[i] != entry->swizzle.in[i] && !next.unused[i])
@@ -169,7 +169,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry 
*entry, const SwsComps nex
     case SWS_OP_MIN:
     case SWS_OP_MAX:
         av_assert1(entry->flexible);
-        return score;
+        break;
     case SWS_OP_LINEAR:
         /* All required elements must be present */
         if (op->lin.mask & ~entry->linear_mask)
-- 
2.52.0


>From b2a6b54b7ee083cbe273cd598ad949f650759d3d Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:24:57 +0100
Subject: [PATCH 2/8] swscale/ops_chain: add ability to match fixed scale
 factor

This is useful especially for the special case of scaling by common
not-quite-power-of-two constants like 255 or 1023.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_chain.c | 2 +-
 libswscale/ops_chain.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c
index e5fbc4f016..2445154186 100644
--- a/libswscale/ops_chain.c
+++ b/libswscale/ops_chain.c
@@ -184,7 +184,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry 
*entry, const SwsComps nex
         score += av_popcount(SWS_MASK_ALL ^ entry->linear_mask);
         return score;
     case SWS_OP_SCALE:
-        return score;
+        return av_cmp_q(op->c.q, entry->scale) ? 0 : score;
     case SWS_OP_TYPE_NB:
         break;
     }
diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h
index 2f5a31793e..0bc8c01283 100644
--- a/libswscale/ops_chain.h
+++ b/libswscale/ops_chain.h
@@ -111,6 +111,7 @@ typedef struct SwsOpEntry {
         uint32_t       linear_mask; /* subset of SwsLinearOp */
         int            dither_size; /* subset of SwsDitherOp */
         int            clear_value; /* clear value for integer clears */
+        AVRational     scale;       /* scale factor for SWS_OP_SCALE */
     };
 
     /* Kernel implementation */
-- 
2.52.0


>From 6bf73d2044875201856ff38af827878b4547fe89 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 11:51:18 +0100
Subject: [PATCH 3/8] swscale/x86/ops: allow matching planar rw against
 1-element packed fmt

Otherwise, the x86 backend fails to serve e.g. rgb565le.

For -src rgb565le:
 Before: Overall speedup=2.210x faster, min=0.256x max=60.465x
 After:  Overall speedup=4.929x faster, min=0.638x max=181.260x

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 44dbe05b35..fadc1ce8c9 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -551,7 +551,7 @@ static bool op_is_type_invariant(const SwsOp *op)
     switch (op->op) {
     case SWS_OP_READ:
     case SWS_OP_WRITE:
-        return !op->rw.packed && !op->rw.frac;
+        return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac;
     case SWS_OP_SWIZZLE:
     case SWS_OP_CLEAR:
         return true;
-- 
2.52.0


>From 3c96548547e0d555a61465172b12bb149f914e79 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:02:18 +0100
Subject: [PATCH 4/8] swscale/x86/ops: add missing U32 <-> F32 conversions

For -src x2rgb10le:
 Before: Overall speedup=1.634x faster, min=0.356x max=44.083x
 After:  Overall speedup=4.662x faster, min=0.676x max=137.445x

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c         |  4 ++++
 libswscale/x86/ops_float.asm | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index fadc1ce8c9..e9598ba437 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -439,6 +439,8 @@ static const SwsOpTable ops16##EXT = {
     DECL_CONVERT(EXT, F32,  U8)                                                
 \
     DECL_CONVERT(EXT, U16, F32)                                                
 \
     DECL_CONVERT(EXT, F32, U16)                                                
 \
+    DECL_CONVERT(EXT, U32, F32)                                                
 \
+    DECL_CONVERT(EXT, F32, U32)                                                
 \
     DECL_EXPAND(EXT,   U8, U32)                                                
 \
     DECL_MIN_MAX(EXT)                                                          
 \
     DECL_SCALE(EXT)                                                            
 \
@@ -489,6 +491,8 @@ static const SwsOpTable ops32##EXT = {
         REF_COMMON_PATTERNS(convert_F32_U8##EXT),                              
 \
         REF_COMMON_PATTERNS(convert_U16_F32##EXT),                             
 \
         REF_COMMON_PATTERNS(convert_F32_U16##EXT),                             
 \
+        REF_COMMON_PATTERNS(convert_U32_F32##EXT),                             
 \
+        REF_COMMON_PATTERNS(convert_F32_U32##EXT),                             
 \
         REF_COMMON_PATTERNS(expand_U8_U32##EXT),                               
 \
         REF_COMMON_PATTERNS(min##EXT),                                         
 \
         REF_COMMON_PATTERNS(max##EXT),                                         
 \
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index 2863085a8e..5336adb50b 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -77,6 +77,20 @@ IF W,   vcvtdq2ps mw2, mw2
         CONTINUE tmp0q
 %endmacro
 
+%macro conv32to32f 0
+op convert_U32_F32
+        LOAD_CONT tmp0q
+IF X,   vcvtdq2ps mx, mx
+IF Y,   vcvtdq2ps my, my
+IF Z,   vcvtdq2ps mz, mz
+IF W,   vcvtdq2ps mw, mw
+IF X,   vcvtdq2ps mx2, mx2
+IF Y,   vcvtdq2ps my2, my2
+IF Z,   vcvtdq2ps mz2, mz2
+IF W,   vcvtdq2ps mw2, mw2
+        CONTINUE tmp0q
+%endmacro
+
 %macro conv32fto8 0
 op convert_F32_U8
         LOAD_CONT tmp0q
@@ -130,6 +144,20 @@ IF W,   vpermq mw, mw, q3120
         CONTINUE tmp0q
 %endmacro
 
+%macro conv32fto32 0
+op convert_F32_U32
+        LOAD_CONT tmp0q
+IF X,   cvttps2dq mx, mx
+IF Y,   cvttps2dq my, my
+IF Z,   cvttps2dq mz, mz
+IF W,   cvttps2dq mw, mw
+IF X,   cvttps2dq mx2, mx2
+IF Y,   cvttps2dq my2, my2
+IF Z,   cvttps2dq mz2, mz2
+IF W,   cvttps2dq mw2, mw2
+        CONTINUE tmp0q
+%endmacro
+
 %macro min_max 0
 op min
 IF X,   vbroadcastss m8,  [implq + SwsOpImpl.priv + 0]
@@ -375,8 +403,10 @@ op dot3
 INIT_YMM avx2
 decl_common_patterns conv8to32f
 decl_common_patterns conv16to32f
+decl_common_patterns conv32to32f
 decl_common_patterns conv32fto8
 decl_common_patterns conv32fto16
+decl_common_patterns conv32fto32
 decl_common_patterns min_max
 decl_common_patterns scale
 decl_common_patterns dither_fns
-- 
2.52.0


>From 3032759f60e79239306822f20adc370d08a360f2 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:24:32 +0100
Subject: [PATCH 5/8] swscale/x86/ops: properly mark SWS_OP_SCALE as flexible

---
 libswscale/x86/ops.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index e9598ba437..0c6ec03a76 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -182,6 +182,7 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out)
     DECL_COMMON_PATTERNS(F32, scale##EXT,                                      
 \
         .op = SWS_OP_SCALE,                                                    
 \
         .setup = ff_sws_setup_q,                                               
 \
+        .flexible = true,                                                      
 \
     );
 
 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
-- 
2.52.0


>From b498e723623cdb0f844546d6b85e9fcad1592d20 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:41:38 +0100
Subject: [PATCH 6/8] swscale/x86/ops: add special case for expanding bits to
 bytes/words

Not super useful but also not expensive to carry.

monob -> gbrp:
 Before: time=84 us, ref=137 us, speedup=1.618x faster
 After:  time=23 us, ref=185 us, speedup=7.773x faster

monob -> gray16le:
 Before: time=75 us, ref=108 us, speedup=1.440x faster
 After:  time=20 us, ref=108 us, speedup=5.192x faster

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c       | 10 ++++++++++
 libswscale/x86/ops_int.asm | 19 +++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 0c6ec03a76..82e85635d6 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -185,6 +185,12 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out)
         .flexible = true,                                                      
 \
     );
 
+#define DECL_EXPAND_BITS(EXT, BITS)                                            
 \
+    DECL_ASM(U##BITS, expand_bits##BITS##EXT,                                  
 \
+        .op = SWS_OP_SCALE,                                                    
 \
+        .scale = Q((1 << (BITS)) - 1),                                         
 \
+    );
+
 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
 {
     /* 1x1 matrix / single constant */
@@ -268,6 +274,7 @@ static int setup_linear(const SwsOp *op, SwsOpPriv *out)
     DECL_RW(EXT, U8, read_nibbles,  READ,  1, false, 1)                        
 \
     DECL_RW(EXT, U8, read_bits,     READ,  1, false, 3)                        
 \
     DECL_RW(EXT, U8, write_bits,    WRITE, 1, false, 3)                        
 \
+    DECL_EXPAND_BITS(EXT, 8)                                                   
 \
     DECL_PACKED_RW(EXT, 8)                                                     
 \
     DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0)                                      
 \
     DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0)                                      
 \
@@ -336,6 +343,7 @@ static const SwsOpTable ops8##EXT = {
         &op_read_nibbles1##EXT,                                                
 \
         &op_read_bits1##EXT,                                                   
 \
         &op_write_bits1##EXT,                                                  
 \
+        &op_expand_bits8##EXT,                                                 
 \
         &op_pack_1210##EXT,                                                    
 \
         &op_pack_3320##EXT,                                                    
 \
         &op_pack_2330##EXT,                                                    
 \
@@ -386,6 +394,7 @@ static const SwsOpTable ops8##EXT = {
 
 #define DECL_FUNCS_16(SIZE, EXT, FLAG)                                         
 \
     DECL_PACKED_RW(EXT, 16)                                                    
 \
+    DECL_EXPAND_BITS(EXT, 16)                                                  
 \
     DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0)                                     
 \
     DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0)                                     
 \
     DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0)                                     
 \
@@ -414,6 +423,7 @@ static const SwsOpTable ops16##EXT = {
         &op_unpack_4440##EXT,                                                  
 \
         &op_unpack_5550##EXT,                                                  
 \
         &op_unpack_5650##EXT,                                                  
 \
+        &op_expand_bits16##EXT,                                                
 \
         REF_COMMON_PATTERNS(swap_bytes_U16##EXT),                              
 \
         REF_COMMON_PATTERNS(convert_U8_U16##EXT),                              
 \
         REF_COMMON_PATTERNS(convert_U16_U8##EXT),                              
 \
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index 44af92a7da..bc9e43a098 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -52,6 +52,9 @@ mask2: times 32 db 0x03
 mask3: times 32 db 0x07
 mask4: times 32 db 0x0F
 
+const1b equ mask1
+const1w: times 16 dw 0x01
+
 SECTION .text
 
 ;---------------------------------------------------------
@@ -456,7 +459,7 @@ IF V2,  movd mx2, [in0q + 2]
 %endif
         mova m8, [bits_shuf]
         VBROADCASTI128 m9,  [bits_mask]
-        VBROADCASTI128 m10, [mask1]
+        VBROADCASTI128 m10, [const1b]
         LOAD_CONT tmp0q
         add in0q, (mmsize >> 3) * (1 + V2)
         pshufb mx,  m8
@@ -947,7 +950,7 @@ IF W,   vpermq mw, mw, q3120
 %endmacro
 
 ;---------------------------------------------------------
-; Shifting
+; Shifting and scaling
 
 %macro lshift16 0
 op lshift16
@@ -983,6 +986,16 @@ IF W,   psrlw mw2, xm8
         CONTINUE tmp0q
 %endmacro
 
+; special cases for expanding bits to full range
+%macro expand_bits 2 ; bits, suffix
+op expand_bits%1
+        mova m8, [const1%2]
+        LOAD_CONT tmp0q
+        pcmpeq%2 mx, m8
+IF V2,  pcmpeq%2 mx2, m8
+        CONTINUE tmp0q
+%endmacro
+
 ;---------------------------------------------------------
 ; Macro instantiations for kernel functions
 
@@ -1000,6 +1013,7 @@ IF W,   psrlw mw2, xm8
     read_nibbles
     read_bits
     write_bits
+    expand_bits 8, b
 
     pack_generic 1, 2, 1
     pack_generic 3, 3, 2
@@ -1022,6 +1036,7 @@ IF W,   psrlw mw2, xm8
 
 %macro funcs_u16 0
     rw_packed 16
+    expand_bits 16, w
     pack_generic  4, 4, 4
     pack_generic  5, 5, 5
     pack_generic  5, 6, 5
-- 
2.52.0


>From 8f450150cb2877629f920d9d730a9139fc558262 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:54:27 +0100
Subject: [PATCH 7/8] swscale/ops_backend: avoid UB from incorrect function
 signature

Annoying C-ism; we can't overload the function type even though they will
always be pointers. We can't even get away with using (void *) in the
function signature, despite casts to void * being technically valid.

Avoid the issue altogether by just moving the process loop into the
type-specific template altogether, and just referring to the correct
compiled process function at runtime. Hopefully, the compiler should be
able to optimize these into a single implementation.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_backend.c     | 36 ++++++++++++------------------------
 libswscale/ops_tmpl_common.c | 24 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c
index a503139016..449ba8c975 100644
--- a/libswscale/ops_backend.c
+++ b/libswscale/ops_backend.c
@@ -48,29 +48,6 @@ typedef    float f32block_t[SWS_BLOCK_SIZE];
 # include "ops_tmpl_float.c"
 #undef BIT_DEPTH
 
-static void process(const SwsOpExec *exec, const void *priv,
-                    const int bx_start, const int y_start, int bx_end, int 
y_end)
-{
-    const SwsOpChain *chain = priv;
-    const SwsOpImpl *impl = chain->impl;
-    u32block_t x, y, z, w; /* allocate enough space for any intermediate */
-
-    SwsOpIter iterdata;
-    SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
-
-    for (iter->y = y_start; iter->y < y_end; iter->y++) {
-        for (int i = 0; i < 4; i++) {
-            iter->in[i]  = exec->in[i]  + (iter->y - y_start) * 
exec->in_stride[i];
-            iter->out[i] = exec->out[i] + (iter->y - y_start) * 
exec->out_stride[i];
-        }
-
-        for (int block = bx_start; block < bx_end; block++) {
-            iter->x = block * SWS_BLOCK_SIZE;
-            CONTINUE(u32block_t, x, y, z, w);
-        }
-    }
-}
-
 static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
 {
     int ret;
@@ -79,6 +56,9 @@ static int compile(SwsContext *ctx, SwsOpList *ops, 
SwsCompiledOp *out)
     if (!chain)
         return AVERROR(ENOMEM);
 
+    av_assert0(ops->num_ops > 0);
+    const SwsPixelType read_type = ops->ops[0].type;
+
     static const SwsOpTable *const tables[] = {
         &bitfn(op_table_int,    u8),
         &bitfn(op_table_int,   u16),
@@ -96,12 +76,20 @@ static int compile(SwsContext *ctx, SwsOpList *ops, 
SwsCompiledOp *out)
     }
 
     *out = (SwsCompiledOp) {
-        .func       = process,
         .block_size = SWS_BLOCK_SIZE,
         .cpu_flags  = chain->cpu_flags,
         .priv       = chain,
         .free       = ff_sws_op_chain_free_cb,
     };
+
+    switch (read_type) {
+    case SWS_PIXEL_U8:  out->func = process_u8;  break;
+    case SWS_PIXEL_U16: out->func = process_u16; break;
+    case SWS_PIXEL_U32: out->func = process_u32; break;
+    case SWS_PIXEL_F32: out->func = process_f32; break;
+    default: return AVERROR(EINVAL);
+    }
+
     return 0;
 }
 
diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
index 7cfec4e3f6..c0e0d9f3fb 100644
--- a/libswscale/ops_tmpl_common.c
+++ b/libswscale/ops_tmpl_common.c
@@ -175,3 +175,27 @@ WRAP_COMMON_PATTERNS(scale,
     .setup = ff_sws_setup_q,
     .flexible = true,
 );
+
+static void fn(process)(const SwsOpExec *exec, const void *priv,
+                        const int bx_start, const int y_start,
+                        int bx_end, int y_end)
+{
+    const SwsOpChain *chain = priv;
+    const SwsOpImpl *impl = chain->impl;
+    u32block_t x, y, z, w; /* allocate enough space for any intermediate */
+
+    SwsOpIter iterdata;
+    SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
+
+    for (iter->y = y_start; iter->y < y_end; iter->y++) {
+        for (int i = 0; i < 4; i++) {
+            iter->in[i]  = exec->in[i]  + (iter->y - y_start) * 
exec->in_stride[i];
+            iter->out[i] = exec->out[i] + (iter->y - y_start) * 
exec->out_stride[i];
+        }
+
+        for (int block = bx_start; block < bx_end; block++) {
+            iter->x = block * SWS_BLOCK_SIZE;
+            CONTINUE(block_t, (void *) x, (void *) y, (void *) z, (void *) w);
+        }
+    }
+}
-- 
2.52.0


>From 95a3b32c2a204a792a20af95da276f3d4b786762 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 24 Feb 2026 12:58:30 +0100
Subject: [PATCH 8/8] swscale/ops: avoid UB in handle_tail()

Stupid NULL + 0 rule.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 900077584a..cf5950aa7d 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -1028,9 +1028,9 @@ handle_tail(const SwsOpPass *p, SwsOpExec *exec,
         }
 
         for (int i = 0; i < 4; i++) {
-            if (!copy_in)
+            if (!copy_in && exec->in[i])
                 exec->in[i] += in.linesize[i];
-            if (!copy_out)
+            if (!copy_out && exec->out[i])
                 exec->out[i] += out.linesize[i];
         }
     }
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale/ops: minor fixes (some UB, some missing x86 functions) (PR #22275)

Reply via email to