[FFmpeg-devel] [PR] swscale: preliminary changes towards scaling and plane splitting (PR #21652)

Niklas Haas via ffmpeg-devel Thu, 05 Feb 2026 06:48:27 -0800

PR #21652 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21652
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21652.patch


I decided to split these changes, which mainly affect the optimizer, off from 
my WIP plane splitting branch, to make them easier to review.

The notable change is the introduction of an explicit plane order, which can be 
used to swizzle input/output planes (for planar reads/writes) for free. This 
will become especially important when splitting planes, but still brings some 
tangible benefits with it already now.

Includes some other minor commits/fixes that don't warrant their own PR.


>From 21626b8442448e7a4cdac406ce1b7b0c2485d025 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 4 Feb 2026 19:33:50 +0100
Subject: [PATCH 01/18] swscale/ops: reset comp flags on SWS_OP_CLEAR

Even if we clear to a non-integer value.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 83eb8e162e..6991db0702 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -306,11 +306,11 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (op->c.q4[i].den) {
-                    if (op->c.q4[i].num == 0) {
-                        op->comps.flags[i] = SWS_COMP_ZERO | SWS_COMP_EXACT;
-                    } else if (op->c.q4[i].den == 1) {
-                        op->comps.flags[i] = SWS_COMP_EXACT;
-                    }
+                    op->comps.flags[i] = 0;
+                    if (op->c.q4[i].num == 0)
+                        op->comps.flags[i] |= SWS_COMP_ZERO;
+                    if (op->c.q4[i].den == 1)
+                        op->comps.flags[i] |= SWS_COMP_EXACT;
                 } else {
                     op->comps.flags[i] = prev.flags[i];
                 }
-- 
2.52.0


>From 784d3f04da96ae2b52f586c70144b74b813f61ae Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 8 Jan 2026 15:10:30 +0100
Subject: [PATCH 02/18] swscale/ops: add input/output plane swizzle mask to
 SwsOpList

This can be used to have the execution code directly swizzle the plane
pointers, instead of swizzling the data via SWS_OP_SWIZZLE. This can be used
to, for example, extract a subset of the input/output planes for partial
processing of split graphs (e.g. subsampled chroma, or independent alpha),
or just to skip an SWS_OP_SWIZZLE operation.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c | 60 ++++++++++++++++++++++++++++++++++++------------
 libswscale/ops.h |  3 +++
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 6991db0702..b3e1e8dbb7 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -465,6 +465,7 @@ SwsOpList *ff_sws_op_list_alloc(void)
     if (!ops)
         return NULL;
 
+    ops->order_src = ops->order_dst = SWS_SWIZZLE(0, 1, 2, 3);
     ff_fmt_clear(&ops->src);
     ff_fmt_clear(&ops->dst);
     return ops;
@@ -826,6 +827,8 @@ typedef struct SwsOpPass {
     int planes_out;
     int pixel_bits_in;
     int pixel_bits_out;
+    int idx_in[4];
+    int idx_out[4];
     bool memcpy_in;
     bool memcpy_out;
 } SwsOpPass;
@@ -842,10 +845,27 @@ static void op_pass_free(void *ptr)
     av_free(p);
 }
 
-static void op_pass_setup(const SwsImg *out, const SwsImg *in, const SwsPass 
*pass)
+static inline SwsImg img_shift_idx(const SwsImg *base, const int y,
+                                   const int plane_idx[4])
 {
-    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->fmt);
-    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->fmt);
+    SwsImg img = *base;
+    for (int i = 0; i < 4; i++) {
+        const int idx = plane_idx[i];
+        if (idx >= 0) {
+            const int yshift = y >> ff_fmt_vshift(base->fmt, idx);
+            img.data[i] = base->data[idx] + yshift * base->linesize[idx];
+        } else {
+            img.data[i] = NULL;
+        }
+    }
+    return img;
+}
+
+static void op_pass_setup(const SwsImg *out_base, const SwsImg *in_base,
+                          const SwsPass *pass)
+{
+    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in_base->fmt);
+    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out_base->fmt);
 
     SwsOpPass *p = pass->priv;
     SwsOpExec *exec = &p->exec_base;
@@ -864,22 +884,27 @@ static void op_pass_setup(const SwsImg *out, const SwsImg 
*in, const SwsPass *pa
     p->memcpy_in     = false;
     p->memcpy_out    = false;
 
+    const SwsImg in  = img_shift_idx(in_base,  0, p->idx_in);
+    const SwsImg out = img_shift_idx(out_base, 0, p->idx_out);
+
     for (int i = 0; i < p->planes_in; i++) {
-        const int sub_x      = (i == 1 || i == 2) ? indesc->log2_chroma_w : 0;
+        const int idx        = p->idx_in[i];
+        const int sub_x      = (idx == 1 || idx == 2) ? indesc->log2_chroma_w 
: 0;
         const int plane_w    = (aligned_w + sub_x) >> sub_x;
         const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
         const int plane_size = plane_w * p->pixel_bits_in >> 3;
-        p->memcpy_in |= plane_size + plane_pad > in->linesize[i];
-        exec->in_stride[i] = in->linesize[i];
+        p->memcpy_in |= plane_size + plane_pad > in.linesize[i];
+        exec->in_stride[i] = in.linesize[i];
     }
 
     for (int i = 0; i < p->planes_out; i++) {
-        const int sub_x      = (i == 1 || i == 2) ? outdesc->log2_chroma_w : 0;
+        const int idx        = p->idx_out[i];
+        const int sub_x      = (idx == 1 || idx == 2) ? outdesc->log2_chroma_w 
: 0;
         const int plane_w    = (aligned_w + sub_x) >> sub_x;
         const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
         const int plane_size = plane_w * p->pixel_bits_out >> 3;
-        p->memcpy_out |= plane_size + plane_pad > out->linesize[i];
-        exec->out_stride[i] = out->linesize[i];
+        p->memcpy_out |= plane_size + plane_pad > out.linesize[i];
+        exec->out_stride[i] = out.linesize[i];
     }
 
     /* Pre-fill pointer bump for the main section only; this value does not
@@ -887,8 +912,8 @@ static void op_pass_setup(const SwsImg *out, const SwsImg 
*in, const SwsPass *pa
      * process a single line */
     const int blocks_main = p->num_blocks - p->memcpy_out;
     for (int i = 0; i < 4; i++) {
-        exec->in_bump[i]  = in->linesize[i]  - blocks_main * 
exec->block_size_in;
-        exec->out_bump[i] = out->linesize[i] - blocks_main * 
exec->block_size_out;
+        exec->in_bump[i]  = in.linesize[i]  - blocks_main * 
exec->block_size_in;
+        exec->out_bump[i] = out.linesize[i] - blocks_main * 
exec->block_size_out;
     }
 }
 
@@ -906,8 +931,8 @@ handle_tail(const SwsOpPass *p, SwsOpExec *exec,
     const int tail_size_out = p->tail_size_out;
     const int bx = p->num_blocks - 1;
 
-    SwsImg in  = ff_sws_img_shift(in_base,  y);
-    SwsImg out = ff_sws_img_shift(out_base, y);
+    SwsImg in  = img_shift_idx(in_base,  y, p->idx_in);
+    SwsImg out = img_shift_idx(out_base, y, p->idx_out);
     for (int i = 0; i < p->planes_in; i++) {
         in.data[i]  += p->tail_off_in;
         if (copy_in) {
@@ -961,8 +986,8 @@ static void op_pass_run(const SwsImg *out_base, const 
SwsImg *in_base,
 {
     const SwsOpPass *p = pass->priv;
     const SwsCompiledOp *comp = &p->comp;
-    const SwsImg in  = ff_sws_img_shift(in_base,  y);
-    const SwsImg out = ff_sws_img_shift(out_base, y);
+    const SwsImg in  = img_shift_idx(in_base,  y, p->idx_in);
+    const SwsImg out = img_shift_idx(out_base, y, p->idx_out);
 
     /* Fill exec metadata for this slice */
     DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
@@ -1075,6 +1100,11 @@ int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, 
int flags, SwsFormat ds
         .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
     };
 
+    for (int i = 0; i < 4; i++) {
+        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
+        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
+    }
+
     pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, 
input,
                                  1, p, op_pass_run);
     if (!pass) {
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 0db92306a0..4497b9aa78 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -230,6 +230,9 @@ typedef struct SwsOpList {
     SwsOp *ops;
     int num_ops;
 
+    /* Input/output plane pointer swizzle mask */
+    SwsSwizzleOp order_src, order_dst;
+
     /* Purely informative metadata associated with this operation list */
     SwsFormat src, dst;
 } SwsOpList;
-- 
2.52.0


>From 15f3c848277e3e700cfa18da051fcdd4ad7b1e6e Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 4 Feb 2026 14:47:49 +0100
Subject: [PATCH 03/18] swscale/ops: also print plane order when swizzled

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index b3e1e8dbb7..2b28da9282 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -19,6 +19,7 @@
  */
 
 #include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
 #include "libavutil/bswap.h"
 #include "libavutil/mem.h"
 #include "libavutil/rational.h"
@@ -621,6 +622,18 @@ static char describe_comp_flags(unsigned flags)
         return '.';
 }
 
+static const char *describe_order(SwsSwizzleOp order, int planes, char buf[32])
+{
+    if (order.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
+        return "";
+
+    av_strlcpy(buf, ", via {", 32);
+    for (int i = 0; i < planes; i++)
+        av_strlcatf(buf, 32, "%s%d", i ? ", " : "", order.in[i]);
+    av_strlcat(buf, "}", 32);
+    return buf;
+}
+
 static const char *print_q(const AVRational q, char buf[], int buf_len)
 {
     if (!q.den) {
@@ -665,11 +678,15 @@ void ff_sws_op_list_print(void *log, int lev, const 
SwsOpList *ops)
             break;
         case SWS_OP_READ:
         case SWS_OP_WRITE:
-            av_log(log, lev, "%-20s: %d elem(s) %s >> %d\n",
+            av_log(log, lev, "%-20s: %d elem(s) %s >> %d%s\n",
                    op->op == SWS_OP_READ ? "SWS_OP_READ"
                                          : "SWS_OP_WRITE",
                    op->rw.elems,  op->rw.packed ? "packed" : "planar",
-                   op->rw.frac);
+                   op->rw.frac,
+                   describe_order(op->op == SWS_OP_READ ? ops->order_src
+                                                        : ops->order_dst,
+                                  op->rw.packed ? 1 : op->rw.elems,
+                                  (char[32]) {0}));
             break;
         case SWS_OP_SWAP_BYTES:
             av_log(log, lev, "SWS_OP_SWAP_BYTES\n");
-- 
2.52.0


>From 6e9f9a408fbae4ee2a062d2bfce52fddb8d9fb91 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 9 Jan 2026 12:47:55 +0100
Subject: [PATCH 04/18] swscale/ops: add ff_sws_op_list_is_noop()

This helper function also takes into account the plane order, and only
returns true if the SwsOpList is a true no-op (i.e. the input image may be
exactly ref'd to the output, with no change in plane order, etc.)

Note that in the special case of e.g. yuva444p -> yuv444p, this check also
returns true, even though there is an (ignored) extra plane in the input.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c | 25 +++++++++++++++++++++++++
 libswscale/ops.h |  6 ++++++
 2 files changed, 31 insertions(+)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 2b28da9282..3114d52c3f 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -544,6 +544,31 @@ int ff_sws_op_list_append(SwsOpList *ops, SwsOp *op)
     return ff_sws_op_list_insert_at(ops, ops->num_ops, op);
 }
 
+bool ff_sws_op_list_is_noop(const SwsOpList *ops)
+{
+    if (!ops->num_ops)
+        return true;
+
+    const SwsOp *read  = &ops->ops[0];
+    const SwsOp *write = &ops->ops[1];
+    if (ops->num_ops != 2 ||
+        read->op != SWS_OP_READ ||
+        write->op != SWS_OP_WRITE ||
+        read->type != write->type ||
+        read->rw.packed != write->rw.packed ||
+        read->rw.elems != write->rw.elems ||
+        read->rw.frac != write->rw.frac)
+        return false;
+
+    const int num_planes = read->rw.packed ? 1 : read->rw.elems;
+    for (int i = 0; i < num_planes; i++) {
+        if (ops->order_src.in[i] != ops->order_dst.in[i])
+            return false;
+    }
+
+    return true;
+}
+
 int ff_sws_op_list_max_size(const SwsOpList *ops)
 {
     int max_size = 0;
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 4497b9aa78..7de97fc3f6 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -245,6 +245,12 @@ void ff_sws_op_list_free(SwsOpList **ops);
  */
 SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops);
 
+/**
+ * Returns whether an op list represents a true no-op operation, i.e. may be
+ * elimenated entirely from an execution graph.
+ */
+bool ff_sws_op_list_is_noop(const SwsOpList *ops);
+
 /**
  * Returns the size of the largest pixel type used in `ops`.
  */
-- 
2.52.0


>From b407f9d0ad22baf0732a60f0764b91ae0238032e Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 9 Jan 2026 12:51:05 +0100
Subject: [PATCH 05/18] swscale/ops: add no-op check to ff_sws_compile_pass()

And remove the now-redundant check from graph.c

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/graph.c | 8 --------
 libswscale/ops.c   | 6 ++++++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/libswscale/graph.c b/libswscale/graph.c
index 9d9ca53b00..e47b2d07f2 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -515,15 +515,7 @@ static int add_convert_pass(SwsGraph *graph, SwsFormat 
src, SwsFormat dst,
     av_log(ctx, AV_LOG_DEBUG, "Unoptimized operation list:\n");
     ff_sws_op_list_print(ctx, AV_LOG_DEBUG, ops);
     av_log(ctx, AV_LOG_DEBUG, "Optimized operation list:\n");
-
     ff_sws_op_list_optimize(ops);
-    if (ops->num_ops == 0) {
-        av_log(ctx, AV_LOG_VERBOSE, "  optimized into memcpy\n");
-        ff_sws_op_list_free(&ops);
-        *output = input;
-        return 0;
-    }
-
     ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, ops);
 
     ret = ff_sws_compile_pass(graph, ops, 0, dst, input, output);
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 3114d52c3f..38cc2c246e 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -1107,6 +1107,12 @@ int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, 
int flags, SwsFormat ds
     SwsPass *pass;
     int ret;
 
+    /* Check if the whole operation graph is an end-to-end no-op */
+    if (ff_sws_op_list_is_noop(ops)) {
+        *output = input;
+        return 0;
+    }
+
     if (ops->num_ops < 2) {
         av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n");
         return AVERROR(EINVAL);
-- 
2.52.0


>From 93c23f4aa5292d78eaff44780576ba208a1b1105 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 9 Jan 2026 13:05:28 +0100
Subject: [PATCH 06/18] tests/sws_ops: explicitly skip no-op operation lists

These are not necessarily empty, as a result of the previous changes.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/tests/sws_ops.c  | 5 ++++-
 tests/ref/fate/sws-ops-list | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/libswscale/tests/sws_ops.c b/libswscale/tests/sws_ops.c
index 69852fc1e0..bf84128291 100644
--- a/libswscale/tests/sws_ops.c
+++ b/libswscale/tests/sws_ops.c
@@ -57,7 +57,10 @@ static int run_test(SwsContext *const ctx, AVFrame *frame,
            av_get_pix_fmt_name(src.format), av_get_pix_fmt_name(dst.format));
 
     ff_sws_op_list_optimize(ops);
-    ff_sws_op_list_print(NULL, AV_LOG_INFO, ops);
+    if (ff_sws_op_list_is_noop(ops))
+        av_log(NULL, AV_LOG_INFO, "  (no-op)\n");
+    else
+        ff_sws_op_list_print(NULL, AV_LOG_INFO, ops);
 
 fail:
     /* silently skip unsupported formats */
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 0210a0a1cc..bf2dacc154 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-d781f3ddfeed4590eb253814366d2d01
+78416673c15226c0cac62ce4eb24f883
-- 
2.52.0


>From ef8f5ffbbe6e9daa97fdb1a4757fa1078c76866f Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 13 Jan 2026 11:31:12 +0100
Subject: [PATCH 07/18] swscale/x86/ops: make the presence of a read op
 optional

Allows this backend to process op lists without a read, e.g. for pure clear
operations. I decided to change `write` to a pointer as well for symmetry.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index bc61266588..44dbe05b35 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -657,8 +657,8 @@ static int compile(SwsContext *ctx, SwsOpList *ops, 
SwsCompiledOp *out)
         return mmsize;
 
     av_assert1(ops->num_ops > 0);
-    const SwsOp read = ops->ops[0];
-    const SwsOp write = ops->ops[ops->num_ops - 1];
+    const SwsOp *read = ops->ops[0].op == SWS_OP_READ ? &ops->ops[0] : NULL;
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
     int ret;
 
     /* Special fast path for in-place packed shuffle */
@@ -679,9 +679,9 @@ static int compile(SwsContext *ctx, SwsOpList *ops, 
SwsCompiledOp *out)
     };
 
     /* 3-component reads/writes process one extra garbage word */
-    if (read.rw.packed && read.rw.elems == 3)
+    if (read && read->rw.packed && read->rw.elems == 3)
         out->over_read = sizeof(uint32_t);
-    if (write.rw.packed && write.rw.elems == 3)
+    if (write->rw.packed && write->rw.elems == 3)
         out->over_write = sizeof(uint32_t);
 
     static const SwsOpTable *const tables[] = {
@@ -722,8 +722,8 @@ static int compile(SwsContext *ctx, SwsOpList *ops, 
SwsCompiledOp *out)
         out->func = NAME;                                       \
     } while (0)
 
-    const int read_planes  = read.rw.packed  ? 1 : read.rw.elems;
-    const int write_planes = write.rw.packed ? 1 : write.rw.elems;
+    const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
+    const int write_planes = write->rw.packed ? 1 : write->rw.elems;
     switch (FFMAX(read_planes, write_planes)) {
     case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
     case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
-- 
2.52.0


>From 522288e600d4d8ff1bbead6c9879b9d5ccde0896 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 13 Jan 2026 15:22:31 +0100
Subject: [PATCH 08/18] swscale/ops_backend: add clear pattern for ya8 alpha

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_tmpl_common.c | 1 +
 libswscale/ops_tmpl_int.c    | 1 +
 libswscale/x86/ops_int.asm   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
index 9490be0313..7cfec4e3f6 100644
--- a/libswscale/ops_tmpl_common.c
+++ b/libswscale/ops_tmpl_common.c
@@ -93,6 +93,7 @@ DECL_ENTRY(clear##_##X##Y##Z##W,
 
 WRAP_CLEAR(1, 1, 1, 0) /* rgba alpha */
 WRAP_CLEAR(0, 1, 1, 1) /* argb alpha */
+WRAP_CLEAR(1, 0, 1, 1) /* ya alpha */
 
 WRAP_CLEAR(0, 0, 1, 1) /* vuya chroma */
 WRAP_CLEAR(1, 0, 0, 1) /* yuva chroma */
diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c
index da24c1985f..84596e2763 100644
--- a/libswscale/ops_tmpl_int.c
+++ b/libswscale/ops_tmpl_int.c
@@ -542,6 +542,7 @@ static const SwsOpTable fn(op_table_int) = {
         &fn(op_clear_1110),
         &fn(op_clear_0111),
         &fn(op_clear_0011),
+        &fn(op_clear_1011),
         &fn(op_clear_1001),
         &fn(op_clear_1100),
         &fn(op_clear_0101),
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index edfbabb60d..44af92a7da 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -622,6 +622,7 @@ op clear
         decl_pattern 1, 1, 1, 0, clear_generic
         decl_pattern 0, 1, 1, 1, clear_generic
         decl_pattern 0, 0, 1, 1, clear_generic
+        decl_pattern 1, 0, 1, 1, clear_generic
         decl_pattern 1, 0, 0, 1, clear_generic
         decl_pattern 1, 1, 0, 0, clear_generic
         decl_pattern 0, 1, 0, 1, clear_generic
-- 
2.52.0


>From 66a71d6c58cc0f4fc3c5b843a0255cb6810635c9 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 13 Jan 2026 17:20:55 +0100
Subject: [PATCH 09/18] swscale/ops_backend: allocate block storage up-front

Instead of in each read() function. Not only is this slightly faster, due
to promoting more tail calls, but it also allows us to have operation chains
that don't start with a read.

Also simplifies the implementations.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_backend.c  | 16 +++++++++-------
 libswscale/ops_backend.h  | 21 ++++++---------------
 libswscale/ops_tmpl_int.c | 10 +---------
 3 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c
index 248a591fd2..a503139016 100644
--- a/libswscale/ops_backend.c
+++ b/libswscale/ops_backend.c
@@ -53,18 +53,20 @@ static void process(const SwsOpExec *exec, const void *priv,
 {
     const SwsOpChain *chain = priv;
     const SwsOpImpl *impl = chain->impl;
-    SwsOpIter iter;
+    u32block_t x, y, z, w; /* allocate enough space for any intermediate */
 
-    for (iter.y = y_start; iter.y < y_end; iter.y++) {
+    SwsOpIter iterdata;
+    SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
+
+    for (iter->y = y_start; iter->y < y_end; iter->y++) {
         for (int i = 0; i < 4; i++) {
-            iter.in[i]  = exec->in[i]  + (iter.y - y_start) * 
exec->in_stride[i];
-            iter.out[i] = exec->out[i] + (iter.y - y_start) * 
exec->out_stride[i];
+            iter->in[i]  = exec->in[i]  + (iter->y - y_start) * 
exec->in_stride[i];
+            iter->out[i] = exec->out[i] + (iter->y - y_start) * 
exec->out_stride[i];
         }
 
         for (int block = bx_start; block < bx_end; block++) {
-            iter.x = block * SWS_BLOCK_SIZE;
-            ((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont)
-                (&iter, &impl[1]);
+            iter->x = block * SWS_BLOCK_SIZE;
+            CONTINUE(u32block_t, x, y, z, w);
         }
     }
 }
diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h
index 4a1794af8a..b1616f6b02 100644
--- a/libswscale/ops_backend.h
+++ b/libswscale/ops_backend.h
@@ -78,13 +78,9 @@ typedef struct SwsOpIter {
                                           __VA_ARGS__)
 
 #define DECL_READ(NAME, ...)                                                   
 \
-    static av_always_inline void fn(NAME)(SwsOpIter *restrict iter,            
 \
-                                          const SwsOpImpl *restrict impl,      
 \
-                                          const pixel_t *restrict in0,         
 \
-                                          const pixel_t *restrict in1,         
 \
-                                          const pixel_t *restrict in2,         
 \
-                                          const pixel_t *restrict in3,         
 \
-                                          __VA_ARGS__)
+    DECL_FUNC(NAME, const pixel_t *restrict in0, const pixel_t *restrict in1,  
 \
+                    const pixel_t *restrict in2, const pixel_t *restrict in3,  
 \
+                    __VA_ARGS__)
 
 #define DECL_WRITE(NAME, ...)                                                  
 \
     DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1,            
 \
@@ -96,10 +92,9 @@ typedef struct SwsOpIter {
     fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__)
 
 #define CALL_READ(FUNC, ...)                                                   
 \
-    fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0],                        
 \
-                         (const pixel_t *) iter->in[1],                        
 \
-                         (const pixel_t *) iter->in[2],                        
 \
-                         (const pixel_t *) iter->in[3], __VA_ARGS__)
+    CALL(FUNC, (const pixel_t *) iter->in[0], (const pixel_t *) iter->in[1],   
 \
+               (const pixel_t *) iter->in[2], (const pixel_t *) iter->in[3],   
 \
+               __VA_ARGS__)
 
 #define CALL_WRITE(FUNC, ...)                                                  
 \
     CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1],             
 \
@@ -112,10 +107,6 @@ typedef struct SwsOpIter {
                                   block_t x, block_t y,                        
 \
                                   block_t z, block_t w)
 
-#define DECL_IMPL_READ(NAME)                                                   
 \
-    static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter,                    
 \
-                                  const SwsOpImpl *restrict impl)
-
 /* Helper macro to call into the next continuation with a given type */
 #define CONTINUE(TYPE, ...)                                                    
 \
     ((void (*)(SwsOpIter *, const SwsOpImpl *,                                 
 \
diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c
index 84596e2763..d9870faf34 100644
--- a/libswscale/ops_tmpl_int.c
+++ b/libswscale/ops_tmpl_int.c
@@ -58,8 +58,6 @@
 
 DECL_READ(read_planar, const int elems)
 {
-    block_t x, y, z, w;
-
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         x[i] = in0[i];
@@ -76,8 +74,6 @@ DECL_READ(read_planar, const int elems)
 
 DECL_READ(read_packed, const int elems)
 {
-    block_t x, y, z, w;
-
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         x[i] = in0[elems * i + 0];
@@ -121,7 +117,7 @@ DECL_WRITE(write_packed, const int elems)
 }
 
 #define WRAP_READ(FUNC, ELEMS, FRAC, PACKED)                                   
 \
-DECL_IMPL_READ(FUNC##ELEMS)                                                    
 \
+DECL_IMPL(FUNC##ELEMS)                                                         
 \
 {                                                                              
 \
     CALL_READ(FUNC, ELEMS);                                                    
 \
     for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                             
 \
@@ -173,8 +169,6 @@ WRAP_WRITE(write_packed, 4, 0, true)
 #if BIT_DEPTH == 8
 DECL_READ(read_nibbles, const int elems)
 {
-    block_t x, y, z, w;
-
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
         const pixel_t val = ((const pixel_t *) in0)[i >> 1];
@@ -187,8 +181,6 @@ DECL_READ(read_nibbles, const int elems)
 
 DECL_READ(read_bits, const int elems)
 {
-    block_t x, y, z, w;
-
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
         const pixel_t val = ((const pixel_t *) in0)[i >> 3];
-- 
2.52.0


>From 6e20a3291f2900a8b9522746dbcea4859fc0c375 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 13 Jan 2026 21:02:12 +0100
Subject: [PATCH 10/18] swscale/optimizer: allow commuting CLEAR past
 SWAP_BYTES

This requires a bit of a manual check in the 32-bit integer case to
make sure we don't exceed the value range of AVRational; but it still allows
quite a number of optimizations despite that restriction.

e.g.

rgb24 -> yuva444p9be:
-  [u16 ...X -> ++++] SWS_OP_CLEAR        : {_ _ _ 511}
-  [u16 .... -> zzzz] SWS_OP_SWAP_BYTES
-  [u16 .... -> zzzz] SWS_OP_WRITE        : 4 elem(s) planar >> 0
+  [u16 ...X -> zzzX] SWS_OP_SWAP_BYTES
+  [u16 ...X -> zzz+] SWS_OP_CLEAR        : {_ _ _ 65281}
+  [u16 .... -> zzz+] SWS_OP_WRITE        : 4 elem(s) planar >> 0

gray -> yuv444p12be:
-  [u16 .XXX -> +++X] SWS_OP_CLEAR        : {_ 2048 2048 _}
-  [u16 ...X -> zzzX] SWS_OP_SWAP_BYTES
-  [u16 ...X -> zzzX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+  [u16 .XXX -> zzXX] SWS_OP_SWAP_BYTES
+  [u16 .XXX -> z++X] SWS_OP_CLEAR        : {_ 8 8 _}
+  [u16 ...X -> z++X] SWS_OP_WRITE        : 3 elem(s) planar >> 0

Ultimately, the benefit of this will only become relevant once we start
splitting apart planes, since then we can have planes with only CLEAR
operations.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  | 19 ++++++++++++++++++-
 tests/ref/fate/sws-ops-list |  2 +-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 11ee40e268..dd184b27e7 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -39,6 +39,8 @@
  */
 static bool op_commute_clear(SwsOp *op, SwsOp *next)
 {
+    SwsOp tmp;
+
     av_assert1(op->op == SWS_OP_CLEAR);
     switch (next->op) {
     case SWS_OP_CONVERT:
@@ -54,8 +56,23 @@ static bool op_commute_clear(SwsOp *op, SwsOp *next)
     case SWS_OP_SWIZZLE:
         ff_sws_apply_op_q(next, op->c.q4);
         return true;
-    case SWS_OP_INVALID:
     case SWS_OP_SWAP_BYTES:
+        switch (next->type) {
+        case SWS_PIXEL_U16:
+            ff_sws_apply_op_q(next, op->c.q4); /* always works */
+            return true;
+        case SWS_PIXEL_U32:;
+            for (int i = 0; i < 4; i++) {
+                uint32_t v = av_bswap32(next->c.q4[i].num);
+                if (v > INT_MAX)
+                    return false; /* can't represent as AVRational anymore */
+                tmp.c.q4[i] = Q(v);
+            }
+            next->c = tmp.c;
+            return true;
+        default: return false;
+        }
+    case SWS_OP_INVALID:
     case SWS_OP_WRITE:
     case SWS_OP_LINEAR:
     case SWS_OP_PACK:
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index bf2dacc154..b69b1ab299 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-78416673c15226c0cac62ce4eb24f883
+8312bc72ff9e05a8a6ab8d1c394783d6
-- 
2.52.0


>From 227303dc1bf113ee3e87e00648ebb8d84ef3edc6 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 21 Jan 2026 14:36:26 +0100
Subject: [PATCH 11/18] swscale/optimizer: fix unswizzle optimization

The way this code was written relied on the implicit assumption that no other
row was reading from the same column, which was true in practice so far but
not necessarily true in general. Fix it by precomputing the nonzero component
mask and then adding an explicit check.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index dd184b27e7..b67b94b422 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -244,21 +244,28 @@ static bool extract_swizzle(SwsLinearOp *op, SwsComps 
prev, SwsSwizzleOp *out_sw
     SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
     SwsLinearOp c = *op;
 
+    /* Find non-zero coefficients in the main 4x4 matrix */
+    uint32_t nonzero = 0;
     for (int i = 0; i < 4; i++) {
-        int idx = -1;
         for (int j = 0; j < 4; j++) {
             if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO))
                 continue;
-            if (idx >= 0)
-                return false; /* multiple inputs */
-            idx = j;
+            nonzero |= SWS_MASK(i, j);
         }
+    }
 
-        if (idx >= 0 && idx != i) {
-            /* Move coefficient to the diagonal */
-            c.m[i][i] = c.m[i][idx];
-            c.m[i][idx] = Q(0);
-            swiz.in[i] = idx;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            /* If this value is unique in its row and the target column is
+             * empty, move it there and update the input swizzle */
+            const bool unique_row = (nonzero & SWS_MASK_ROW(i)) == SWS_MASK(i, 
j);
+            const bool empty_col  = (nonzero & SWS_MASK_COL(i)) == 0;
+            if (unique_row && empty_col) {
+                /* Move coefficient to the diagonal */
+                c.m[i][i] = c.m[i][j];
+                c.m[i][j] = Q(0);
+                swiz.in[i] = j;
+            }
         }
     }
 
-- 
2.52.0


>From c03cff2f76e60a1d1662feffab7c198a24b62038 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 13 Jan 2026 11:19:00 +0100
Subject: [PATCH 12/18] swscale/optimizer: don't reject op lists without read

When splitting planes, some planes can end up without a read operation
altogether, e.g. when just clearing the alpha plane.

Just return ENOTSUP for such lists instead of EINVAL.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index b67b94b422..5175f0aa26 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -658,9 +658,8 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
     const int read_size = ff_sws_pixel_type_size(read.type);
     uint32_t mask[4] = {0};
 
-    if (!ops->num_ops || read.op != SWS_OP_READ)
-        return AVERROR(EINVAL);
-    if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
+    if (!ops->num_ops || read.op != SWS_OP_READ ||
+        read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
         return AVERROR(ENOTSUP);
 
     for (int i = 0; i < read.rw.elems; i++)
-- 
2.52.0


>From b55e661cc0ab7ed4e0e8a9d5e51ebc881c6e040c Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 9 Jan 2026 12:52:45 +0100
Subject: [PATCH 13/18] swscale/optimizer: don't assume op lists start with
 read

This was just a minor/pointless optimization in the first place. We keep
the skip on the last component because we can never commute that past the
end of the list.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 5175f0aa26..3d75d609e6 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -585,7 +585,7 @@ retry:
     }
 
     /* Push clears to the back to void any unused components */
-    for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */
+    for (int n = 0; n < ops->num_ops - 1; n++) {
         SwsOp *op = &ops->ops[n];
         SwsOp *next = &ops->ops[n + 1];
 
@@ -602,7 +602,7 @@ retry:
     /* Apply any remaining preferential re-ordering optimizations; do these
      * last because they are more likely to block other optimizations if done
      * too aggressively */
-    for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */
+    for (int n = 0; n < ops->num_ops - 1; n++) {
         SwsOp *op = &ops->ops[n];
         SwsOp *prev = &ops->ops[n - 1];
         SwsOp *next = &ops->ops[n + 1];
-- 
2.52.0


>From ee86270833120903ab69d568578d3ad1797e7738 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 4 Feb 2026 14:49:01 +0100
Subject: [PATCH 14/18] swscale/optimizer: remove read+write optimization

This optimization is lossy, since it removes important information about the
number of planes to be copied. Subsumed by the more correct
ff_sws_op_list_is_noop() check in ff_sws_compile_pass().

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 3d75d609e6..87e6ff9075 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -297,17 +297,6 @@ retry:
 
         switch (op->op) {
         case SWS_OP_READ:
-            /* Optimized further into refcopy / memcpy */
-            if (next->op == SWS_OP_WRITE &&
-                next->rw.elems == op->rw.elems &&
-                next->rw.packed == op->rw.packed &&
-                next->rw.frac == op->rw.frac)
-            {
-                ff_sws_op_list_remove_at(ops, n, 2);
-                av_assert1(ops->num_ops == 0);
-                return 0;
-            }
-
             /* Skip reading extra unneeded components */
             if (!op->rw.packed) {
                 int needed = op->rw.elems;
-- 
2.52.0


>From e1d5238abd9c277d679e1c548473bcd755376f90 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 8 Jan 2026 20:12:26 +0100
Subject: [PATCH 15/18] swscale/optimizer: promote component swizzles to plane
 swizzles

In some cases, we can just directly swizzle the order of input/output
planes, rather than applying a swizzle operation on the data itself.

This can eliminate some such swizzle operations entirely, for example
yuv444p -> vuya is now just a read, clear and write.

Results in a lot of simplifications like this:

 rgb24 -> gbrp:
   [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
-  [ u8 ...X -> +++X] SWS_OP_SWIZZLE      : 1203
-  [ u8 ...X -> +++X] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+  [ u8 ...X -> +++X] SWS_OP_WRITE        : 3 elem(s) planar >> 0, via {2, 0, 1}

 rgb24 -> gbrap16le:
   [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
   [ u8 ...X -> +++X] SWS_OP_CONVERT      : u8 -> u16 (expand)
-  [u16 ...X -> +++X] SWS_OP_SWIZZLE      : 1203
   [u16 ...X -> ++++] SWS_OP_CLEAR        : {_ _ _ 65535}
-  [u16 .... -> ++++] SWS_OP_WRITE        : 4 elem(s) planar >> 0
+  [u16 .... -> ++++] SWS_OP_WRITE        : 4 elem(s) planar >> 0, via {2, 0, 
1, 3}

 yuv444p -> vuya:
-  [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) planar >> 0
-  [ u8 ...X -> +++X] SWS_OP_SWIZZLE      : 2103
+  [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) planar >> 0, via {2, 1, 0}
   [ u8 ...X -> ++++] SWS_OP_CLEAR        : {_ _ _ 255}
   [ u8 .... -> ++++] SWS_OP_WRITE        : 4 elem(s) packed >> 0

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  | 28 ++++++++++++++++++++++++++++
 tests/ref/fate/sws-ops-list |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 87e6ff9075..a5cf07054e 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -403,6 +403,34 @@ retry:
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }
+
+            /* Swizzle planes instead of components, if possible */
+            if (prev->op == SWS_OP_READ && !prev->rw.packed) {
+                for (int dst = 0; dst < prev->rw.elems; dst++) {
+                    const int src = op->swizzle.in[dst];
+                    if (src > dst && src < prev->rw.elems) {
+                        FFSWAP(int, ops->order_src.in[dst], 
ops->order_src.in[src]);
+                        for (int i = dst; i < 4; i++) {
+                            if (op->swizzle.in[i] == dst)
+                                op->swizzle.in[i] = src;
+                            else if (op->swizzle.in[i] == src)
+                                op->swizzle.in[i] = dst;
+                        }
+                        goto retry;
+                    }
+                }
+            }
+
+            if (next->op == SWS_OP_WRITE && !next->rw.packed) {
+                for (int dst = 0; dst < next->rw.elems; dst++) {
+                    const int src = op->swizzle.in[dst];
+                    if (src > dst && src < next->rw.elems) {
+                        FFSWAP(int, ops->order_dst.in[dst], 
ops->order_dst.in[src]);
+                        FFSWAP(int, op->swizzle.in[dst], op->swizzle.in[src]);
+                        goto retry;
+                    }
+                }
+            }
             break;
 
         case SWS_OP_CONVERT:
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index b69b1ab299..429b46b371 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-8312bc72ff9e05a8a6ab8d1c394783d6
+30ceeaa73f093642f28c1f17b3ee4e3e
-- 
2.52.0


>From a97d08e835eb3ef0fa63cb6abc487b8426884ca9 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 4 Feb 2026 17:03:11 +0100
Subject: [PATCH 16/18] swscale/optimizer: try pushing all swizzles towards the
 output

Now that we can directly promote these to plane swizzles, we generally want
to try pushing them in one direction - ideally towards the output, as in the
case of split subpasses, the output is guaranteed to be planar. (And there
may not even be a read)

Results in a lot of diffs, ranging from the benign, e.g.:

 rgb24 -> bgr48be:
   [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
   [ u8 ...X -> +++X] SWS_OP_CONVERT      : u8 -> u16 (expand)
-  [u16 ...X -> +++X] SWS_OP_SWIZZLE      : 2103
   [u16 ...X -> zzzX] SWS_OP_SWAP_BYTES
+  [u16 ...X -> zzzX] SWS_OP_SWIZZLE      : 2103
   [u16 ...X -> zzzX] SWS_OP_WRITE        : 3 elem(s) packed >> 0

 rgb24 -> gbrp9be:
   [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
   [ u8 ...X -> +++X] SWS_OP_CONVERT      : u8 -> f32
   [f32 ...X -> ...X] SWS_OP_SCALE        : * 511/255
   [f32 ...X -> ...X] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 5}
   [f32 ...X -> ...X] SWS_OP_MIN          : x <= {511 511 511 _}
   [f32 ...X -> +++X] SWS_OP_CONVERT      : f32 -> u16
-  [u16 ...X -> +++X] SWS_OP_SWIZZLE      : 1203
   [u16 ...X -> zzzX] SWS_OP_SWAP_BYTES
-  [u16 ...X -> zzzX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+  [u16 ...X -> zzzX] SWS_OP_WRITE        : 3 elem(s) planar >> 0, via {2, 0, 1}

To the clear improvements, e.g.:

 bgr24 -> gbrp16be:
   [ u8 XXXX -> +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
-  [ u8 ...X -> +++X] SWS_OP_SWIZZLE      : 2103
   [ u8 ...X -> +++X] SWS_OP_CONVERT      : u8 -> u16 (expand)
-  [u16 ...X -> +++X] SWS_OP_SWIZZLE      : 1203
   [u16 ...X -> zzzX] SWS_OP_SWAP_BYTES
-  [u16 ...X -> zzzX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+  [u16 ...X -> zzzX] SWS_OP_WRITE        : 3 elem(s) planar >> 0, via {1, 0, 2}

The only case worth careful consideration is when there are swizzled inputs
that result in unusual plane patterns, e.g.:

 argb -> gbrp9be:
   [ u8 XXXX -> ++++] SWS_OP_READ         : 4 elem(s) packed >> 0
-  [ u8 X... -> ++++] SWS_OP_SWIZZLE      : 1230
-  [ u8 ...X -> ++++] SWS_OP_CONVERT      : u8 -> f32
-  [f32 ...X -> ....] SWS_OP_SCALE        : * 511/255
-  [f32 ...X -> ....] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 5}
-  [f32 ...X -> ....] SWS_OP_MIN          : x <= {511 511 511 _}
-  [f32 ...X -> ++++] SWS_OP_CONVERT      : f32 -> u16
-  [u16 ...X -> ++++] SWS_OP_SWIZZLE      : 1203
-  [u16 ...X -> zzzz] SWS_OP_SWAP_BYTES
-  [u16 ...X -> zzzz] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+  [ u8 X... -> ++++] SWS_OP_CONVERT      : u8 -> f32
+  [f32 X... -> ....] SWS_OP_SCALE        : * 511/255
+  [f32 X... -> ....] SWS_OP_DITHER       : 16x16 matrix + {0 0 3 2}
+  [f32 X... -> ....] SWS_OP_MIN          : x <= {511 511 511 511}
+  [f32 X... -> ++++] SWS_OP_CONVERT      : f32 -> u16
+  [u16 X... -> zzzz] SWS_OP_SWAP_BYTES
+  [u16 X... -> zzzz] SWS_OP_SWIZZLE      : 3120
+  [u16 ...X -> zzzz] SWS_OP_WRITE        : 3 elem(s) planar >> 0, via {1, 2, 0}
     (X = unused, z = byteswapped, + = exact, 0 = zero)

Observe the change from ...X to X..., which is a pattern that doesn't
necessarily have a fast path and would usually end up falling back to the
generic 4-component implementations (rather than the 3-component ones).

That said, this is not a big deal, since we can ultimately re-align the
set of implementations with what's actually needed; once we're done with
plane splitting and so forth.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  | 22 ++--------------------
 tests/ref/fate/sws-ops-list |  2 +-
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index a5cf07054e..2cb297e0df 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -621,30 +621,12 @@ retry:
      * too aggressively */
     for (int n = 0; n < ops->num_ops - 1; n++) {
         SwsOp *op = &ops->ops[n];
-        SwsOp *prev = &ops->ops[n - 1];
         SwsOp *next = &ops->ops[n + 1];
 
         switch (op->op) {
         case SWS_OP_SWIZZLE: {
-            bool seen[4] = {0};
-            bool has_duplicates = false;
-            for (int i = 0; i < 4; i++) {
-                if (next->comps.unused[i])
-                    continue;
-                has_duplicates |= seen[op->swizzle.in[i]];
-                seen[op->swizzle.in[i]] = true;
-            }
-
-            /* Try to push swizzles with duplicates towards the output */
-            if (has_duplicates && op_commute_swizzle(op, next)) {
-                FFSWAP(SwsOp, *op, *next);
-                goto retry;
-            }
-
-            /* Move swizzle out of the way between two converts so that
-             * they may be merged */
-            if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) {
-                op->type = next->convert.to;
+            /* Try to push swizzles towards the output */
+            if (op_commute_swizzle(op, next)) {
                 FFSWAP(SwsOp, *op, *next);
                 goto retry;
             }
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 429b46b371..6111cc4cbd 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-30ceeaa73f093642f28c1f17b3ee4e3e
+1c8369d53a092dd41f88f333f6a8e426
-- 
2.52.0


>From 13fb36f427216987bc91b13891da92a531f5c43c Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 14 Jan 2026 15:59:42 +0100
Subject: [PATCH 17/18] swscale/optimizer: compress planar reads with unused
 planes

After plane splitting, we can end up with a situation where a subpass wants
to read only, say, the alpha plane. In this case, we should compress the
planar read by instead swizzling the alpha plane into the correct place
in the src plane order, and then reading only a single plane.

Results in a bunch of benign diffs like:

 yuva444p -> ya8:
-  [ u8 XXXX -> ++++] SWS_OP_READ         : 4 elem(s) planar >> 0
-  [ u8 .XX. -> ++++] SWS_OP_CONVERT      : u8 -> f32
-  [f32 .XX. -> .+++] SWS_OP_LINEAR       : luma [...]
-  [f32 .XX. -> .+++] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 5}
-  [f32 .XX. -> .+++] SWS_OP_MAX          : {0 0 0 0} <= x
-  [f32 .XX. -> .+++] SWS_OP_MIN          : x <= {255 _ _ 255}
-  [f32 .XX. -> ++++] SWS_OP_CONVERT      : f32 -> u8
-  [ u8 .XX. -> ++++] SWS_OP_SWIZZLE      : 0312
-  [ u8 ..XX -> ++++] SWS_OP_WRITE        : 2 elem(s) packed >> 0
+  [ u8 XXXX -> ++XX] SWS_OP_READ         : 2 elem(s) planar >> 0, via {0, 3}
+  [ u8 ..XX -> ++XX] SWS_OP_CONVERT      : u8 -> f32
+  [f32 ..XX -> +XX+] SWS_OP_SWIZZLE      : 0321
+  [f32 .XX. -> .XX+] SWS_OP_LINEAR       : luma [...]
+  [f32 .XX. -> .XX+] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 5}
+  [f32 .XX. -> .XX+] SWS_OP_MAX          : {0 0 0 0} <= x
+  [f32 .XX. -> .XX+] SWS_OP_MIN          : x <= {255 _ _ 255}
+  [f32 .XX. -> +XX+] SWS_OP_CONVERT      : f32 -> u8
+  [ u8 .XX. -> ++XX] SWS_OP_SWIZZLE      : 0312
+  [ u8 ..XX -> ++XX] SWS_OP_WRITE        : 2 elem(s) packed >> 0

This may seem noisy, but really is mostly a result of the fact that the unused
middle components are now marked as garbage instead of as valid data.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  | 28 ++++++++++++++++++++++------
 tests/ref/fate/sws-ops-list |  2 +-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 2cb297e0df..c2acba34b6 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -297,13 +297,29 @@ retry:
 
         switch (op->op) {
         case SWS_OP_READ:
-            /* Skip reading extra unneeded components */
+            /* "Compress" planar reads where not all components are needed */
             if (!op->rw.packed) {
-                int needed = op->rw.elems;
-                while (needed > 0 && next->comps.unused[needed - 1])
-                    needed--;
-                if (op->rw.elems != needed) {
-                    op->rw.elems = needed;
+                SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
+                int nb_planes = 0;
+                for (int i = 0; i < op->rw.elems; i++) {
+                    if (next->comps.unused[i]) {
+                        swiz.in[i] = 3 - (i - nb_planes); /* map to unused 
plane */
+                        continue;
+                    }
+
+                    const int idx = nb_planes++;
+                    av_assert1(idx <= i);
+                    ops->order_src.in[idx] = ops->order_src.in[i];
+                    swiz.in[i] = idx;
+                }
+
+                if (nb_planes < op->rw.elems) {
+                    op->rw.elems = nb_planes;
+                    RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
+                        .op = SWS_OP_SWIZZLE,
+                        .type = op->type,
+                        .swizzle = swiz,
+                    }));
                     goto retry;
                 }
             }
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 6111cc4cbd..af3cf2f4e9 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-1c8369d53a092dd41f88f333f6a8e426
+3505d38dc669faf6f14036516b6caf61
-- 
2.52.0


>From 42aa8f16c104f7d926686798bc56a537678165d4 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 14 Jan 2026 12:25:49 +0100
Subject: [PATCH 18/18] swscale/optimizer: eliminate completely unused
 operations

e.g. empty read when all components are eventually cleared

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index c2acba34b6..91a09589d8 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -295,6 +295,14 @@ retry:
         /* common helper variable */
         bool noop = true;
 
+        if (next->comps.unused[0] && next->comps.unused[1] &&
+            next->comps.unused[2] && next->comps.unused[3])
+        {
+            /* Remove completely unused operations */
+            ff_sws_op_list_remove_at(ops, n, 1);
+            goto retry;
+        }
+
         switch (op->op) {
         case SWS_OP_READ:
             /* "Compress" planar reads where not all components are needed */
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale: preliminary changes towards scaling and plane splitting (PR #21652)

Reply via email to