PR #21208 opened by Niklas Haas (haasn) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21208 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21208.patch
Turns out I already had the solution to this lying around on a very old development branch. I've properly rebased this, cleaned it up further, and ironed out the edge cases. >From 333c39464ff4d2ba6b713f03db5997549e300bdd Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 15 Dec 2025 18:15:24 +0100 Subject: [PATCH 1/7] swscale/ops_optimizer: don't commute clear with itself These would normally be merged, not swapped. --- libswscale/ops_optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c index f317a84f9a..c4a75820d3 100644 --- a/libswscale/ops_optimizer.c +++ b/libswscale/ops_optimizer.c @@ -50,7 +50,6 @@ static bool op_commute_clear(SwsOp *op, SwsOp *next) case SWS_OP_MIN: case SWS_OP_MAX: case SWS_OP_SCALE: - case SWS_OP_CLEAR: case SWS_OP_READ: case SWS_OP_SWIZZLE: ff_sws_apply_op_q(next, op->c.q4); @@ -61,6 +60,7 @@ static bool op_commute_clear(SwsOp *op, SwsOp *next) case SWS_OP_LINEAR: case SWS_OP_PACK: case SWS_OP_UNPACK: + case SWS_OP_CLEAR: return false; case SWS_OP_TYPE_NB: break; -- 2.49.1 >From 7425eb43f2a686d5185f7d6a6ca4efcff875947a Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 15 Dec 2025 17:36:54 +0100 Subject: [PATCH 2/7] swscale/ops_optimizer: apply optimizations in a more predictable order Instead of blindly interleaving re-ordering and minimizing opptimizations, separate this loop into several passes - the first pass will minimize the operation list in-place as much as possible, and the second pass will apply any desired re-orderings. (We also want to try pushing clear back before any other re-orderings, as this can trigger more phase 1 optimizations) This restructuring leads to significantly more predictable and stable behavior, especially when introducing more operation types going forwards. Does not actually affect the current results, but matters with some upcoming changes I have planned. --- libswscale/ops_optimizer.c | 102 +++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 37 deletions(-) diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c index c4a75820d3..3b57c76130 100644 --- a/libswscale/ops_optimizer.c +++ b/libswscale/ops_optimizer.c @@ -500,6 +500,7 @@ int ff_sws_op_list_optimize(SwsOpList *ops) retry: ff_sws_op_list_update_comps(ops); + /* Apply all in-place optimizations (that do not re-order the list) */ for (int n = 0; n < ops->num_ops;) { SwsOp dummy = {0}; SwsOp *op = &ops->ops[n]; @@ -604,25 +605,14 @@ retry: ff_sws_op_list_remove_at(ops, n + 1, 1); goto retry; } - - /* Prefer to clear as late as possible, to avoid doing - * redundant work */ - if (op_commute_clear(op, next)) { - FFSWAP(SwsOp, *op, *next); - goto retry; - } break; - case SWS_OP_SWIZZLE: { - bool seen[4] = {0}; - bool has_duplicates = false; + case SWS_OP_SWIZZLE: for (int i = 0; i < 4; i++) { if (next->comps.unused[i]) continue; if (op->swizzle.in[i] != i) noop = false; - has_duplicates |= seen[op->swizzle.in[i]]; - seen[op->swizzle.in[i]] = true; } /* Identity swizzle */ @@ -639,22 +629,7 @@ retry: ff_sws_op_list_remove_at(ops, n + 1, 1); goto retry; } - - /* Try to push swizzles with duplicates towards the output */ - if (has_duplicates && op_commute_swizzle(op, next)) { - FFSWAP(SwsOp, *op, *next); - goto retry; - } - - /* Move swizzle out of the way between two converts so that - * they may be merged */ - if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) { - op->type = next->convert.to; - FFSWAP(SwsOp, *op, *next); - goto retry; - } break; - } case SWS_OP_CONVERT: /* No-op conversion */ @@ -813,16 +788,6 @@ retry: goto retry; } - /* Scaling by integer before conversion to int */ - if (op->c.q.den == 1 && - next->op == SWS_OP_CONVERT && - ff_sws_pixel_type_is_int(next->convert.to)) - { - op->type = next->convert.to; - FFSWAP(SwsOp, *op, *next); - goto retry; - } - /* Scaling by exact power of two */ if (factor2 && ff_sws_pixel_type_is_int(op->type)) { op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT; @@ -837,6 +802,69 @@ retry: n++; } + /* Push clears to the back to void any unused components */ + for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */ + SwsOp *op = &ops->ops[n]; + SwsOp *next = &ops->ops[n + 1]; + + switch (op->op) { + case SWS_OP_CLEAR: + if (op_commute_clear(op, next)) { + FFSWAP(SwsOp, *op, *next); + goto retry; + } + break; + } + } + + /* Apply any remaining preferential re-ordering optimizations; do these + * last because they are more likely to block other optimizations if done + * too aggressively */ + for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */ + SwsOp *op = &ops->ops[n]; + SwsOp *prev = &ops->ops[n - 1]; + SwsOp *next = &ops->ops[n + 1]; + + switch (op->op) { + case SWS_OP_SWIZZLE: { + bool seen[4] = {0}; + bool has_duplicates = false; + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i]) + continue; + has_duplicates |= seen[op->swizzle.in[i]]; + seen[op->swizzle.in[i]] = true; + } + + /* Try to push swizzles with duplicates towards the output */ + if (has_duplicates && op_commute_swizzle(op, next)) { + FFSWAP(SwsOp, *op, *next); + goto retry; + } + + /* Move swizzle out of the way between two converts so that + * they may be merged */ + if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) { + op->type = next->convert.to; + FFSWAP(SwsOp, *op, *next); + goto retry; + } + break; + } + + case SWS_OP_SCALE: + /* Scaling by integer before conversion to int */ + if (op->c.q.den == 1 && next->op == SWS_OP_CONVERT && + ff_sws_pixel_type_is_int(next->convert.to)) + { + op->type = next->convert.to; + FFSWAP(SwsOp, *op, *next); + goto retry; + } + break; + } + } + return 0; } -- 2.49.1 >From 0be920d3309da6233319f43dc694c7d6f09115dd Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 15 Dec 2025 17:57:22 +0100 Subject: [PATCH 3/7] swscale/ops_optimizer: simplify loop slightly (cosmetic) We always `goto retry` whenever an optimization case is hit, so we don't need to defer the increment of `n`. --- libswscale/ops_optimizer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c index 3b57c76130..9d668fee74 100644 --- a/libswscale/ops_optimizer.c +++ b/libswscale/ops_optimizer.c @@ -501,7 +501,7 @@ retry: ff_sws_op_list_update_comps(ops); /* Apply all in-place optimizations (that do not re-order the list) */ - for (int n = 0; n < ops->num_ops;) { + for (int n = 0; n < ops->num_ops; n++) { SwsOp dummy = {0}; SwsOp *op = &ops->ops[n]; SwsOp *prev = n ? &ops->ops[n - 1] : &dummy; @@ -797,9 +797,6 @@ retry: break; } } - - /* No optimization triggered, move on to next operation */ - n++; } /* Push clears to the back to void any unused components */ -- 2.49.1 >From 30247ad6263f4c03d5030f219bfad74d8e773054 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 15 Dec 2025 17:58:31 +0100 Subject: [PATCH 4/7] swscale/ops: add SWS_OP_ASSUME The current behavior of assuming the value range implicitly on SWS_OP_READ has a number of serious drawbacks and shortcomings: - It is difficult to reason about when acted upon by SWS_OP_SWAP_BYTES, and the existing hack of simply ignoring SWAP_BYTES on the value range is not a very good solution here. - It ignored the effects of SWS_OP_RSHIFT, such as for p010 and related MSB-aligned formats. (This is actually a bug) - It adds a needless dependency on the "purely informative" src/dst fields inside SwsOpList. Instead, introduce a new pseudo-op, SWS_OP_ASSUME, whose only purpose is to add an assumption about the component value range for the purposes of optimizing. To avoid it from blocking any existing optimizations (e.g. when an ASSUME op is stuck between two SWAP_BYTES ops), it is stripped after the first optimization pass, but before any preferential re-ordering; to give the in-place optimizations enough time to act on the information. --- libswscale/ops.c | 8 ++++++++ libswscale/ops.h | 2 ++ libswscale/ops_optimizer.c | 22 ++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/libswscale/ops.c b/libswscale/ops.c index f33dd02a37..2990432e83 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -110,6 +110,7 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4]) int shift[4]; switch (op->op) { + case SWS_OP_ASSUME: case SWS_OP_READ: case SWS_OP_WRITE: return; @@ -417,6 +418,13 @@ void ff_sws_op_list_print(void *log, int lev, const SwsOpList *ops) case SWS_OP_INVALID: av_log(log, lev, "SWS_OP_INVALID\n"); break; + case SWS_OP_ASSUME: + av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", "SWS_OP_ASSUME", + op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", + op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", + op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", + op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); + break; case SWS_OP_READ: case SWS_OP_WRITE: av_log(log, lev, "%-20s: %d elem(s) %s >> %d\n", diff --git a/libswscale/ops.h b/libswscale/ops.h index d1a15294d1..1a02346177 100644 --- a/libswscale/ops.h +++ b/libswscale/ops.h @@ -41,7 +41,9 @@ int ff_sws_pixel_type_size(SwsPixelType type) av_const; bool ff_sws_pixel_type_is_int(SwsPixelType type) av_const; typedef enum SwsOpType { + /* Pseudo-ops, will never be seen by backends */ SWS_OP_INVALID = 0, + SWS_OP_ASSUME, /* assume maximum value range of input (q4) */ /* Input/output handling */ SWS_OP_READ, /* gather raw pixels from planes */ diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c index 9d668fee74..09296cdbf9 100644 --- a/libswscale/ops_optimizer.c +++ b/libswscale/ops_optimizer.c @@ -44,6 +44,7 @@ static bool op_commute_clear(SwsOp *op, SwsOp *next) case SWS_OP_CONVERT: op->type = next->convert.to; /* fall through */ + case SWS_OP_ASSUME: case SWS_OP_LSHIFT: case SWS_OP_RSHIFT: case SWS_OP_DITHER: @@ -100,6 +101,7 @@ static bool op_commute_swizzle(SwsOp *op, SwsOp *next) * NEXT {x, _, _, w} * SWIZZLE {0, 0, 0, 3} */ + case SWS_OP_ASSUME: case SWS_OP_MIN: case SWS_OP_MAX: { const SwsConst c = next->c; @@ -177,6 +179,13 @@ void ff_sws_op_list_update_comps(SwsOpList *ops) ff_sws_apply_op_q(op, op->comps.max); } + if (op->op == SWS_OP_ASSUME) { + for (int i = 0; i < 4; i++) { + if (av_cmp_q(op->comps.max[i], op->c.q4[i]) == 1) + op->comps.max[i] = op->c.q4[i]; + } + } + switch (op->op) { case SWS_OP_READ: for (int i = 0; i < op->rw.elems; i++) { @@ -207,6 +216,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops) for (int i = 0; i < op->rw.elems; i++) av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE)); /* fall through */ + case SWS_OP_ASSUME: case SWS_OP_SWAP_BYTES: case SWS_OP_LSHIFT: case SWS_OP_RSHIFT: @@ -324,6 +334,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops) for (int i = op->rw.elems; i < 4; i++) op->comps.unused[i] = next.unused[i]; break; + case SWS_OP_ASSUME: case SWS_OP_SWAP_BYTES: case SWS_OP_LSHIFT: case SWS_OP_RSHIFT: @@ -814,6 +825,17 @@ retry: } } + /* Strip any informational pseudo-ops. Do this after the main optimization + * passe have completed, to ensure that all redundant operations have been + * eliminated as a result of the information being stripped here. */ + for (int n = 0; n < ops->num_ops; n++) { + switch (ops->ops[n].op) { + case SWS_OP_ASSUME: + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + } + /* Apply any remaining preferential re-ordering optimizations; do these * last because they are more likely to block other optimizations if done * too aggressively */ -- 2.49.1 >From 762ba67d99b664e33ddc4231cf00dd8ebbd964f8 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 15 Dec 2025 17:59:41 +0100 Subject: [PATCH 5/7] swscale/format: insert SWS_OP_ASSUME after ff_sws_decode_pixfmt This is the correct place to make an assumption about the value range, since it is the first place where the pixel values are actually properly normalized. Improves a large number of conversions involving xv36be, e.g.: xv36be -> yuv444p12be: [u16 XXXX -> ++++] SWS_OP_READ : 4 elem(s) packed >> 0 [u16 ...X -> ++++] SWS_OP_SWAP_BYTES [u16 ...X -> ++++] SWS_OP_SWIZZLE : 1023 [u16 ...X -> ++++] SWS_OP_RSHIFT : >> 4 - [u16 ...X -> ++++] SWS_OP_CONVERT : u16 -> f32 - [f32 ...X -> ++++] SWS_OP_MIN : x <= {4095 4095 4095 _} - [f32 ...X -> ++++] SWS_OP_CONVERT : f32 -> u16 [u16 ...X -> ++++] SWS_OP_SWAP_BYTES [u16 ...X -> ++++] SWS_OP_WRITE : 3 elem(s) planar >> 0 (X = unused, + = exact, 0 = zero) xv36be -> yuv444p14le: [u16 XXXX -> ++++] SWS_OP_READ : 4 elem(s) packed >> 0 [u16 ...X -> ++++] SWS_OP_SWAP_BYTES [u16 ...X -> ++++] SWS_OP_SWIZZLE : 1023 [u16 ...X -> ++++] SWS_OP_RSHIFT : >> 4 - [u16 ...X -> ++++] SWS_OP_CONVERT : u16 -> f32 - [f32 ...X -> ++++] SWS_OP_SCALE : * 4 - [f32 ...X -> ++++] SWS_OP_MIN : x <= {16383 16383 16383 _} - [f32 ...X -> ++++] SWS_OP_CONVERT : f32 -> u16 + [u16 ...X -> ++++] SWS_OP_LSHIFT : << 2 [u16 ...X -> ++++] SWS_OP_WRITE : 3 elem(s) planar >> 0 (X = unused, + = exact, 0 = zero) --- libswscale/format.c | 15 +++++++++++++++ tests/ref/fate/sws-ops-list | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/libswscale/format.c b/libswscale/format.c index be03adcdca..29682fcb0b 100644 --- a/libswscale/format.c +++ b/libswscale/format.c @@ -947,6 +947,21 @@ int ff_sws_decode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt) .c = fmt_clear(fmt), })); + if (!(desc->flags & AV_PIX_FMT_FLAG_FLOAT)) { + SwsConst range = {0}; + for (int c = 0; c < desc->nb_components; c++) { + const int bits = desc->comp[c].depth; + const int idx = desc->nb_components == 2 ? 3 * c : c; + range.q4[idx] = Q((1 << bits) - 1); + } + + RET(ff_sws_op_list_append(ops, &(SwsOp) { + .op = SWS_OP_ASSUME, + .type = pixel_type, + .c = range, + })); + } + return 0; } diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list index dbcad4052b..b49f944794 100644 --- a/tests/ref/fate/sws-ops-list +++ b/tests/ref/fate/sws-ops-list @@ -1 +1 @@ -708f10e4de79450729ab2120e4e44ec9 +e910ff7ceaeb64bfdbac3f652b67403f -- 2.49.1 >From 363659d6143820354ee618318b2c34e351088bc8 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Wed, 11 Jun 2025 13:20:32 +0200 Subject: [PATCH 6/7] swscale/optimizer: remove broken value range assumption hack Replaced by the new SWS_OP_ASSUME; and also buggy. Fixes a number of cases involving MSB-aligned formats, e.g.: yuv444p10msbbe -> rgb24: [u16 XXXX -> +++X] SWS_OP_READ : 3 elem(s) planar >> 0 [u16 ...X -> +++X] SWS_OP_SWAP_BYTES [u16 ...X -> +++X] SWS_OP_RSHIFT : >> 6 [u16 ...X -> +++X] SWS_OP_CONVERT : u16 -> f32 [f32 ...X -> ...X] SWS_OP_LINEAR : matrix3+off3 [...] [f32 ...X -> ...X] SWS_OP_DITHER : 16x16 matrix + {0 3 2 5} [f32 ...X -> ...X] SWS_OP_MAX : {0 0 0 0} <= x + [f32 ...X -> ...X] SWS_OP_MIN : x <= {255 255 255 _} [f32 ...X -> +++X] SWS_OP_CONVERT : f32 -> u8 [ u8 ...X -> +++X] SWS_OP_WRITE : 3 elem(s) packed >> 0 (X = unused, + = exact, 0 = zero) (This clamp is needed and was incorrectly optimized away before, because the `SWS_OP_RSHIFT` incorrectly distorted the value range assertion) --- libswscale/ops_optimizer.c | 13 ------------- tests/ref/fate/sws-ops-list | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c index 09296cdbf9..9345e06dcb 100644 --- a/libswscale/ops_optimizer.c +++ b/libswscale/ops_optimizer.c @@ -191,19 +191,6 @@ void ff_sws_op_list_update_comps(SwsOpList *ops) for (int i = 0; i < op->rw.elems; i++) { if (ff_sws_pixel_type_is_int(op->type)) { int bits = 8 * ff_sws_pixel_type_size(op->type) >> op->rw.frac; - if (!op->rw.packed && ops->src.desc) { - /* Use legal value range from pixdesc if available; - * we don't need to do this for packed formats because - * non-byte-aligned packed formats will necessarily go - * through SWS_OP_UNPACK anyway */ - for (int c = 0; c < 4; c++) { - if (ops->src.desc->comp[c].plane == i) { - bits = ops->src.desc->comp[c].depth; - break; - } - } - } - op->comps.flags[i] = SWS_COMP_EXACT; op->comps.min[i] = Q(0); op->comps.max[i] = Q((1ULL << bits) - 1); diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list index b49f944794..9877de5f8f 100644 --- a/tests/ref/fate/sws-ops-list +++ b/tests/ref/fate/sws-ops-list @@ -1 +1 @@ -e910ff7ceaeb64bfdbac3f652b67403f +59514a7a7c3ab7674885b9ac07045fc1 -- 2.49.1 >From 34c5a3e4c69ef852c1a709e48befc674a2737c82 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 15 Dec 2025 18:34:47 +0100 Subject: [PATCH 7/7] Revert "swscale/ops: clarify SwsOpList.src/dst semantics" This reverts commit c94e8afe5d195fb08c441fbe3f8c2295081fcf8b. These are now actually purely informational. --- libswscale/ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libswscale/ops.h b/libswscale/ops.h index 1a02346177..517d5c5bfb 100644 --- a/libswscale/ops.h +++ b/libswscale/ops.h @@ -212,8 +212,8 @@ typedef struct SwsOpList { SwsOp *ops; int num_ops; - /* Metadata associated with this operation list */ - SwsFormat src, dst; /* if set; may inform the optimizer about e.g value ranges */ + /* Purely informative metadata associated with this operation list */ + SwsFormat src, dst; } SwsOpList; SwsOpList *ff_sws_op_list_alloc(void); -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
