ops_dispatch: refactor tail handling

Niklas Haas via ffmpeg-cvslog Sat, 28 Mar 2026 11:27:26 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 78878b9daad3081ee0de58693c092dea551f5eec
Author:     Niklas Haas <[email protected]>
AuthorDate: Fri Mar 6 20:55:28 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Mar 28 18:50:14 2026 +0100

    swscale/ops_dispatch: refactor tail handling
    
    Rather than dispatching the compiled function for each line of the tail
    individually, with a memcpy to a shared buffer in between, this instead 
copies
    the entire tail region into a temporary intermediate buffer, processes it 
with
    a single dispatch call, and then copies the entire result back to the
    destination.
    
    The main benefit of this is that it enables scaling, subsampling or other
    quirky layouts to continue working, which may require accessing lines 
adjacent
    to the main input.
    
    It also arguably makes the code a bit simpler and easier to follow, but 
YMMV.
    
    One minor consequence of the change in logic is that we also no longer 
handle
    the last row of an unpadded input buffer separately - instead, if *any* row
    needs to be padded, *all* rows in the current slice will be padded. This is
    a bit less efficient but much more predictable, and as discussed, basically
    required for scaling/filtering anyways.
    
    While we could implement some sort of hybrid regime where we only use the 
new
    logic when scaling is needed, I really don't think this would gain us 
anything
    concrete enough to be worth the effort, especially since the performance is
    basically roughly the same across the board:
    
    16 threads:
      yuv444p 1920x1080 -> ayuv 1920x1080: speedup=1.000x slower (input memcpy)
      rgb24   1920x1080 -> argb 1920x1080: speedup=1.012x faster (output memcpy)
    
    1 thread:
      yuv444p 1920x1080 -> ayuv 1920x1080: speedup=1.062x faster (input memcpy)
      rgb24   1920x1080 -> argb 1920x1080: speedup=0.959x slower (output memcpy)
    
    Overall speedup is +/- 1% across the board, well within margin of error.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 216 ++++++++++++++++++++++++++--------------------
 1 file changed, 121 insertions(+), 95 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index e05a471b4e..f6dfddd57e 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -19,6 +19,7 @@
  */
 
 #include "libavutil/avassert.h"
+#include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/refstruct.h"
@@ -30,6 +31,7 @@
 typedef struct SwsOpPass {
     SwsCompiledOp comp;
     SwsOpExec exec_base;
+    SwsOpExec exec_tail;
     int num_blocks;
     int tail_off_in;
     int tail_off_out;
@@ -45,6 +47,8 @@ typedef struct SwsOpPass {
     bool memcpy_first;
     bool memcpy_last;
     bool memcpy_out;
+    uint8_t *tail_buf; /* extra memory for fixing unpadded tails */
+    unsigned int tail_buf_size;
 } SwsOpPass;
 
 int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
@@ -113,6 +117,7 @@ static void op_pass_free(void *ptr)
     ff_sws_compiled_op_unref(&p->comp);
     av_refstruct_unref(&p->offsets_y);
     av_free(p->exec_base.in_bump_y);
+    av_free(p->tail_buf);
     av_free(p);
 }
 
@@ -136,20 +141,15 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
     SwsOpPass *p = pass->priv;
     SwsOpExec *exec = &p->exec_base;
     const SwsCompiledOp *comp = &p->comp;
-    const int block_size = comp->block_size;
-    p->num_blocks = (pass->width + block_size - 1) / block_size;
 
     /* Set up main loop parameters */
-    const int aligned_w  = p->num_blocks * block_size;
-    const int safe_width = (p->num_blocks - 1) * block_size;
-    const int tail_size  = pass->width - safe_width;
-    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
-    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
-    p->tail_size_in  = (tail_size * p->pixel_bits_in  + 7) >> 3;
-    p->tail_size_out = (tail_size * p->pixel_bits_out + 7) >> 3;
-    p->memcpy_first  = false;
-    p->memcpy_last   = false;
-    p->memcpy_out    = false;
+    const int block_size = comp->block_size;
+    const int num_blocks = (pass->width + block_size - 1) / block_size;
+    const int aligned_w  = num_blocks * block_size;
+    p->num_blocks   = num_blocks;
+    p->memcpy_first = false;
+    p->memcpy_last  = false;
+    p->memcpy_out   = false;
 
     for (int i = 0; i < p->planes_in; i++) {
         const int idx        = p->idx_in[i];
@@ -160,6 +160,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
         const int plane_size = plane_w * p->pixel_bits_in >> 3;
         const int total_size = plane_size + plane_pad;
+        const int loop_size  = num_blocks * exec->block_size_in;
         if (in->linesize[idx] >= 0) {
             p->memcpy_last |= total_size > in->linesize[idx];
         } else {
@@ -167,6 +168,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         }
         exec->in[i]        = in->data[idx];
         exec->in_stride[i] = in->linesize[idx];
+        exec->in_bump[i]   = in->linesize[idx] - loop_size;
         exec->in_sub_y[i]  = sub_y;
         exec->in_sub_x[i]  = sub_x;
     }
@@ -179,88 +181,76 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         const int plane_w    = (aligned_w + sub_x) >> sub_x;
         const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
         const int plane_size = plane_w * p->pixel_bits_out >> 3;
+        const int loop_size  = num_blocks * exec->block_size_out;
         p->memcpy_out |= plane_size + plane_pad > FFABS(out->linesize[idx]);
         exec->out[i]        = out->data[idx];
         exec->out_stride[i] = out->linesize[idx];
+        exec->out_bump[i]   = out->linesize[idx] - loop_size;
         exec->out_sub_y[i]  = sub_y;
         exec->out_sub_x[i]  = sub_x;
     }
 
-    /* Pre-fill pointer bump for the main section only; this value does not
-     * matter at all for the tail / last row handlers because they only ever
-     * process a single line */
-    const int blocks_main = p->num_blocks - p->memcpy_out;
-    for (int i = 0; i < 4; i++) {
-        exec->in_bump[i]  = exec->in_stride[i]  - blocks_main * 
exec->block_size_in;
-        exec->out_bump[i] = exec->out_stride[i] - blocks_main * 
exec->block_size_out;
-    }
-
-    return 0;
-}
+    const bool memcpy_in = p->memcpy_first || p->memcpy_last;
+    if (!memcpy_in && !p->memcpy_out)
+        return 0;
 
-/* Dispatch kernel over the last column of the image using memcpy */
-static av_always_inline void
-handle_tail(const SwsOpPass *p, SwsOpExec *exec,
-            const bool copy_out, const bool copy_in,
-            int y, const int h)
-{
-    DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
+    /* Set-up tail section parameters and buffers */
+    SwsOpExec *tail = &p->exec_tail;
+    const int align = av_cpu_max_align();
+    size_t alloc_size = 0;
+    *tail = *exec;
 
-    const SwsOpExec *base = &p->exec_base;
-    const SwsCompiledOp *comp = &p->comp;
-    const int tail_size_in  = p->tail_size_in;
-    const int tail_size_out = p->tail_size_out;
-    const int bx = p->num_blocks - 1;
-
-    const uint8_t *in_data[4];
-    uint8_t *out_data[4];
-    get_row_data(p, y, in_data, out_data);
+    const int safe_width = (num_blocks - 1) * block_size;
+    const int tail_size  = pass->width - safe_width;
+    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
+    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
+    p->tail_size_in  = (tail_size * p->pixel_bits_in  + 7) >> 3;
+    p->tail_size_out = (tail_size * p->pixel_bits_out + 7) >> 3;
 
-    for (int i = 0; i < p->planes_in; i++) {
-        in_data[i] += p->tail_off_in;
-        if (copy_in) {
-            exec->in[i] = (void *) tmp[0][i];
-            exec->in_stride[i] = sizeof(tmp[0][i]);
-        } else {
-            exec->in[i] = in_data[i];
-        }
+    for (int i = 0; memcpy_in && i < p->planes_in; i++) {
+        size_t block_size = (comp->block_size * p->pixel_bits_in + 7) >> 3;
+        block_size += comp->over_read;
+        block_size = FFMAX(block_size, p->tail_size_in);
+        tail->in_stride[i] = FFALIGN(block_size, align);
+        tail->in_bump[i] = tail->in_stride[i] - exec->block_size_in;
+        alloc_size += tail->in_stride[i] * in->height;
     }
 
-    for (int i = 0; i < p->planes_out; i++) {
-        out_data[i] += p->tail_off_out;
-        if (copy_out) {
-            exec->out[i] = (void *) tmp[1][i];
-            exec->out_stride[i] = sizeof(tmp[1][i]);
-        } else {
-            exec->out[i] = out_data[i];
-        }
+    for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
+        size_t block_size = (comp->block_size * p->pixel_bits_out + 7) >> 3;
+        block_size += comp->over_write;
+        block_size = FFMAX(block_size, p->tail_size_out);
+        tail->out_stride[i] = FFALIGN(block_size, align);
+        tail->out_bump[i] = tail->out_stride[i] - exec->block_size_out;
+        alloc_size += tail->out_stride[i] * out->height;
     }
 
-    for (int y_end = y + h; y < y_end; y++) {
-        if (copy_in) {
-            for (int i = 0; i < p->planes_in; i++) {
-                av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
-                memcpy(tmp[0][i], in_data[i], tail_size_in);
-                in_data[i] += base->in_stride[i]; /* exec->in_stride was 
clobbered */
-            }
-        }
+    uint8_t *tail_buf = av_fast_realloc(p->tail_buf, &p->tail_buf_size, 
alloc_size);
+    if (!tail_buf)
+        return AVERROR(ENOMEM);
+    p->tail_buf = tail_buf;
 
-        comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);
+    for (int i = 0; memcpy_in && i < p->planes_in; i++) {
+        tail->in[i] = tail_buf;
+        tail_buf += tail->in_stride[i] * in->height;
+    }
 
-        if (copy_out) {
-            for (int i = 0; i < p->planes_out; i++) {
-                av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
-                memcpy(out_data[i], tmp[1][i], tail_size_out);
-                out_data[i] += base->out_stride[i];
-            }
-        }
+    for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
+        tail->out[i] = tail_buf;
+        tail_buf += tail->out_stride[i] * out->height;
+    }
 
-        for (int i = 0; i < 4; i++) {
-            if (!copy_in && exec->in[i])
-                exec->in[i] += exec->in_stride[i];
-            if (!copy_out && exec->out[i])
-                exec->out[i] += exec->out_stride[i];
-        }
+    return 0;
+}
+
+static void copy_lines(uint8_t *dst, const size_t dst_stride,
+                       const uint8_t *src, const size_t src_stride,
+                       const int h, const size_t bytes)
+{
+    for (int y = 0; y < h; y++) {
+        memcpy(dst, src, bytes);
+        dst += dst_stride;
+        src += src_stride;
     }
 }
 
@@ -287,35 +277,71 @@ static void op_pass_run(const SwsFrame *out, const 
SwsFrame *in, const int y,
      * 2. We can overwrite the output, as long as we don't write more than the
      *    amount of pixels that fit into one linesize. So we always need to
      *    memcpy the last column on the output side if unpadded.
-     *
-     * 3. For the last row, we also need to memcpy the remainder of the input,
-     *    to avoid reading past the end of the buffer. Note that since we know
-     *    the run() function is called on stripes of the same buffer, we don't
-     *    need to worry about this for the end of a slice.
      */
 
     const bool memcpy_in  = p->memcpy_last && y + h == pass->height ||
                             p->memcpy_first && y == 0;
     const bool memcpy_out = p->memcpy_out;
     const int num_blocks  = p->num_blocks;
-    const int blocks_main = num_blocks - memcpy_out;
-    const int h_main      = h - memcpy_in;
 
-    /* Handle main section */
     get_row_data(p, y, exec.in, exec.out);
-    comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);
+    if (!memcpy_in && !memcpy_out) {
+        /* Fast path (fully aligned/padded inputs and outputs) */
+        comp->func(&exec, comp->priv, 0, y, num_blocks, y + h);
+        return;
+    }
 
-    if (memcpy_in) {
-        /* Safe part of last row */
-        get_row_data(p, y + h_main, exec.in, exec.out);
-        comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
+    /* Non-aligned case (slow path); process num_blocks - 1 main blocks and
+     * a separate tail (via memcpy into an appropriately padded buffer) */
+    for (int i = 0; i < 4; i++) {
+        /* We process one fewer block, so the in_bump needs to be increased
+         * to reflect that the plane pointers are left on the last block,
+         * not the end of the processed line, after each loop iteration */
+        exec.in_bump[i]  += exec.block_size_in;
+        exec.out_bump[i] += exec.block_size_out;
     }
 
-    /* Handle last column via memcpy, takes over `exec` so call these last */
-    if (memcpy_out)
-        handle_tail(p, &exec, true, false, y, h_main);
-    if (memcpy_in)
-        handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
+    comp->func(&exec, comp->priv, 0, y, num_blocks - 1, y + h);
+
+    DECLARE_ALIGNED_32(SwsOpExec, tail) = p->exec_tail;
+    tail.slice_y = y;
+    tail.slice_h = h;
+
+    for (int i = 0; i < p->planes_in; i++) {
+        exec.in[i] += p->tail_off_in;
+        tail.in[i] += y * tail.in_stride[i];
+    }
+    for (int i = 0; i < p->planes_out; i++) {
+        exec.out[i] += p->tail_off_out;
+        tail.out[i] += y * tail.out_stride[i];
+    }
+
+    for (int i = 0; i < p->planes_in; i++) {
+        if (memcpy_in) {
+            copy_lines((uint8_t *) tail.in[i], tail.in_stride[i],
+                       exec.in[i], exec.in_stride[i], h, p->tail_size_in);
+        } else {
+            /* Reuse input pointers directly */
+            tail.in[i]        = exec.in[i];
+            tail.in_stride[i] = exec.in_stride[i];
+            tail.in_bump[i]   = exec.in_stride[i] - exec.block_size_in;
+        }
+    }
+
+    for (int i = 0; !memcpy_out && i < p->planes_out; i++) {
+        /* Reuse output pointers directly */
+        tail.out[i]        = exec.out[i];
+        tail.out_stride[i] = exec.out_stride[i];
+        tail.out_bump[i]   = exec.out_stride[i] - exec.block_size_out;
+    }
+
+    /* Dispatch kernel over tail */
+    comp->func(&tail, comp->priv, num_blocks - 1, y, num_blocks, y + h);
+
+    for (int i = 0; memcpy_out && i < p->planes_out; i++) {
+        copy_lines(exec.out[i], exec.out_stride[i],
+                   tail.out[i], tail.out_stride[i], h, p->tail_size_out);
+    }
 }
 
 static int rw_planes(const SwsOp *op)

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 10/31: swscale/ops_dispatch: refactor tail handling

Reply via email to