This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit e3daeff9650e3fa8c72526358a730a1fa69f410a
Author:     Niklas Haas <[email protected]>
AuthorDate: Thu Mar 26 23:53:19 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Mar 28 18:50:14 2026 +0100

    swscale/ops_dispatch: compute input x offset map for SwsOpExec
    
    This is cheap to precompute and can be used as-is for gather-style 
horizontal
    filter implementations.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c     | 49 +++++++++++++++++++++++++++++++++++++++----
 libswscale/ops_dispatch.h     | 18 ++++++++++++++--
 libswscale/x86/ops_common.asm |  1 +
 3 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 774a7e5641..70f6f7d24d 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -44,6 +44,7 @@ typedef struct SwsOpPass {
     int idx_in[4];
     int idx_out[4];
     int *offsets_y;
+    int filter_size;
     bool memcpy_first;
     bool memcpy_last;
     bool memcpy_out;
@@ -117,6 +118,7 @@ static void op_pass_free(void *ptr)
     ff_sws_compiled_op_unref(&p->comp);
     av_refstruct_unref(&p->offsets_y);
     av_free(p->exec_base.in_bump_y);
+    av_free(p->exec_base.in_offset_x);
     av_free(p->tail_buf);
     av_free(p);
 }
@@ -202,11 +204,18 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
 
     const int safe_width = (num_blocks - 1) * block_size;
     const int tail_size  = pass->width - safe_width;
-    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
     p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
-    p->tail_size_in  = (tail_size * p->pixel_bits_in  + 7) >> 3;
     p->tail_size_out = (tail_size * p->pixel_bits_out + 7) >> 3;
 
+    if (exec->in_offset_x) {
+        p->tail_off_in  = exec->in_offset_x[safe_width];
+        p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in;
+        p->tail_size_in += (p->filter_size * p->pixel_bits_in + 7) >> 3;
+    } else {
+        p->tail_off_in  = safe_width * p->pixel_bits_in >> 3;
+        p->tail_size_in = (tail_size * p->pixel_bits_in + 7) >> 3;
+    }
+
     for (int i = 0; memcpy_in && i < p->planes_in; i++) {
         size_t block_size = (comp->block_size * p->pixel_bits_in + 7) >> 3;
         block_size += comp->over_read;
@@ -225,6 +234,14 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         alloc_size += tail->out_stride[i] * out->height;
     }
 
+    if (memcpy_in && exec->in_offset_x) {
+        /* `in_offset_x` is indexed relative to the line start, not the start
+         * of the section being processed; so we need to over-allocate this
+         * array to the full width of the image, even though we will only
+         * partially fill in the offsets relevant to the tail region */
+        alloc_size += aligned_w * sizeof(*exec->in_offset_x);
+    }
+
     uint8_t *tail_buf = av_fast_realloc(p->tail_buf, &p->tail_buf_size, 
alloc_size);
     if (!tail_buf)
         return AVERROR(ENOMEM);
@@ -240,6 +257,12 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         tail_buf += tail->out_stride[i] * out->height;
     }
 
+    if (memcpy_in && exec->in_offset_x) {
+        tail->in_offset_x = (int32_t *) tail_buf;
+        for (int i = safe_width; i < aligned_w; i++)
+            tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in;
+    }
+
     return 0;
 }
 
@@ -308,7 +331,9 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
     tail.slice_h = h;
 
     for (int i = 0; i < p->planes_in; i++) {
-        exec.in[i] += p->tail_off_in;
+        /* Input offsets are relative to the base pointer */
+        if (!exec.in_offset_x || memcpy_in)
+            exec.in[i] += p->tail_off_in;
         tail.in[i] += y * tail.in_stride[i];
     }
     for (int i = 0; i < p->planes_out; i++) {
@@ -397,8 +422,8 @@ static int compile(SwsGraph *graph, const SwsOpList *ops, 
SwsPass *input,
         p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
     }
 
+    const SwsFilterWeights *filter = read->rw.kernel;
     if (read->rw.filter == SWS_OP_FILTER_V) {
-        const SwsFilterWeights *filter = read->rw.kernel;
         p->offsets_y = av_refstruct_ref(filter->offsets);
 
         /* Compute relative pointer bumps for each output line */
@@ -416,6 +441,22 @@ static int compile(SwsGraph *graph, const SwsOpList *ops, 
SwsPass *input,
         }
         bump[filter->dst_size - 1] = 0;
         p->exec_base.in_bump_y = bump;
+    } else if (read->rw.filter == SWS_OP_FILTER_H) {
+        /* Compute pixel offset map for each output line */
+        const int pixels = FFALIGN(filter->dst_size, p->comp.block_size);
+        int32_t *offset = av_malloc_array(pixels, sizeof(*offset));
+        if (!offset) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        for (int x = 0; x < filter->dst_size; x++)
+            offset[x] = filter->offsets[x] * p->pixel_bits_in >> 3;
+        for (int x = filter->dst_size; x < pixels; x++)
+            offset[x] = offset[filter->dst_size - 1];
+        p->exec_base.in_offset_x = offset;
+        p->exec_base.block_size_in = 0; /* ptr does not advance */
+        p->filter_size = filter->filter_size;
     }
 
     return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
diff --git a/libswscale/ops_dispatch.h b/libswscale/ops_dispatch.h
index bca4b886ef..a35e9be907 100644
--- a/libswscale/ops_dispatch.h
+++ b/libswscale/ops_dispatch.h
@@ -41,7 +41,13 @@ typedef struct SwsOpExec {
     ptrdiff_t in_stride[4];
     ptrdiff_t out_stride[4];
 
-    /* Pointer bump, difference between stride and processed line size */
+    /**
+     * Pointer bump, difference between stride and processed line size.
+     *
+     * Assumes that each read kernel increments pointers by the processed
+     * block size, except when using horizontal filtering, in which case
+     * this is always equal to the input stride.
+     */
     ptrdiff_t in_bump[4];
     ptrdiff_t out_bump[4];
 
@@ -64,12 +70,20 @@ typedef struct SwsOpExec {
      * multiplied by the corresponding line stride.
      */
     int32_t *in_bump_y;
+
+    /**
+     * Pixel offset map; for horizontal scaling, in bytes. Indexed by the x
+     * coordinate of the output pixel. This is always aligned up to a multiple
+     * of the block size, so implementations may safely over-read up to the
+     * next block boundary.
+     */
+    int32_t *in_offset_x;
 } SwsOpExec;
 
 static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) +
                                    6  * sizeof(int32_t) +
                                    16 * sizeof(uint8_t) +
-                                   1  * sizeof(void *),
+                                   2  * sizeof(void *),
               "SwsOpExec layout mismatch");
 
 /**
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
index c7cc460447..8e4ea6e977 100644
--- a/libswscale/x86/ops_common.asm
+++ b/libswscale/x86/ops_common.asm
@@ -142,6 +142,7 @@ struc SwsOpExec
     .in_sub_x4 resb 4
     .out_sub_x4 resb 4
     .in_bump_y resq 1
+    .in_offset_x resq 1
 endstruc
 
 struc SwsOpImpl

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to