ctu: Put scratchbufs into union to save space

Andreas Rheinhardt via ffmpeg-cvslog Tue, 24 Mar 2026 10:47:08 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 92d06a8027649a2e823972115542418136a884bb
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Mar 20 07:06:43 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Mar 24 18:12:00 2026 +0100

    avcodec/vvc/ctu: Put scratchbufs into union to save space
    
    This reduces sizeof(VVCLocalContext) from 4580576B to
    3408032B here.
    
    Reviewed-by: Frank Plowman <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/vvc/ctu.h    | 35 +++++++++++++++++++++++++----------
 libavcodec/vvc/filter.c | 20 ++++++--------------
 libavcodec/vvc/inter.c  | 34 +++++++++++++++++-----------------
 3 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/libavcodec/vvc/ctu.h b/libavcodec/vvc/ctu.h
index b6f68d8432..59366a1756 100644
--- a/libavcodec/vvc/ctu.h
+++ b/libavcodec/vvc/ctu.h
@@ -87,6 +87,8 @@
 #define ALF_GRADIENT_SIZE       ((MAX_CU_SIZE + ALF_GRADIENT_BORDER * 2) / 
ALF_GRADIENT_STEP)
 #define ALF_NUM_DIR             4
 
+#define ALF_MAX_BLOCKS_IN_CTU   (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE 
/ ALF_BLOCK_SIZE)
+#define ALF_MAX_FILTER_SIZE     (ALF_MAX_BLOCKS_IN_CTU * ALF_NUM_COEFF_LUMA)
 
 /**
  * Value of the luma sample at position (x, y) in the 2D array tab.
@@ -437,16 +439,29 @@ typedef struct VVCLocalContext {
 
     NeighbourAvailable na;
 
-    /* *2 for high bit depths */
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[EDGE_EMU_BUFFER_STRIDE * 
EDGE_EMU_BUFFER_STRIDE * 2];
-    DECLARE_ALIGNED(32, int16_t, tmp)[MAX_PB_SIZE * MAX_PB_SIZE];
-    DECLARE_ALIGNED(32, int16_t, tmp1)[MAX_PB_SIZE * MAX_PB_SIZE];
-    DECLARE_ALIGNED(32, int16_t, tmp2)[MAX_PB_SIZE * MAX_PB_SIZE];
-    DECLARE_ALIGNED(32, uint8_t, ciip_tmp)[MAX_PB_SIZE * MAX_PB_SIZE * 2];
-    DECLARE_ALIGNED(32, uint8_t, sao_buffer)[(MAX_CTU_SIZE + 2 * 
SAO_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
-    DECLARE_ALIGNED(32, uint8_t, alf_buffer_luma)[(MAX_CTU_SIZE + 2 * 
ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
-    DECLARE_ALIGNED(32, uint8_t, alf_buffer_chroma)[(MAX_CTU_SIZE + 2 * 
ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
-    DECLARE_ALIGNED(32, int32_t, alf_gradient_tmp)[ALF_GRADIENT_SIZE * 
ALF_GRADIENT_SIZE * ALF_NUM_DIR];
+    union {
+        struct {
+            /* *2 for high bit depths */
+            DECLARE_ALIGNED(32, uint8_t, 
edge_emu_buffer)[EDGE_EMU_BUFFER_STRIDE * EDGE_EMU_BUFFER_STRIDE * 2];
+            DECLARE_ALIGNED(32, int16_t, tmp)[MAX_PB_SIZE * MAX_PB_SIZE];
+            DECLARE_ALIGNED(32, int16_t, tmp1)[MAX_PB_SIZE * MAX_PB_SIZE];
+            union {
+                DECLARE_ALIGNED(32, int16_t, prof_tmp)[MAX_PB_SIZE * 
MAX_PB_SIZE];
+                DECLARE_ALIGNED(32, uint8_t, ciip_tmp)[MAX_PB_SIZE * 
MAX_PB_SIZE * 2];
+            };
+        } pred; ///< only accessed in ff_vvc_predict_inter() and 
ff_vvc_predict_ciip()
+                ///< during the inter and reconstruction stages
+        struct {
+            DECLARE_ALIGNED(32, uint8_t, buffer)[(MAX_CTU_SIZE + 2 * 
SAO_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
+        } sao; ///< only accessed in ff_vvc_sao_filter() during the sao 
processing stage
+        struct {
+            DECLARE_ALIGNED(32, uint8_t, buffer_luma)[(MAX_CTU_SIZE + 2 * 
ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
+            DECLARE_ALIGNED(32, uint8_t, buffer_chroma)[(MAX_CTU_SIZE + 2 * 
ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
+            DECLARE_ALIGNED(32, int32_t, gradient_tmp)[ALF_GRADIENT_SIZE * 
ALF_GRADIENT_SIZE * ALF_NUM_DIR];
+            DECLARE_ALIGNED(32, int16_t, coeff_tmp)[ALF_MAX_FILTER_SIZE];
+            DECLARE_ALIGNED(32, int16_t, clip_tmp)[ALF_MAX_FILTER_SIZE];
+        } alf; ///< only accessed in ff_vvc_alf_filter() during the alf 
processing stage
+    };
 } VVCLocalContext;
 
 typedef struct VVCAllowedSplit {
diff --git a/libavcodec/vvc/filter.c b/libavcodec/vvc/filter.c
index b99901e219..e3131cf808 100644
--- a/libavcodec/vvc/filter.c
+++ b/libavcodec/vvc/filter.c
@@ -20,8 +20,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
-
 #include "libavutil/frame.h"
 #include "libavutil/imgutils.h"
 
@@ -355,7 +353,7 @@ void ff_vvc_sao_filter(VVCLocalContext *lc, int x0, int y0)
             case SAO_EDGE:
             {
                 const ptrdiff_t dst_stride = 2 * MAX_PB_SIZE + 
AV_INPUT_BUFFER_PADDING_SIZE;
-                uint8_t *dst               = lc->sao_buffer + dst_stride + 
AV_INPUT_BUFFER_PADDING_SIZE;
+                uint8_t *dst               = lc->sao.buffer + dst_stride + 
AV_INPUT_BUFFER_PADDING_SIZE;
 
                 sao_extends_edges(dst, dst_stride, src, src_stride, width, 
height, fc, x0, y0, rx, ry, edges, c_idx);
 
@@ -990,9 +988,6 @@ static void alf_prepare_buffer(VVCFrameContext *fc, uint8_t 
*_dst, const uint8_t
     alf_fill_border_v(dst, dst_stride, src,  dst - (1 << ps), border_pixels, 
height, ps, edges, edges[RIGHT]);
 }
 
-#define ALF_MAX_BLOCKS_IN_CTU   (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE 
/ ALF_BLOCK_SIZE)
-#define ALF_MAX_FILTER_SIZE     (ALF_MAX_BLOCKS_IN_CTU * ALF_NUM_COEFF_LUMA)
-
 static void alf_get_coeff_and_clip(VVCLocalContext *lc, int16_t *coeff, 
int16_t *clip,
     const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, const ALFParams *alf)
 {
@@ -1018,7 +1013,7 @@ static void alf_get_coeff_and_clip(VVCLocalContext *lc, 
int16_t *coeff, int16_t
         class_to_filt     = ff_vvc_alf_aps_class_to_filt_map;
     }
     fc->vvcdsp.alf.classify(class_idx, transpose_idx, src, src_stride, width, 
height,
-        vb_pos, lc->alf_gradient_tmp);
+        vb_pos, lc->alf.gradient_tmp);
     fc->vvcdsp.alf.recon_coeff_and_clip(coeff, clip, class_idx, transpose_idx, 
size,
         coeff_set, clip_idx_set, class_to_filt);
 }
@@ -1029,11 +1024,8 @@ static void alf_filter_luma(VVCLocalContext *lc, uint8_t 
*dst, const uint8_t *sr
 {
     const VVCFrameContext *fc = lc->fc;
     int vb_pos                = _vb_pos - y0;
-    int16_t *coeff            = (int16_t*)lc->tmp;
-    int16_t *clip             = (int16_t *)lc->tmp1;
-
-    static_assert(ALF_MAX_FILTER_SIZE <= sizeof(lc->tmp), "VVCLocalContext.tmp 
too small");
-    static_assert(ALF_MAX_FILTER_SIZE * sizeof(int16_t) <= sizeof(lc->tmp1), 
"VVCLocalContext.tmp1 too small");
+    int16_t *coeff            = lc->alf.coeff_tmp;
+    int16_t *clip             = lc->alf.clip_tmp;
 
     alf_get_coeff_and_clip(lc, coeff, clip, src, src_stride, width, height, 
vb_pos, alf);
     fc->vvcdsp.alf.filter[LUMA](dst, dst_stride, src, src_stride, width, 
height, coeff, clip, vb_pos);
@@ -1217,7 +1209,7 @@ void ff_vvc_alf_filter(VVCLocalContext *lc, const int x0, 
const int y0)
             uint8_t *padded;
 
             if (alf->ctb_flag[c_idx] || (!c_idx && has_chroma && 
(alf->ctb_cc_idc[0] || alf->ctb_cc_idc[1]))) {
-                padded = (c_idx ? lc->alf_buffer_chroma : lc->alf_buffer_luma) 
+ padded_offset;
+                padded = (c_idx ? lc->alf.buffer_chroma : lc->alf.buffer_luma) 
+ padded_offset;
                 alf_prepare_buffer(fc, padded, src, x, y, rx, ry, width, 
height,
                     padded_stride, src_stride, c_idx, sb_edges[i]);
             }
@@ -1231,7 +1223,7 @@ void ff_vvc_alf_filter(VVCLocalContext *lc, const int x0, 
const int y0)
                 }
             }
             if (c_idx && alf->ctb_cc_idc[c_idx - 1]) {
-                padded = lc->alf_buffer_luma + padded_offset;
+                padded = lc->alf.buffer_luma + padded_offset;
                 alf_filter_cc(lc, src, padded, src_stride, padded_stride, 
c_idx,
                     width, height, hs, vs, ctu_end - sb->t - 
ALF_VB_POS_ABOVE_LUMA, alf);
             }
diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
index 7d6e79e49b..fe7ce722e5 100644
--- a/libavcodec/vvc/inter.c
+++ b/libavcodec/vvc/inter.c
@@ -273,7 +273,7 @@ static void mc(VVCLocalContext *lc, int16_t *dst, const 
VVCFrame *ref, const Mv
     x_off += mv->x >> (4 + hs);
     y_off += mv->y >> (4 + vs);
 
-    MC_EMULATED_EDGE(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off);
+    MC_EMULATED_EDGE(lc->pred.edge_emu_buffer, &src, &src_stride, x_off, 
y_off);
     fc->vvcdsp.inter.put[is_chroma][idx][!!my][!!mx](dst, src, src_stride, 
block_h, hf, vf, block_w);
 }
 
@@ -302,7 +302,7 @@ static void mc_uni(VVCLocalContext *lc, uint8_t *dst, const 
ptrdiff_t dst_stride
     x_off += mv->x >> (4 + hs);
     y_off += mv->y >> (4 + vs);
 
-    MC_EMULATED_EDGE(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off);
+    MC_EMULATED_EDGE(lc->pred.edge_emu_buffer, &src, &src_stride, x_off, 
y_off);
     if (derive_weight_uni(&denom, &wx, &ox, lc, mvf, c_idx)) {
         fc->vvcdsp.inter.put_uni_w[is_chroma][idx][!!my][!!mx](dst, 
dst_stride, src, src_stride,
             block_h, denom, wx, ox, hf, vf, block_w);
@@ -323,7 +323,7 @@ static void mc_bi(VVCLocalContext *lc, uint8_t *dst, const 
ptrdiff_t dst_stride,
     const int vs              = fc->ps.sps->vshift[c_idx];
     const int idx             = av_log2(block_w) - 1;
     const VVCFrame *refs[]    = { ref0, ref1 };
-    int16_t *tmp[]            = { lc->tmp + sb_bdof_flag * PROF_TEMP_OFFSET, 
lc->tmp1 + sb_bdof_flag * PROF_TEMP_OFFSET };
+    int16_t *tmp[]            = { lc->pred.tmp + sb_bdof_flag * 
PROF_TEMP_OFFSET, lc->pred.tmp1 + sb_bdof_flag * PROF_TEMP_OFFSET };
     const int is_chroma       = !!c_idx;
     const int hpel_if_idx     = is_chroma ? 0 : pu->mi.hpel_if_idx;
 
@@ -344,9 +344,9 @@ static void mc_bi(VVCLocalContext *lc, uint8_t *dst, const 
ptrdiff_t dst_stride,
             const int x_sb = x_off + (orig_mv->mv[i].x >> (4 + hs));
             const int y_sb = y_off + (orig_mv->mv[i].y >> (4 + vs));
 
-            MC_EMULATED_EDGE_DMVR(lc->edge_emu_buffer,  &src, &src_stride, 
x_sb, y_sb, ox, oy);
+            MC_EMULATED_EDGE_DMVR(lc->pred.edge_emu_buffer,  &src, 
&src_stride, x_sb, y_sb, ox, oy);
         } else {
-            MC_EMULATED_EDGE(lc->edge_emu_buffer, &src, &src_stride, ox, oy);
+            MC_EMULATED_EDGE(lc->pred.edge_emu_buffer, &src, &src_stride, ox, 
oy);
         }
         fc->vvcdsp.inter.put[is_chroma][idx][!!my][!!mx](tmp[i],  src, 
src_stride, block_h, hf, vf, block_w);
         if (sb_bdof_flag)
@@ -415,7 +415,7 @@ static void emulated_edge_scaled(VVCLocalContext *lc, const 
uint8_t **src, ptrdi
     const int block_h       = *src_height = y_end - y_off + (y_end == y_last);
     const int wrap_enabled  = 0;
 
-    MC_EMULATED_EDGE(lc->edge_emu_buffer, src, src_stride, x_off, y_off);
+    MC_EMULATED_EDGE(lc->pred.edge_emu_buffer, src, src_stride, x_off, y_off);
 }
 
 static void mc_scaled(VVCLocalContext *lc, int16_t *dst, const VVCRefPic 
*refp, const Mv *mv,
@@ -470,7 +470,7 @@ static void mc_bi_scaled(VVCLocalContext *lc, uint8_t *dst, 
const ptrdiff_t dst_
    const int x_off, const int y_off, const int block_w, const int block_h, 
const int c_idx)
 {
     const VVCRefPic *refps[]  = { refp0, refp1 };
-    int16_t *tmp[]            = { lc->tmp, lc->tmp1 };
+    int16_t *tmp[]            = { lc->pred.tmp, lc->pred.tmp1 };
 
     for (int i = L0; i <= L1; i++) {
         const Mv *mv          = mvf->mv + i;
@@ -492,7 +492,7 @@ static void luma_prof_uni(VVCLocalContext *lc, uint8_t 
*dst, const ptrdiff_t dst
     const VVCFrameContext *fc = lc->fc;
     const uint8_t *src        = ref->frame->data[LUMA];
     ptrdiff_t src_stride      = ref->frame->linesize[LUMA];
-    uint16_t *prof_tmp        = lc->tmp + PROF_TEMP_OFFSET;
+    uint16_t *prof_tmp        = lc->pred.tmp + PROF_TEMP_OFFSET;
     const int idx             = av_log2(block_w) - 1;
     const int lx              = mvf->pred_flag - PF_L0;
     const Mv *mv              = mvf->mv + lx;
@@ -508,7 +508,7 @@ static void luma_prof_uni(VVCLocalContext *lc, uint8_t 
*dst, const ptrdiff_t dst
     x_off += mv->x >> 4;
     y_off += mv->y >> 4;
 
-    MC_EMULATED_EDGE(lc->edge_emu_buffer, &src, &src_stride, x_off, y_off);
+    MC_EMULATED_EDGE(lc->pred.edge_emu_buffer, &src, &src_stride, x_off, 
y_off);
     if (cb_prof_flag) {
         fc->vvcdsp.inter.put[LUMA][idx][!!my][!!mx](prof_tmp, src, src_stride, 
AFFINE_MIN_BLOCK_SIZE, hf, vf, AFFINE_MIN_BLOCK_SIZE);
         fc->vvcdsp.inter.fetch_samples(prof_tmp, src, src_stride, mx, my);
@@ -535,14 +535,14 @@ static void luma_prof(VVCLocalContext *lc, int16_t *dst, 
const VVCFrame *ref,
     const int oy              = y_off + (mv->y >> 4);
     const int idx             = av_log2(block_w) - 1;
     const int is_chroma       = 0;
-    uint16_t *prof_tmp        = lc->tmp2 + PROF_TEMP_OFFSET;
+    int16_t *prof_tmp         = lc->pred.prof_tmp + PROF_TEMP_OFFSET;
     ptrdiff_t src_stride      = ref->frame->linesize[LUMA];
     const uint8_t *src        = ref->frame->data[LUMA];
     const int8_t *hf          = 
ff_vvc_inter_luma_filters[VVC_INTER_LUMA_FILTER_TYPE_AFFINE][mx];
     const int8_t *vf          = 
ff_vvc_inter_luma_filters[VVC_INTER_LUMA_FILTER_TYPE_AFFINE][my];
     const int wrap_enabled    = fc->ps.pps->r->pps_ref_wraparound_enabled_flag;
 
-    MC_EMULATED_EDGE(lc->edge_emu_buffer, &src, &src_stride, ox, oy);
+    MC_EMULATED_EDGE(lc->pred.edge_emu_buffer, &src, &src_stride, ox, oy);
     if (!pu->cb_prof_flag[lx]) {
         fc->vvcdsp.inter.put[LUMA][idx][!!my][!!mx](dst, src, src_stride, 
block_h, hf, vf, block_w);
     } else {
@@ -557,7 +557,7 @@ static void luma_prof_bi(VVCLocalContext *lc, uint8_t *dst, 
const ptrdiff_t dst_
     const int block_w, const int block_h)
 {
     const VVCRefPic *refps[]  = { ref0, ref1 };
-    int16_t *tmp[]            = { lc->tmp, lc->tmp1 };
+    int16_t *tmp[]            = { lc->pred.tmp, lc->pred.tmp1 };
 
     for (int i = L0; i <= L1; i++) {
         const VVCRefPic *refp = refps[i];
@@ -608,7 +608,7 @@ static void pred_gpm_blk(VVCLocalContext *lc)
 
     const int c_end = fc->ps.sps->r->sps_chroma_format_idc ? 3 : 1;
 
-    int16_t *tmp[2] = {lc->tmp, lc->tmp1};
+    int16_t *tmp[2] = {lc->pred.tmp, lc->pred.tmp1};
 
     for (int c_idx = 0; c_idx < c_end; c_idx++) {
         const int hs         = fc->ps.sps->hshift[c_idx];
@@ -691,7 +691,7 @@ static void pred_regular(VVCLocalContext *lc, const MvField 
*mvf, const MvField
         const int h                  = sbh >> vs;
         const int is_luma            = !c_idx;
         const int do_ciip            = lc->cu->ciip_flag && (is_luma || (w > 
2));
-        uint8_t *inter               = do_ciip ? (uint8_t *)lc->ciip_tmp : dst;
+        uint8_t *inter               = do_ciip ? lc->pred.ciip_tmp : dst;
         const ptrdiff_t inter_stride = do_ciip ? (MAX_PB_SIZE * 
sizeof(uint16_t)) : dst_stride;
         const int do_bdof            = is_luma && sb_bdof_flag;
 
@@ -774,7 +774,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField 
*mvf, MvField *orig_mv,
     const VVCFrameContext *fc = lc->fc;
     const int sr_range        = 2;
     const VVCFrame *refs[]    = { ref0, ref1 };
-    int16_t *tmp[]            = { lc->tmp, lc->tmp1 };
+    int16_t *tmp[]            = { lc->pred.tmp, lc->pred.tmp1 };
     int sad[SAD_ARRAY_SIZE][SAD_ARRAY_SIZE];
     int min_dx, min_dy, min_sad, dx, dy;
 
@@ -794,7 +794,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField 
*mvf, MvField *orig_mv,
         const uint8_t *src      = ref->frame->data[LUMA];
         const int wrap_enabled  = 
fc->ps.pps->r->pps_ref_wraparound_enabled_flag;
 
-        MC_EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer, &src, &src_stride, ox, 
oy);
+        MC_EMULATED_EDGE_BILINEAR(lc->pred.edge_emu_buffer, &src, &src_stride, 
ox, oy);
         fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, 
my, pred_w);
     }
 
@@ -808,7 +808,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField 
*mvf, MvField *orig_mv,
         for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) {
             for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) {
                 if (dx != sr_range || dy != sr_range) {
-                    sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, 
dy, block_w, block_h);
+                    sad[dy][dx] = fc->vvcdsp.inter.sad(lc->pred.tmp, 
lc->pred.tmp1, dx, dy, block_w, block_h);
                     if (sad[dy][dx] < min_sad) {
                         min_sad = sad[dy][dx];
                         min_dx = dx;

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/03: avcodec/vvc/ctu: Put scratchbufs into union to save space

Reply via email to