PR #22418 opened by indecisive_turtle
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22418
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22418.patch

Instead of loading individual 2-byte coefficients from global memory, use the 
maximum amount of data that most modern ISAs can load with a single instruction 
(i.e buffer_load_b128)
This provides close to 5x improvement on my AMD 890M iGPU, encode time dropping 
from 3 minutes to 36 seconds.

The next step would be to preload coefficients into shared memory with subgroup 
coalesced loads, but this is an easier change for now


>From 7afd7eda0e4a3f3a121ee516aa94758e43e4ef4c Mon Sep 17 00:00:00 2001
From: raphaelthegreat <[email protected]>
Date: Fri, 6 Mar 2026 10:14:25 +0200
Subject: [PATCH] lavc: Batch load coefficients with 16-byte loads

Provides large improvement in memory bound dispatches
---
 .../vulkan/prores_ks_encode_slice.comp.glsl   | 30 +++++++++++++--
 .../vulkan/prores_ks_estimate_slice.comp.glsl | 37 +++++++++++++++----
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl 
b/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl
index 7105ad8dae..fcddd9408c 100644
--- a/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl
+++ b/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl
@@ -38,6 +38,11 @@ struct SliceData {
     int16_t coeffs[4][8 * 256];
 };
 
+struct SliceDataX8 {
+    uint32_t mbs_per_slice;
+    i16vec4 coeffs[4][8 * 256 / 8][2];
+};
+
 struct SliceScore {
     ivec4 bits[16];
     ivec4 score[16];
@@ -56,6 +61,9 @@ layout(push_constant, scalar) uniform EncodeSliceInfo {
 layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer {
     SliceData slices[];
 };
+layout (set = 0, binding = 0, scalar) readonly buffer SliceBufferX8 {
+    SliceDataX8 slices_x8[];
+};
 layout (set = 0, binding = 1, scalar) readonly buffer SliceScores {
     SliceScore scores[];
 };
@@ -65,6 +73,21 @@ layout (set = 0, binding = 2, scalar) uniform 
ProresDataTables {
 
 #define CFACTOR_Y444 3
 
+i16vec4 batch[2];
+int batch_idx = 0;
+int idx = 8;
+
+int16_t load_next_coeff(uint slice, uint plane)
+{
+    if (idx >= 8) {
+        batch = slices_x8[slice].coeffs[plane][batch_idx++];
+        idx = 0;
+    }
+    int16_t coeff = batch[idx >> 2][idx & 3];
+    idx++;
+    return coeff;
+}
+
 void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val)
 {
     /* number of prefix bits to switch between Rice and expGolomb */
@@ -105,12 +128,12 @@ void encode_dcs(inout PutBitContext pb, bool is_chroma, 
int q)
     uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
     int codebook = 5;
     int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0];
-    int coeff = slices[slice].coeffs[plane][0];
+    int coeff = load_next_coeff(slice, plane);
     int prev_dc = (coeff - 0x4000) / scale;
     encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc));
     int sign = 0;
     for (int i = 1; i < blocks_per_slice; i++) {
-        coeff = slices[slice].coeffs[plane][i];
+        coeff = load_next_coeff(slice, plane);
         int dc = (coeff - 0x4000) / scale;
         int delta = dc - prev_dc;
         int new_sign = GET_SIGN(delta);
@@ -143,8 +166,7 @@ void encode_acs(inout PutBitContext pb, bool is_chroma, int 
q)
     for (uint i = 1; i < 64; i++) {
         int quant = is_chroma ? qmat_chroma[q][i] : qmat[q][i];
         for (uint j = 0; j < blocks_per_slice; j++) {
-            uint idx = i * blocks_per_slice + j;
-            int coeff = slices[slice].coeffs[plane][idx];
+            int coeff = load_next_coeff(slice, plane);
             int level = coeff / quant;
             if (level != 0) {
                 int abs_level = abs(level);
diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl 
b/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl
index fdaa1c810a..531ffe2636 100644
--- a/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl
+++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl
@@ -41,6 +41,11 @@ struct SliceData {
     int16_t coeffs[4][8 * 256];
 };
 
+struct SliceDataX8 {
+    uint32_t mbs_per_slice;
+    i16vec4 coeffs[4][8 * 256 / 8][2];
+};
+
 struct SliceScore {
     ivec4 bits[16];
     ivec4 score[16];
@@ -54,6 +59,9 @@ struct SliceScore {
 layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer {
     SliceData slices[];
 };
+layout (set = 0, binding = 0, scalar) readonly buffer SliceBufferX8 {
+    SliceDataX8 slices_x8[];
+};
 layout (set = 0, binding = 1, scalar) writeonly buffer SliceScores {
     SliceScore scores[];
 };
@@ -87,6 +95,21 @@ int estimate_vlc(uint codebook, int val)
 
 #define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits 
= 0
 
+i16vec4 batch[2];
+int batch_idx = 0;
+int idx = 8;
+
+int16_t load_next_coeff(uint slice, uint plane)
+{
+    if (idx >= 8) {
+        batch = slices_x8[slice].coeffs[plane][batch_idx++];
+        idx = 0;
+    }
+    int16_t coeff = batch[idx >> 2][idx & 3];
+    idx++;
+    return coeff;
+}
+
 int estimate_dcs(inout int error, uint slice, uint plane, uint q)
 {
     const uint8_t dc_codebook[7] = { U8(0x04), U8(0x28), U8(0x28), U8(0x4D), 
U8(0x4D), U8(0x70), U8(0x70) };
@@ -94,14 +117,14 @@ int estimate_dcs(inout int error, uint slice, uint plane, 
uint q)
     uint blocks_per_mb = plane != 0 && chroma_factor != CFACTOR_Y444 ? 2 : 4;
     uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
     int codebook = 5;
-    int coeff = slices[slice].coeffs[plane][0];
+    int coeff = load_next_coeff(slice, plane);
     int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0];
     int prev_dc = (coeff - 0x4000) / scale;
     int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc));
     int sign = 0;
 
     for (int i = 1; i < blocks_per_slice; ++i) {
-        coeff = slices[slice].coeffs[plane][i];
+        coeff = load_next_coeff(slice, plane);
         int dc = (coeff - 0x4000) / scale;
         error += abs(coeff - 0x4000) % scale;
         int delta = dc - prev_dc;
@@ -140,8 +163,7 @@ int estimate_acs(inout int error, uint slice, uint plane, 
uint q)
     for (uint i = 1; i < 64; i++) {
         int quant = plane != 0 ? qmat_chroma[q][i] : qmat[q][i];
         for (uint j = 0; j < blocks_per_slice; j++) {
-            uint idx = i * blocks_per_slice + j;
-            int coeff = slices[slice].coeffs[plane][idx];
+            int coeff = load_next_coeff(slice, plane);
             int level = coeff / quant;
             error += abs(coeff) % quant;
             if (level != 0) {
@@ -188,15 +210,16 @@ int estimate_alpha_plane(uint slice)
     const int mask  = (1 << alpha_bits) - 1;
     const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
     int prev = mask, cur;
-    int idx = 0;
+    int idx = 1;
     int run = 0;
     int bits;
 
-    cur = slices[slice].coeffs[3][idx++];
+    cur = load_next_coeff(slice, 3);
     bits = est_alpha_diff(cur, prev);
     prev = cur;
     do {
-        cur = slices[slice].coeffs[3][idx++];
+        ++idx;
+        cur = load_next_coeff(slice, 3);
         if (cur != prev) {
             if (run == 0)
                 bits++;
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to