[FFmpeg-devel] [PATCH] vk-proresdec-vldoptim (PR #21203)

averne via ffmpeg-devel Mon, 15 Dec 2025 01:50:35 -0800

PR #21203 opened by averne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21203
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21203.patch


This introduces a more aggressive bitstream caching, and saves the 
codebook/scan tables in shared memory, to reduce the overhead of hitting global 
memory.
It gives a nice speedup on AMD, but less significant on NVIDIA/Intel:

For a 4k, 422p10 file:
- AMD 6700XT: 18% (249 vs 211 fps)
- NVIDIA RTX 3050: 4% (98 vs 94 fps)
- Intel Tiger Lake GT2: 12% (38 vs 34 fps)


>From 540834ad991af1144d4b6d8d34737eff488fa303 Mon Sep 17 00:00:00 2001
From: averne <[email protected]>
Date: Sun, 14 Dec 2025 23:01:45 +0100
Subject: [PATCH 1/3] lavc/vulkan/common: allow configurable bitstream caching
 in shared memory

---
 libavcodec/vulkan/common.comp | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp
index d50e629f06..1e34c9bab2 100644
--- a/libavcodec/vulkan/common.comp
+++ b/libavcodec/vulkan/common.comp
@@ -229,13 +229,14 @@ struct GetBitContext {
         gb.bits_valid += 32;                                      \
     }
 #else /* GET_BITS_SMEM */
-shared u32vec4 
gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z];
+shared u32vec4 
gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z*GET_BITS_SMEM];
 
-#define FILL_SMEM()                                      \
-    {                                                    \
-        u32vec4buf ptr = u32vec4buf(gb.buf);             \
-        gb_storage[gl_LocalInvocationIndex] = ptr[0].v;  \
-        gb.cur_smem_pos = 0;                             \
+#define FILL_SMEM()                                                         \
+    {                                                                       \
+        u32vec4buf ptr = u32vec4buf(gb.buf);                                \
+        [[unroll]] for (uint i = 0; i < GET_BITS_SMEM; ++i)                 \
+            gb_storage[gl_LocalInvocationIndex*GET_BITS_SMEM+i] = ptr[i].v; \
+        gb.cur_smem_pos = 0;                                                \
     }
 
 #define LOAD64()                                                    \
@@ -251,15 +252,15 @@ shared u32vec4 
gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize
         FILL_SMEM();                                                \
     }
 
-#define RELOAD32()                                                         \
-    {                                                                      \
-        if (gb.cur_smem_pos >= 4)                                          \
-            FILL_SMEM();                                                   \
-        uint v = gb_storage[gl_LocalInvocationIndex][gb.cur_smem_pos];     \
-        gb.buf += 4;                                                       \
-        gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits; \
-        gb.bits_valid += 32;                                               \
-        gb.cur_smem_pos += 1;                                              \
+#define RELOAD32()                                                             
      \
+    {                                                                          
      \
+        if (gb.cur_smem_pos >= 4*GET_BITS_SMEM)                                
      \
+            FILL_SMEM();                                                       
      \
+        uint v = 
gb_storage[gl_LocalInvocationIndex*GET_BITS_SMEM][gb.cur_smem_pos]; \
+        gb.buf += 4;                                                           
      \
+        gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits;     
      \
+        gb.bits_valid += 32;                                                   
      \
+        gb.cur_smem_pos += 1;                                                  
      \
     }
 #endif /* GET_BITS_SMEM */
 
-- 
2.49.1


>From 04f0ab1992e5b019a9bb4e6f0c9b6130339cc575 Mon Sep 17 00:00:00 2001
From: averne <[email protected]>
Date: Sun, 14 Dec 2025 23:05:07 +0100
Subject: [PATCH 2/3] vulkan/prores: increase bitstream caching

Now caches 64B of data when the reader hits the refill codepath
---
 libavcodec/vulkan_prores.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c
index 338c09d46f..da10f93548 100644
--- a/libavcodec/vulkan_prores.c
+++ b/libavcodec/vulkan_prores.c
@@ -405,7 +405,7 @@ static int init_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
                           local_size >> 16 & 0xff, local_size >> 8 & 0xff, 
local_size >> 0 & 0xff,
                           0));
 
-    av_bprintf(&shd->src, "#define GET_BITS_SMEM\n");
+    av_bprintf(&shd->src, "#define GET_BITS_SMEM %d\n", 4);
 
     if (interlaced)
         av_bprintf(&shd->src, "#define INTERLACED\n");
-- 
2.49.1


>From d33674642084a857b348cc8a256ad5cfd4b67e42 Mon Sep 17 00:00:00 2001
From: averne <[email protected]>
Date: Sun, 14 Dec 2025 23:13:11 +0100
Subject: [PATCH 3/3] vulkan/prores: copy constant tables to shared memory

The shader needs ~3 loads per DCT coeff.
This data was not observed to get efficiently stored
in the upper cached levels, loading it explicitely in
shared memory fixes that.
---
 libavcodec/vulkan/prores_vld.comp | 106 +++++++++++++++++-------------
 1 file changed, 59 insertions(+), 47 deletions(-)

diff --git a/libavcodec/vulkan/prores_vld.comp 
b/libavcodec/vulkan/prores_vld.comp
index 298a5baf4c..30d5dcb04d 100644
--- a/libavcodec/vulkan/prores_vld.comp
+++ b/libavcodec/vulkan/prores_vld.comp
@@ -19,6 +19,58 @@
 #define U8(x)  (uint8_t (x))
 #define U16(x) (uint16_t(x))
 
+/**
+ * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or 
kexp + 1) << 8)
+ * According to the SMPTE document, abs(prev_dc_diff) should be used
+ * to index the table, duplicating the entries removes the abs operation.
+ */
+const uint16_t k_dc_codebook[] = { U16(0x100),
+                                   U16(0x210), U16(0x210),
+                                   U16(0x321), U16(0x321),
+                                   U16(0x430), U16(0x430), };
+
+/* Table 10 */
+const uint16_t k_ac_run_codebook  [] = { U16(0x102), U16(0x102), U16(0x101), 
U16(0x101),
+                                         U16(0x100), U16(0x211), U16(0x211), 
U16(0x211),
+                                         U16(0x211), U16(0x210), U16(0x210), 
U16(0x210),
+                                         U16(0x210), U16(0x210), U16(0x210), 
U16(0x320), };
+/* Table 11 */
+const uint16_t k_ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), 
U16(0x100),
+                                         U16(0x210), U16(0x210), U16(0x210), 
U16(0x210),
+                                         U16(0x320) };
+
+#ifndef INTERLACED
+    /* Figure 4, encoded as (x << 0) | (y << 4) */
+    const uint8_t k_scan_tbl[] = {
+        U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), 
U8(0x13),
+        U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), 
U8(0x33),
+        U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), 
U8(0x16),
+        U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), 
U8(0x37),
+        U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), 
U8(0x52),
+        U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), 
U8(0x54),
+        U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), 
U8(0x56),
+        U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), 
U8(0x77),
+    };
+#else
+    /* Figure 5 */
+    const uint8_t k_scan_tbl[] = {
+        U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), 
U8(0x31),
+        U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), 
U8(0x33),
+        U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), 
U8(0x61),
+        U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), 
U8(0x73),
+        U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), 
U8(0x25),
+        U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), 
U8(0x45),
+        U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), 
U8(0x65),
+        U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), 
U8(0x77),
+    };
+#endif
+
+shared uint16_t dc_codebook      [k_dc_codebook      .length()],
+                ac_run_codebook  [k_ac_run_codebook  .length()],
+                ac_level_codebook[k_ac_level_codebook.length()];
+
+shared uint8_t  scan_tbl[k_scan_tbl.length()];
+
 void put_px(uint tex_idx, ivec2 pos, uint v)
 {
 #ifndef INTERLACED
@@ -72,16 +124,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint 
mb_count)
         uint c = to_signed(decode_codeword(gb, 0x650));
         put_px(gid.z, base_pos, c);
 
-        /**
-         * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | 
((kexp or kexp + 1) << 8)
-         * According to the SMPTE document, abs(prev_dc_diff) should be used
-         * to index the table, duplicating the entries removes the abs 
operation.
-         */
-        const uint16_t dc_codebook[] = { U16(0x100),
-                                         U16(0x210), U16(0x210),
-                                         U16(0x321), U16(0x321),
-                                         U16(0x430), U16(0x430), };
-
         uint cw = 5, prev_dc_diff = 0;
         for (int i = 1; i < num_blocks; ++i) {
             cw = decode_codeword(gb, dc_codebook[min(cw, 6)]);
@@ -95,43 +137,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint 
mb_count)
 
     /* 7.1.1.4 AC Coefficients */
     {
-        /* Table 10 */
-        const uint16_t ac_run_codebook  [] = { U16(0x102), U16(0x102), 
U16(0x101), U16(0x101),
-                                               U16(0x100), U16(0x211), 
U16(0x211), U16(0x211),
-                                               U16(0x211), U16(0x210), 
U16(0x210), U16(0x210),
-                                               U16(0x210), U16(0x210), 
U16(0x210), U16(0x320), };
-
-        /* Table 11 */
-        const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), 
U16(0x102), U16(0x100),
-                                               U16(0x210), U16(0x210), 
U16(0x210), U16(0x210),
-                                               U16(0x320) };
-
-#ifndef INTERLACED
-        /* Figure 4, encoded as (x << 0) | (y << 4) */
-        const uint8_t scan_tbl[] = {
-            U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), 
U8(0x12), U8(0x13),
-            U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), 
U8(0x32), U8(0x33),
-            U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), 
U8(0x07), U8(0x16),
-            U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), 
U8(0x36), U8(0x37),
-            U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), 
U8(0x43), U8(0x52),
-            U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), 
U8(0x45), U8(0x54),
-            U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), 
U8(0x47), U8(0x56),
-            U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), 
U8(0x76), U8(0x77),
-        };
-#else
-        /* Figure 5 */
-        const uint8_t scan_tbl[] = {
-            U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), 
U8(0x21), U8(0x31),
-            U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), 
U8(0x23), U8(0x33),
-            U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), 
U8(0x70), U8(0x61),
-            U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), 
U8(0x63), U8(0x73),
-            U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), 
U8(0x34), U8(0x25),
-            U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), 
U8(0x54), U8(0x45),
-            U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), 
U8(0x74), U8(0x65),
-            U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), 
U8(0x67), U8(0x77),
-        };
-#endif
-
         uint block_mask  = num_blocks - 1;
         uint block_shift = findLSB(num_blocks);
 
@@ -276,6 +281,13 @@ void main(void)
     if (left_bits(gb) == 0)
         return;
 
+    /* Copy constant tables to local memory */
+    dc_codebook       = k_dc_codebook;
+    ac_run_codebook   = k_ac_run_codebook;
+    ac_level_codebook = k_ac_level_codebook;
+
+    scan_tbl = k_scan_tbl;
+
     /**
      * 4 ProRes Frame Structure
      * ProRes tiles pictures into a grid of slices, whose size is determined
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] vk-proresdec-vldoptim (PR #21203)

Reply via email to