PR #22418 opened by indecisive_turtle URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22418 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22418.patch
Instead of loading individual 2-byte coefficients from global memory, use the maximum amount of data that most modern ISAs can load with a single instruction (i.e buffer_load_b128) This provides close to 5x improvement on my AMD 890M iGPU, encode time dropping from 3 minutes to 36 seconds. The next step would be to preload coefficients into shared memory with subgroup coalesced loads, but this is an easier change for now >From 7afd7eda0e4a3f3a121ee516aa94758e43e4ef4c Mon Sep 17 00:00:00 2001 From: raphaelthegreat <[email protected]> Date: Fri, 6 Mar 2026 10:14:25 +0200 Subject: [PATCH] lavc: Batch load coefficients with 16-byte loads Provides large improvement in memory bound dispatches --- .../vulkan/prores_ks_encode_slice.comp.glsl | 30 +++++++++++++-- .../vulkan/prores_ks_estimate_slice.comp.glsl | 37 +++++++++++++++---- 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl b/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl index 7105ad8dae..fcddd9408c 100644 --- a/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl +++ b/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl @@ -38,6 +38,11 @@ struct SliceData { int16_t coeffs[4][8 * 256]; }; +struct SliceDataX8 { + uint32_t mbs_per_slice; + i16vec4 coeffs[4][8 * 256 / 8][2]; +}; + struct SliceScore { ivec4 bits[16]; ivec4 score[16]; @@ -56,6 +61,9 @@ layout(push_constant, scalar) uniform EncodeSliceInfo { layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer { SliceData slices[]; }; +layout (set = 0, binding = 0, scalar) readonly buffer SliceBufferX8 { + SliceDataX8 slices_x8[]; +}; layout (set = 0, binding = 1, scalar) readonly buffer SliceScores { SliceScore scores[]; }; @@ -65,6 +73,21 @@ layout (set = 0, binding = 2, scalar) uniform ProresDataTables { #define CFACTOR_Y444 3 +i16vec4 batch[2]; +int batch_idx = 0; +int idx = 8; + +int16_t load_next_coeff(uint slice, uint plane) +{ + if (idx >= 8) { + batch = slices_x8[slice].coeffs[plane][batch_idx++]; + idx = 0; + } + int16_t coeff = batch[idx >> 2][idx & 3]; + idx++; + return coeff; +} + void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val) { /* number of prefix bits to switch between Rice and expGolomb */ @@ -105,12 +128,12 @@ void encode_dcs(inout PutBitContext pb, bool is_chroma, int q) uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; int codebook = 5; int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0]; - int coeff = slices[slice].coeffs[plane][0]; + int coeff = load_next_coeff(slice, plane); int prev_dc = (coeff - 0x4000) / scale; encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc)); int sign = 0; for (int i = 1; i < blocks_per_slice; i++) { - coeff = slices[slice].coeffs[plane][i]; + coeff = load_next_coeff(slice, plane); int dc = (coeff - 0x4000) / scale; int delta = dc - prev_dc; int new_sign = GET_SIGN(delta); @@ -143,8 +166,7 @@ void encode_acs(inout PutBitContext pb, bool is_chroma, int q) for (uint i = 1; i < 64; i++) { int quant = is_chroma ? qmat_chroma[q][i] : qmat[q][i]; for (uint j = 0; j < blocks_per_slice; j++) { - uint idx = i * blocks_per_slice + j; - int coeff = slices[slice].coeffs[plane][idx]; + int coeff = load_next_coeff(slice, plane); int level = coeff / quant; if (level != 0) { int abs_level = abs(level); diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl b/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl index fdaa1c810a..531ffe2636 100644 --- a/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl +++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl @@ -41,6 +41,11 @@ struct SliceData { int16_t coeffs[4][8 * 256]; }; +struct SliceDataX8 { + uint32_t mbs_per_slice; + i16vec4 coeffs[4][8 * 256 / 8][2]; +}; + struct SliceScore { ivec4 bits[16]; ivec4 score[16]; @@ -54,6 +59,9 @@ struct SliceScore { layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer { SliceData slices[]; }; +layout (set = 0, binding = 0, scalar) readonly buffer SliceBufferX8 { + SliceDataX8 slices_x8[]; +}; layout (set = 0, binding = 1, scalar) writeonly buffer SliceScores { SliceScore scores[]; }; @@ -87,6 +95,21 @@ int estimate_vlc(uint codebook, int val) #define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0 +i16vec4 batch[2]; +int batch_idx = 0; +int idx = 8; + +int16_t load_next_coeff(uint slice, uint plane) +{ + if (idx >= 8) { + batch = slices_x8[slice].coeffs[plane][batch_idx++]; + idx = 0; + } + int16_t coeff = batch[idx >> 2][idx & 3]; + idx++; + return coeff; +} + int estimate_dcs(inout int error, uint slice, uint plane, uint q) { const uint8_t dc_codebook[7] = { U8(0x04), U8(0x28), U8(0x28), U8(0x4D), U8(0x4D), U8(0x70), U8(0x70) }; @@ -94,14 +117,14 @@ int estimate_dcs(inout int error, uint slice, uint plane, uint q) uint blocks_per_mb = plane != 0 && chroma_factor != CFACTOR_Y444 ? 2 : 4; uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; int codebook = 5; - int coeff = slices[slice].coeffs[plane][0]; + int coeff = load_next_coeff(slice, plane); int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0]; int prev_dc = (coeff - 0x4000) / scale; int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc)); int sign = 0; for (int i = 1; i < blocks_per_slice; ++i) { - coeff = slices[slice].coeffs[plane][i]; + coeff = load_next_coeff(slice, plane); int dc = (coeff - 0x4000) / scale; error += abs(coeff - 0x4000) % scale; int delta = dc - prev_dc; @@ -140,8 +163,7 @@ int estimate_acs(inout int error, uint slice, uint plane, uint q) for (uint i = 1; i < 64; i++) { int quant = plane != 0 ? qmat_chroma[q][i] : qmat[q][i]; for (uint j = 0; j < blocks_per_slice; j++) { - uint idx = i * blocks_per_slice + j; - int coeff = slices[slice].coeffs[plane][idx]; + int coeff = load_next_coeff(slice, plane); int level = coeff / quant; error += abs(coeff) % quant; if (level != 0) { @@ -188,15 +210,16 @@ int estimate_alpha_plane(uint slice) const int mask = (1 << alpha_bits) - 1; const int num_coeffs = int(slices[slice].mbs_per_slice) * 256; int prev = mask, cur; - int idx = 0; + int idx = 1; int run = 0; int bits; - cur = slices[slice].coeffs[3][idx++]; + cur = load_next_coeff(slice, 3); bits = est_alpha_diff(cur, prev); prev = cur; do { - cur = slices[slice].coeffs[3][idx++]; + ++idx; + cur = load_next_coeff(slice, 3); if (cur != prev) { if (run == 0) bits++; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
