From: IndecisiveTurtle <[email protected]>
Adds a vulkan implementation of the reference prores kostya encoder. Provides
about 3-4x speedup over the CPU code
---
configure | 1 +
libavcodec/Makefile | 1 +
libavcodec/allcodecs.c | 1 +
libavcodec/proresenc_kostya_vulkan.c | 1068 +++++++++++++++++
libavcodec/vulkan/Makefile | 7 +
libavcodec/vulkan/prores_ks_alpha_data.comp | 67 ++
libavcodec/vulkan/prores_ks_encode_slice.comp | 230 ++++
.../vulkan/prores_ks_estimate_slice.comp | 267 +++++
libavcodec/vulkan/prores_ks_slice_data.comp | 265 ++++
libavcodec/vulkan/prores_ks_trellis_node.comp | 177 +++
10 files changed, 2084 insertions(+)
create mode 100644 libavcodec/proresenc_kostya_vulkan.c
create mode 100644 libavcodec/vulkan/prores_ks_alpha_data.comp
create mode 100644 libavcodec/vulkan/prores_ks_encode_slice.comp
create mode 100644 libavcodec/vulkan/prores_ks_estimate_slice.comp
create mode 100644 libavcodec/vulkan/prores_ks_slice_data.comp
create mode 100644 libavcodec/vulkan/prores_ks_trellis_node.comp
diff --git a/configure b/configure
index 7ec4c3975b..4db8a7c581 100755
--- a/configure
+++ b/configure
@@ -3099,6 +3099,7 @@ prores_decoder_select="blockdsp idctdsp"
prores_encoder_select="fdctdsp"
prores_aw_encoder_select="fdctdsp"
prores_ks_encoder_select="fdctdsp"
+prores_ks_vulkan_encoder_select="vulkan spirv_compiler"
prores_raw_decoder_select="blockdsp idctdsp"
qcelp_decoder_select="lsp"
qdm2_decoder_select="mpegaudiodsp"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index d8e1ac5a54..1964c787d7 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -639,6 +639,7 @@ OBJS-$(CONFIG_PRORES_DECODER) += proresdec.o
proresdsp.o proresdata.o
OBJS-$(CONFIG_PRORES_ENCODER) += proresenc_anatoliy.o proresdata.o
OBJS-$(CONFIG_PRORES_AW_ENCODER) += proresenc_anatoliy.o proresdata.o
OBJS-$(CONFIG_PRORES_KS_ENCODER) += proresenc_kostya.o proresdata.o
proresenc_kostya_common.o
+OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += proresenc_kostya_vulkan.o
proresdata.o proresenc_kostya_common.o
OBJS-$(CONFIG_PRORES_RAW_DECODER) += prores_raw.o proresdsp.o proresdata.o
OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
OBJS-$(CONFIG_PROSUMER_DECODER) += prosumer.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index f5ec2e01e8..1b4a5f769c 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -269,6 +269,7 @@ extern const FFCodec ff_prores_encoder;
extern const FFCodec ff_prores_decoder;
extern const FFCodec ff_prores_aw_encoder;
extern const FFCodec ff_prores_ks_encoder;
+extern const FFCodec ff_prores_ks_vulkan_encoder;
extern const FFCodec ff_prores_raw_decoder;
extern const FFCodec ff_prosumer_decoder;
extern const FFCodec ff_psd_decoder;
diff --git a/libavcodec/proresenc_kostya_vulkan.c
b/libavcodec/proresenc_kostya_vulkan.c
new file mode 100644
index 0000000000..6413b2f9d4
--- /dev/null
+++ b/libavcodec/proresenc_kostya_vulkan.c
@@ -0,0 +1,1068 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/buffer.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/vulkan_spirv.h"
+#include "libavutil/hwcontext_vulkan.h"
+#include "libavutil/vulkan_loader.h"
+#include "libavutil/vulkan.h"
+#include "avcodec.h"
+#include "codec.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "packet.h"
+#include "put_bits.h"
+#include "profiles.h"
+#include "bytestream.h"
+#include "proresdata.h"
+#include "proresenc_kostya_common.h"
+#include "hwconfig.h"
+
+#define DCTSIZE 8
+
+typedef struct ProresDataTables {
+ int16_t qmat[128][64];
+ int16_t qmat_chroma[128][64];
+ uint8_t scan[64];
+ uint8_t dc_codebook[7];
+ uint8_t run_to_cb[16];
+ uint8_t level_to_cb[10];
+} ProresDataTables;
+
+typedef struct SliceDataInfo {
+ int plane;
+ int pictures_per_frame;
+ int line_add;
+} SliceDataInfo;
+
+typedef struct EstimateSliceInfo {
+ int slices_per_picture;
+ int min_quant;
+ int max_quant;
+ int bits_per_mb;
+} EstimateSliceInfo;
+
+typedef struct EncodeSliceInfo {
+ VkDeviceAddress bytestream;
+ VkDeviceAddress seek_table;
+ int num_planes;
+ int slices_per_picture;
+ int max_quant;
+} EncodeSliceInfo;
+
+typedef struct TrellisNodeInfo {
+ int min_quant;
+ int max_quant;
+ int mbs_per_slice;
+ int bits_per_mb;
+} TrellisNodeInfo;
+
+#define TRELLIS_WIDTH 16
+#define SCORE_LIMIT INT_MAX / 2
+
+struct TrellisNode {
+ int prev_node;
+ int quant;
+ int bits;
+ int score;
+};
+
+typedef struct SliceData {
+ uint32_t mbs_per_slice;
+ int16_t rows[MAX_PLANES * MAX_MBS_PER_SLICE * 256];
+} SliceData;
+
+typedef struct SliceScore {
+ int bits[MAX_STORED_Q][4];
+ int error[MAX_STORED_Q][4];
+ int total_bits[MAX_STORED_Q];
+ int total_error[MAX_STORED_Q];
+ int overquant;
+ int buf_start;
+ int quant;
+} SliceScore;
+
+typedef struct VulkanEncodeProresFrameData {
+ /* Intermediate buffers */
+ AVBufferRef *out_data_ref[2];
+ AVBufferRef *slice_data_ref[2];
+ AVBufferRef *slice_score_ref[2];
+ AVBufferRef *frame_size_ref[2];
+
+ /* Copied from the source */
+ int64_t pts;
+ int64_t duration;
+ void *frame_opaque;
+ AVBufferRef *frame_opaque_ref;
+ enum AVColorTransferCharacteristic color_trc;
+ enum AVColorSpace colorspace;
+ enum AVColorPrimaries color_primaries;
+ int key_frame;
+ int flags;
+} VulkanEncodeProresFrameData;
+
+typedef struct ProresVulkanContext {
+ ProresContext ctx;
+
+ /* Vulkan state */
+ FFVulkanContext vkctx;
+ AVVulkanDeviceQueueFamily *qf;
+ FFVkExecPool e;
+ AVVulkanDeviceQueueFamily *transfer_qf;
+ FFVkExecPool transfer_exec_pool;
+ AVBufferPool *pkt_buf_pool;
+ AVBufferPool *slice_data_buf_pool;
+ AVBufferPool *slice_score_buf_pool;
+ AVBufferPool *frame_size_buf_pool;
+
+ FFVulkanShader alpha_data_shd;
+ FFVulkanShader slice_data_shd[2];
+ FFVulkanShader estimate_slice_shd;
+ FFVulkanShader encode_slice_shd;
+ FFVulkanShader trellis_node_shd;
+ FFVkBuffer prores_data_tables_buf;
+
+ int *slice_quants;
+ SliceScore *slice_scores;
+ ProresDataTables *tables;
+
+ int in_flight;
+ int async_depth;
+ AVFrame *frame;
+ VulkanEncodeProresFrameData *exec_ctx_info;
+} ProresVulkanContext;
+
+extern const char *ff_source_common_comp;
+extern const char *ff_source_prores_ks_alpha_data_comp;
+extern const char *ff_source_prores_ks_slice_data_comp;
+extern const char *ff_source_prores_ks_estimate_slice_comp;
+extern const char *ff_source_prores_ks_trellis_node_comp;
+extern const char *ff_source_prores_ks_encode_slice_comp;
+
+static int init_slice_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler
*spv,
+ FFVulkanShader* shd, const char* pl_name,
int blocks_per_mb)
+{
+ int err = 0;
+ uint8_t *spv_data;
+ size_t spv_len;
+ void *spv_opaque = NULL;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanDescriptorSetBinding *desc;
+
+ ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+ NULL, 0, DCTSIZE, blocks_per_mb, pv->ctx.mbs_per_slice,
0);
+
+ av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+ av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+ av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n",
pv->ctx.mbs_per_slice);
+ av_bprintf(&shd->src, "#define BLOCKS_PER_MB %d\n", blocks_per_mb);
+ av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width);
+ av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; i16vec4
rows[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE][DCTSIZE / 4]; };\n");
+
+ desc = (FFVulkanDescriptorSetBinding []) {
+ {
+ .name = "SliceBuffer",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceData slices[];",
+ },
+ {
+ .name = "planes",
+ .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+ .dimensions = 2,
+ .elems = 3,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "r16i",
+ },
+ };
+ RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+ ff_vk_shader_add_push_const(shd, 0, sizeof(SliceDataInfo),
VK_SHADER_STAGE_COMPUTE_BIT);
+ GLSLD(ff_source_prores_ks_slice_data_comp);
+
+ RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main",
&spv_opaque));
+ RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+ return err;
+}
+
+static int init_alpha_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler
*spv,
+ FFVulkanShader* shd, const char* pl_name)
+{
+ int err = 0;
+ uint8_t *spv_data;
+ size_t spv_len;
+ void *spv_opaque = NULL;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanDescriptorSetBinding *desc;
+
+ ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+ NULL, 0, DCTSIZE * 2, DCTSIZE * 2, 1, 0);
+
+ av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+ av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+ av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n",
pv->ctx.mbs_per_slice);
+ av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width);
+ av_bprintf(&shd->src, "#define SLICES_PITCH %d\n", pv->ctx.slices_width);
+ av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+ av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t
coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+
+ desc = (FFVulkanDescriptorSetBinding []) {
+ {
+ .name = "SliceBuffer",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceData slices[];",
+ },
+ {
+ .name = "plane",
+ .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+ .dimensions = 2,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "r16i",
+ },
+ };
+ RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+ ff_vk_shader_add_push_const(shd, 0, sizeof(int),
VK_SHADER_STAGE_COMPUTE_BIT);
+ GLSLD(ff_source_prores_ks_alpha_data_comp);
+
+ RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main",
&spv_opaque));
+ RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+ return err;
+}
+
+static int init_estimate_slice_pipeline(ProresVulkanContext *pv,
FFVkSPIRVCompiler *spv,
+ FFVulkanShader* shd, const char*
pl_name)
+{
+ int err = 0;
+ uint8_t *spv_data;
+ size_t spv_len;
+ void *spv_opaque = NULL;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanDescriptorSetBinding *desc;
+ int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+ int dim_x = pv->ctx.alpha_bits ? subgroup_size : (subgroup_size / 3) * 3;
+
+ ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+ NULL, 0, dim_x, 1, 1, 0);
+
+ av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+ av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+ av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n",
pv->ctx.mbs_per_slice);
+ av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor);
+ av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+ av_bprintf(&shd->src, "#define MAX_STORED_Q %d\n", MAX_STORED_Q);
+ av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes);
+ av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t
coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+ av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4
score[16]; int total_bits[16]; int total_score[16]; uint overquant; int
buf_start; int quant; };\n");
+
+ desc = (FFVulkanDescriptorSetBinding []) {
+ {
+ .name = "SliceBuffer",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceData slices[];",
+ },
+ {
+ .name = "SliceScores",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceScore scores[];",
+ },
+ {
+ .name = "ProresDataTables",
+ .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "int16_t qmat[128][64]; int16_t
qmat_chroma[128][64]; uint8_t scan[64]; "
+ "uint8_t dc_codebook[7]; uint8_t run_to_cb[16];
uint8_t level_to_cb[10];",
+ },
+ };
+ RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0));
+
+ ff_vk_shader_add_push_const(shd, 0, sizeof(EstimateSliceInfo),
VK_SHADER_STAGE_COMPUTE_BIT);
+ GLSLD(ff_source_prores_ks_estimate_slice_comp);
+
+ RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main",
&spv_opaque));
+ RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+ return err;
+}
+
+static int init_trellis_node_pipeline(ProresVulkanContext *pv,
FFVkSPIRVCompiler *spv,
+ FFVulkanShader* shd, const char* pl_name)
+{
+ int err = 0;
+ uint8_t *spv_data;
+ size_t spv_len;
+ void *spv_opaque = NULL;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanDescriptorSetBinding *desc;
+ int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+
+ ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+ NULL, 0, pv->ctx.mb_height, 1, 1, 0);
+
+ av_bprintf(&shd->src, "#define SLICES_WIDTH %d\n", pv->ctx.slices_width);
+ av_bprintf(&shd->src, "#define NUM_SUBGROUPS %d\n",
FFALIGN(pv->ctx.mb_height, subgroup_size) / subgroup_size);
+ av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes);
+ av_bprintf(&shd->src, "#define FORCE_QUANT %d\n", pv->ctx.force_quant);
+ av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4
score[16]; int total_bits[16]; int total_score[16]; int overquant; int
buf_start; int quant; };\n");
+
+ desc = (FFVulkanDescriptorSetBinding []) {
+ {
+ .name = "FrameSize",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "int frame_size;",
+ },
+ {
+ .name = "SliceScores",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceScore scores[];",
+ },
+ };
+ RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+ ff_vk_shader_add_push_const(shd, 0, sizeof(TrellisNodeInfo),
VK_SHADER_STAGE_COMPUTE_BIT);
+ GLSLD(ff_source_prores_ks_trellis_node_comp);
+
+ RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main",
&spv_opaque));
+ RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+ return err;
+}
+
+static int init_encode_slice_pipeline(ProresVulkanContext *pv,
FFVkSPIRVCompiler *spv,
+ FFVulkanShader* shd, const char* pl_name)
+{
+ int err = 0;
+ uint8_t *spv_data;
+ size_t spv_len;
+ void *spv_opaque = NULL;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanDescriptorSetBinding *desc;
+
+ ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+ NULL, 0, 64, 1, 1, 0);
+
+ av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+ av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+ av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n",
pv->ctx.mbs_per_slice);
+ av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor);
+ av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+ av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t
coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+ av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4
score[16]; int total_bits[16]; int total_score[16]; uint overquant; int
buf_start; int quant; };\n");
+
+ desc = (FFVulkanDescriptorSetBinding []) {
+ {
+ .name = "SliceBuffer",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceData slices[];",
+ },
+ {
+ .name = "SliceScores",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "SliceScore scores[];",
+ },
+ {
+ .name = "ProresDataTables",
+ .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .mem_layout = "scalar",
+ .buf_content = "int16_t qmat[128][64]; int16_t
qmat_chroma[128][64]; uint8_t scan[64]; "
+ "uint8_t dc_codebook[7]; uint8_t run_to_cb[16];
uint8_t level_to_cb[10];",
+ },
+ };
+ RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0));
+
+ ff_vk_shader_add_push_const(shd, 0, sizeof(EncodeSliceInfo),
VK_SHADER_STAGE_COMPUTE_BIT);
+ av_bprintf(&shd->src, "#define PB_UNALIGNED\n");
+ av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference : require\n");
+ av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference2 : require\n");
+ GLSLD(ff_source_common_comp);
+ GLSLD(ff_source_prores_ks_encode_slice_comp);
+
+ RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main",
&spv_opaque));
+ RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+ return err;
+}
+
+static int vulkan_encode_prores_submit_frame(AVCodecContext *avctx,
FFVkExecContext *exec,
+ AVFrame *frame, int picture_idx)
+{
+ ProresVulkanContext *pv = avctx->priv_data;
+ ProresContext *ctx = &pv->ctx;
+ VulkanEncodeProresFrameData *pd = exec->opaque;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanFunctions *vk = &vkctx->vkfn;
+ int err = 0, nb_img_bar = 0, i, is_chroma;
+ int min_quant = ctx->profile_info->min_quant;
+ int max_quant = ctx->profile_info->max_quant;
+ int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+ int estimate_dim_x = ctx->alpha_bits ? subgroup_size : (subgroup_size / 3)
* 3;
+ int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY;
+ VkImageView views[AV_NUM_DATA_POINTERS];
+ VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
+ FFVkBuffer *pkt_vk_buf, *slice_data_buf, *slice_score_buf, *frame_size_buf;
+ SliceDataInfo slice_data_info;
+ EstimateSliceInfo estimate_info;
+ TrellisNodeInfo trellis_node_info;
+ EncodeSliceInfo encode_info;
+ FFVulkanShader *shd;
+
+ /* Start recording */
+ ff_vk_exec_start(vkctx, exec);
+
+ /* Get a pooled buffer for writing output data */
+ RET(ff_vk_get_pooled_buffer(vkctx, &pv->pkt_buf_pool,
&pd->out_data_ref[picture_idx],
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+ VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL,
+ ctx->frame_size_upper_bound +
FF_INPUT_BUFFER_MIN_SIZE,
+ transfer_slices ?
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+ :
(VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)));
+ pkt_vk_buf = (FFVkBuffer*)pd->out_data_ref[picture_idx]->data;
+ ff_vk_exec_add_dep_buf(vkctx, exec, &pd->out_data_ref[picture_idx], 1, 1);
+
+ /* Allocate buffer for writing slice data */
+ RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_data_buf_pool,
&pd->slice_data_ref[picture_idx],
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+ VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL,
+ ctx->slices_per_picture * sizeof(SliceData),
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+ slice_data_buf = (FFVkBuffer*)pd->slice_data_ref[picture_idx]->data;
+ ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_data_ref[picture_idx], 1,
1);
+
+ /* Allocate buffer for writing slice scores */
+ RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_score_buf_pool,
&pd->slice_score_ref[picture_idx],
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+ VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL,
+ ctx->slices_per_picture * sizeof(SliceScore),
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+ slice_score_buf = (FFVkBuffer*)pd->slice_score_ref[picture_idx]->data;
+ ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_score_ref[picture_idx], 1,
1);
+
+ /* Allocate buffer for writing frame size */
+ RET(ff_vk_get_pooled_buffer(vkctx, &pv->frame_size_buf_pool,
&pd->frame_size_ref[picture_idx],
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+ VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL,
+ sizeof(int),
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT));
+ frame_size_buf = (FFVkBuffer*)pd->frame_size_ref[picture_idx]->data;
+ ff_vk_exec_add_dep_buf(vkctx, exec, &pd->frame_size_ref[picture_idx], 1,
1);
+
+ /* Generate barriers and image views for frame images. */
+ RET(ff_vk_exec_add_dep_frame(vkctx, exec, frame,
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+ VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+ RET(ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_INT));
+ ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar,
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+ VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ VK_ACCESS_SHADER_READ_BIT,
+ VK_IMAGE_LAYOUT_GENERAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ /* Submit the image barriers. */
+ vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+ .sType =
VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .pImageMemoryBarriers = img_bar,
+ .imageMemoryBarrierCount =
nb_img_bar,
+ });
+
+ /* Apply FDCT on input image data for future passes */
+ slice_data_info = (SliceDataInfo) {
+ .pictures_per_frame = ctx->pictures_per_frame,
+ .line_add = ctx->pictures_per_frame == 1 ? 0 : picture_idx ^
!(frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST),
+ };
+ for (i = 0; i < ctx->num_planes; i++) {
+ is_chroma = (i == 1 || i == 2);
+ shd = &pv->slice_data_shd[!is_chroma || ctx->chroma_factor ==
CFACTOR_Y444];
+ if (i < 3) {
+ slice_data_info.plane = i;
+ ff_vk_shader_update_desc_buffer(vkctx, exec, shd, 0, 0, 0,
+ slice_data_buf, 0,
slice_data_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_img_array(vkctx, exec, shd, frame, views, 0, 1,
+ VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
+ ff_vk_exec_bind_shader(vkctx, exec, shd);
+ ff_vk_shader_update_push_const(vkctx, exec, shd,
VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, sizeof(SliceDataInfo),
&slice_data_info);
+ vk->CmdDispatch(exec->buf, ctx->slices_width, ctx->mb_height, 1);
+ } else {
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->alpha_data_shd,
0, 0, 0,
+ slice_data_buf, 0,
slice_data_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_img(vkctx, exec, &pv->alpha_data_shd, 0, 1, 0,
views[3],
+ VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+ ff_vk_exec_bind_shader(vkctx, exec, &pv->alpha_data_shd);
+ vk->CmdDispatch(exec->buf, ctx->mb_width, ctx->mb_height, 1);
+ }
+ }
+
+ /* Wait for writes to slice buffer. */
+ vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .pNext = NULL,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = slice_data_buf->buf,
+ .offset = 0U,
+ .size = slice_data_buf->size,
+ },
+ .bufferMemoryBarrierCount = 1u,
+ });
+
+ /* Estimate slice bits and error for each quant */
+ estimate_info = (EstimateSliceInfo) {
+ .slices_per_picture = ctx->slices_per_picture,
+ .min_quant = ctx->force_quant ? ctx->force_quant : min_quant,
+ .max_quant = ctx->force_quant ? ctx->force_quant : max_quant,
+ .bits_per_mb = ctx->bits_per_mb,
+ };
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0,
0, 0,
+ slice_data_buf, 0, slice_data_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0,
1, 0,
+ slice_score_buf, 0, slice_score_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0,
2, 0,
+ &pv->prores_data_tables_buf, 0,
pv->prores_data_tables_buf.size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_exec_bind_shader(vkctx, exec, &pv->estimate_slice_shd);
+
+ ff_vk_shader_update_push_const(vkctx, exec, &pv->estimate_slice_shd,
+ VK_SHADER_STAGE_COMPUTE_BIT, 0,
sizeof(estimate_info),
+ &estimate_info);
+ vk->CmdDispatch(exec->buf, (ctx->slices_per_picture * ctx->num_planes +
estimate_dim_x - 1) / estimate_dim_x,
+ ctx->force_quant ? 1 : (max_quant - min_quant +
1), 1);
+
+ /* Wait for writes to score buffer. */
+ vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .pNext = NULL,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT |
VK_ACCESS_2_SHADER_READ_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT |
VK_ACCESS_2_SHADER_READ_BIT,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = slice_score_buf->buf,
+ .offset = 0U,
+ .size = slice_score_buf->size,
+ },
+ .bufferMemoryBarrierCount = 1u,
+ });
+
+ /* Compute optimal quant value for each slice */
+ trellis_node_info = (TrellisNodeInfo) {
+ .min_quant = min_quant,
+ .max_quant = max_quant,
+ .bits_per_mb = ctx->bits_per_mb,
+ .mbs_per_slice = ctx->mbs_per_slice,
+ };
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 0,
0,
+ frame_size_buf, 0, frame_size_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 1,
0,
+ slice_score_buf, 0, slice_score_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_exec_bind_shader(vkctx, exec, &pv->trellis_node_shd);
+ ff_vk_shader_update_push_const(vkctx, exec, &pv->trellis_node_shd,
VK_SHADER_STAGE_COMPUTE_BIT,
+ 0, sizeof(TrellisNodeInfo),
&trellis_node_info);
+ vk->CmdDispatch(exec->buf, 1, 1, 1);
+
+ /* Wait for writes to quant buffer. */
+ vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .pNext = NULL,
+ .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT |
VK_ACCESS_2_SHADER_READ_BIT,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = frame_size_buf->buf,
+ .offset = 0U,
+ .size = frame_size_buf->size,
+ },
+ .bufferMemoryBarrierCount = 1u,
+ });
+
+ /* Encode slices. */
+ encode_info = (EncodeSliceInfo) {
+ .seek_table = pkt_vk_buf->address,
+ .bytestream = pkt_vk_buf->address + ctx->slices_per_picture * 2,
+ .num_planes = ctx->num_planes,
+ .slices_per_picture = ctx->slices_per_picture,
+ .max_quant = ctx->force_quant ? ctx->force_quant : max_quant,
+ };
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 0,
0,
+ slice_data_buf, 0, slice_data_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 1,
0,
+ slice_score_buf, 0, slice_score_buf->size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 2,
0,
+ &pv->prores_data_tables_buf, 0,
pv->prores_data_tables_buf.size,
+ VK_FORMAT_UNDEFINED);
+ ff_vk_exec_bind_shader(vkctx, exec, &pv->encode_slice_shd);
+ ff_vk_shader_update_push_const(vkctx, exec, &pv->encode_slice_shd,
+ VK_SHADER_STAGE_COMPUTE_BIT, 0,
sizeof(encode_info), &encode_info);
+ vk->CmdDispatch(exec->buf, FFALIGN(ctx->slices_per_picture, 64) / 64,
+ ctx->num_planes, 1);
+
+fail:
+ return err;
+}
+
+static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, AVPacket
*pkt)
+{
+ ProresVulkanContext *pv = avctx->priv_data;
+ ProresContext *ctx = &pv->ctx;
+ VulkanEncodeProresFrameData *pd = exec->opaque;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVulkanFunctions *vk = &vkctx->vkfn;
+ FFVkExecContext *transfer_exec;
+ uint8_t *orig_buf, *buf, *slice_sizes;
+ uint8_t *picture_size_pos;
+ int picture_idx, err = 0;
+ int frame_size, picture_size;
+ int pkt_size = ctx->frame_size_upper_bound;
+ int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY;
+ FFVkBuffer *out_data_buf, *frame_size_buf;
+ VkMappedMemoryRange invalidate_data;
+ AVBufferRef *mapped_ref;
+ FFVkBuffer *mapped_buf;
+
+ /* Allocate packet */
+ RET(ff_get_encode_buffer(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE,
0));
+
+ /* Initialize packet. */
+ pkt->pts = pd->pts;
+ pkt->dts = pd->pts;
+ pkt->duration = pd->duration;
+ pkt->flags |= AV_PKT_FLAG_KEY * pd->key_frame;
+
+ if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
+ pkt->opaque = pd->frame_opaque;
+ pkt->opaque_ref = pd->frame_opaque_ref;
+ pd->frame_opaque_ref = NULL;
+ }
+
+ /* Write frame atom */
+ orig_buf = pkt->data;
+ buf = ff_prores_kostya_write_frame_header(avctx, ctx, &orig_buf,
pd->flags,
+ pd->color_primaries, pd->color_trc,
+ pd->colorspace);
+
+ /* Make sure encoding's done */
+ ff_vk_exec_wait(vkctx, exec);
+
+ /* Roll transfer execution context */
+ if (transfer_slices) {
+ RET(ff_vk_host_map_buffer(vkctx, &mapped_ref, pkt->data, pkt->buf,
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT));
+ mapped_buf = (FFVkBuffer *)mapped_ref->data;
+ transfer_exec = ff_vk_exec_get(vkctx, &pv->transfer_exec_pool);
+ ff_vk_exec_start(vkctx, transfer_exec);
+ }
+
+ for (picture_idx = 0; picture_idx < ctx->pictures_per_frame;
picture_idx++) {
+ /* Fetch buffers for the current picture. */
+ out_data_buf = (FFVkBuffer *)pd->out_data_ref[picture_idx]->data;
+ frame_size_buf = (FFVkBuffer *)pd->frame_size_ref[picture_idx]->data;
+
+ /* Invalidate slice/output data if needed */
+ invalidate_data = (VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .offset = 0,
+ .size = VK_WHOLE_SIZE,
+ };
+ if (!(frame_size_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+ invalidate_data.memory = frame_size_buf->mem;
+ vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1,
&invalidate_data);
+ }
+
+ /* Write picture header */
+ picture_size_pos = buf + 1;
+ buf = ff_prores_kostya_write_picture_header(ctx, buf);
+
+ /* Skip over seek table */
+ slice_sizes = buf;
+ buf += ctx->slices_per_picture * 2;
+
+ /* Calculate final size */
+ buf += *(int*)frame_size_buf->mapped_mem;
+
+ if (transfer_slices) {
+ /* Perform host mapped transfer of slice data */
+ ff_vk_exec_add_dep_buf(vkctx, transfer_exec,
&pd->out_data_ref[picture_idx], 1, 0);
+ ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &mapped_ref, 1, 0);
+ vk->CmdCopyBuffer(transfer_exec->buf, out_data_buf->buf,
mapped_buf->buf, 1, & (VkBufferCopy) {
+ .srcOffset = 0,
+ .dstOffset = mapped_buf->virtual_offset + slice_sizes -
pkt->data,
+ .size = buf - slice_sizes,
+ });
+ } else {
+ /* Fallback to regular memcpy if transfer is not available */
+ if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
{
+ invalidate_data.memory = out_data_buf->mem;
+ vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1,
&invalidate_data);
+ }
+ memcpy(slice_sizes, out_data_buf->mapped_mem, buf - slice_sizes);
+ av_buffer_unref(&pd->out_data_ref[picture_idx]);
+ }
+
+ /* Write picture size with header */
+ picture_size = buf - (picture_size_pos - 1);
+ bytestream_put_be32(&picture_size_pos, picture_size);
+
+ /* Slice output buffers no longer needed */
+ av_buffer_unref(&pd->slice_data_ref[picture_idx]);
+ av_buffer_unref(&pd->slice_score_ref[picture_idx]);
+ av_buffer_unref(&pd->frame_size_ref[picture_idx]);
+ }
+
+ /* Write frame size in header */
+ orig_buf -= 8;
+ frame_size = buf - orig_buf;
+ bytestream_put_be32(&orig_buf, frame_size);
+
+ av_shrink_packet(pkt, frame_size);
+ av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size /
(1024*1024));
+
+ /* Wait for slice transfer */
+ if (transfer_slices) {
+ RET(ff_vk_exec_submit(vkctx, transfer_exec));
+ ff_vk_exec_wait(vkctx, transfer_exec);
+ }
+
+fail:
+ return err;
+}
+
+static int vulkan_encode_prores_receive_packet(AVCodecContext *avctx, AVPacket
*pkt)
+{
+ int err;
+ ProresVulkanContext *pv = avctx->priv_data;
+ ProresContext *ctx = &pv->ctx;
+ VulkanEncodeProresFrameData *pd;
+ FFVkExecContext *exec;
+ AVFrame *frame;
+
+ while (1) {
+ /* Roll an execution context */
+ exec = ff_vk_exec_get(&pv->vkctx, &pv->e);
+
+ /* If it had a frame, immediately output it */
+ if (exec->had_submission) {
+ exec->had_submission = 0;
+ pv->in_flight--;
+ return get_packet(avctx, exec, pkt);
+ }
+
+ /* Get next frame to encode */
+ frame = pv->frame;
+ err = ff_encode_get_frame(avctx, frame);
+ if (err < 0 && err != AVERROR_EOF) {
+ return err;
+ } else if (err == AVERROR_EOF) {
+ if (!pv->in_flight)
+ return err;
+ continue;
+ }
+
+ /* Encode frame */
+ pd = exec->opaque;
+ pd->color_primaries = frame->color_primaries;
+ pd->color_trc = frame->color_trc;
+ pd->colorspace = frame->colorspace;
+ pd->pts = frame->pts;
+ pd->duration = frame->duration;
+ pd->flags = frame->flags;
+ if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
+ pd->frame_opaque = frame->opaque;
+ pd->frame_opaque_ref = frame->opaque_ref;
+ frame->opaque_ref = NULL;
+ }
+
+ err = vulkan_encode_prores_submit_frame(avctx, exec, frame, 0);
+ if (ctx->pictures_per_frame > 1)
+ vulkan_encode_prores_submit_frame(avctx, exec, frame, 1);
+
+ /* Submit execution context */
+ ff_vk_exec_submit(&pv->vkctx, exec);
+ av_frame_unref(frame);
+ if (err < 0)
+ return err;
+
+ pv->in_flight++;
+ if (pv->in_flight < pv->async_depth)
+ return AVERROR(EAGAIN);
+ }
+
+ return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+ ProresVulkanContext *pv = avctx->priv_data;
+ ProresContext *ctx = &pv->ctx;
+ FFVulkanContext *vkctx = &pv->vkctx;
+
+ ff_vk_exec_pool_free(vkctx, &pv->e);
+ ff_vk_exec_pool_free(vkctx, &pv->transfer_exec_pool);
+
+ if (ctx->alpha_bits)
+ ff_vk_shader_free(vkctx, &pv->alpha_data_shd);
+
+ ff_vk_shader_free(vkctx, &pv->slice_data_shd[0]);
+ ff_vk_shader_free(vkctx, &pv->slice_data_shd[1]);
+ ff_vk_shader_free(vkctx, &pv->estimate_slice_shd);
+ ff_vk_shader_free(vkctx, &pv->encode_slice_shd);
+ ff_vk_shader_free(vkctx, &pv->trellis_node_shd);
+
+ ff_vk_free_buf(vkctx, &pv->prores_data_tables_buf);
+
+ av_buffer_pool_uninit(&pv->pkt_buf_pool);
+ av_buffer_pool_uninit(&pv->slice_data_buf_pool);
+ av_buffer_pool_uninit(&pv->slice_score_buf_pool);
+ av_buffer_pool_uninit(&pv->frame_size_buf_pool);
+
+ ff_vk_uninit(vkctx);
+
+ return 0;
+}
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+ ProresVulkanContext *pv = avctx->priv_data;
+ ProresContext *ctx = &pv->ctx;
+ int err = 0, i, q;
+ FFVulkanContext *vkctx = &pv->vkctx;
+ FFVkSPIRVCompiler *spv;
+
+ /* Init vulkan */
+ RET(ff_vk_init(vkctx, avctx, NULL, avctx->hw_frames_ctx));
+
+ pv->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
+ if (!pv->qf) {
+ av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n");
+ return AVERROR(ENOTSUP);
+ }
+
+ spv = ff_vk_spirv_init();
+ if (!spv) {
+ av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
+ return AVERROR_EXTERNAL;
+ }
+
+ RET(ff_vk_exec_pool_init(vkctx, pv->qf, &pv->e, 1, 0, 0, 0, NULL));
+
+ pv->transfer_qf = ff_vk_qf_find(vkctx, VK_QUEUE_TRANSFER_BIT, 0);
+ if (!pv->transfer_qf) {
+ av_log(avctx, AV_LOG_ERROR, "Device has no transfer queues!\n");
+ return err;
+ }
+
+ RET(ff_vk_exec_pool_init(vkctx, pv->transfer_qf, &pv->transfer_exec_pool,
1, 0, 0, 0, NULL));
+
+ /* Init common prores structures */
+ err = ff_prores_kostya_encode_init(avctx, ctx, vkctx->frames->sw_format);
+ if (err < 0)
+ return err;
+
+ /* Temporary frame */
+ pv->frame = av_frame_alloc();
+ if (!pv->frame)
+ return AVERROR(ENOMEM);
+
+ /* Async data pool */
+ pv->async_depth = pv->e.pool_size;
+ pv->exec_ctx_info = av_calloc(pv->async_depth, sizeof(*pv->exec_ctx_info));
+ if (!pv->exec_ctx_info)
+ return AVERROR(ENOMEM);
+ for (int i = 0; i < pv->async_depth; i++)
+ pv->e.contexts[i].opaque = &pv->exec_ctx_info[i];
+
+ /* Compile shaders used by encoder */
+ init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[0],
"slice_data_blocks2", 2);
+ init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[1],
"slice_data_blocks4", 4);
+ init_estimate_slice_pipeline(pv, spv, &pv->estimate_slice_shd,
"estimate_slice");
+ init_trellis_node_pipeline(pv, spv, &pv->trellis_node_shd, "trellis_node");
+ init_encode_slice_pipeline(pv, spv, &pv->encode_slice_shd, "encode_slice");
+ if (ctx->alpha_bits) {
+ init_alpha_data_pipeline(pv, spv, &pv->alpha_data_shd, "alpha_data");
+ }
+
+ /* Create prores data tables uniform buffer. */
+ RET(ff_vk_create_buf(vkctx, &pv->prores_data_tables_buf,
+ sizeof(ProresDataTables), NULL, NULL,
+ VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+ VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
+ RET(ff_vk_map_buffer(vkctx, &pv->prores_data_tables_buf, (void
*)&pv->tables, 0));
+ memcpy(pv->tables->qmat, ctx->quants, sizeof(ctx->quants));
+ memcpy(pv->tables->qmat_chroma, ctx->quants_chroma,
sizeof(ctx->quants_chroma));
+ memcpy(pv->tables->scan, ctx->scantable,
sizeof(ff_prores_progressive_scan));
+ memcpy(pv->tables->dc_codebook, ff_prores_dc_codebook,
sizeof(ff_prores_dc_codebook));
+ memcpy(pv->tables->run_to_cb, ff_prores_run_to_cb,
sizeof(ff_prores_run_to_cb));
+ memcpy(pv->tables->level_to_cb, ff_prores_level_to_cb,
sizeof(ff_prores_level_to_cb));
+
+ for (q = MAX_STORED_Q; q < 128; ++q) {
+ for (i = 0; i < 64; i++) {
+ pv->tables->qmat[q][i] = ctx->quant_mat[i] * q;
+ pv->tables->qmat_chroma[q][i] = ctx->quant_chroma_mat[i] * q;
+ }
+ }
+
+fail:
+ return err;
+}
+
+#define OFFSET(x) offsetof(ProresVulkanContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+ { "mbs_per_slice", "macroblocks per slice", OFFSET(ctx.mbs_per_slice),
+ AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE },
+ { "profile", NULL, OFFSET(ctx.profile), AV_OPT_TYPE_INT,
+ { .i64 = PRORES_PROFILE_AUTO },
+ PRORES_PROFILE_AUTO, PRORES_PROFILE_4444XQ, VE, .unit = "profile" },
+ { "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO
},
+ 0, 0, VE, .unit = "profile" },
+ { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
PRORES_PROFILE_PROXY },
+ 0, 0, VE, .unit = "profile" },
+ { "lt", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT
},
+ 0, 0, VE, .unit = "profile" },
+ { "standard", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
PRORES_PROFILE_STANDARD },
+ 0, 0, VE, .unit = "profile" },
+ { "hq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_HQ
},
+ 0, 0, VE, .unit = "profile" },
+ { "4444", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
PRORES_PROFILE_4444 },
+ 0, 0, VE, .unit = "profile" },
+ { "4444xq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
PRORES_PROFILE_4444XQ },
+ 0, 0, VE, .unit = "profile" },
+ { "vendor", "vendor ID", OFFSET(ctx.vendor),
+ AV_OPT_TYPE_STRING, { .str = "Lavc" }, 0, 0, VE },
+ { "bits_per_mb", "desired bits per macroblock", OFFSET(ctx.bits_per_mb),
+ AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 8192, VE },
+ { "quant_mat", "quantiser matrix", OFFSET(ctx.quant_sel), AV_OPT_TYPE_INT,
+ { .i64 = -1 }, -1, QUANT_MAT_DEFAULT, VE, .unit = "quant_mat" },
+ { "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 },
+ 0, 0, VE, .unit = "quant_mat" },
+ { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_PROXY },
+ 0, 0, VE, .unit = "quant_mat" },
+ { "lt", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_LT },
+ 0, 0, VE, .unit = "quant_mat" },
+ { "standard", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_STANDARD
},
+ 0, 0, VE, .unit = "quant_mat" },
+ { "hq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_HQ },
+ 0, 0, VE, .unit = "quant_mat" },
+ { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_DEFAULT
},
+ 0, 0, VE, .unit = "quant_mat" },
+ { "alpha_bits", "bits for alpha plane", OFFSET(ctx.alpha_bits),
AV_OPT_TYPE_INT,
+ { .i64 = 16 }, 0, 16, VE },
+ { "async_depth", "Internal parallelization depth", OFFSET(async_depth),
AV_OPT_TYPE_INT,
+ { .i64 = 1 }, 1, INT_MAX, VE },
+ { NULL }
+};
+
+static const AVClass proresenc_class = {
+ .class_name = "ProRes vulkan encoder",
+ .item_name = av_default_item_name,
+ .option = options,
+ .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecHWConfigInternal *const prores_ks_hw_configs[] = {
+ HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN),
+ HW_CONFIG_ENCODER_DEVICE(NONE, VULKAN),
+ NULL,
+};
+
+const FFCodec ff_prores_ks_vulkan_encoder = {
+ .p.name = "prores_ks_vulkan",
+ CODEC_LONG_NAME("Apple ProRes (iCodec Pro)"),
+ .p.type = AVMEDIA_TYPE_VIDEO,
+ .p.id = AV_CODEC_ID_PRORES,
+ .priv_data_size = sizeof(ProresVulkanContext),
+ .init = encode_init,
+ .close = encode_close,
+ FF_CODEC_RECEIVE_PACKET_CB(&vulkan_encode_prores_receive_packet),
+ .p.capabilities = AV_CODEC_CAP_DELAY |
+ AV_CODEC_CAP_HARDWARE |
+ AV_CODEC_CAP_ENCODER_FLUSH |
+ AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
+ CODEC_PIXFMTS(AV_PIX_FMT_VULKAN),
+ .hw_configs = prores_ks_hw_configs,
+ .color_ranges = AVCOL_RANGE_MPEG,
+ .p.priv_class = &proresenc_class,
+ .p.profiles = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
+ .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH,
+};
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index d8e1471fa6..f69e430c33 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -9,6 +9,13 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/common.o \
vulkan/ffv1_enc_rct.o
vulkan/ffv1_enc_setup.o \
vulkan/ffv1_rct_search.o
vulkan/ffv1_enc.o
+OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += vulkan/common.o \
+ vulkan/prores_ks_alpha_data.o \
+ vulkan/prores_ks_slice_data.o \
+ vulkan/prores_ks_estimate_slice.o \
+ vulkan/prores_ks_encode_slice.o \
+ vulkan/prores_ks_trellis_node.o
+
OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \
vulkan/rangecoder.o vulkan/ffv1_vlc.o \
vulkan/ffv1_common.o
vulkan/ffv1_reset.o \
diff --git a/libavcodec/vulkan/prores_ks_alpha_data.comp
b/libavcodec/vulkan/prores_ks_alpha_data.comp
new file mode 100644
index 0000000000..825ba28a4f
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_alpha_data.comp
@@ -0,0 +1,67 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_EXT_samplerless_texture_functions : require
+
+/* Table of possible edge slice configurations */
+const uvec3 edge_mps_table[8] = uvec3[](
+ uvec3(0, 0, 0),
+ uvec3(1, 0, 0),
+ uvec3(2, 0, 0),
+ uvec3(2, 1, 0),
+ uvec3(4, 0, 0),
+ uvec3(4, 1, 0),
+ uvec3(4, 2, 0),
+ uvec3(4, 2, 1)
+);
+
+void main()
+{
+ ivec2 coord = min(ivec2(gl_GlobalInvocationID.xy), textureSize(plane, 0) -
ivec2(1));
+ int alpha = texelFetch(plane, coord, 0).x;
+
+#if ALPHA_BITS == 8
+ alpha >>= 2;
+#else
+ alpha = (alpha << 6) | (alpha >> 4);
+#endif
+
+ uint mbs_per_slice = MAX_MBS_PER_SLICE;
+ uint slices_width = WIDTH_IN_MB / mbs_per_slice;
+ uint mb_width = slices_width * mbs_per_slice;
+ uint slice_x = gl_WorkGroupID.x / mbs_per_slice;
+ uint slice_y = gl_WorkGroupID.y;
+ uvec2 slice_base = uvec2(slice_x, slice_y) * (mbs_per_slice * 16u);
+
+ /* Handle slice macroblock size reduction on edge slices */
+ if (gl_WorkGroupID.x >= mb_width)
+ {
+ uint edge_mb = gl_WorkGroupID.x - mb_width;
+ uvec3 table = edge_mps_table[WIDTH_IN_MB - mb_width];
+ uvec3 base = uvec3(0, table.x, table.x + table.y);
+ uint edge_slice = edge_mb < base.y ? 0 : (edge_mb < base.z ? 1 : 2);
+ slice_x += edge_slice;
+ slice_base += base[edge_slice] * (DCTSIZE * 2u);
+ mbs_per_slice = table[edge_slice];
+ }
+
+ uint slice = slice_y * SLICES_PITCH + slice_x;
+ uvec2 coeff_coord = uvec2(coord) - slice_base;
+ uint coeff = coeff_coord.y * (mbs_per_slice * 16u) + coeff_coord.x;
+ slices[slice].coeffs[3][coeff] = int16_t(alpha);
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp
b/libavcodec/vulkan/prores_ks_encode_slice.comp
new file mode 100644
index 0000000000..2c06388a46
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_encode_slice.comp
@@ -0,0 +1,230 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CFACTOR_Y444 3
+
+layout(push_constant, scalar) uniform EncodeSliceInfo {
+ u8buf bytestream;
+ u8vec2buf seek_table;
+ int num_planes;
+ int slices_per_picture;
+ int max_quant;
+};
+
+int av_zero_extend(int a, uint p)
+{
+ return int(uint(a) & ((1U << p) - 1));
+}
+
+void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val)
+{
+ /* number of prefix bits to switch between Rice and expGolomb */
+ uint switch_bits = (codebook & 3) + 1;
+ uint rice_order = codebook >> 5; /* rice code order */
+ uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */
+
+ uint switch_val = switch_bits << rice_order;
+
+ if (val >= switch_val) {
+ val -= int(switch_val - (1 << exp_order));
+ int exponent = findMSB(val);
+
+ put_bits(pb, exponent - exp_order + switch_bits, 0);
+ put_bits(pb, exponent + 1, val);
+ } else {
+ int exponent = val >> rice_order;
+ if (exponent != 0)
+ put_bits(pb, exponent, 0);
+ put_bits(pb, 1, 1);
+ if (rice_order != 0)
+ put_bits(pb, rice_order, av_zero_extend(val, rice_order));
+ }
+}
+
+#define GET_SIGN(x) ((x) >> 31)
+#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
+
+#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits
= 0
+
+void encode_dcs(inout PutBitContext pb, bool is_chroma, int q)
+{
+ uint slice = gl_GlobalInvocationID.x;
+ uint plane = gl_GlobalInvocationID.y;
+ uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+ uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+ int codebook = 5;
+ int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0];
+ int coeff = slices[slice].coeffs[plane][0];
+ int prev_dc = (coeff - 0x4000) / scale;
+ encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc));
+ int sign = 0;
+ for (int i = 1; i < blocks_per_slice; i++) {
+ coeff = slices[slice].coeffs[plane][i * 64];
+ int dc = (coeff - 0x4000) / scale;
+ int delta = dc - prev_dc;
+ int new_sign = GET_SIGN(delta);
+ delta = (delta ^ sign) - sign;
+ int code = MAKE_CODE(delta);
+ encode_vlc_codeword(pb, dc_codebook[codebook], code);
+ codebook = min(code, 6);
+ sign = new_sign;
+ prev_dc = dc;
+ }
+}
+
+void encode_acs(inout PutBitContext pb, bool is_chroma, int q)
+{
+ uint slice = gl_GlobalInvocationID.x;
+ uint plane = gl_GlobalInvocationID.y;
+ uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+ uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+ uint max_coeffs = blocks_per_slice << 6;
+ int prev_run = 4;
+ int prev_level = 2;
+ int run = 0;
+
+ for (int i = 1; i < 64; i++) {
+ for (int idx = scan[i]; idx < max_coeffs; idx += 64) {
+ int coeff = slices[slice].coeffs[plane][idx];
+ int level = coeff / (is_chroma ? qmat_chroma[q][scan[i]] :
qmat[q][scan[i]]);
+ if (level != 0) {
+ int abs_level = abs(level);
+ encode_vlc_codeword(pb, run_to_cb[prev_run], run);
+ encode_vlc_codeword(pb, level_to_cb[prev_level], abs_level -
1);
+ put_bits(pb, 1, av_zero_extend(GET_SIGN(level), 1));
+ prev_run = min(run, 15);
+ prev_level = min(abs_level, 9);
+ run = 0;
+ } else {
+ run++;
+ }
+ }
+ }
+}
+
+void encode_slice_plane(inout PutBitContext pb, int q)
+{
+ uint plane = gl_GlobalInvocationID.y;
+ bool is_chroma = plane == 1 || plane == 2;
+ encode_dcs(pb, is_chroma, q);
+ encode_acs(pb, is_chroma, q);
+}
+
+void put_alpha_diff(inout PutBitContext pb, int cur, int prev)
+{
+ const int dbits = (ALPHA_BITS == 8) ? 4 : 7;
+ const int dsize = 1 << dbits - 1;
+ int diff = cur - prev;
+
+ diff = av_zero_extend(diff, ALPHA_BITS);
+ if (diff >= (1 << ALPHA_BITS) - dsize)
+ diff -= 1 << ALPHA_BITS;
+ if (diff < -dsize || diff > dsize || diff == 0) {
+ put_bits(pb, 1, 1);
+ put_bits(pb, ALPHA_BITS, diff);
+ } else {
+ put_bits(pb, 1, 0);
+ put_bits(pb, dbits - 1, abs(diff) - 1);
+ put_bits(pb, 1, int(diff < 0));
+ }
+}
+
+void put_alpha_run(inout PutBitContext pb, int run)
+{
+ if (run != 0) {
+ put_bits(pb, 1, 0);
+ if (run < 0x10)
+ put_bits(pb, 4, run);
+ else
+ put_bits(pb, 15, run);
+ } else {
+ put_bits(pb, 1, 1);
+ }
+}
+
+void encode_alpha_plane(inout PutBitContext pb)
+{
+ uint slice = gl_GlobalInvocationID.x;
+ const int mask = (1 << ALPHA_BITS) - 1;
+ const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
+ int prev = mask, cur;
+ int idx = 0;
+ int run = 0;
+
+ cur = slices[slice].coeffs[3][idx++];
+ put_alpha_diff(pb, cur, prev);
+ prev = cur;
+ do {
+ cur = slices[slice].coeffs[3][idx++];
+ if (cur != prev) {
+ put_alpha_run(pb, run);
+ put_alpha_diff(pb, cur, prev);
+ prev = cur;
+ run = 0;
+ } else {
+ run++;
+ }
+ } while (idx < num_coeffs);
+ put_alpha_run(pb, run);
+}
+
+u8vec2 byteswap16(int value)
+{
+ return unpack8(uint16_t(value)).yx;
+}
+
+void main()
+{
+ uint slice = gl_GlobalInvocationID.x;
+ if (slice >= slices_per_picture)
+ return;
+
+ uint plane = gl_GlobalInvocationID.y;
+ int q = scores[slice].quant;
+ int q_idx = min(q, max_quant + 1);
+ int slice_hdr_size = 2 * num_planes;
+ int slice_size = slice_hdr_size + (scores[slice].total_bits[q_idx] / 8);
+ u8buf buf = OFFBUF(u8buf, bytestream, scores[slice].buf_start);
+
+ /* Write slice header */
+ if (plane == 0)
+ {
+ buf[0].v = uint8_t(slice_hdr_size * 8);
+ buf[1].v = uint8_t(q);
+ u8vec2buf slice_hdr = OFFBUF(u8vec2buf, buf, 2);
+ for (int i = 0; i < num_planes - 1; i++)
+ {
+ int bits = scores[slice].bits[q_idx][i] / 8;
+ slice_hdr[i].v = byteswap16(bits);
+ }
+ seek_table[slice].v = byteswap16(slice_size);
+ }
+
+ int plane_offset = 0;
+ for (int i = 0; i < plane; ++i)
+ plane_offset += scores[slice].bits[q_idx][i] / 8;
+
+ /* Encode slice plane */
+ PutBitContext pb;
+ init_put_bits(pb, OFFBUF(u8buf, buf, slice_hdr_size + plane_offset), 0);
+ if (plane == 3)
+ encode_alpha_plane(pb);
+ else
+ encode_slice_plane(pb, q);
+ flush_put_bits(pb);
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp
b/libavcodec/vulkan/prores_ks_estimate_slice.comp
new file mode 100644
index 0000000000..5f9b39cd75
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp
@@ -0,0 +1,267 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_KHR_shader_subgroup_clustered : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+
+#define CFACTOR_Y444 3
+
+layout(push_constant, scalar) uniform EstimateSliceInfo {
+ uint slices_per_picture;
+ uint min_quant;
+ uint max_quant;
+ uint bits_per_mb;
+};
+
+int av_zero_extend(int a, uint p)
+{
+ return int(uint(a) & ((1U << p) - 1));
+}
+
+#define GET_SIGN(x) ((x) >> 31)
+#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
+
+int estimate_vlc(uint codebook, int val)
+{
+ /* number of prefix bits to switch between Rice and expGolomb */
+ uint switch_bits = (codebook & 3) + 1;
+ uint rice_order = codebook >> 5; /* rice code order */
+ uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */
+
+ uint switch_val = switch_bits << rice_order;
+
+ if (val >= switch_val)
+ {
+ val -= int(switch_val - (1 << exp_order));
+ int exponent = findMSB(val);
+ return int(exponent * 2 - exp_order + switch_bits + 1);
+ }
+ else
+ {
+ return int((val >> rice_order) + rice_order + 1);
+ }
+}
+
+#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits
= 0
+
+int estimate_dcs(inout int error, uint slice, uint plane, uint q)
+{
+ uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+ uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+ int codebook = 5;
+ int coeff = slices[slice].coeffs[plane][0];
+ int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0];
+ int prev_dc = (coeff - 0x4000) / scale;
+ int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc));
+ int sign = 0;
+ coeff = slices[slice].coeffs[plane][64];
+ error += abs(coeff - 0x4000) % scale;
+
+ for (int i = 1; i < blocks_per_slice; ++i) {
+ coeff = slices[slice].coeffs[plane][i * 64];
+ int dc = (coeff - 0x4000) / scale;
+ error += abs(coeff - 0x4000) % scale;
+ int delta = dc - prev_dc;
+ int new_sign = GET_SIGN(delta);
+ delta = (delta ^ sign) - sign;
+ int code = MAKE_CODE(delta);
+ bits += estimate_vlc(dc_codebook[codebook], code);
+ codebook = min(code, 6);
+ sign = new_sign;
+ prev_dc = dc;
+ }
+
+ return bits;
+}
+
+#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+#define SCORE_LIMIT 1073741823
+
+int estimate_acs(inout int error, uint slice, uint plane, uint q)
+{
+ uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+ uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+ uint max_coeffs = blocks_per_slice << 6;
+ int prev_run = 4;
+ int prev_level = 2;
+ int bits = 0;
+ int run = 0;
+
+ for (int i = 1; i < 64; i++) {
+ for (int idx = scan[i]; idx < max_coeffs; idx += 64) {
+ int coeff = slices[slice].coeffs[plane][idx];
+ int quant = plane != 0 ? qmat_chroma[q][scan[i]] :
qmat[q][scan[i]];
+ int level = coeff / quant;
+ error += abs(coeff) % quant;
+ if (level != 0) {
+ int abs_level = abs(level);
+ bits += estimate_vlc(run_to_cb[prev_run], run);
+ bits += estimate_vlc(level_to_cb[prev_level], abs_level - 1) +
1;
+ prev_run = min(run, 15);
+ prev_level = min(abs_level, 9);
+ run = 0;
+ } else {
+ run++;
+ }
+ }
+ }
+
+ return bits;
+}
+
+int estimate_slice_plane(inout int error, uint slice, uint plane, uint q)
+{
+ int bits = 0;
+ bits += estimate_dcs(error, slice, plane, q);
+ bits += estimate_acs(error, slice, plane, q);
+ return FFALIGN(bits, 8);
+}
+
+int est_alpha_diff(int cur, int prev)
+{
+ const int dbits = (ALPHA_BITS == 8) ? 4 : 7;
+ const int dsize = 1 << dbits - 1;
+ int diff = cur - prev;
+
+ diff = av_zero_extend(diff, ALPHA_BITS);
+ if (diff >= (1 << ALPHA_BITS) - dsize)
+ diff -= 1 << ALPHA_BITS;
+ if (diff < -dsize || diff > dsize || diff == 0)
+ return ALPHA_BITS + 1;
+ else
+ return dbits + 1;
+}
+
+int estimate_alpha_plane(uint slice)
+{
+ const int mask = (1 << ALPHA_BITS) - 1;
+ const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
+ int prev = mask, cur;
+ int idx = 0;
+ int run = 0;
+ int bits;
+
+ cur = slices[slice].coeffs[3][idx++];
+ bits = est_alpha_diff(cur, prev);
+ prev = cur;
+ do {
+ cur = slices[slice].coeffs[3][idx++];
+ if (cur != prev) {
+ if (run == 0)
+ bits++;
+ else if (run < 0x10)
+ bits += 4;
+ else
+ bits += 15;
+ bits += est_alpha_diff(cur, prev);
+ prev = cur;
+ run = 0;
+ } else {
+ run++;
+ }
+ } while (idx < num_coeffs);
+
+ if (run != 0) {
+ if (run < 0x10)
+ bits += 4;
+ else
+ bits += 15;
+ }
+
+ return bits;
+}
+
+int sum_of_planes(int value)
+{
+#if NUM_PLANES == 3
+ uint base = (gl_SubgroupInvocationID / 3) * 3;
+ return subgroupShuffle(value, base) + subgroupShuffle(value, base + 1) +
subgroupShuffle(value, base + 2);
+#else
+ return subgroupClusteredAdd(value, 4);
+#endif
+}
+
+void main()
+{
+ uint slice = gl_GlobalInvocationID.x / NUM_PLANES;
+ uint plane = gl_LocalInvocationID.x % NUM_PLANES;
+ uint q = min_quant + gl_GlobalInvocationID.y;
+ if (slice >= slices_per_picture)
+ return;
+
+ /* Estimate slice bits and error for specified quantizer and plane */
+ int error = 0;
+ int bits = 0;
+ if (plane == 3)
+ bits = estimate_alpha_plane(slice);
+ else
+ bits = estimate_slice_plane(error, slice, plane, q);
+
+ /* Write results to score buffer */
+ scores[slice].bits[q][plane] = bits;
+ scores[slice].score[q][plane] = error;
+
+ /* Accumulate total bits and error of all planes */
+ int total_bits = sum_of_planes(bits);
+ int total_score = sum_of_planes(error);
+ if (total_bits > 65000 * 8)
+ total_score = SCORE_LIMIT;
+ scores[slice].total_bits[q] = total_bits;
+ scores[slice].total_score[q] = total_score;
+
+ if (q != max_quant)
+ return;
+
+ /* Task threads that computed max_quant to also compute overquant if
necessary */
+ uint mbs_per_slice = slices[slice].mbs_per_slice;
+ if (total_bits <= bits_per_mb * mbs_per_slice)
+ {
+ /* Overquant isn't needed for this slice */
+ scores[slice].total_bits[max_quant + 1] = total_bits;
+ scores[slice].total_score[max_quant + 1] = total_score + 1;
+ scores[slice].overquant = max_quant;
+ }
+ else
+ {
+ /* Keep searching until an encoding fits our budget */
+ for (q = max_quant + 1; q < 128; ++q)
+ {
+ /* Estimate slice bits and error for specified quantizer and plane
*/
+ error = 0;
+ bits = 0;
+ if (plane == 3)
+ bits = estimate_alpha_plane(slice);
+ else
+ bits = estimate_slice_plane(error, slice, plane, q);
+
+ /* Accumulate total bits and error of all planes */
+ total_bits = sum_of_planes(bits);
+ total_score = sum_of_planes(error);
+
+ /* If estimated bits fit within budget, we are done */
+ if (total_bits <= bits_per_mb * mbs_per_slice)
+ break;
+ }
+
+ scores[slice].bits[max_quant + 1][plane] = bits;
+ scores[slice].score[max_quant + 1][plane] = error;
+ scores[slice].total_bits[max_quant + 1] = total_bits;
+ scores[slice].total_score[max_quant + 1] = total_score;
+ scores[slice].overquant = q;
+ }
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_slice_data.comp
b/libavcodec/vulkan/prores_ks_slice_data.comp
new file mode 100644
index 0000000000..6a943532c5
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_slice_data.comp
@@ -0,0 +1,265 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_EXT_samplerless_texture_functions : require
+
+layout(push_constant, scalar) uniform SliceDataInfo {
+ int plane;
+ int pictures_per_frame;
+ int line_add;
+};
+
+shared i16vec4 coeffs[MAX_MBS_PER_SLICE][BLOCKS_PER_MB][DCTSIZE][DCTSIZE / 4];
+
+#define CONST_BITS 13
+#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
+#define OUT_SHIFT (PASS1_BITS + 1)
+
+#define FIX_0_541196100 4433 /* FIX(0.541196100) */
+#define FIX_0_765366865 6270 /* FIX(0.765366865) */
+#define FIX_1_847759065 15137 /* FIX(1.847759065) */
+#define FIX_1_175875602 9633 /* FIX(1.175875602) */
+#define FIX_0_298631336 2446 /* FIX(0.298631336) */
+#define FIX_3_072711026 25172 /* FIX(3.072711026) */
+#define FIX_1_501321110 12299 /* FIX(1.501321110) */
+#define FIX_0_899976223 7373 /* FIX(0.899976223) */
+#define FIX_1_961570560 16069 /* FIX(1.961570560) */
+#define FIX_2_053119869 16819 /* FIX(2.053119869) */
+#define FIX_2_562915447 20995 /* FIX(2.562915447) */
+#define FIX_0_390180644 3196 /* FIX(0.390180644) */
+
+#define MULTIPLY(type, var, cons) type(uint32_t(var) * uint32_t(cons))
+#define RIGHT_SHIFT(x, n) ((x) >> (n))
+#define DESCALE(x,n) RIGHT_SHIFT(int32_t(x) + (1 << ((n) - 1)), n)
+
+void row_fdct(i32vec4 data_lo, i32vec4 data_hi)
+{
+ uint row_idx = gl_LocalInvocationID.x;
+ uint block = gl_LocalInvocationID.y;
+ uint mb = gl_LocalInvocationID.z;
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ int32_t tmp0 = data_lo.x + data_hi.w;
+ int32_t tmp7 = data_lo.x - data_hi.w;
+ int32_t tmp1 = data_lo.y + data_hi.z;
+ int32_t tmp6 = data_lo.y - data_hi.z;
+ int32_t tmp2 = data_lo.z + data_hi.y;
+ int32_t tmp5 = data_lo.z - data_hi.y;
+ int32_t tmp3 = data_lo.w + data_hi.x;
+ int32_t tmp4 = data_lo.w - data_hi.x;
+
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+ int32_t tmp10 = tmp0 + tmp3;
+ int32_t tmp13 = tmp0 - tmp3;
+ int32_t tmp11 = tmp1 + tmp2;
+ int32_t tmp12 = tmp1 - tmp2;
+
+ data_lo.x = (tmp10 + tmp11) * (1 << PASS1_BITS);
+ data_hi.x = (tmp10 - tmp11) * (1 << PASS1_BITS);
+
+ uint32_t z1 = MULTIPLY(uint32_t, tmp12 + tmp13, FIX_0_541196100);
+ data_lo.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp13, FIX_0_765366865),
CONST_BITS-PASS1_BITS);
+ data_hi.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp12, -FIX_1_847759065),
CONST_BITS-PASS1_BITS);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * cK represents cos(K*pi/16).
+ * i0..i3 in the paper are tmp4..tmp7 here.
+ */
+ z1 = tmp4 + tmp7;
+ uint32_t z2 = tmp5 + tmp6;
+ uint32_t z3 = tmp4 + tmp6;
+ uint32_t z4 = tmp5 + tmp7;
+ uint32_t z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+ tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) *
(-c1+c3+c5-c7) */
+ tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * (
c1+c3-c5+c7) */
+ tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * (
c1+c3+c5-c7) */
+ tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * (
c1+c3-c5-c7) */
+ z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
+ z3 += z5;
+ z4 += z5;
+
+ data_hi.w = DESCALE(uint32_t(tmp4) + z1 + z3, CONST_BITS - PASS1_BITS);
+ data_hi.y = DESCALE(uint32_t(tmp5) + z2 + z4, CONST_BITS - PASS1_BITS);
+ data_lo.w = DESCALE(uint32_t(tmp6) + z2 + z3, CONST_BITS - PASS1_BITS);
+ data_lo.y = DESCALE(uint32_t(tmp7) + z1 + z4, CONST_BITS - PASS1_BITS);
+
+ coeffs[mb][block][row_idx][0] = i16vec4(data_lo);
+ coeffs[mb][block][row_idx][1] = i16vec4(data_hi);
+}
+
+void ff_jpeg_fdct_islow_10()
+{
+ uint col_half = gl_LocalInvocationID.x / 4;
+ uint col = gl_LocalInvocationID.x & 3u;
+ uint block = gl_LocalInvocationID.y;
+ uint mb = gl_LocalInvocationID.z;
+
+ i16vec4 col_lo = i16vec4(coeffs[mb][block][0][col_half][col],
+ coeffs[mb][block][1][col_half][col],
+ coeffs[mb][block][2][col_half][col],
+ coeffs[mb][block][3][col_half][col]);
+ i16vec4 col_hi = i16vec4(coeffs[mb][block][4][col_half][col],
+ coeffs[mb][block][5][col_half][col],
+ coeffs[mb][block][6][col_half][col],
+ coeffs[mb][block][7][col_half][col]);
+ i32vec4 data_lo = i32vec4(col_lo);
+ i32vec4 data_hi = i32vec4(col_hi);
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ */
+ int32_t tmp0 = data_lo.x + data_hi.w;
+ int32_t tmp7 = data_lo.x - data_hi.w;
+ int32_t tmp1 = data_lo.y + data_hi.z;
+ int32_t tmp6 = data_lo.y - data_hi.z;
+ int32_t tmp2 = data_lo.z + data_hi.y;
+ int32_t tmp5 = data_lo.z - data_hi.y;
+ int32_t tmp3 = data_lo.w + data_hi.x;
+ int32_t tmp4 = data_lo.w - data_hi.x;
+
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+ int32_t tmp10 = tmp0 + tmp3;
+ int32_t tmp13 = tmp0 - tmp3;
+ int32_t tmp11 = tmp1 + tmp2;
+ int32_t tmp12 = tmp1 - tmp2;
+
+ data_lo.x = DESCALE(tmp10 + tmp11, OUT_SHIFT);
+ data_hi.x = DESCALE(tmp10 - tmp11, OUT_SHIFT);
+
+ uint32_t z1 = uint32_t((tmp12 + tmp13) * FIX_0_541196100);
+ data_lo.z = DESCALE(z1 + uint32_t(tmp13 * FIX_0_765366865), CONST_BITS +
OUT_SHIFT);
+ data_hi.z = DESCALE(z1 + uint32_t(tmp12 * (-FIX_1_847759065)), CONST_BITS
+ OUT_SHIFT);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * cK represents cos(K*pi/16).
+ * i0..i3 in the paper are tmp4..tmp7 here.
+ */
+ z1 = tmp4 + tmp7;
+ uint32_t z2 = tmp5 + tmp6;
+ uint32_t z3 = tmp4 + tmp6;
+ uint32_t z4 = tmp5 + tmp7;
+ uint32_t z5 = MULTIPLY(uint32_t, z3 + z4, FIX_1_175875602); /* sqrt(2) *
c3 */
+
+ tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) *
(-c1+c3+c5-c7) */
+ tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * (
c1+c3-c5+c7) */
+ tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * (
c1+c3+c5-c7) */
+ tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * (
c1+c3-c5-c7) */
+ z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
+ z3 += z5;
+ z4 += z5;
+
+ data_hi.w = DESCALE(tmp4 + z1 + z3, CONST_BITS + OUT_SHIFT);
+ data_hi.y = DESCALE(tmp5 + z2 + z4, CONST_BITS + OUT_SHIFT);
+ data_lo.w = DESCALE(tmp6 + z2 + z3, CONST_BITS + OUT_SHIFT);
+ data_lo.y = DESCALE(tmp7 + z1 + z4, CONST_BITS + OUT_SHIFT);
+
+ col_lo = i16vec4(data_lo);
+ col_hi = i16vec4(data_hi);
+ coeffs[mb][block][0][col_half][col] = col_lo.x;
+ coeffs[mb][block][1][col_half][col] = col_lo.y;
+ coeffs[mb][block][2][col_half][col] = col_lo.z;
+ coeffs[mb][block][3][col_half][col] = col_lo.w;
+ coeffs[mb][block][4][col_half][col] = col_hi.x;
+ coeffs[mb][block][5][col_half][col] = col_hi.y;
+ coeffs[mb][block][6][col_half][col] = col_hi.z;
+ coeffs[mb][block][7][col_half][col] = col_hi.w;
+}
+
+/* Table of possible edge slice configurations */
+const uvec3 edge_mps_table[8] = uvec3[](
+ uvec3(0, 0, 0),
+ uvec3(1, 0, 0),
+ uvec3(2, 0, 0),
+ uvec3(2, 1, 0),
+ uvec3(4, 0, 0),
+ uvec3(4, 1, 0),
+ uvec3(4, 2, 0),
+ uvec3(4, 2, 1)
+);
+
+void main()
+{
+ bool is_chroma = plane == 1 || plane == 2;
+ uint row_idx = gl_LocalInvocationID.x;
+ uint block = gl_LocalInvocationID.y;
+ uint macroblock = gl_LocalInvocationID.z;
+ uint slice_x = gl_WorkGroupID.x;
+
+ /* Calculate the current thread coordinate in input plane */
+ uint mbs_per_slice = MAX_MBS_PER_SLICE;
+ uint mb_width = 4u * BLOCKS_PER_MB;
+ uint slices_width = WIDTH_IN_MB / MAX_MBS_PER_SLICE;
+ uvec2 slice_base = gl_WorkGroupID.xy * uvec2(MAX_MBS_PER_SLICE * mb_width,
DCTSIZE * 2u);
+
+ /* Handle slice macroblock size reduction on edge slices */
+ if (slice_x >= slices_width)
+ {
+ uint edge_slice = slice_x - slices_width;
+ uvec3 table = edge_mps_table[WIDTH_IN_MB - slices_width *
MAX_MBS_PER_SLICE];
+ uvec3 base = uvec3(0u, table.x, table.x + table.y);
+ slice_base.x = (MAX_MBS_PER_SLICE * slices_width + base[edge_slice]) *
mb_width;
+ mbs_per_slice = table[edge_slice];
+ }
+
+ uvec2 mb_base = slice_base + uvec2(macroblock * mb_width, 0u);
+ uvec2 block_coord = is_chroma ? uvec2(block >> 1u, block & 1u) :
uvec2(block & 1u, block >> 1u);
+ ivec2 coord = ivec2(mb_base + block_coord * DCTSIZE + uvec2(0u, row_idx));
+ ivec2 size = textureSize(planes[plane], 0);
+ coord.y = coord.y * pictures_per_frame + line_add;
+ coord = min(coord, size - ivec2(1));
+
+ /* Load coefficients from input planes */
+ i32vec4 row_lo;
+ row_lo.x = texelFetchOffset(planes[plane], coord, 0, ivec2(0, 0)).x;
+ row_lo.y = texelFetchOffset(planes[plane], coord, 0, ivec2(1, 0)).x;
+ row_lo.z = texelFetchOffset(planes[plane], coord, 0, ivec2(2, 0)).x;
+ row_lo.w = texelFetchOffset(planes[plane], coord, 0, ivec2(3, 0)).x;
+
+ i32vec4 row_hi;
+ row_hi.x = texelFetchOffset(planes[plane], coord, 0, ivec2(4, 0)).x;
+ row_hi.y = texelFetchOffset(planes[plane], coord, 0, ivec2(5, 0)).x;
+ row_hi.z = texelFetchOffset(planes[plane], coord, 0, ivec2(6, 0)).x;
+ row_hi.w = texelFetchOffset(planes[plane], coord, 0, ivec2(7, 0)).x;
+
+ /* Perform DCT on the coefficients */
+ row_fdct(row_lo, row_hi);
+ ff_jpeg_fdct_islow_10();
+ barrier();
+
+ /* Store DCT result to slice buffer */
+ uint slice = gl_WorkGroupID.y * gl_NumWorkGroups.x + gl_WorkGroupID.x;
+ uint slice_row = macroblock * BLOCKS_PER_MB * DCTSIZE + block * DCTSIZE +
row_idx;
+ slices[slice].mbs_per_slice = mbs_per_slice;
+ slices[slice].rows[plane][slice_row] = coeffs[macroblock][block][row_idx];
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_trellis_node.comp
b/libavcodec/vulkan/prores_ks_trellis_node.comp
new file mode 100644
index 0000000000..052e47ac5f
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_trellis_node.comp
@@ -0,0 +1,177 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
+layout(push_constant, scalar) uniform TrellisNodeInfo {
+ int min_quant;
+ int max_quant;
+ int mbs_per_slice;
+ int bits_per_mb;
+};
+
+#define TRELLIS_WIDTH 16
+#define SCORE_LIMIT 1073741823
+
+struct TrellisNode {
+ int prev_node;
+ int quant;
+ int bits;
+ int score;
+};
+
+shared int subgroup_sizes[NUM_SUBGROUPS];
+
+int slice_sizes[SLICES_WIDTH];
+
+TrellisNode nodes[(SLICES_WIDTH + 1) * TRELLIS_WIDTH];
+
+int find_slice_quant(int slice_x)
+{
+ int slice = int(gl_LocalInvocationID.x) * SLICES_WIDTH + slice_x;
+
+ int trellis_node = int(slice_x + 1) * TRELLIS_WIDTH;
+ for (int q = min_quant; q < max_quant + 2; q++)
+ {
+ nodes[trellis_node + q].prev_node = -1;
+ nodes[trellis_node + q].quant = q;
+ }
+
+ int mbs = int(slice_x + 1) * mbs_per_slice;
+ nodes[trellis_node + max_quant + 1].quant = scores[slice].overquant;
+
+ int bits_limit = mbs * bits_per_mb;
+ for (int pq = min_quant; pq < max_quant + 2; pq++)
+ {
+ int prev = trellis_node - TRELLIS_WIDTH + pq;
+ for (int q = min_quant; q < max_quant + 2; q++)
+ {
+ int cur = trellis_node + q;
+ int bits = nodes[prev].bits + scores[slice].total_bits[q];
+ int error = scores[slice].total_score[q];
+ if (bits > bits_limit)
+ error = SCORE_LIMIT;
+
+ int new_score;
+ if (nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT)
+ new_score = nodes[prev].score + error;
+ else
+ new_score = SCORE_LIMIT;
+ if (nodes[cur].prev_node == -1 || nodes[cur].score >= new_score)
+ {
+ nodes[cur].bits = bits;
+ nodes[cur].score = new_score;
+ nodes[cur].prev_node = prev;
+ }
+ }
+ }
+
+ int error = nodes[trellis_node + min_quant].score;
+ int pq = trellis_node + min_quant;
+ for (int q = min_quant + 1; q < max_quant + 2; q++)
+ {
+ if (nodes[trellis_node + q].score <= error)
+ {
+ error = nodes[trellis_node + q].score;
+ pq = trellis_node + q;
+ }
+ }
+
+ return pq;
+}
+
+int find_slice_row_quants()
+{
+ for (int i = min_quant; i < max_quant + 2; i++)
+ {
+ nodes[i].prev_node = -1;
+ nodes[i].bits = 0;
+ nodes[i].score = 0;
+ }
+
+ int q = 0;
+ for (int slice_x = 0; slice_x < SLICES_WIDTH; ++slice_x)
+ {
+ q = find_slice_quant(slice_x);
+ }
+
+ int slice_hdr_size = 2 * NUM_PLANES;
+ int slice_row_size = slice_hdr_size * SLICES_WIDTH;
+ int y = int(gl_LocalInvocationID.x);
+ for (int x = SLICES_WIDTH - 1; x >= 0; x--)
+ {
+ int slice = x + y * SLICES_WIDTH;
+ int quant = nodes[q].quant;
+ int q_idx = min(quant, max_quant + 1);
+ slice_sizes[x] = scores[slice].total_bits[q_idx] / 8;
+ slice_row_size += slice_sizes[x];
+ scores[slice].quant = quant;
+ q = nodes[q].prev_node;
+ }
+
+ return slice_row_size;
+}
+
+int force_slice_row_quants()
+{
+ int slice_hdr_size = 2 * NUM_PLANES;
+ int slice_row_size = slice_hdr_size * SLICES_WIDTH;
+ int y = int(gl_LocalInvocationID.x);
+ for (int x = SLICES_WIDTH - 1; x >= 0; x--)
+ {
+ int slice = x + y * SLICES_WIDTH;
+ slice_sizes[x] = scores[slice].total_bits[FORCE_QUANT] / 8;
+ slice_row_size += slice_sizes[x];
+ scores[slice].quant = FORCE_QUANT;
+ }
+
+ return slice_row_size;
+}
+
+void main()
+{
+#if FORCE_QUANT == 0
+ int slice_row_size = find_slice_row_quants();
+#else
+ int slice_row_size = force_slice_row_quants();
+#endif
+
+ int subgroup_sum = subgroupAdd(slice_row_size);
+ subgroup_sizes[gl_SubgroupID] = subgroup_sum;
+ barrier();
+
+ int buf_start = subgroupExclusiveAdd(slice_row_size);
+ [[unroll]] for (int i = 0; i < NUM_SUBGROUPS; ++i)
+ {
+ if (i >= gl_SubgroupID)
+ break;
+ buf_start += subgroup_sizes[i];
+ }
+
+ int slice_hdr_size = 2 * NUM_PLANES;
+ int y = int(gl_LocalInvocationID.x);
+ [[unroll]] for (int x = 0; x < SLICES_WIDTH; ++x)
+ {
+ int slice = x + y * SLICES_WIDTH;
+ scores[slice].buf_start = buf_start;
+ buf_start += slice_hdr_size + slice_sizes[x];
+ }
+
+ if (y == gl_WorkGroupSize.x - 1)
+ frame_size = buf_start;
+}
\ No newline at end of file
--
2.50.1
_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]