From 4ff36d26f69ae9f72347b8e276e71925b2633fb6 Mon Sep 17 00:00:00 2001
From: Ben Chang <benc@nvidia.com>
Date: Thu, 22 Jun 2017 16:23:38 -0700
Subject: [PATCH] NVENC_Input_Surface_Reduction_Optimization

This patch aims to reduce the number of input/output surfaces
NVENC allocates per session. Previous default sets allocated surfaces to 32
(unless there is user specified param or lookahead involved). Having large
number of surfaces consumes extra video memory (esp for higher resolution
encoding). The patch changes the surfaces calculation for default, B-frames,
lookahead scenario respectively.

Change in location of surfaces and async_depth determination. Putting it prior
to rate control setup. Current code limits lookahead depth to be 27
(default surface (32) - P interval (1) - 4).

The other change involves surface selection. Previously, if a session
allocates x surfaces, only x-1 surfaces are used (due to combination
of output delay and lock toggle logic). To prevent unused surfaces,
changing surface rotation to using predefined fifo.

Miscellaneous -  add "auto" option in level.
---
 libavcodec/nvenc.c      | 70 ++++++++++++++++++++++++++++++++++++++-----------
 libavcodec/nvenc.h      |  3 +--
 libavcodec/nvenc_h264.c |  5 ++--
 libavcodec/nvenc_hevc.c |  5 ++--
 4 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index f16e509..386f8d9 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -956,6 +956,8 @@ static int nvenc_setup_encoder(AVCodecContext *avctx)
     ctx->initial_pts[0] = AV_NOPTS_VALUE;
     ctx->initial_pts[1] = AV_NOPTS_VALUE;
 
+    nvenc_recalc_surfaces(avctx);
+
     nvenc_setup_rate_control(avctx);
 
     if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
@@ -982,10 +984,50 @@ static int nvenc_setup_encoder(AVCodecContext *avctx)
     return 0;
 }
 
+static int nvenc_recalc_surfaces(AVCodecContext *avctx)
+{
+    NVENCContext *ctx = avctx->priv_data;
+    // default minimum of 4 surfaces
+    // multiply by 2 for number of NVENCs on gpu (hardcode to 2)
+    // another multiply by 2 to avoid blocking next PBB group
+    int nb_surfaces = FFMAX(4, ctx->config.frameIntervalP * 2 * 2);
+
+    // lookahead enabled
+    if (ctx->rc_lookahead > 0) {
+        // +1 is to account for lkd_bound calculation later
+        // +4 is to allow sufficient pipelining with lookahead
+        nb_surfaces = FFMAX(1, FFMAX(nb_surfaces, ctx->rc_lookahead + ctx->config.frameIntervalP + 1 + 4));
+        if (nb_surfaces > ctx->nb_surfaces && ctx->nb_surfaces > 0)
+        {
+            av_log(avctx, AV_LOG_WARNING,
+                "Defined rc_lookahead requires more surfaces, "
+                "increasing used surfaces %d -> %d\n", ctx->nb_surfaces, nb_surfaces);
+        }
+        ctx->nb_surfaces = FFMAX(nb_surfaces, ctx->nb_surfaces);
+    }
+    else {
+        if (ctx->config.frameIntervalP > 1 && ctx->nb_surfaces < nb_surfaces && ctx->nb_surfaces > 0)
+        {
+            av_log(avctx, AV_LOG_WARNING,
+                "Defined b-frame requires more surfaces, "
+                "increasing used surfaces %d -> %d\n", ctx->nb_surfaces, nb_surfaces);
+            ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, nb_surfaces);
+        }
+        else if (ctx->nb_surfaces <= 0)
+            ctx->nb_surfaces = nb_surfaces;
+        // otherwise use user specified value
+    }
+
+    ctx->nb_surfaces = FFMAX(1, FFMIN(MAX_REGISTERED_FRAMES, ctx->nb_surfaces));
+    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
+    return 0;
+}
+
 static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
 {
     NVENCContext *ctx               = avctx->priv_data;
     NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NVENCFrame* tmp_surface = &ctx->frames[idx];
     int ret;
     NV_ENC_CREATE_BITSTREAM_BUFFER out_buffer = { 0 };
 
@@ -1046,6 +1088,8 @@ static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
 
     ctx->frames[idx].out  = out_buffer.bitstreamBuffer;
 
+    av_fifo_generic_write(ctx->unused_surface_queue, &tmp_surface, sizeof(tmp_surface), NULL);
+
     return 0;
 }
 
@@ -1054,18 +1098,15 @@ static int nvenc_setup_surfaces(AVCodecContext *avctx)
     NVENCContext *ctx = avctx->priv_data;
     int i, ret;
 
-    ctx->nb_surfaces = FFMAX(4 + avctx->max_b_frames,
-                             ctx->nb_surfaces);
-    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
-
-
     ctx->frames = av_mallocz_array(ctx->nb_surfaces, sizeof(*ctx->frames));
     if (!ctx->frames)
         return AVERROR(ENOMEM);
-
     ctx->timestamps = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
     if (!ctx->timestamps)
         return AVERROR(ENOMEM);
+    ctx->unused_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NVENCFrame*));
+    if (!ctx->unused_surface_queue)
+        return AVERROR(ENOMEM);
     ctx->pending = av_fifo_alloc(ctx->nb_surfaces * sizeof(*ctx->frames));
     if (!ctx->pending)
         return AVERROR(ENOMEM);
@@ -1123,6 +1164,7 @@ av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
     av_fifo_free(ctx->timestamps);
     av_fifo_free(ctx->pending);
     av_fifo_free(ctx->ready);
+    av_fifo_free(ctx->unused_surface_queue);
 
     if (ctx->frames) {
         for (i = 0; i < ctx->nb_surfaces; ++i) {
@@ -1201,16 +1243,14 @@ av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
 
 static NVENCFrame *get_free_frame(NVENCContext *ctx)
 {
-    int i;
+    NVENCFrame *tmp_surf;
 
-    for (i = 0; i < ctx->nb_surfaces; i++) {
-        if (!ctx->frames[i].locked) {
-            ctx->frames[i].locked = 1;
-            return &ctx->frames[i];
-        }
-    }
+    if (!(av_fifo_size(ctx->unused_surface_queue) > 0))
+        // queue empty
+        return NULL;
 
-    return NULL;
+    av_fifo_generic_read(ctx->unused_surface_queue, &tmp_surf, sizeof(tmp_surf), NULL);
+    return tmp_surf;
 }
 
 static int nvenc_copy_frame(NV_ENC_LOCK_INPUT_BUFFER *in, const AVFrame *frame)
@@ -1510,7 +1550,7 @@ static int nvenc_get_output(AVCodecContext *avctx, AVPacket *pkt)
         frame->in = NULL;
     }
 
-    frame->locked = 0;
+    av_fifo_generic_write(ctx->unused_surface_queue, &frame, sizeof(frame), NULL);
 
     ret = nvenc_set_timestamp(avctx, &params, pkt);
     if (ret < 0)
diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
index 3602f16..b42b930 100644
--- a/libavcodec/nvenc.h
+++ b/libavcodec/nvenc.h
@@ -56,7 +56,6 @@ typedef struct NVENCFrame {
 
     NV_ENC_OUTPUT_PTR out;
     NV_ENC_BUFFER_FORMAT format;
-    int locked;
 } NVENCFrame;
 
 typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
@@ -145,7 +144,7 @@ typedef struct NVENCContext {
     int nb_surfaces;
     NVENCFrame *frames;
     AVFifoBuffer *timestamps;
-    AVFifoBuffer *pending, *ready;
+    AVFifoBuffer *pending, *ready, *unused_surface_queue;
 
     struct {
         CUdeviceptr ptr;
diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
index f7e5cd0..bf98326 100644
--- a/libavcodec/nvenc_h264.c
+++ b/libavcodec/nvenc_h264.c
@@ -47,6 +47,7 @@ static const AVOption options[] = {
     { "high_444", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH_444 },            0, 0, VE, "profile" },
     { "constrained_high", "",                             0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH },    0, 0, VE, "profile" },
     { "level",    "Set the encoding level restriction",   OFFSET(level),       AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
+    { "auto",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT }, 0, 0, VE, "level" },
     { "1.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1 },  0, 0, VE,  "level" },
     { "1.b",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
     { "1.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_11 }, 0, 0, VE,  "level" },
@@ -71,14 +72,14 @@ static const AVOption options[] = {
     { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
     { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
     { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
+    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 0 },                    0, MAX_REGISTERED_FRAMES, VE },
     { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
     { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
     { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
     { "async_depth", "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
     { "delay",       "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
 #if NVENCAPI_MAJOR_VERSION >= 7
-    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
+    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE },
     { "no-scenecut", "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts", OFFSET(no_scenecut), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "b_adapt", "When lookahead is enabled, set this to 0 to disable adaptive B-frame decision", OFFSET(b_adapt), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
     { "spatial-aq", "set to 1 to enable Spatial AQ", OFFSET(aq), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
index 9102d8a..9140398 100644
--- a/libavcodec/nvenc_hevc.c
+++ b/libavcodec/nvenc_hevc.c
@@ -47,6 +47,7 @@ static const AVOption options[] = {
     { "rext",   "",                                      0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_HEVC_PROFILE_REXT }, 0, 0, VE, "profile" },
 #endif /* NVENCAPI_MAJOR_VERSION >= 7 */
     { "level",   "Set the encoding level restriction",   OFFSET(level),        AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_HEVC_62, VE, "level" },
+    { "auto",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT }, 0, 0, VE, "level" },
     { "1.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_1 },  0, 0, VE,  "level" },
     { "2.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_2 },  0, 0, VE,  "level" },
     { "2.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_21 }, 0, 0, VE,  "level" },
@@ -71,14 +72,14 @@ static const AVOption options[] = {
     { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
     { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
     { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
+    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT, { .i64 = 0 },                       0, MAX_REGISTERED_FRAMES, VE },
     { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
     { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
     { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
     { "async_depth", "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
     { "delay",       "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
 #if NVENCAPI_MAJOR_VERSION >= 7
-    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
+    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE },
     { "no-scenecut", "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts", OFFSET(no_scenecut), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "spatial_aq", "set to 1 to enable Spatial AQ", OFFSET(aq), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "zerolatency", "Set 1 to indicate zero latency operation (no reordering delay)", OFFSET(zerolatency), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-- 
2.9.1