PR #22362 opened by f1k2faeez
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22362
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22362.patch

CUDA-accelerated versions of hstack, vstack, and xstack that perform
video stacking on GPU without host memory round-trips.

Supported pixel formats: yuv420p, nv12, yuv444p, p010le, p016le,
yuv444p16le, rgb0, bgr0, rgba, bgra.

xstack_cuda supports fill color with automatic RGB to YUV conversion.

The filters share options with the software counterparts and reuse
the stack_internal.h framework. CUDA kernels are selected by element
size rather than per pixel format.

Signed-off-by: Faeez Kadiri <[email protected]>


>From c7586e5804cbf8c5db29a425e2935b9076b2c68b Mon Sep 17 00:00:00 2001
From: Faeez Kadiri <[email protected]>
Date: Sat, 24 May 2025 02:18:05 +0530
Subject: [PATCH] avfilter: add CUDA accelerated stack filters (hstack_cuda,
 vstack_cuda, xstack_cuda)

CUDA-accelerated versions of hstack, vstack, and xstack that perform
video stacking on GPU without host memory round-trips.

Supported pixel formats: yuv420p, nv12, yuv444p, p010le, p016le,
yuv444p16le, rgb0, bgr0, rgba, bgra.

xstack_cuda supports fill color with automatic RGB to YUV conversion.

The filters share options with the software counterparts and reuse
the stack_internal.h framework. CUDA kernels are selected by element
size rather than per pixel format.

Signed-off-by: Faeez Kadiri <[email protected]>
---
 Changelog                    |   1 +
 configure                    |   6 +
 doc/filters.texi             |  78 +++++
 libavfilter/Makefile         |   3 +
 libavfilter/allfilters.c     |   3 +
 libavfilter/version.h        |   2 +-
 libavfilter/vf_stack_cuda.c  | 619 +++++++++++++++++++++++++++++++++++
 libavfilter/vf_stack_cuda.cu | 229 +++++++++++++
 8 files changed, 940 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_stack_cuda.c
 create mode 100644 libavfilter/vf_stack_cuda.cu

diff --git a/Changelog b/Changelog
index ce49a5fff0..f3551c4faa 100644
--- a/Changelog
+++ b/Changelog
@@ -26,6 +26,7 @@ version <next>:
 - swscale Vulkan support
 - LCEVC metadata bitstream filter
 - Add vf_deinterlace_d3d12 filter
+- hstack_cuda, vstack_cuda and xstack_cuda filters
 
 
 version 8.0:
diff --git a/configure b/configure
index 5ad2e6787d..ac9f223d4d 100755
--- a/configure
+++ b/configure
@@ -3518,6 +3518,12 @@ overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 pad_cuda_filter_deps="ffnvcodec"
 pad_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
+hstack_cuda_filter_deps="ffnvcodec"
+hstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+vstack_cuda_filter_deps="ffnvcodec"
+vstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+xstack_cuda_filter_deps="ffnvcodec"
+xstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
 gfxcapture_filter_deps="cxx17 threads d3d11va IGraphicsCaptureItemInterop 
__x_ABI_CWindows_CGraphics_CCapture_CIGraphicsCaptureSession3"
diff --git a/doc/filters.texi b/doc/filters.texi
index e49dd9ef0d..f5dbe27f3b 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -27349,6 +27349,84 @@ Only deinterlace frames marked as interlaced.
 The default value is @code{all}.
 @end table
 
+@section hstack_cuda
+Stack input videos horizontally.
+
+This is the CUDA variant of the @ref{hstack} filter, each input stream may
+have different height, this filter will scale down/up each input stream while
+keeping the original aspect ratio.
+
+It accepts the following options:
+
+@table @option
+@item inputs
+See @ref{hstack}.
+
+@item shortest
+See @ref{hstack}.
+
+@item height
+Set height of output. If set to 0, this filter will set height of output to
+height of the first input stream. Default value is 0.
+@end table
+
+@section vstack_cuda
+Stack input videos vertically.
+
+This is the CUDA variant of the @ref{vstack} filter, each input stream may
+have different width, this filter will scale down/up each input stream while
+keeping the original aspect ratio.
+
+It accepts the following options:
+
+@table @option
+@item inputs
+See @ref{vstack}.
+
+@item shortest
+See @ref{vstack}.
+
+@item width
+Set width of output. If set to 0, this filter will set width of output to
+width of the first input stream. Default value is 0.
+@end table
+
+@section xstack_cuda
+Stack video inputs into custom layout.
+
+This is the CUDA variant of the @ref{xstack} filter, each input stream may
+have different size, this filter will scale down/up each input stream to the
+given output size, or the size of the first input stream.
+
+It accepts the following options:
+
+@table @option
+@item inputs
+See @ref{xstack}.
+
+@item shortest
+See @ref{xstack}.
+
+@item layout
+See @ref{xstack}.
+Moreover, this permits the user to supply output size for each input stream.
+@example
+xstack_cuda=inputs=4:layout=0_0_1920x1080|0_h0_1920x1080|w0_0_1920x1080|w0_h0_1920x1080
+@end example
+
+@item grid
+See @ref{xstack}.
+
+@item grid_tile_size
+Set output size for each input stream when @option{grid} is set. If this option
+is not set, this filter will set output size by default to the size of the
+first input stream. For the syntax of this option, check the
+@ref{video size syntax,,"Video size" section in the ffmpeg-utils 
manual,ffmpeg-utils}.
+
+@item fill
+See @ref{xstack}.
+@end table
+
 @anchor{CUDA NPP}
 @section CUDA NPP
 Below is a description of the currently available NVIDIA Performance 
Primitives (libnpp) video filters.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index a530cfae29..eb0af76457 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -596,6 +596,9 @@ OBJS-$(CONFIG_YAEPBLUR_FILTER)               += 
vf_yaepblur.o
 OBJS-$(CONFIG_ZMQ_FILTER)                    += f_zmq.o
 OBJS-$(CONFIG_ZOOMPAN_FILTER)                += vf_zoompan.o
 OBJS-$(CONFIG_ZSCALE_FILTER)                 += vf_zscale.o
+OBJS-$(CONFIG_HSTACK_CUDA_FILTER)            += vf_stack_cuda.o framesync.o 
vf_stack_cuda.ptx.o cuda/load_helper.o
+OBJS-$(CONFIG_VSTACK_CUDA_FILTER)            += vf_stack_cuda.o framesync.o 
vf_stack_cuda.ptx.o cuda/load_helper.o
+OBJS-$(CONFIG_XSTACK_CUDA_FILTER)            += vf_stack_cuda.o framesync.o 
vf_stack_cuda.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_HSTACK_VAAPI_FILTER)           += vf_stack_vaapi.o framesync.o 
vaapi_vpp.o
 OBJS-$(CONFIG_VSTACK_VAAPI_FILTER)           += vf_stack_vaapi.o framesync.o 
vaapi_vpp.o
 OBJS-$(CONFIG_XSTACK_VAAPI_FILTER)           += vf_stack_vaapi.o framesync.o 
vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index e26859e159..58750aa05d 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -557,6 +557,9 @@ extern const FFFilter ff_vf_yaepblur;
 extern const FFFilter ff_vf_zmq;
 extern const FFFilter ff_vf_zoompan;
 extern const FFFilter ff_vf_zscale;
+extern const FFFilter ff_vf_hstack_cuda;
+extern const FFFilter ff_vf_vstack_cuda;
+extern const FFFilter ff_vf_xstack_cuda;
 extern const FFFilter ff_vf_hstack_vaapi;
 extern const FFFilter ff_vf_vstack_vaapi;
 extern const FFFilter ff_vf_xstack_vaapi;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 537df129cd..7642b670d1 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@
 
 #include "version_major.h"
 
-#define LIBAVFILTER_VERSION_MINOR  13
+#define LIBAVFILTER_VERSION_MINOR  14
 #define LIBAVFILTER_VERSION_MICRO 100
 
 
diff --git a/libavfilter/vf_stack_cuda.c b/libavfilter/vf_stack_cuda.c
new file mode 100644
index 0000000000..e74e9e97d2
--- /dev/null
+++ b/libavfilter/vf_stack_cuda.c
@@ -0,0 +1,619 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Hardware accelerated hstack, vstack and xstack filters based on CUDA
+ */
+
+#include "config_components.h"
+
+#include "libavutil/opt.h"
+#include "libavutil/common.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/eval.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/avstring.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/colorspace.h"
+#include "libavutil/mem.h"
+
+#include "filters.h"
+#include "formats.h"
+#include "video.h"
+
+#include "framesync.h"
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P016,
+    AV_PIX_FMT_YUV444P16,
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct CUDAStackContext {
+    AVCUDADeviceContext *hwctx;
+    CudaFunctions *cuda_dl;
+
+    CUcontext   cu_ctx;
+    CUmodule    cu_module;
+    CUstream    cu_stream;
+
+    CUfunction  cu_func_copy;
+    CUfunction  cu_func_copy_uv;
+
+    CUfunction  cu_func_color;
+    CUfunction  cu_func_color_uv;
+
+    enum AVPixelFormat in_fmt;
+    const AVPixFmtDescriptor *in_desc;
+    int in_planes;
+    int in_plane_depths[4];
+    int in_plane_channels[4];
+
+    int fillcolor_yuv[4];
+} CUDAStackContext;
+
+#define HSTACK_NAME             "hstack_cuda"
+#define VSTACK_NAME             "vstack_cuda"
+#define XSTACK_NAME             "xstack_cuda"
+#define HWContext               CUDAStackContext
+#define StackHWContext          StackCudaContext
+#include "stack_internal.h"
+
+typedef struct StackCudaContext {
+    StackBaseContext base;
+    CUDAStackContext cuda;
+} StackCudaContext;
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static void rgb2yuv(float r, float g, float b, int *y, int *u, int *v, int 
depth)
+{
+    *y = ((0.21260*219.0/255.0) * r + (0.71520*219.0/255.0) * g +
+         (0.07220*219.0/255.0) * b) * ((1 << depth) - 1);
+    *u = (-(0.11457*224.0/255.0) * r - (0.38543*224.0/255.0) * g +
+         (0.50000*224.0/255.0) * b + 0.5) * ((1 << depth) - 1);
+    *v = ((0.50000*224.0/255.0) * r - (0.45415*224.0/255.0) * g -
+         (0.04585*224.0/255.0) * b + 0.5) * ((1 << depth) - 1);
+}
+
+static int conv_8to16(int val, int mask)
+{
+    return (val | (val << 8)) & mask;
+}
+
+static void get_func_name(char *buf, size_t buf_size,
+                          const char *prefix, int depth, int channels)
+{
+    const char *suffix;
+
+    if (channels == 4 && depth <= 8)
+        suffix = "uchar4";
+    else if (channels == 2 && depth <= 8)
+        suffix = "uchar2";
+    else if (channels == 2 && depth > 8)
+        suffix = "ushort2";
+    else if (depth > 8)
+        suffix = "ushort";
+    else
+        suffix = "uchar";
+
+    snprintf(buf, buf_size, "%s_%s", prefix, suffix);
+}
+
+static av_cold int cuda_stack_load_functions(AVFilterContext *ctx)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions *cu = s->cuda_dl;
+    int ret;
+    char buf[128];
+
+    extern const unsigned char ff_vf_stack_cuda_ptx_data[];
+    extern const unsigned int ff_vf_stack_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_stack_cuda_ptx_data, 
ff_vf_stack_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    get_func_name(buf, sizeof(buf), "StackCopy",
+                  s->in_plane_depths[0], s->in_plane_channels[0]);
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_copy, s->cu_module, 
buf));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load copy function: %s\n", buf);
+        ret = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    get_func_name(buf, sizeof(buf), "SetColor",
+                  s->in_plane_depths[0], s->in_plane_channels[0]);
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_color, s->cu_module, 
buf));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load color function: %s\n", buf);
+        ret = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    if (s->in_planes > 1) {
+        if (s->in_plane_channels[1] > 1) {
+            get_func_name(buf, sizeof(buf), "StackCopy",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+        } else {
+            get_func_name(buf, sizeof(buf), "StackCopy",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+            av_strlcat(buf, "_uv", sizeof(buf));
+        }
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_copy_uv, 
s->cu_module, buf));
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_FATAL, "Failed to load copy UV function: %s\n", 
buf);
+            ret = AVERROR(ENOSYS);
+            goto fail;
+        }
+
+        if (s->in_plane_channels[1] > 1) {
+            get_func_name(buf, sizeof(buf), "SetColor",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+        } else {
+            get_func_name(buf, sizeof(buf), "SetColor",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+            av_strlcat(buf, "_uv", sizeof(buf));
+        }
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_color_uv, 
s->cu_module, buf));
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_FATAL, "Failed to load color UV function: 
%s\n", buf);
+            ret = AVERROR(ENOSYS);
+            goto fail;
+        }
+    }
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int cuda_stack_color_kernel(AVFilterContext *ctx, CUfunction func,
+                            AVFrame *out_frame, const int *color,
+                            int width, int height,
+                            int dst_x, int dst_y,
+                            int dst_width, int dst_height, int dst_pitch)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+
+    CUdeviceptr dst_devptr[4] = {
+        (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1],
+        (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3]
+    };
+
+    void *args[] = {
+        &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
+        &width, &height, &dst_pitch,
+        &dst_x, &dst_y,
+        (void *)&color[0], (void *)&color[1], (void *)&color[2], (void 
*)&color[3],
+        &dst_width, &dst_height,
+    };
+
+    return CHECK_CU(cu->cuLaunchKernel(func,
+                                     DIV_UP(width, BLOCKX), DIV_UP(height, 
BLOCKY), 1,
+                                     BLOCKX, BLOCKY, 1,
+                                     0, s->cu_stream, args, NULL));
+}
+
+static int cuda_stack_copy_kernel(AVFilterContext *ctx, CUfunction func,
+                            CUtexObject src_tex[4],
+                            AVFrame *out_frame,
+                            int width, int height,
+                            int dst_x, int dst_y, int dst_pitch,
+                            int src_width, int src_height)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+
+    CUdeviceptr dst_devptr[4] = {
+        (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1],
+        (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3]
+    };
+
+    void *args[] = {
+        &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
+        &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
+        &width, &height, &dst_pitch,
+        &dst_x, &dst_y,
+        &src_width, &src_height,
+        &out_frame->width, &out_frame->height
+    };
+
+    return CHECK_CU(cu->cuLaunchKernel(func,
+                                     DIV_UP(width, BLOCKX), DIV_UP(height, 
BLOCKY), 1,
+                                     BLOCKX, BLOCKY, 1,
+                                     0, s->cu_stream, args, NULL));
+}
+
+static int cuda_stack_color_op(AVFilterContext *ctx, StackItemRegion *region,
+                               AVFrame *out, const int *color)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+    int ret = 0;
+    CUcontext dummy;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = cuda_stack_color_kernel(ctx, s->cu_func_color,
+                                out, color, region->width, region->height,
+                                region->x, region->y,
+                                out->width, out->height,
+                                out->linesize[0]);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Error during color operation: %d\n", ret);
+        goto fail;
+    }
+
+    if (s->in_planes > 1) {
+        ret = cuda_stack_color_kernel(ctx, s->cu_func_color_uv,
+                                    out, color,
+                                    AV_CEIL_RSHIFT(region->width, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->height, 
s->in_desc->log2_chroma_h),
+                                    AV_CEIL_RSHIFT(region->x, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->y, 
s->in_desc->log2_chroma_h),
+                                    out->width, out->height,
+                                    out->linesize[1]);
+        if (ret < 0)
+            av_log(ctx, AV_LOG_ERROR, "Error during color UV operation: %d\n", 
ret);
+    }
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int cuda_stack_copy_op(AVFilterContext *ctx, StackItemRegion *region,
+                              AVFrame *in, AVFrame *out)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+    CUtexObject tex[4] = { 0, 0, 0, 0 };
+    int ret = 0;
+    int i;
+    CUcontext dummy;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; i < s->in_planes; i++) {
+        CUDA_TEXTURE_DESC tex_desc = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = s->in_plane_depths[i] <= 8 ?
+                                  CU_AD_FORMAT_UNSIGNED_INT8 :
+                                  CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = s->in_plane_channels[i],
+            .res.pitch2D.pitchInBytes = in->linesize[i],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[i],
+        };
+
+        if (i == 1 || i == 2) {
+            res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, 
s->in_desc->log2_chroma_w);
+            res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, 
s->in_desc->log2_chroma_h);
+        } else {
+            res_desc.res.pitch2D.width = in->width;
+            res_desc.res.pitch2D.height = in->height;
+        }
+
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, 
NULL));
+        if (ret < 0)
+            goto fail;
+    }
+
+    ret = cuda_stack_copy_kernel(ctx, s->cu_func_copy,
+                             tex, out, region->width, region->height,
+                                region->x, region->y, out->linesize[0],
+                                in->width, in->height);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Error during copy operation: %d\n", ret);
+        goto fail;
+    }
+
+    if (s->in_planes > 1) {
+        ret = cuda_stack_copy_kernel(ctx, s->cu_func_copy_uv, tex, out,
+                                    AV_CEIL_RSHIFT(region->width, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->height, 
s->in_desc->log2_chroma_h),
+                                    AV_CEIL_RSHIFT(region->x, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->y, 
s->in_desc->log2_chroma_h),
+                                    out->linesize[1],
+                                    AV_CEIL_RSHIFT(in->width, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(in->height, 
s->in_desc->log2_chroma_h));
+        if (ret < 0)
+            av_log(ctx, AV_LOG_ERROR, "Error during copy UV operation: %d\n", 
ret);
+    }
+
+fail:
+    for (i = 0; i < FF_ARRAY_ELEMS(tex); i++)
+        if (tex[i])
+            CHECK_CU(cu->cuTexObjectDestroy(tex[i]));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int process_frame(FFFrameSync *fs)
+{
+    AVFilterContext *ctx = fs->parent;
+    StackCudaContext *sctx = fs->opaque;
+    CUDAStackContext *s = &sctx->cuda;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *out_frame = NULL;
+    AVFrame *in_frame = NULL;
+    int ret = 0;
+
+    out_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!out_frame)
+        return AVERROR(ENOMEM);
+
+    if (sctx->base.fillcolor_enable) {
+        StackItemRegion full_region = {
+            .x = 0,
+            .y = 0,
+            .width = outlink->w,
+            .height = outlink->h
+        };
+
+        ret = cuda_stack_color_op(ctx, &full_region, out_frame, 
s->fillcolor_yuv);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to fill background color\n");
+            goto fail;
+        }
+    }
+
+    for (int i = 0; i < ctx->nb_inputs; i++) {
+        ret = ff_framesync_get_frame(fs, i, &in_frame, 0);
+        if (ret)
+            goto fail;
+
+        if (!i) {
+            ret = av_frame_copy_props(out_frame, in_frame);
+            if (ret < 0)
+                goto fail;
+        }
+
+        ret = cuda_stack_copy_op(ctx, &sctx->base.regions[i], in_frame, 
out_frame);
+        if (ret < 0)
+            goto fail;
+    }
+
+    out_frame->pts = av_rescale_q(sctx->base.fs.pts, sctx->base.fs.time_base, 
outlink->time_base);
+    out_frame->sample_aspect_ratio = outlink->sample_aspect_ratio;
+
+    return ff_filter_frame(outlink, out_frame);
+
+fail:
+    av_frame_free(&out_frame);
+    return ret;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    AVFilterLink *inlink0 = ctx->inputs[0];
+    FilterLink      *inl0 = ff_filter_link(inlink0);
+    FilterLink      *outl = ff_filter_link(outlink);
+    enum AVPixelFormat in_format;
+    int ret;
+    AVHWFramesContext *in_frames_ctx;
+    AVBufferRef *hw_frames_ctx;
+    AVHWFramesContext *out_frames_ctx;
+
+    if (inlink0->format != AV_PIX_FMT_CUDA || !inl0->hw_frames_ctx || 
!inl0->hw_frames_ctx->data) {
+        av_log(ctx, AV_LOG_ERROR, "Software pixel format is not supported.\n");
+        return AVERROR(EINVAL);
+    }
+
+    in_frames_ctx = (AVHWFramesContext*)inl0->hw_frames_ctx->data;
+    in_format = in_frames_ctx->sw_format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->in_desc = av_pix_fmt_desc_get(s->in_fmt);
+    s->in_planes = av_pix_fmt_count_planes(s->in_fmt);
+
+    for (int i = 0; i < s->in_desc->nb_components; i++) {
+        int d = (s->in_desc->comp[i].depth + 7) / 8;
+        int p = s->in_desc->comp[i].plane;
+        s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], 
s->in_desc->comp[i].step / d);
+        s->in_plane_depths[p] = s->in_desc->comp[i].depth;
+    }
+
+    s->hwctx = in_frames_ctx->device_ctx->hwctx;
+    s->cuda_dl = s->hwctx->internal->cuda_dl;
+    s->cu_stream = s->hwctx->stream;
+
+    for (int i = 1; i < sctx->base.nb_inputs; i++) {
+        AVFilterLink *inlink = ctx->inputs[i];
+        FilterLink      *inl = ff_filter_link(inlink);
+        AVHWFramesContext *hwfc = NULL;
+
+        if (inlink->format != AV_PIX_FMT_CUDA || !inl->hw_frames_ctx || 
!inl->hw_frames_ctx->data) {
+            av_log(ctx, AV_LOG_ERROR, "Software pixel format is not 
supported.\n");
+            return AVERROR(EINVAL);
+        }
+
+        hwfc = (AVHWFramesContext *)inl->hw_frames_ctx->data;
+
+        if (in_frames_ctx->sw_format != hwfc->sw_format) {
+            av_log(ctx, AV_LOG_ERROR, "All inputs should have the same 
underlying software pixel format.\n");
+            return AVERROR(EINVAL);
+        }
+    }
+
+    if (sctx->base.fillcolor_enable) {
+        if (s->in_desc->flags & AV_PIX_FMT_FLAG_RGB) {
+            s->fillcolor_yuv[0] = sctx->base.fillcolor[0];
+            s->fillcolor_yuv[1] = sctx->base.fillcolor[1];
+            s->fillcolor_yuv[2] = sctx->base.fillcolor[2];
+            s->fillcolor_yuv[3] = sctx->base.fillcolor[3];
+        } else {
+            int Y, U, V;
+
+            rgb2yuv(sctx->base.fillcolor[0] / 255.0, sctx->base.fillcolor[1] / 
255.0,
+                    sctx->base.fillcolor[2] / 255.0, &Y, &U, &V, 8);
+
+            if (s->in_plane_depths[0] > 8) {
+                int mask = (s->in_plane_depths[0] <= 10) ? 0xFFC0 : 0xFFFF;
+                s->fillcolor_yuv[0] = conv_8to16(Y, mask);
+                s->fillcolor_yuv[1] = conv_8to16(U, mask);
+                s->fillcolor_yuv[2] = conv_8to16(V, mask);
+            } else {
+                s->fillcolor_yuv[0] = Y;
+                s->fillcolor_yuv[1] = U;
+                s->fillcolor_yuv[2] = V;
+            }
+            s->fillcolor_yuv[3] = sctx->base.fillcolor[3];
+        }
+    }
+
+    ret = config_comm_output(outlink);
+    if (ret < 0)
+        return ret;
+
+    ret = cuda_stack_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    hw_frames_ctx = av_hwframe_ctx_alloc(in_frames_ctx->device_ref);
+    if (!hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    out_frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
+    out_frames_ctx->format = AV_PIX_FMT_CUDA;
+    out_frames_ctx->sw_format = in_format;
+    out_frames_ctx->width = outlink->w;
+    out_frames_ctx->height = outlink->h;
+
+    ret = av_hwframe_ctx_init(hw_frames_ctx);
+    if (ret < 0) {
+        av_buffer_unref(&hw_frames_ctx);
+        return ret;
+    }
+
+    av_buffer_unref(&outl->hw_frames_ctx);
+    outl->hw_frames_ctx = hw_frames_ctx;
+
+    return 0;
+}
+
+static av_cold int cuda_stack_init(AVFilterContext *ctx)
+{
+    return stack_init(ctx);
+}
+
+static av_cold void cuda_stack_uninit(AVFilterContext *ctx)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    stack_uninit(ctx);
+}
+
+static const enum AVPixelFormat cuda_stack_pix_fmts[] = {
+    AV_PIX_FMT_CUDA,
+    AV_PIX_FMT_NONE,
+};
+
+#include "stack_internal.c"
+
+#if CONFIG_HSTACK_CUDA_FILTER
+
+DEFINE_HSTACK_OPTIONS(cuda);
+DEFINE_STACK_FILTER(hstack, cuda, "CUDA", 0);
+
+#endif
+
+#if CONFIG_VSTACK_CUDA_FILTER
+
+DEFINE_VSTACK_OPTIONS(cuda);
+DEFINE_STACK_FILTER(vstack, cuda, "CUDA", 0);
+
+#endif
+
+#if CONFIG_XSTACK_CUDA_FILTER
+
+DEFINE_XSTACK_OPTIONS(cuda);
+DEFINE_STACK_FILTER(xstack, cuda, "CUDA", 0);
+
+#endif
diff --git a/libavfilter/vf_stack_cuda.cu b/libavfilter/vf_stack_cuda.cu
new file mode 100644
index 0000000000..c2f4bc70b9
--- /dev/null
+++ b/libavfilter/vf_stack_cuda.cu
@@ -0,0 +1,229 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+#define FIXED_PITCH(T) \
+    (dst_pitch / sizeof(T))
+
+#define OFFSET_DST(n, T) \
+    dst_##n[(target_y) * FIXED_PITCH(T) + (target_x)]
+
+#define BOUNDS_CHECK() \
+    int xo = blockIdx.x * blockDim.x + threadIdx.x; \
+    int yo = blockIdx.y * blockDim.y + threadIdx.y; \
+    if (xo >= width || yo >= height) \
+        return; \
+    int target_x = xo + dst_x; \
+    int target_y = yo + dst_y; \
+    if (target_x < 0 || target_y < 0 || \
+        target_x >= frame_width || target_y >= frame_height) \
+        return;
+
+#define COPY_SCALE() \
+    float hscale = (float)src_width / (float)width; \
+    float vscale = (float)src_height / (float)height; \
+    float xi = (xo + 0.5f) * hscale; \
+    float yi = (yo + 0.5f) * vscale;
+
+extern "C" {
+
+// --- COLOR KERNELS ---
+
+__global__ void SetColor_uchar(
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(0, uchar) = (uchar)c0;
+}
+
+__global__ void SetColor_ushort(
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(0, ushort) = (ushort)c0;
+}
+
+__global__ void SetColor_uchar4(
+    uchar4 *dst_0, uchar4 *dst_1, uchar4 *dst_2, uchar4 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(0, uchar4) = make_uchar4(c0, c1, c2, c3);
+}
+
+__global__ void SetColor_uchar_uv(
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, uchar) = (uchar)c1;
+    OFFSET_DST(2, uchar) = (uchar)c2;
+}
+
+__global__ void SetColor_ushort_uv(
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, ushort) = (ushort)c1;
+    OFFSET_DST(2, ushort) = (ushort)c2;
+}
+
+__global__ void SetColor_uchar2(
+    uchar2 *dst_0, uchar2 *dst_1, uchar2 *dst_2, uchar2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, uchar2) = make_uchar2(c1, c2);
+}
+
+__global__ void SetColor_ushort2(
+    ushort2 *dst_0, ushort2 *dst_1, ushort2 *dst_2, ushort2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, ushort2) = make_ushort2(c1, c2);
+}
+
+// --- COPY KERNELS ---
+
+__global__ void StackCopy_uchar(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(0, uchar) = tex2D<uchar>(src_tex_0, xi, yi);
+}
+
+__global__ void StackCopy_ushort(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(0, ushort) = tex2D<ushort>(src_tex_0, xi, yi);
+}
+
+__global__ void StackCopy_uchar4(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar4 *dst_0, uchar4 *dst_1, uchar4 *dst_2, uchar4 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(0, uchar4) = tex2D<uchar4>(src_tex_0, xi, yi);
+}
+
+__global__ void StackCopy_uchar_uv(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, uchar) = tex2D<uchar>(src_tex_1, xi, yi);
+    OFFSET_DST(2, uchar) = tex2D<uchar>(src_tex_2, xi, yi);
+}
+
+__global__ void StackCopy_ushort_uv(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, ushort) = tex2D<ushort>(src_tex_1, xi, yi);
+    OFFSET_DST(2, ushort) = tex2D<ushort>(src_tex_2, xi, yi);
+}
+
+__global__ void StackCopy_uchar2(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar2 *dst_0, uchar2 *dst_1, uchar2 *dst_2, uchar2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, uchar2) = tex2D<uchar2>(src_tex_1, xi, yi);
+}
+
+__global__ void StackCopy_ushort2(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    ushort2 *dst_0, ushort2 *dst_1, ushort2 *dst_2, ushort2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, ushort2) = tex2D<ushort2>(src_tex_1, xi, yi);
+}
+
+}
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to