Add hardware-accelerated stack filters for CUDA that provide equivalent
functionality to the software stack filters but with GPU acceleration.

Features:
- Support for hstack, vstack, and xstack operations
- Compatible pixel formats:
  yuv420p, nv12, yuv444p, p010le, p016le, yuv444p16le, rgb0, bgr0, rgba, bgra
- Fill color support with automatic RGB to YUV conversion for YUV formats
- Proper chroma subsampling handling for all supported formats
- Integration with existing stack filter infrastructure via stack_internal.h

The implementation follows the established CUDA filter pattern using PTX
modules for kernel execution and proper CUDA context management. Kernels
are organized by element size (uchar, ushort, uchar2, ushort2, uchar4)
rather than per pixel format, reducing code duplication. Copy operations
handle frame placement while color operations fill background areas.

This enables efficient video composition workflows entirely on GPU
without CPU-GPU memory transfers, significantly improving performance
for multi-input video processing pipelines.

Signed-off-by: Faeez Kadiri <[email protected]>
---
v2 -> v3:
- Changed vf_stack_cuda.c license from MIT to LGPL 2.1+
- Replaced per-pixel-format CUDA kernels with generic element-size
  based kernels (uchar, ushort, uchar2, ushort2, uchar4) for both
  copy and color operations, reducing code duplication
- Added get_func_name() helper to dynamically construct kernel names
  from element depth and channel count
- Moved 8-to-16-bit fill color conversion from CUDA kernels to C code
  (conv_8to16 helper), with proper P010 masking (0xFFC0)
- Changed fillcolor_yuv from uint8_t[4] to int[4] to hold pre-computed
  16-bit values; color kernels now accept int arguments
- Removed incorrect av_cold annotation from cuda_stack_color_kernel()
  and cuda_stack_copy_kernel() which are called during frame processing
- Added av_cold to cuda_stack_init()
- Simplified if (i == 0) to if (!i)
- Moved CUDA filter dependencies in configure to the dedicated CUDA
  filter section alongside other CUDA filters
- Moved Changelog entry from version 8.0 to version <next>

 Changelog                    |   1 +
 configure                    |   6 +
 doc/filters.texi             |  78 +++++
 libavfilter/Makefile         |   3 +
 libavfilter/allfilters.c     |   3 +
 libavfilter/vf_stack_cuda.c  | 630 +++++++++++++++++++++++++++++++++++
 libavfilter/vf_stack_cuda.cu | 235 +++++++++++++
 7 files changed, 956 insertions(+)
 create mode 100644 libavfilter/vf_stack_cuda.c
 create mode 100644 libavfilter/vf_stack_cuda.cu

diff --git a/Changelog b/Changelog
index 26416cb1d6..19a3bc39d8 100644
--- a/Changelog
+++ b/Changelog
@@ -24,6 +24,7 @@ version <next>:
 - Remove the old HLS protocol handler
 - Vulkan compute codec optimizations
 - swscale Vulkan support
+- hstack_cuda, vstack_cuda and xstack_cuda filters
 
 
 version 8.0:
diff --git a/configure b/configure
index 87a9c02686..b0f98ace57 100755
--- a/configure
+++ b/configure
@@ -3515,6 +3515,12 @@ overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 pad_cuda_filter_deps="ffnvcodec"
 pad_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
+hstack_cuda_filter_deps="ffnvcodec"
+hstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+vstack_cuda_filter_deps="ffnvcodec"
+vstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+xstack_cuda_filter_deps="ffnvcodec"
+xstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
 gfxcapture_filter_deps="cxx17 threads d3d11va IGraphicsCaptureItemInterop 
__x_ABI_CWindows_CGraphics_CCapture_CIGraphicsCaptureSession3"
diff --git a/doc/filters.texi b/doc/filters.texi
index e49dd9ef0d..ab36df7c05 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -27349,6 +27349,84 @@ Only deinterlace frames marked as interlaced.
 The default value is @code{all}.
 @end table
 
+@section hstack_cuda
+Stack input videos horizontally.
+
+This is the CUDA variant of the @ref{hstack} filter, each input stream may
+have different height, this filter will scale down/up each input stream while
+keeping the original aspect ratio.
+
+It accepts the following options:
+
+@table @option
+@item inputs
+See @ref{hstack}.
+
+@item shortest
+See @ref{hstack}.
+
+@item height
+Set height of output. If set to 0, this filter will set height of output to
+height of the first input stream. Default value is 0.
+@end table
+
+@section vstack_cuda
+Stack input videos vertically.
+
+This is the CUDA variant of the @ref{vstack} filter, each input stream may
+have different width, this filter will scale down/up each input stream while
+keeping the original aspect ratio.
+
+It accepts the following options:
+
+@table @option
+@item inputs
+See @ref{vstack}.
+
+@item shortest
+See @ref{vstack}.
+
+@item width
+Set width of output. If set to 0, this filter will set width of output to
+width of the first input stream. Default value is 0.
+@end table
+
+@section xstack_cuda
+Stack video inputs into custom layout.
+
+This is the CUDA variant of the @ref{xstack} filter,  each input stream may
+have different size, this filter will scale down/up each input stream to the
+given output size, or the size of the first input stream.
+
+It accepts the following options:
+
+@table @option
+@item inputs
+See @ref{xstack}.
+
+@item shortest
+See @ref{xstack}.
+
+@item layout
+See @ref{xstack}.
+Moreover, this permits the user to supply output size for each input stream.
+@example
+xstack_cuda=inputs=4:layout=0_0_1920x1080|0_h0_1920x1080|w0_0_1920x1080|w0_h0_1920x1080
+@end example
+
+@item grid
+See @ref{xstack}.
+
+@item grid_tile_size
+Set output size for each input stream when @option{grid} is set. If this option
+is not set, this filter will set output size by default to the size of the
+first input stream. For the syntax of this option, check the
+@ref{video size syntax,,"Video size" section in the ffmpeg-utils 
manual,ffmpeg-utils}.
+
+@item fill
+See @ref{xstack}.
+@end table
+
 @anchor{CUDA NPP}
 @section CUDA NPP
 Below is a description of the currently available NVIDIA Performance 
Primitives (libnpp) video filters.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 6ecacc346b..df21d4154f 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -595,6 +595,9 @@ OBJS-$(CONFIG_YAEPBLUR_FILTER)               += 
vf_yaepblur.o
 OBJS-$(CONFIG_ZMQ_FILTER)                    += f_zmq.o
 OBJS-$(CONFIG_ZOOMPAN_FILTER)                += vf_zoompan.o
 OBJS-$(CONFIG_ZSCALE_FILTER)                 += vf_zscale.o
+OBJS-$(CONFIG_HSTACK_CUDA_FILTER)            += vf_stack_cuda.o framesync.o 
vf_stack_cuda.ptx.o cuda/load_helper.o
+OBJS-$(CONFIG_VSTACK_CUDA_FILTER)            += vf_stack_cuda.o framesync.o 
vf_stack_cuda.ptx.o cuda/load_helper.o
+OBJS-$(CONFIG_XSTACK_CUDA_FILTER)            += vf_stack_cuda.o framesync.o 
vf_stack_cuda.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_HSTACK_VAAPI_FILTER)           += vf_stack_vaapi.o framesync.o 
vaapi_vpp.o
 OBJS-$(CONFIG_VSTACK_VAAPI_FILTER)           += vf_stack_vaapi.o framesync.o 
vaapi_vpp.o
 OBJS-$(CONFIG_XSTACK_VAAPI_FILTER)           += vf_stack_vaapi.o framesync.o 
vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 458f8c5373..a8b4369b29 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -556,6 +556,9 @@ extern const FFFilter ff_vf_yaepblur;
 extern const FFFilter ff_vf_zmq;
 extern const FFFilter ff_vf_zoompan;
 extern const FFFilter ff_vf_zscale;
+extern const FFFilter ff_vf_hstack_cuda;
+extern const FFFilter ff_vf_vstack_cuda;
+extern const FFFilter ff_vf_xstack_cuda;
 extern const FFFilter ff_vf_hstack_vaapi;
 extern const FFFilter ff_vf_vstack_vaapi;
 extern const FFFilter ff_vf_xstack_vaapi;
diff --git a/libavfilter/vf_stack_cuda.c b/libavfilter/vf_stack_cuda.c
new file mode 100644
index 0000000000..3273b83c3d
--- /dev/null
+++ b/libavfilter/vf_stack_cuda.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright (c) 2025, Faeez Kadiri < f1k2faeez at gmail dot com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Hardware accelerated hstack, vstack and xstack filters based on CUDA
+ */
+
+#include "config_components.h"
+
+#include "libavutil/opt.h"
+#include "libavutil/common.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/eval.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/avstring.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/colorspace.h"
+#include "libavutil/mem.h"
+
+#include "filters.h"
+#include "formats.h"
+#include "video.h"
+
+#include "framesync.h"
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P016,
+    AV_PIX_FMT_YUV444P16,
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct CUDAStackContext {
+    AVCUDADeviceContext *hwctx;
+    CudaFunctions *cuda_dl;
+
+    CUcontext   cu_ctx;
+    CUmodule    cu_module;
+    CUstream    cu_stream;
+
+    // For copy operations
+    CUfunction  cu_func_copy;
+    CUfunction  cu_func_copy_uv;
+
+    // For color operations
+    CUfunction  cu_func_color;
+    CUfunction  cu_func_color_uv;
+
+    enum AVPixelFormat in_fmt;
+    const AVPixFmtDescriptor *in_desc;
+    int in_planes;
+    int in_plane_depths[4];
+    int in_plane_channels[4];
+
+    int fillcolor_yuv[4];
+} CUDAStackContext;
+
+#define HSTACK_NAME             "hstack_cuda"
+#define VSTACK_NAME             "vstack_cuda"
+#define XSTACK_NAME             "xstack_cuda"
+#define HWContext               CUDAStackContext
+#define StackHWContext          StackCudaContext
+#include "stack_internal.h"
+
+typedef struct StackCudaContext {
+    StackBaseContext base;
+    CUDAStackContext cuda;
+} StackCudaContext;
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static void rgb2yuv(float r, float g, float b, int *y, int *u, int *v, int 
depth)
+{
+    *y = ((0.21260*219.0/255.0) * r + (0.71520*219.0/255.0) * g +
+         (0.07220*219.0/255.0) * b) * ((1 << depth) - 1);
+    *u = (-(0.11457*224.0/255.0) * r - (0.38543*224.0/255.0) * g +
+         (0.50000*224.0/255.0) * b + 0.5) * ((1 << depth) - 1);
+    *v = ((0.50000*224.0/255.0) * r - (0.45415*224.0/255.0) * g -
+         (0.04585*224.0/255.0) * b + 0.5) * ((1 << depth) - 1);
+}
+
+static int conv_8to16(int val, int mask)
+{
+    return ((val) | ((val) << 8)) & mask;
+}
+
+static void get_func_name(char *buf, size_t buf_size,
+                          const char *prefix, int depth, int channels)
+{
+    const char *suffix;
+
+    if (channels == 4 && depth <= 8)
+        suffix = "uchar4";
+    else if (channels == 2 && depth <= 8)
+        suffix = "uchar2";
+    else if (channels == 2 && depth > 8)
+        suffix = "ushort2";
+    else if (depth > 8)
+        suffix = "ushort";
+    else
+        suffix = "uchar";
+
+    snprintf(buf, buf_size, "%s_%s", prefix, suffix);
+}
+
+static av_cold int cuda_stack_load_functions(AVFilterContext *ctx)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions *cu = s->cuda_dl;
+    int ret;
+    char buf[128];
+
+    extern const unsigned char ff_vf_stack_cuda_ptx_data[];
+    extern const unsigned int ff_vf_stack_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_stack_cuda_ptx_data, 
ff_vf_stack_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    get_func_name(buf, sizeof(buf), "StackCopy",
+                  s->in_plane_depths[0], s->in_plane_channels[0]);
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_copy, s->cu_module, 
buf));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load copy function: %s\n", buf);
+        ret = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    get_func_name(buf, sizeof(buf), "SetColor",
+                  s->in_plane_depths[0], s->in_plane_channels[0]);
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_color, s->cu_module, 
buf));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load color function: %s\n", buf);
+        ret = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    if (s->in_planes > 1) {
+        if (s->in_plane_channels[1] > 1) {
+            get_func_name(buf, sizeof(buf), "StackCopy",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+        } else {
+            get_func_name(buf, sizeof(buf), "StackCopy",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+            av_strlcat(buf, "_uv", sizeof(buf));
+        }
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_copy_uv, 
s->cu_module, buf));
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_FATAL, "Failed to load copy UV function: %s\n", 
buf);
+            ret = AVERROR(ENOSYS);
+            goto fail;
+        }
+
+        if (s->in_plane_channels[1] > 1) {
+            get_func_name(buf, sizeof(buf), "SetColor",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+        } else {
+            get_func_name(buf, sizeof(buf), "SetColor",
+                          s->in_plane_depths[1], s->in_plane_channels[1]);
+            av_strlcat(buf, "_uv", sizeof(buf));
+        }
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_color_uv, 
s->cu_module, buf));
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_FATAL, "Failed to load color UV function: 
%s\n", buf);
+            ret = AVERROR(ENOSYS);
+            goto fail;
+        }
+    }
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int cuda_stack_color_kernel(AVFilterContext *ctx, CUfunction func,
+                            AVFrame *out_frame, const int *color,
+                            int width, int height,
+                            int dst_x, int dst_y,
+                            int dst_width, int dst_height, int dst_pitch)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+
+    CUdeviceptr dst_devptr[4] = {
+        (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1],
+        (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3]
+    };
+
+    void *args[] = {
+        &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
+        &width, &height, &dst_pitch,
+        &dst_x, &dst_y,
+        (void *)&color[0], (void *)&color[1], (void *)&color[2], (void 
*)&color[3],
+        &dst_width, &dst_height,
+    };
+
+    return CHECK_CU(cu->cuLaunchKernel(func,
+                                     DIV_UP(width, BLOCKX), DIV_UP(height, 
BLOCKY), 1,
+                                     BLOCKX, BLOCKY, 1,
+                                     0, s->cu_stream, args, NULL));
+}
+
+static int cuda_stack_copy_kernel(AVFilterContext *ctx, CUfunction func,
+                            CUtexObject src_tex[4],
+                            AVFrame *out_frame,
+                            int width, int height,
+                            int dst_x, int dst_y, int dst_pitch,
+                            int src_width, int src_height)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+
+    CUdeviceptr dst_devptr[4] = {
+        (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1],
+        (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3]
+    };
+
+    void *args[] = {
+        &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
+        &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
+        &width, &height, &dst_pitch,
+        &dst_x, &dst_y,
+        &src_width, &src_height,
+        &out_frame->width, &out_frame->height
+    };
+
+    return CHECK_CU(cu->cuLaunchKernel(func,
+                                     DIV_UP(width, BLOCKX), DIV_UP(height, 
BLOCKY), 1,
+                                     BLOCKX, BLOCKY, 1,
+                                     0, s->cu_stream, args, NULL));
+}
+
+static int cuda_stack_color_op(AVFilterContext *ctx, StackItemRegion *region, 
AVFrame *out, const int *color) {
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+    int ret = 0;
+    CUcontext dummy;
+
+    // Push CUDA context
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = cuda_stack_color_kernel(ctx, s->cu_func_color,
+                                out, color, region->width, region->height,
+                                region->x, region->y,
+                                out->width, out->height,
+                                out->linesize[0]);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Error during color operation: %d\n", ret);
+        goto fail;
+    }
+
+    if (s->in_planes > 1) {
+        ret = cuda_stack_color_kernel(ctx, s->cu_func_color_uv,
+                                    out, color,
+                                    AV_CEIL_RSHIFT(region->width, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->height, 
s->in_desc->log2_chroma_h),
+                                    AV_CEIL_RSHIFT(region->x, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->y, 
s->in_desc->log2_chroma_h),
+                                    out->width, out->height,
+                                    out->linesize[1]);
+        if (ret < 0)
+            av_log(ctx, AV_LOG_ERROR, "Error during color UV operation: %d\n", 
ret);
+    }
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int cuda_stack_copy_op(AVFilterContext *ctx, StackItemRegion *region, 
AVFrame *in, AVFrame *out) {
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    CudaFunctions *cu = s->cuda_dl;
+    CUtexObject tex[4] = { 0, 0, 0, 0 };
+    int ret = 0;
+    int i;
+    CUcontext dummy;
+
+    // Push CUDA context
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; i < s->in_planes; i++) {
+        CUDA_TEXTURE_DESC tex_desc = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = s->in_plane_depths[i] <= 8 ?
+                                  CU_AD_FORMAT_UNSIGNED_INT8 :
+                                  CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = s->in_plane_channels[i],
+            .res.pitch2D.pitchInBytes = in->linesize[i],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[i],
+        };
+
+        if (i == 1 || i == 2) {
+            res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, 
s->in_desc->log2_chroma_w);
+            res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, 
s->in_desc->log2_chroma_h);
+        } else {
+            res_desc.res.pitch2D.width = in->width;
+            res_desc.res.pitch2D.height = in->height;
+        }
+
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, 
NULL));
+        if (ret < 0)
+            goto fail;
+    }
+
+    ret = cuda_stack_copy_kernel(ctx, s->cu_func_copy,
+                             tex, out, region->width, region->height,
+                                region->x, region->y, out->linesize[0],
+                                in->width, in->height);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Error during copy operation: %d\n", ret);
+        goto fail;
+    }
+
+    if (s->in_planes > 1) {
+        ret = cuda_stack_copy_kernel(ctx, s->cu_func_copy_uv, tex, out,
+                                    AV_CEIL_RSHIFT(region->width, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->height, 
s->in_desc->log2_chroma_h),
+                                    AV_CEIL_RSHIFT(region->x, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(region->y, 
s->in_desc->log2_chroma_h),
+                                    out->linesize[1],
+                                    AV_CEIL_RSHIFT(in->width, 
s->in_desc->log2_chroma_w),
+                                    AV_CEIL_RSHIFT(in->height, 
s->in_desc->log2_chroma_h));
+        if (ret < 0)
+            av_log(ctx, AV_LOG_ERROR, "Error during copy UV operation: %d\n", 
ret);
+    }
+
+fail:
+    for (i = 0; i < FF_ARRAY_ELEMS(tex); i++)
+        if (tex[i])
+            CHECK_CU(cu->cuTexObjectDestroy(tex[i]));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int process_frame(FFFrameSync *fs)
+{
+    AVFilterContext *ctx = fs->parent;
+    StackCudaContext *sctx = fs->opaque;
+    CUDAStackContext *s = &sctx->cuda;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *out_frame = NULL;
+    AVFrame *in_frame = NULL;
+    int ret = 0;
+
+    out_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!out_frame)
+        return AVERROR(ENOMEM);
+
+    // Fill the entire output frame with fill color if enabled
+    if (sctx->base.fillcolor_enable) {
+        StackItemRegion full_region = {
+            .x = 0,
+            .y = 0,
+            .width = outlink->w,
+            .height = outlink->h
+        };
+
+        ret = cuda_stack_color_op(ctx, &full_region, out_frame, 
s->fillcolor_yuv);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to fill background color\n");
+            goto fail;
+        }
+    }
+
+    for (int i = 0; i < ctx->nb_inputs; i++) {
+        ret = ff_framesync_get_frame(fs, i, &in_frame, 0);
+        if (ret)
+            goto fail;
+
+        if (!i) {
+            ret = av_frame_copy_props(out_frame, in_frame);
+            if (ret < 0)
+                goto fail;
+        }
+
+        ret = cuda_stack_copy_op(ctx, &sctx->base.regions[i], in_frame, 
out_frame);
+        if (ret < 0)
+            goto fail;
+    }
+
+    out_frame->pts = av_rescale_q(sctx->base.fs.pts, sctx->base.fs.time_base, 
outlink->time_base);
+    out_frame->sample_aspect_ratio = outlink->sample_aspect_ratio;
+
+    return ff_filter_frame(outlink, out_frame);
+
+fail:
+    av_frame_free(&out_frame);
+    return ret;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+    AVFilterLink *inlink0 = ctx->inputs[0];
+    FilterLink      *inl0 = ff_filter_link(inlink0);
+    FilterLink      *outl = ff_filter_link(outlink);
+    enum AVPixelFormat in_format;
+    int ret;
+    AVHWFramesContext *in_frames_ctx;
+    AVBufferRef *hw_frames_ctx;
+    AVHWFramesContext *out_frames_ctx;
+
+    if (inlink0->format != AV_PIX_FMT_CUDA || !inl0->hw_frames_ctx || 
!inl0->hw_frames_ctx->data) {
+        av_log(ctx, AV_LOG_ERROR, "Software pixel format is not supported.\n");
+        return AVERROR(EINVAL);
+    }
+
+    in_frames_ctx = (AVHWFramesContext*)inl0->hw_frames_ctx->data;
+    in_format = in_frames_ctx->sw_format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->in_desc = av_pix_fmt_desc_get(s->in_fmt);
+    s->in_planes = av_pix_fmt_count_planes(s->in_fmt);
+
+    // Set up plane information
+    for (int i = 0; i < s->in_desc->nb_components; i++) {
+        int d = (s->in_desc->comp[i].depth + 7) / 8;
+        int p = s->in_desc->comp[i].plane;
+        s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], 
s->in_desc->comp[i].step / d);
+        s->in_plane_depths[p] = s->in_desc->comp[i].depth;
+    }
+
+    s->hwctx = in_frames_ctx->device_ctx->hwctx;
+    s->cuda_dl = s->hwctx->internal->cuda_dl;
+    s->cu_stream = s->hwctx->stream;
+
+    for (int i = 1; i < sctx->base.nb_inputs; i++) {
+        AVFilterLink *inlink = ctx->inputs[i];
+        FilterLink      *inl = ff_filter_link(inlink);
+        AVHWFramesContext *hwfc = NULL;
+
+        if (inlink->format != AV_PIX_FMT_CUDA || !inl->hw_frames_ctx || 
!inl->hw_frames_ctx->data) {
+            av_log(ctx, AV_LOG_ERROR, "Software pixel format is not 
supported.\n");
+            return AVERROR(EINVAL);
+        }
+
+        hwfc = (AVHWFramesContext *)inl->hw_frames_ctx->data;
+
+        if (in_frames_ctx->sw_format != hwfc->sw_format) {
+            av_log(ctx, AV_LOG_ERROR, "All inputs should have the same 
underlying software pixel format.\n");
+            return AVERROR(EINVAL);
+        }
+    }
+
+    if (sctx->base.fillcolor_enable) {
+        if (s->in_desc->flags & AV_PIX_FMT_FLAG_RGB) {
+            s->fillcolor_yuv[0] = sctx->base.fillcolor[0];
+            s->fillcolor_yuv[1] = sctx->base.fillcolor[1];
+            s->fillcolor_yuv[2] = sctx->base.fillcolor[2];
+            s->fillcolor_yuv[3] = sctx->base.fillcolor[3];
+        } else {
+            int Y, U, V;
+
+            rgb2yuv(sctx->base.fillcolor[0] / 255.0, sctx->base.fillcolor[1] / 
255.0,
+                    sctx->base.fillcolor[2] / 255.0, &Y, &U, &V, 8);
+
+            if (s->in_plane_depths[0] > 8) {
+                int mask = (s->in_plane_depths[0] <= 10) ? 0xFFC0 : 0xFFFF;
+                s->fillcolor_yuv[0] = conv_8to16(Y, mask);
+                s->fillcolor_yuv[1] = conv_8to16(U, mask);
+                s->fillcolor_yuv[2] = conv_8to16(V, mask);
+            } else {
+                s->fillcolor_yuv[0] = Y;
+                s->fillcolor_yuv[1] = U;
+                s->fillcolor_yuv[2] = V;
+            }
+            s->fillcolor_yuv[3] = sctx->base.fillcolor[3];
+        }
+    }
+
+    ret = config_comm_output(outlink);
+    if (ret < 0)
+        return ret;
+
+    ret = cuda_stack_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    // Initialize hardware frames context for output
+    hw_frames_ctx = av_hwframe_ctx_alloc(in_frames_ctx->device_ref);
+    if (!hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    out_frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
+    out_frames_ctx->format = AV_PIX_FMT_CUDA;
+    out_frames_ctx->sw_format = in_format;
+    out_frames_ctx->width = outlink->w;
+    out_frames_ctx->height = outlink->h;
+
+    ret = av_hwframe_ctx_init(hw_frames_ctx);
+    if (ret < 0) {
+        av_buffer_unref(&hw_frames_ctx);
+        return ret;
+    }
+
+    av_buffer_unref(&outl->hw_frames_ctx);
+    outl->hw_frames_ctx = hw_frames_ctx;
+
+    return 0;
+}
+
+static av_cold int cuda_stack_init(AVFilterContext *ctx)
+{
+    int ret;
+
+    ret = stack_init(ctx);
+    if (ret)
+        return ret;
+
+    return 0;
+}
+
+static av_cold void cuda_stack_uninit(AVFilterContext *ctx)
+{
+    StackCudaContext *sctx = ctx->priv;
+    CUDAStackContext *s = &sctx->cuda;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    stack_uninit(ctx);
+}
+
+static const enum AVPixelFormat cuda_stack_pix_fmts[] = {
+    AV_PIX_FMT_CUDA,
+    AV_PIX_FMT_NONE,
+};
+
+#include "stack_internal.c"
+
+#if CONFIG_HSTACK_CUDA_FILTER
+
+DEFINE_HSTACK_OPTIONS(cuda);
+DEFINE_STACK_FILTER(hstack, cuda, "CUDA", 0);
+
+#endif
+
+#if CONFIG_VSTACK_CUDA_FILTER
+
+DEFINE_VSTACK_OPTIONS(cuda);
+DEFINE_STACK_FILTER(vstack, cuda, "CUDA", 0);
+
+#endif
+
+#if CONFIG_XSTACK_CUDA_FILTER
+
+DEFINE_XSTACK_OPTIONS(cuda);
+DEFINE_STACK_FILTER(xstack, cuda, "CUDA", 0);
+
+#endif
diff --git a/libavfilter/vf_stack_cuda.cu b/libavfilter/vf_stack_cuda.cu
new file mode 100644
index 0000000000..ea09e3c67d
--- /dev/null
+++ b/libavfilter/vf_stack_cuda.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2025, Faeez Kadiri < f1k2faeez at gmail dot com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+#define FIXED_PITCH(T) \
+    (dst_pitch / sizeof(T))
+
+#define OFFSET_DST(n, T) \
+    dst_##n[(target_y) * FIXED_PITCH(T) + (target_x)]
+
+#define BOUNDS_CHECK() \
+    int xo = blockIdx.x * blockDim.x + threadIdx.x; \
+    int yo = blockIdx.y * blockDim.y + threadIdx.y; \
+    if (xo >= width || yo >= height) \
+        return; \
+    int target_x = xo + dst_x; \
+    int target_y = yo + dst_y; \
+    if (target_x < 0 || target_y < 0 || \
+        target_x >= frame_width || target_y >= frame_height) \
+        return;
+
+#define COPY_SCALE() \
+    float hscale = (float)src_width / (float)width; \
+    float vscale = (float)src_height / (float)height; \
+    float xi = (xo + 0.5f) * hscale; \
+    float yi = (yo + 0.5f) * vscale;
+
+extern "C" {
+
+// --- COLOR KERNELS ---
+
+__global__ void SetColor_uchar(
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(0, uchar) = (uchar)c0;
+}
+
+__global__ void SetColor_ushort(
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(0, ushort) = (ushort)c0;
+}
+
+__global__ void SetColor_uchar4(
+    uchar4 *dst_0, uchar4 *dst_1, uchar4 *dst_2, uchar4 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(0, uchar4) = make_uchar4(c0, c1, c2, c3);
+}
+
+__global__ void SetColor_uchar_uv(
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, uchar) = (uchar)c1;
+    OFFSET_DST(2, uchar) = (uchar)c2;
+}
+
+__global__ void SetColor_ushort_uv(
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, ushort) = (ushort)c1;
+    OFFSET_DST(2, ushort) = (ushort)c2;
+}
+
+__global__ void SetColor_uchar2(
+    uchar2 *dst_0, uchar2 *dst_1, uchar2 *dst_2, uchar2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, uchar2) = make_uchar2(c1, c2);
+}
+
+__global__ void SetColor_ushort2(
+    ushort2 *dst_0, ushort2 *dst_1, ushort2 *dst_2, ushort2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int c0, int c1, int c2, int c3,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    OFFSET_DST(1, ushort2) = make_ushort2(c1, c2);
+}
+
+// --- COPY KERNELS ---
+
+__global__ void StackCopy_uchar(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(0, uchar) = tex2D<uchar>(src_tex_0, xi, yi);
+}
+
+__global__ void StackCopy_ushort(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(0, ushort) = tex2D<ushort>(src_tex_0, xi, yi);
+}
+
+__global__ void StackCopy_uchar4(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar4 *dst_0, uchar4 *dst_1, uchar4 *dst_2, uchar4 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(0, uchar4) = tex2D<uchar4>(src_tex_0, xi, yi);
+}
+
+__global__ void StackCopy_uchar_uv(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, uchar) = tex2D<uchar>(src_tex_1, xi, yi);
+    OFFSET_DST(2, uchar) = tex2D<uchar>(src_tex_2, xi, yi);
+}
+
+__global__ void StackCopy_ushort_uv(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, ushort) = tex2D<ushort>(src_tex_1, xi, yi);
+    OFFSET_DST(2, ushort) = tex2D<ushort>(src_tex_2, xi, yi);
+}
+
+__global__ void StackCopy_uchar2(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    uchar2 *dst_0, uchar2 *dst_1, uchar2 *dst_2, uchar2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, uchar2) = tex2D<uchar2>(src_tex_1, xi, yi);
+}
+
+__global__ void StackCopy_ushort2(
+    cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1,
+    cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3,
+    ushort2 *dst_0, ushort2 *dst_1, ushort2 *dst_2, ushort2 *dst_3,
+    int width, int height, int dst_pitch,
+    int dst_x, int dst_y,
+    int src_width, int src_height,
+    int frame_width, int frame_height)
+{
+    BOUNDS_CHECK();
+    COPY_SCALE();
+    OFFSET_DST(1, ushort2) = tex2D<ushort2>(src_tex_1, xi, yi);
+}
+
+}
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to