PR #22362 opened by f1k2faeez URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22362 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22362.patch
CUDA-accelerated versions of hstack, vstack, and xstack that perform video stacking on GPU without host memory round-trips. Supported pixel formats: yuv420p, nv12, yuv444p, p010le, p016le, yuv444p16le, rgb0, bgr0, rgba, bgra. xstack_cuda supports fill color with automatic RGB to YUV conversion. The filters share options with the software counterparts and reuse the stack_internal.h framework. CUDA kernels are selected by element size rather than per pixel format. Signed-off-by: Faeez Kadiri <[email protected]> >From c7586e5804cbf8c5db29a425e2935b9076b2c68b Mon Sep 17 00:00:00 2001 From: Faeez Kadiri <[email protected]> Date: Sat, 24 May 2025 02:18:05 +0530 Subject: [PATCH] avfilter: add CUDA accelerated stack filters (hstack_cuda, vstack_cuda, xstack_cuda) CUDA-accelerated versions of hstack, vstack, and xstack that perform video stacking on GPU without host memory round-trips. Supported pixel formats: yuv420p, nv12, yuv444p, p010le, p016le, yuv444p16le, rgb0, bgr0, rgba, bgra. xstack_cuda supports fill color with automatic RGB to YUV conversion. The filters share options with the software counterparts and reuse the stack_internal.h framework. CUDA kernels are selected by element size rather than per pixel format. Signed-off-by: Faeez Kadiri <[email protected]> --- Changelog | 1 + configure | 6 + doc/filters.texi | 78 +++++ libavfilter/Makefile | 3 + libavfilter/allfilters.c | 3 + libavfilter/version.h | 2 +- libavfilter/vf_stack_cuda.c | 619 +++++++++++++++++++++++++++++++++++ libavfilter/vf_stack_cuda.cu | 229 +++++++++++++ 8 files changed, 940 insertions(+), 1 deletion(-) create mode 100644 libavfilter/vf_stack_cuda.c create mode 100644 libavfilter/vf_stack_cuda.cu diff --git a/Changelog b/Changelog index ce49a5fff0..f3551c4faa 100644 --- a/Changelog +++ b/Changelog @@ -26,6 +26,7 @@ version <next>: - swscale Vulkan support - LCEVC metadata bitstream filter - Add vf_deinterlace_d3d12 filter +- hstack_cuda, vstack_cuda and xstack_cuda filters version 8.0: diff --git a/configure b/configure index 5ad2e6787d..ac9f223d4d 100755 --- a/configure +++ b/configure @@ -3518,6 +3518,12 @@ overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" pad_cuda_filter_deps="ffnvcodec" pad_cuda_filter_deps_any="cuda_nvcc cuda_llvm" sharpen_npp_filter_deps="ffnvcodec libnpp" +hstack_cuda_filter_deps="ffnvcodec" +hstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +vstack_cuda_filter_deps="ffnvcodec" +vstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +xstack_cuda_filter_deps="ffnvcodec" +xstack_cuda_filter_deps_any="cuda_nvcc cuda_llvm" ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO" gfxcapture_filter_deps="cxx17 threads d3d11va IGraphicsCaptureItemInterop __x_ABI_CWindows_CGraphics_CCapture_CIGraphicsCaptureSession3" diff --git a/doc/filters.texi b/doc/filters.texi index e49dd9ef0d..f5dbe27f3b 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -27349,6 +27349,84 @@ Only deinterlace frames marked as interlaced. The default value is @code{all}. @end table +@section hstack_cuda +Stack input videos horizontally. + +This is the CUDA variant of the @ref{hstack} filter, each input stream may +have different height, this filter will scale down/up each input stream while +keeping the original aspect ratio. + +It accepts the following options: + +@table @option +@item inputs +See @ref{hstack}. + +@item shortest +See @ref{hstack}. + +@item height +Set height of output. If set to 0, this filter will set height of output to +height of the first input stream. Default value is 0. +@end table + +@section vstack_cuda +Stack input videos vertically. + +This is the CUDA variant of the @ref{vstack} filter, each input stream may +have different width, this filter will scale down/up each input stream while +keeping the original aspect ratio. + +It accepts the following options: + +@table @option +@item inputs +See @ref{vstack}. + +@item shortest +See @ref{vstack}. + +@item width +Set width of output. If set to 0, this filter will set width of output to +width of the first input stream. Default value is 0. +@end table + +@section xstack_cuda +Stack video inputs into custom layout. + +This is the CUDA variant of the @ref{xstack} filter, each input stream may +have different size, this filter will scale down/up each input stream to the +given output size, or the size of the first input stream. + +It accepts the following options: + +@table @option +@item inputs +See @ref{xstack}. + +@item shortest +See @ref{xstack}. + +@item layout +See @ref{xstack}. +Moreover, this permits the user to supply output size for each input stream. +@example +xstack_cuda=inputs=4:layout=0_0_1920x1080|0_h0_1920x1080|w0_0_1920x1080|w0_h0_1920x1080 +@end example + +@item grid +See @ref{xstack}. + +@item grid_tile_size +Set output size for each input stream when @option{grid} is set. If this option +is not set, this filter will set output size by default to the size of the +first input stream. For the syntax of this option, check the +@ref{video size syntax,,"Video size" section in the ffmpeg-utils manual,ffmpeg-utils}. + +@item fill +See @ref{xstack}. +@end table + @anchor{CUDA NPP} @section CUDA NPP Below is a description of the currently available NVIDIA Performance Primitives (libnpp) video filters. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index a530cfae29..eb0af76457 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -596,6 +596,9 @@ OBJS-$(CONFIG_YAEPBLUR_FILTER) += vf_yaepblur.o OBJS-$(CONFIG_ZMQ_FILTER) += f_zmq.o OBJS-$(CONFIG_ZOOMPAN_FILTER) += vf_zoompan.o OBJS-$(CONFIG_ZSCALE_FILTER) += vf_zscale.o +OBJS-$(CONFIG_HSTACK_CUDA_FILTER) += vf_stack_cuda.o framesync.o vf_stack_cuda.ptx.o cuda/load_helper.o +OBJS-$(CONFIG_VSTACK_CUDA_FILTER) += vf_stack_cuda.o framesync.o vf_stack_cuda.ptx.o cuda/load_helper.o +OBJS-$(CONFIG_XSTACK_CUDA_FILTER) += vf_stack_cuda.o framesync.o vf_stack_cuda.ptx.o cuda/load_helper.o OBJS-$(CONFIG_HSTACK_VAAPI_FILTER) += vf_stack_vaapi.o framesync.o vaapi_vpp.o OBJS-$(CONFIG_VSTACK_VAAPI_FILTER) += vf_stack_vaapi.o framesync.o vaapi_vpp.o OBJS-$(CONFIG_XSTACK_VAAPI_FILTER) += vf_stack_vaapi.o framesync.o vaapi_vpp.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index e26859e159..58750aa05d 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -557,6 +557,9 @@ extern const FFFilter ff_vf_yaepblur; extern const FFFilter ff_vf_zmq; extern const FFFilter ff_vf_zoompan; extern const FFFilter ff_vf_zscale; +extern const FFFilter ff_vf_hstack_cuda; +extern const FFFilter ff_vf_vstack_cuda; +extern const FFFilter ff_vf_xstack_cuda; extern const FFFilter ff_vf_hstack_vaapi; extern const FFFilter ff_vf_vstack_vaapi; extern const FFFilter ff_vf_xstack_vaapi; diff --git a/libavfilter/version.h b/libavfilter/version.h index 537df129cd..7642b670d1 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #include "version_major.h" -#define LIBAVFILTER_VERSION_MINOR 13 +#define LIBAVFILTER_VERSION_MINOR 14 #define LIBAVFILTER_VERSION_MICRO 100 diff --git a/libavfilter/vf_stack_cuda.c b/libavfilter/vf_stack_cuda.c new file mode 100644 index 0000000000..e74e9e97d2 --- /dev/null +++ b/libavfilter/vf_stack_cuda.c @@ -0,0 +1,619 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Hardware accelerated hstack, vstack and xstack filters based on CUDA + */ + +#include "config_components.h" + +#include "libavutil/opt.h" +#include "libavutil/common.h" +#include "libavutil/pixdesc.h" +#include "libavutil/eval.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/avstring.h" +#include "libavutil/avassert.h" +#include "libavutil/imgutils.h" +#include "libavutil/mathematics.h" +#include "libavutil/parseutils.h" +#include "libavutil/colorspace.h" +#include "libavutil/mem.h" + +#include "filters.h" +#include "formats.h" +#include "video.h" + +#include "framesync.h" +#include "cuda/load_helper.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV444P, + AV_PIX_FMT_P010, + AV_PIX_FMT_P016, + AV_PIX_FMT_YUV444P16, + AV_PIX_FMT_0RGB32, + AV_PIX_FMT_0BGR32, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_BGR32, +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +typedef struct CUDAStackContext { + AVCUDADeviceContext *hwctx; + CudaFunctions *cuda_dl; + + CUcontext cu_ctx; + CUmodule cu_module; + CUstream cu_stream; + + CUfunction cu_func_copy; + CUfunction cu_func_copy_uv; + + CUfunction cu_func_color; + CUfunction cu_func_color_uv; + + enum AVPixelFormat in_fmt; + const AVPixFmtDescriptor *in_desc; + int in_planes; + int in_plane_depths[4]; + int in_plane_channels[4]; + + int fillcolor_yuv[4]; +} CUDAStackContext; + +#define HSTACK_NAME "hstack_cuda" +#define VSTACK_NAME "vstack_cuda" +#define XSTACK_NAME "xstack_cuda" +#define HWContext CUDAStackContext +#define StackHWContext StackCudaContext +#include "stack_internal.h" + +typedef struct StackCudaContext { + StackBaseContext base; + CUDAStackContext cuda; +} StackCudaContext; + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static void rgb2yuv(float r, float g, float b, int *y, int *u, int *v, int depth) +{ + *y = ((0.21260*219.0/255.0) * r + (0.71520*219.0/255.0) * g + + (0.07220*219.0/255.0) * b) * ((1 << depth) - 1); + *u = (-(0.11457*224.0/255.0) * r - (0.38543*224.0/255.0) * g + + (0.50000*224.0/255.0) * b + 0.5) * ((1 << depth) - 1); + *v = ((0.50000*224.0/255.0) * r - (0.45415*224.0/255.0) * g - + (0.04585*224.0/255.0) * b + 0.5) * ((1 << depth) - 1); +} + +static int conv_8to16(int val, int mask) +{ + return (val | (val << 8)) & mask; +} + +static void get_func_name(char *buf, size_t buf_size, + const char *prefix, int depth, int channels) +{ + const char *suffix; + + if (channels == 4 && depth <= 8) + suffix = "uchar4"; + else if (channels == 2 && depth <= 8) + suffix = "uchar2"; + else if (channels == 2 && depth > 8) + suffix = "ushort2"; + else if (depth > 8) + suffix = "ushort"; + else + suffix = "uchar"; + + snprintf(buf, buf_size, "%s_%s", prefix, suffix); +} + +static av_cold int cuda_stack_load_functions(AVFilterContext *ctx) +{ + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->cuda_dl; + int ret; + char buf[128]; + + extern const unsigned char ff_vf_stack_cuda_ptx_data[]; + extern const unsigned int ff_vf_stack_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_stack_cuda_ptx_data, ff_vf_stack_cuda_ptx_len); + if (ret < 0) + goto fail; + + get_func_name(buf, sizeof(buf), "StackCopy", + s->in_plane_depths[0], s->in_plane_channels[0]); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_copy, s->cu_module, buf)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load copy function: %s\n", buf); + ret = AVERROR(ENOSYS); + goto fail; + } + + get_func_name(buf, sizeof(buf), "SetColor", + s->in_plane_depths[0], s->in_plane_channels[0]); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_color, s->cu_module, buf)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load color function: %s\n", buf); + ret = AVERROR(ENOSYS); + goto fail; + } + + if (s->in_planes > 1) { + if (s->in_plane_channels[1] > 1) { + get_func_name(buf, sizeof(buf), "StackCopy", + s->in_plane_depths[1], s->in_plane_channels[1]); + } else { + get_func_name(buf, sizeof(buf), "StackCopy", + s->in_plane_depths[1], s->in_plane_channels[1]); + av_strlcat(buf, "_uv", sizeof(buf)); + } + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_copy_uv, s->cu_module, buf)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load copy UV function: %s\n", buf); + ret = AVERROR(ENOSYS); + goto fail; + } + + if (s->in_plane_channels[1] > 1) { + get_func_name(buf, sizeof(buf), "SetColor", + s->in_plane_depths[1], s->in_plane_channels[1]); + } else { + get_func_name(buf, sizeof(buf), "SetColor", + s->in_plane_depths[1], s->in_plane_channels[1]); + av_strlcat(buf, "_uv", sizeof(buf)); + } + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_color_uv, s->cu_module, buf)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load color UV function: %s\n", buf); + ret = AVERROR(ENOSYS); + goto fail; + } + } + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return ret; +} + +static int cuda_stack_color_kernel(AVFilterContext *ctx, CUfunction func, + AVFrame *out_frame, const int *color, + int width, int height, + int dst_x, int dst_y, + int dst_width, int dst_height, int dst_pitch) +{ + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + CudaFunctions *cu = s->cuda_dl; + + CUdeviceptr dst_devptr[4] = { + (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1], + (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3] + }; + + void *args[] = { + &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3], + &width, &height, &dst_pitch, + &dst_x, &dst_y, + (void *)&color[0], (void *)&color[1], (void *)&color[2], (void *)&color[3], + &dst_width, &dst_height, + }; + + return CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); +} + +static int cuda_stack_copy_kernel(AVFilterContext *ctx, CUfunction func, + CUtexObject src_tex[4], + AVFrame *out_frame, + int width, int height, + int dst_x, int dst_y, int dst_pitch, + int src_width, int src_height) +{ + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + CudaFunctions *cu = s->cuda_dl; + + CUdeviceptr dst_devptr[4] = { + (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1], + (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3] + }; + + void *args[] = { + &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3], + &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3], + &width, &height, &dst_pitch, + &dst_x, &dst_y, + &src_width, &src_height, + &out_frame->width, &out_frame->height + }; + + return CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, + 0, s->cu_stream, args, NULL)); +} + +static int cuda_stack_color_op(AVFilterContext *ctx, StackItemRegion *region, + AVFrame *out, const int *color) +{ + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + CudaFunctions *cu = s->cuda_dl; + int ret = 0; + CUcontext dummy; + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + return ret; + + ret = cuda_stack_color_kernel(ctx, s->cu_func_color, + out, color, region->width, region->height, + region->x, region->y, + out->width, out->height, + out->linesize[0]); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Error during color operation: %d\n", ret); + goto fail; + } + + if (s->in_planes > 1) { + ret = cuda_stack_color_kernel(ctx, s->cu_func_color_uv, + out, color, + AV_CEIL_RSHIFT(region->width, s->in_desc->log2_chroma_w), + AV_CEIL_RSHIFT(region->height, s->in_desc->log2_chroma_h), + AV_CEIL_RSHIFT(region->x, s->in_desc->log2_chroma_w), + AV_CEIL_RSHIFT(region->y, s->in_desc->log2_chroma_h), + out->width, out->height, + out->linesize[1]); + if (ret < 0) + av_log(ctx, AV_LOG_ERROR, "Error during color UV operation: %d\n", ret); + } + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return ret; +} + +static int cuda_stack_copy_op(AVFilterContext *ctx, StackItemRegion *region, + AVFrame *in, AVFrame *out) +{ + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + CudaFunctions *cu = s->cuda_dl; + CUtexObject tex[4] = { 0, 0, 0, 0 }; + int ret = 0; + int i; + CUcontext dummy; + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + return ret; + + for (i = 0; i < s->in_planes; i++) { + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = s->in_plane_depths[i] <= 8 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = s->in_plane_channels[i], + .res.pitch2D.pitchInBytes = in->linesize[i], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[i], + }; + + if (i == 1 || i == 2) { + res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w); + res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h); + } else { + res_desc.res.pitch2D.width = in->width; + res_desc.res.pitch2D.height = in->height; + } + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto fail; + } + + ret = cuda_stack_copy_kernel(ctx, s->cu_func_copy, + tex, out, region->width, region->height, + region->x, region->y, out->linesize[0], + in->width, in->height); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Error during copy operation: %d\n", ret); + goto fail; + } + + if (s->in_planes > 1) { + ret = cuda_stack_copy_kernel(ctx, s->cu_func_copy_uv, tex, out, + AV_CEIL_RSHIFT(region->width, s->in_desc->log2_chroma_w), + AV_CEIL_RSHIFT(region->height, s->in_desc->log2_chroma_h), + AV_CEIL_RSHIFT(region->x, s->in_desc->log2_chroma_w), + AV_CEIL_RSHIFT(region->y, s->in_desc->log2_chroma_h), + out->linesize[1], + AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w), + AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h)); + if (ret < 0) + av_log(ctx, AV_LOG_ERROR, "Error during copy UV operation: %d\n", ret); + } + +fail: + for (i = 0; i < FF_ARRAY_ELEMS(tex); i++) + if (tex[i]) + CHECK_CU(cu->cuTexObjectDestroy(tex[i])); + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return ret; +} + +static int process_frame(FFFrameSync *fs) +{ + AVFilterContext *ctx = fs->parent; + StackCudaContext *sctx = fs->opaque; + CUDAStackContext *s = &sctx->cuda; + AVFilterLink *outlink = ctx->outputs[0]; + AVFrame *out_frame = NULL; + AVFrame *in_frame = NULL; + int ret = 0; + + out_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!out_frame) + return AVERROR(ENOMEM); + + if (sctx->base.fillcolor_enable) { + StackItemRegion full_region = { + .x = 0, + .y = 0, + .width = outlink->w, + .height = outlink->h + }; + + ret = cuda_stack_color_op(ctx, &full_region, out_frame, s->fillcolor_yuv); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Failed to fill background color\n"); + goto fail; + } + } + + for (int i = 0; i < ctx->nb_inputs; i++) { + ret = ff_framesync_get_frame(fs, i, &in_frame, 0); + if (ret) + goto fail; + + if (!i) { + ret = av_frame_copy_props(out_frame, in_frame); + if (ret < 0) + goto fail; + } + + ret = cuda_stack_copy_op(ctx, &sctx->base.regions[i], in_frame, out_frame); + if (ret < 0) + goto fail; + } + + out_frame->pts = av_rescale_q(sctx->base.fs.pts, sctx->base.fs.time_base, outlink->time_base); + out_frame->sample_aspect_ratio = outlink->sample_aspect_ratio; + + return ff_filter_frame(outlink, out_frame); + +fail: + av_frame_free(&out_frame); + return ret; +} + +static int config_output(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + AVFilterLink *inlink0 = ctx->inputs[0]; + FilterLink *inl0 = ff_filter_link(inlink0); + FilterLink *outl = ff_filter_link(outlink); + enum AVPixelFormat in_format; + int ret; + AVHWFramesContext *in_frames_ctx; + AVBufferRef *hw_frames_ctx; + AVHWFramesContext *out_frames_ctx; + + if (inlink0->format != AV_PIX_FMT_CUDA || !inl0->hw_frames_ctx || !inl0->hw_frames_ctx->data) { + av_log(ctx, AV_LOG_ERROR, "Software pixel format is not supported.\n"); + return AVERROR(EINVAL); + } + + in_frames_ctx = (AVHWFramesContext*)inl0->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + + if (!format_is_supported(in_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + + s->in_fmt = in_format; + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->in_planes = av_pix_fmt_count_planes(s->in_fmt); + + for (int i = 0; i < s->in_desc->nb_components; i++) { + int d = (s->in_desc->comp[i].depth + 7) / 8; + int p = s->in_desc->comp[i].plane; + s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], s->in_desc->comp[i].step / d); + s->in_plane_depths[p] = s->in_desc->comp[i].depth; + } + + s->hwctx = in_frames_ctx->device_ctx->hwctx; + s->cuda_dl = s->hwctx->internal->cuda_dl; + s->cu_stream = s->hwctx->stream; + + for (int i = 1; i < sctx->base.nb_inputs; i++) { + AVFilterLink *inlink = ctx->inputs[i]; + FilterLink *inl = ff_filter_link(inlink); + AVHWFramesContext *hwfc = NULL; + + if (inlink->format != AV_PIX_FMT_CUDA || !inl->hw_frames_ctx || !inl->hw_frames_ctx->data) { + av_log(ctx, AV_LOG_ERROR, "Software pixel format is not supported.\n"); + return AVERROR(EINVAL); + } + + hwfc = (AVHWFramesContext *)inl->hw_frames_ctx->data; + + if (in_frames_ctx->sw_format != hwfc->sw_format) { + av_log(ctx, AV_LOG_ERROR, "All inputs should have the same underlying software pixel format.\n"); + return AVERROR(EINVAL); + } + } + + if (sctx->base.fillcolor_enable) { + if (s->in_desc->flags & AV_PIX_FMT_FLAG_RGB) { + s->fillcolor_yuv[0] = sctx->base.fillcolor[0]; + s->fillcolor_yuv[1] = sctx->base.fillcolor[1]; + s->fillcolor_yuv[2] = sctx->base.fillcolor[2]; + s->fillcolor_yuv[3] = sctx->base.fillcolor[3]; + } else { + int Y, U, V; + + rgb2yuv(sctx->base.fillcolor[0] / 255.0, sctx->base.fillcolor[1] / 255.0, + sctx->base.fillcolor[2] / 255.0, &Y, &U, &V, 8); + + if (s->in_plane_depths[0] > 8) { + int mask = (s->in_plane_depths[0] <= 10) ? 0xFFC0 : 0xFFFF; + s->fillcolor_yuv[0] = conv_8to16(Y, mask); + s->fillcolor_yuv[1] = conv_8to16(U, mask); + s->fillcolor_yuv[2] = conv_8to16(V, mask); + } else { + s->fillcolor_yuv[0] = Y; + s->fillcolor_yuv[1] = U; + s->fillcolor_yuv[2] = V; + } + s->fillcolor_yuv[3] = sctx->base.fillcolor[3]; + } + } + + ret = config_comm_output(outlink); + if (ret < 0) + return ret; + + ret = cuda_stack_load_functions(ctx); + if (ret < 0) + return ret; + + hw_frames_ctx = av_hwframe_ctx_alloc(in_frames_ctx->device_ref); + if (!hw_frames_ctx) + return AVERROR(ENOMEM); + + out_frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data; + out_frames_ctx->format = AV_PIX_FMT_CUDA; + out_frames_ctx->sw_format = in_format; + out_frames_ctx->width = outlink->w; + out_frames_ctx->height = outlink->h; + + ret = av_hwframe_ctx_init(hw_frames_ctx); + if (ret < 0) { + av_buffer_unref(&hw_frames_ctx); + return ret; + } + + av_buffer_unref(&outl->hw_frames_ctx); + outl->hw_frames_ctx = hw_frames_ctx; + + return 0; +} + +static av_cold int cuda_stack_init(AVFilterContext *ctx) +{ + return stack_init(ctx); +} + +static av_cold void cuda_stack_uninit(AVFilterContext *ctx) +{ + StackCudaContext *sctx = ctx->priv; + CUDAStackContext *s = &sctx->cuda; + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->cuda_dl; + CUcontext dummy; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + } + + stack_uninit(ctx); +} + +static const enum AVPixelFormat cuda_stack_pix_fmts[] = { + AV_PIX_FMT_CUDA, + AV_PIX_FMT_NONE, +}; + +#include "stack_internal.c" + +#if CONFIG_HSTACK_CUDA_FILTER + +DEFINE_HSTACK_OPTIONS(cuda); +DEFINE_STACK_FILTER(hstack, cuda, "CUDA", 0); + +#endif + +#if CONFIG_VSTACK_CUDA_FILTER + +DEFINE_VSTACK_OPTIONS(cuda); +DEFINE_STACK_FILTER(vstack, cuda, "CUDA", 0); + +#endif + +#if CONFIG_XSTACK_CUDA_FILTER + +DEFINE_XSTACK_OPTIONS(cuda); +DEFINE_STACK_FILTER(xstack, cuda, "CUDA", 0); + +#endif diff --git a/libavfilter/vf_stack_cuda.cu b/libavfilter/vf_stack_cuda.cu new file mode 100644 index 0000000000..c2f4bc70b9 --- /dev/null +++ b/libavfilter/vf_stack_cuda.cu @@ -0,0 +1,229 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "cuda/vector_helpers.cuh" + +#define FIXED_PITCH(T) \ + (dst_pitch / sizeof(T)) + +#define OFFSET_DST(n, T) \ + dst_##n[(target_y) * FIXED_PITCH(T) + (target_x)] + +#define BOUNDS_CHECK() \ + int xo = blockIdx.x * blockDim.x + threadIdx.x; \ + int yo = blockIdx.y * blockDim.y + threadIdx.y; \ + if (xo >= width || yo >= height) \ + return; \ + int target_x = xo + dst_x; \ + int target_y = yo + dst_y; \ + if (target_x < 0 || target_y < 0 || \ + target_x >= frame_width || target_y >= frame_height) \ + return; + +#define COPY_SCALE() \ + float hscale = (float)src_width / (float)width; \ + float vscale = (float)src_height / (float)height; \ + float xi = (xo + 0.5f) * hscale; \ + float yi = (yo + 0.5f) * vscale; + +extern "C" { + +// --- COLOR KERNELS --- + +__global__ void SetColor_uchar( + uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(0, uchar) = (uchar)c0; +} + +__global__ void SetColor_ushort( + ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(0, ushort) = (ushort)c0; +} + +__global__ void SetColor_uchar4( + uchar4 *dst_0, uchar4 *dst_1, uchar4 *dst_2, uchar4 *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(0, uchar4) = make_uchar4(c0, c1, c2, c3); +} + +__global__ void SetColor_uchar_uv( + uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(1, uchar) = (uchar)c1; + OFFSET_DST(2, uchar) = (uchar)c2; +} + +__global__ void SetColor_ushort_uv( + ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(1, ushort) = (ushort)c1; + OFFSET_DST(2, ushort) = (ushort)c2; +} + +__global__ void SetColor_uchar2( + uchar2 *dst_0, uchar2 *dst_1, uchar2 *dst_2, uchar2 *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(1, uchar2) = make_uchar2(c1, c2); +} + +__global__ void SetColor_ushort2( + ushort2 *dst_0, ushort2 *dst_1, ushort2 *dst_2, ushort2 *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int c0, int c1, int c2, int c3, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + OFFSET_DST(1, ushort2) = make_ushort2(c1, c2); +} + +// --- COPY KERNELS --- + +__global__ void StackCopy_uchar( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(0, uchar) = tex2D<uchar>(src_tex_0, xi, yi); +} + +__global__ void StackCopy_ushort( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(0, ushort) = tex2D<ushort>(src_tex_0, xi, yi); +} + +__global__ void StackCopy_uchar4( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + uchar4 *dst_0, uchar4 *dst_1, uchar4 *dst_2, uchar4 *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(0, uchar4) = tex2D<uchar4>(src_tex_0, xi, yi); +} + +__global__ void StackCopy_uchar_uv( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + uchar *dst_0, uchar *dst_1, uchar *dst_2, uchar *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(1, uchar) = tex2D<uchar>(src_tex_1, xi, yi); + OFFSET_DST(2, uchar) = tex2D<uchar>(src_tex_2, xi, yi); +} + +__global__ void StackCopy_ushort_uv( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + ushort *dst_0, ushort *dst_1, ushort *dst_2, ushort *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(1, ushort) = tex2D<ushort>(src_tex_1, xi, yi); + OFFSET_DST(2, ushort) = tex2D<ushort>(src_tex_2, xi, yi); +} + +__global__ void StackCopy_uchar2( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + uchar2 *dst_0, uchar2 *dst_1, uchar2 *dst_2, uchar2 *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(1, uchar2) = tex2D<uchar2>(src_tex_1, xi, yi); +} + +__global__ void StackCopy_ushort2( + cudaTextureObject_t src_tex_0, cudaTextureObject_t src_tex_1, + cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, + ushort2 *dst_0, ushort2 *dst_1, ushort2 *dst_2, ushort2 *dst_3, + int width, int height, int dst_pitch, + int dst_x, int dst_y, + int src_width, int src_height, + int frame_width, int frame_height) +{ + BOUNDS_CHECK(); + COPY_SCALE(); + OFFSET_DST(1, ushort2) = tex2D<ushort2>(src_tex_1, xi, yi); +} + +} -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
