PR #22340 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22340
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22340.patch


>From 6d2e894f8d8c31d9eb4935d0e7075dcd25cd3ce5 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <[email protected]>
Date: Mon, 2 Mar 2026 00:40:10 +0800
Subject: [PATCH 1/3] avfilter/vf_scale_cuda: add YUV to RGB color space
 conversion

When the input colorspace is unspecified, BT.601 (SMPTE 170M) is used
as the default, matching swscale behavior. Both limited and full range
inputs are handled correctly.

The conversion kernels use direct texture reads for luma and simple
coordinate mapping for chroma upsampling, avoiding unnecessary
Subsample_Nearest overhead since color conversion operates at the same
resolution. Simultaneous resize and color conversion is rejected
with an error directing users to chain separate scale_cuda instances.
---
 libavfilter/vf_scale_cuda.c  | 212 +++++++++++++++++++++++++++--------
 libavfilter/vf_scale_cuda.cu | 165 +++++++++++++++++++++++++++
 libavfilter/vf_scale_cuda.h  |   7 ++
 3 files changed, 336 insertions(+), 48 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 5fd757161b..2a4313cae1 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -21,9 +21,11 @@
 */
 
 #include <float.h>
+#include <stdbool.h>
 #include <stdio.h>
 
 #include "libavutil/common.h"
+#include "libavutil/csp.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_cuda_internal.h"
 #include "libavutil/cuda_check.h"
@@ -32,6 +34,7 @@
 #include "libavutil/pixdesc.h"
 
 #include "avfilter.h"
+#include "colorspace.h"
 #include "filters.h"
 #include "scale_eval.h"
 #include "video.h"
@@ -121,6 +124,11 @@ typedef struct CUDAScaleContext {
     int interp_as_integer;
 
     float param;
+
+    bool yuv2rgb;
+    enum AVColorSpace  colorspace;   ///< resolved colorspace for conversion
+    enum AVColorRange  color_range;  ///< resolved color range for conversion
+    CUDAScaleColorMatrix color_matrix;
 } CUDAScaleContext;
 
 static av_cold int cudascale_init(AVFilterContext *ctx)
@@ -210,6 +218,38 @@ static const char* get_format_name(enum AVPixelFormat fmt)
     return NULL;
 }
 
+static int compute_yuv2rgb_matrix(void *log_ctx, CUDAScaleColorMatrix *mat,
+                                  enum AVColorSpace colorspace,
+                                  bool limited_range)
+{
+    const AVLumaCoefficients *coeffs = 
av_csp_luma_coeffs_from_avcsp(colorspace);
+    float y_scale = 1.0f;
+    float uv_scale = 1.0f;
+    double rgb2yuv[3][3], yuv2rgb[3][3];
+
+    if (!coeffs) {
+        av_log(log_ctx, AV_LOG_ERROR,
+               "Unsupported colorspace %d for YUV->RGB conversion\n", 
colorspace);
+        return AVERROR(EINVAL);
+    }
+
+    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
+    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
+
+    if (limited_range) {
+        y_scale  = 255.0f / (235.0f - 16.0f);
+        uv_scale = 255.0f / (240.0f - 16.0f);
+    }
+
+    for (int i = 0; i < 3; i++) {
+        mat->m[i][0] = (float)yuv2rgb[i][0] * y_scale;
+        mat->m[i][1] = (float)yuv2rgb[i][1] * uv_scale;
+        mat->m[i][2] = (float)yuv2rgb[i][2] * uv_scale;
+    }
+
+    return 0;
+}
+
 static av_cold void set_format_info(AVFilterContext *ctx, enum AVPixelFormat 
in_format, enum AVPixelFormat out_format)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -271,6 +311,42 @@ static av_cold int init_processing_chain(AVFilterContext 
*ctx, int in_width, int
 
     set_format_info(ctx, in_format, out_format);
 
+    s->yuv2rgb = !(s->in_desc->flags & AV_PIX_FMT_FLAG_RGB) &&
+                  (s->out_desc->flags & AV_PIX_FMT_FLAG_RGB);
+    if (s->yuv2rgb) {
+        AVFilterLink *inlink = ctx->inputs[0];
+
+        if (in_width != out_width || in_height != out_height) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Resizing is not supported during YUV->RGB conversion, "
+                   "use a separate scale_cuda instance for resizing\n");
+            return AVERROR(ENOSYS);
+        }
+
+        enum AVColorSpace cs = inlink->colorspace;
+        bool limited_range;
+
+        /* When colorspace is unspecified, default to BT.601 (SMPTE 170M)
+         * to match swscale behavior (SWS_CS_DEFAULT = ITU601).
+         */
+        if (cs == AVCOL_SPC_UNSPECIFIED)
+            cs = AVCOL_SPC_SMPTE170M;
+
+        limited_range = (inlink->color_range != AVCOL_RANGE_JPEG);
+
+        ret = compute_yuv2rgb_matrix(ctx, &s->color_matrix, cs, limited_range);
+        if (ret < 0)
+            return ret;
+
+        s->colorspace  = cs;
+        s->color_range = limited_range ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+
+        av_log(ctx, AV_LOG_VERBOSE,
+               "YUV->RGB conversion enabled (%s, %s range)\n",
+               av_color_space_name(cs),
+               limited_range ? "limited" : "full");
+    }
+
     if (s->passthrough && in_width == out_width && in_height == out_height && 
in_format == out_format) {
         s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
         if (!s->frames_ctx)
@@ -310,31 +386,41 @@ static av_cold int 
cudascale_load_functions(AVFilterContext *ctx)
     extern const unsigned char ff_vf_scale_cuda_ptx_data[];
     extern const unsigned int ff_vf_scale_cuda_ptx_len;
 
-    switch(s->interp_algo) {
-    case INTERP_ALGO_NEAREST:
+    if (s->yuv2rgb) {
+        /* YUV->RGB only supports Nearest for now */
         function_infix = "Nearest";
         s->interp_use_linear = 0;
         s->interp_as_integer = 1;
-        break;
-    case INTERP_ALGO_BILINEAR:
-        function_infix = "Bilinear";
-        s->interp_use_linear = 1;
-        s->interp_as_integer = 1;
-        break;
-    case INTERP_ALGO_DEFAULT:
-    case INTERP_ALGO_BICUBIC:
-        function_infix = "Bicubic";
-        s->interp_use_linear = 0;
-        s->interp_as_integer = 0;
-        break;
-    case INTERP_ALGO_LANCZOS:
-        function_infix = "Lanczos";
-        s->interp_use_linear = 0;
-        s->interp_as_integer = 0;
-        break;
-    default:
-        av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n");
-        return AVERROR_BUG;
+        if (s->interp_algo != INTERP_ALGO_DEFAULT && s->interp_algo != 
INTERP_ALGO_NEAREST)
+            av_log(ctx, AV_LOG_WARNING,
+                   "YUV->RGB conversion only supports nearest interpolation, 
ignoring interp_algo\n");
+    } else {
+        switch(s->interp_algo) {
+            case INTERP_ALGO_NEAREST:
+                function_infix = "Nearest";
+                s->interp_use_linear = 0;
+                s->interp_as_integer = 1;
+                break;
+            case INTERP_ALGO_BILINEAR:
+                function_infix = "Bilinear";
+                s->interp_use_linear = 1;
+                s->interp_as_integer = 1;
+                break;
+            case INTERP_ALGO_DEFAULT:
+            case INTERP_ALGO_BICUBIC:
+                function_infix = "Bicubic";
+                s->interp_use_linear = 0;
+                s->interp_as_integer = 0;
+                break;
+            case INTERP_ALGO_LANCZOS:
+                function_infix = "Lanczos";
+                s->interp_use_linear = 0;
+                s->interp_as_integer = 0;
+                break;
+            default:
+                av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n");
+                return AVERROR_BUG;
+        }
     }
 
     ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
@@ -353,13 +439,18 @@ static av_cold int 
cudascale_load_functions(AVFilterContext *ctx)
         ret = AVERROR(ENOSYS);
         goto fail;
     }
-    av_log(ctx, AV_LOG_DEBUG, "Luma filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
+    av_log(ctx, AV_LOG_DEBUG, "Main filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
 
-    snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, 
in_fmt_name, out_fmt_name);
-    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, buf));
-    if (ret < 0)
-        goto fail;
-    av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
+    if (s->yuv2rgb) {
+        s->cu_func_uv = NULL;
+        av_log(ctx, AV_LOG_DEBUG, "YUV->RGB mode: no separate chroma 
kernel\n");
+    } else {
+        snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, 
in_fmt_name, out_fmt_name);
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, 
buf));
+        if (ret < 0)
+            goto fail;
+        av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
+    }
 
 fail:
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
@@ -457,9 +548,15 @@ static int call_resize_kernel(AVFilterContext *ctx, 
CUfunction func,
         .src_width = src_width,
         .src_height = src_height,
         .param = s->param,
-        .mpeg_range = mpeg_range
+        .mpeg_range = mpeg_range,
+        .color_matrix = s->color_matrix,
     };
 
+    if (s->yuv2rgb) {
+        params.log2_chroma_w = s->in_desc->log2_chroma_w;
+        params.log2_chroma_h = s->in_desc->log2_chroma_h;
+    }
+
     void *args[] = { &params };
 
     return CHECK_CU(cu->cuLaunchKernel(func,
@@ -474,7 +571,12 @@ static int scalecuda_resize(AVFilterContext *ctx,
     CudaFunctions *cu = s->hwctx->internal->cuda_dl;
     CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
     int i, ret;
-    int mpeg_range = in->color_range != AVCOL_RANGE_JPEG;
+
+    /* Color matrix was computed at config time; use the resolved color_range
+     * for yuv2rgb so mpeg_range matches the matrix. For plain scaling,
+     * read from the input frame. */
+    int mpeg_range = s->yuv2rgb ? (s->color_range != AVCOL_RANGE_JPEG)
+                                : (in->color_range != AVCOL_RANGE_JPEG);
 
     CUtexObject tex[4] = { 0, 0, 0, 0 };
 
@@ -516,26 +618,35 @@ static int scalecuda_resize(AVFilterContext *ctx,
             goto exit;
     }
 
-    // scale primary plane(s). Usually Y (and A), or single plane of RGB 
frames.
-    ret = call_resize_kernel(ctx, s->cu_func,
-                             tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
-                             out, out->width, out->height, out->linesize[0], 
mpeg_range);
-    if (ret < 0)
-        goto exit;
-
-    if (s->out_planes > 1) {
-        // scale UV plane. Scale function sets both U and V plane, or singular 
interleaved plane.
-        ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
-                                 AV_CEIL_RSHIFT(in->crop_left, 
s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(in->crop_top, 
s->in_desc->log2_chroma_h),
-                                 AV_CEIL_RSHIFT(crop_width, 
s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(crop_height, 
s->in_desc->log2_chroma_h),
-                                 out,
-                                 AV_CEIL_RSHIFT(out->width, 
s->out_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(out->height, 
s->out_desc->log2_chroma_h),
-                                 out->linesize[1], mpeg_range);
+    if (s->yuv2rgb) {
+        // YUV->RGB: single kernel call handles everything
+        ret = call_resize_kernel(ctx, s->cu_func,
+                                 tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
+                                 out, out->width, out->height, 
out->linesize[0], mpeg_range);
         if (ret < 0)
             goto exit;
+    } else {
+        // scale primary plane(s). Usually Y (and A), or single plane of RGB 
frames.
+        ret = call_resize_kernel(ctx, s->cu_func,
+                                 tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
+                                 out, out->width, out->height, 
out->linesize[0], mpeg_range);
+        if (ret < 0)
+            goto exit;
+
+        if (s->out_planes > 1) {
+            // scale UV plane. Scale function sets both U and V plane, or 
singular interleaved plane.
+            ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
+                                     AV_CEIL_RSHIFT(in->crop_left, 
s->in_desc->log2_chroma_w),
+                                     AV_CEIL_RSHIFT(in->crop_top, 
s->in_desc->log2_chroma_h),
+                                     AV_CEIL_RSHIFT(crop_width, 
s->in_desc->log2_chroma_w),
+                                     AV_CEIL_RSHIFT(crop_height, 
s->in_desc->log2_chroma_h),
+                                     out,
+                                     AV_CEIL_RSHIFT(out->width, 
s->out_desc->log2_chroma_w),
+                                     AV_CEIL_RSHIFT(out->height, 
s->out_desc->log2_chroma_h),
+                                     out->linesize[1], mpeg_range);
+            if (ret < 0)
+                goto exit;
+        }
     }
 
 exit:
@@ -574,6 +685,11 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
     if (ret < 0)
         return ret;
 
+    if (s->yuv2rgb) {
+        out->colorspace  = AVCOL_SPC_RGB;
+        out->color_range = AVCOL_RANGE_JPEG;
+    }
+
     if (out->width != in->width || out->height != in->height) {
         av_frame_side_data_remove_by_props(&out->side_data, &out->nb_side_data,
                                            AV_SIDE_DATA_PROP_SIZE_DEPENDENT);
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index d674c0885a..0d40681461 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -1105,6 +1105,128 @@ __device__ static inline T 
Subsample_Nearest(cudaTextureObject_t tex,
     return tex2D<T>(tex, xi, yi);
 }
 
+__device__ static inline float clamp_f(float x, float lo, float hi)
+{
+    return x < lo ? lo : (x > hi ? hi : x);
+}
+
+// YUV to RGB CONVERSION
+//      R = m[0][0]*(Y-Yoff) + m[0][1]*(U-128) + m[0][2]*(V-128)
+//      G = m[1][0]*(Y-Yoff) + m[1][1]*(U-128) + m[1][2]*(V-128)
+//      B = m[2][0]*(Y-Yoff) + m[2][1]*(U-128) + m[2][2]*(V-128)
+// channel_order:
+//      0 = BGRA (B,G,R,A), 1 = RGBA (R,G,B,A),
+//      2 = BGR0 (B,G,R,0), 3 = RGB0 (R,G,B,0)
+__device__ static inline uchar4 yuv8_to_rgba_generic(
+    float fy, float fu, float fv,
+    const CUDAScaleColorMatrix &mat,
+    int channel_order)
+{
+    float r = mat.m[0][0] * fy + mat.m[0][1] * fu + mat.m[0][2] * fv;
+    float g = mat.m[1][0] * fy + mat.m[1][1] * fu + mat.m[1][2] * fv;
+    float b = mat.m[2][0] * fy + mat.m[2][1] * fu + mat.m[2][2] * fv;
+
+    uchar rc = (uchar)clamp_f(r, 0.0f, 255.0f);
+    uchar gc = (uchar)clamp_f(g, 0.0f, 255.0f);
+    uchar bc = (uchar)clamp_f(b, 0.0f, 255.0f);
+
+    switch (channel_order) {
+    case 0: return make_uchar4(bc, gc, rc, 0xFF); // BGRA
+    case 1: return make_uchar4(rc, gc, bc, 0xFF); // RGBA
+    case 2: return make_uchar4(bc, gc, rc, 0x00); // BGR0
+    case 3: return make_uchar4(rc, gc, bc, 0x00); // RGB0
+    default: return make_uchar4(bc, gc, rc, 0xFF);
+    }
+}
+
+// YUV to RGB kernel for semiplanar input (NV12, P010, NV16, P210)
+// No resize: src and dst have identical dimensions.
+template<typename in_y_T, typename in_uv_T, int channel_order>
+__device__ void YuvToRgb_semiplanar(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    // Direct 1:1 Y plane read
+    in_y_T y_raw = tex2D<in_y_T>(
+        params.src_tex[0],
+        xo + params.src_left + 0.5f,
+        yo + params.src_top  + 0.5f);
+
+    // Chroma: map luma coords to chroma coords for subsampled formats
+    float cx = ((float)xo + 0.5f) / (float)(1 << params.log2_chroma_w)
+             + (float)(params.src_left >> params.log2_chroma_w);
+    float cy = ((float)yo + 0.5f) / (float)(1 << params.log2_chroma_h)
+             + (float)(params.src_top  >> params.log2_chroma_h);
+    in_uv_T uv_raw = tex2D<in_uv_T>(params.src_tex[1], cx, cy);
+
+    float fy, fu, fv;
+    if (sizeof(in_y_T) == 1) {
+        fy = (float)(int)y_raw;
+        fu = (float)(int)uv_raw.x;
+        fv = (float)(int)uv_raw.y;
+    } else {
+        // 10/16-bit semiplanar: normalize to 8-bit range by dividing by 256
+        fy = (float)(int)y_raw / 256.0f;
+        fu = (float)(int)uv_raw.x / 256.0f;
+        fv = (float)(int)uv_raw.y / 256.0f;
+    }
+
+    if (params.mpeg_range) fy -= 16.0f;
+    fu -= 128.0f;
+    fv -= 128.0f;
+
+    uchar4 *dst = (uchar4*)params.dst[0];
+    int dst_pitch = params.dst_pitch / sizeof(uchar4);
+    dst[yo * dst_pitch + xo] = yuv8_to_rgba_generic(fy, fu, fv, 
params.color_matrix, channel_order);
+}
+
+// YUV to RGB kernel for planar input (YUV420P, YUV422P, YUV444P, etc)
+// No resize: src and dst have identical dimensions.
+template<typename in_y_T, typename in_uv_T, int channel_order>
+__device__ void YuvToRgb_planar(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    // Direct 1:1 Y plane read
+    in_y_T y_raw = tex2D<in_y_T>(
+        params.src_tex[0],
+        xo + params.src_left + 0.5f,
+        yo + params.src_top  + 0.5f);
+
+    // Chroma: map luma coords to chroma coords for subsampled formats
+    float cx = ((float)xo + 0.5f) / (float)(1 << params.log2_chroma_w)
+             + (float)(params.src_left >> params.log2_chroma_w);
+    float cy = ((float)yo + 0.5f) / (float)(1 << params.log2_chroma_h)
+             + (float)(params.src_top  >> params.log2_chroma_h);
+
+    in_uv_T u_raw = tex2D<in_uv_T>(params.src_tex[1], cx, cy);
+    in_uv_T v_raw = tex2D<in_uv_T>(params.src_tex[2], cx, cy);
+
+    float fy, fu, fv;
+    if (sizeof(in_y_T) == 1) {
+        fy = (float)(int)y_raw;
+        fu = (float)(int)u_raw;
+        fv = (float)(int)v_raw;
+    } else {
+        // planar10: 10-bit native values, divide by 4
+        fy = (float)(int)y_raw / 4.0f;
+        fu = (float)(int)u_raw / 4.0f;
+        fv = (float)(int)v_raw / 4.0f;
+    }
+
+    if (params.mpeg_range) fy -= 16.0f;
+    fu -= 128.0f;
+    fv -= 128.0f;
+
+    uchar4 *dst = (uchar4*)params.dst[0];
+    int dst_pitch = params.dst_pitch / sizeof(uchar4);
+    dst[yo * dst_pitch + xo] = yuv8_to_rgba_generic(fy, fu, fv, 
params.color_matrix, channel_order);
+}
+
 template<typename T>
 __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
                                               int xo, int yo,
@@ -1357,4 +1479,47 @@ LANCZOS_KERNELS_RGB(rgb0)
 LANCZOS_KERNELS_RGB(bgr0)
 LANCZOS_KERNELS_RGB(rgba)
 LANCZOS_KERNELS_RGB(bgra)
+
+// --- YUV to RGB kernel exports ---
+// Single-pass kernels (no resize, color conversion only)
+
+// semiplanar8 (NV12, NV16) -> RGB outputs
+__global__ void Subsample_Nearest_semiplanar8_bgra(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<uchar, uchar2, 0>(p); }
+__global__ void Subsample_Nearest_semiplanar8_rgba(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<uchar, uchar2, 1>(p); }
+__global__ void Subsample_Nearest_semiplanar8_bgr0(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<uchar, uchar2, 2>(p); }
+__global__ void Subsample_Nearest_semiplanar8_rgb0(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<uchar, uchar2, 3>(p); }
+
+// semiplanar10 (P010, P210) -> RGB outputs
+__global__ void Subsample_Nearest_semiplanar10_bgra(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<ushort, ushort2, 0>(p); }
+__global__ void Subsample_Nearest_semiplanar10_rgba(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<ushort, ushort2, 1>(p); }
+__global__ void Subsample_Nearest_semiplanar10_bgr0(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<ushort, ushort2, 2>(p); }
+__global__ void Subsample_Nearest_semiplanar10_rgb0(CUDAScaleKernelParams p)
+{ YuvToRgb_semiplanar<ushort, ushort2, 3>(p); }
+
+// planar8 (YUV420P, YUV422P, YUV444P) -> RGB outputs
+__global__ void Subsample_Nearest_planar8_bgra(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<uchar, uchar, 0>(p); }
+__global__ void Subsample_Nearest_planar8_rgba(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<uchar, uchar, 1>(p); }
+__global__ void Subsample_Nearest_planar8_bgr0(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<uchar, uchar, 2>(p); }
+__global__ void Subsample_Nearest_planar8_rgb0(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<uchar, uchar, 3>(p); }
+
+// planar10 (YUV420P10, YUV422P10, YUV444P10) -> RGB outputs
+__global__ void Subsample_Nearest_planar10_bgra(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<ushort, ushort, 0>(p); }
+__global__ void Subsample_Nearest_planar10_rgba(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<ushort, ushort, 1>(p); }
+__global__ void Subsample_Nearest_planar10_bgr0(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<ushort, ushort, 2>(p); }
+__global__ void Subsample_Nearest_planar10_rgb0(CUDAScaleKernelParams p)
+{ YuvToRgb_planar<ushort, ushort, 3>(p); }
 }
diff --git a/libavfilter/vf_scale_cuda.h b/libavfilter/vf_scale_cuda.h
index 81fd8061e3..2f698fc5ba 100644
--- a/libavfilter/vf_scale_cuda.h
+++ b/libavfilter/vf_scale_cuda.h
@@ -33,6 +33,10 @@ typedef uint8_t* CUdeviceptr;
 
 #define SCALE_CUDA_PARAM_DEFAULT 999999.0f
 
+typedef struct {
+    float m[3][3];
+} CUDAScaleColorMatrix;
+
 typedef struct {
     CUtexObject src_tex[4];
     CUdeviceptr dst[4];
@@ -45,6 +49,9 @@ typedef struct {
     int src_height;
     float param;
     int mpeg_range;
+    int log2_chroma_w;
+    int log2_chroma_h;
+    CUDAScaleColorMatrix color_matrix;
 } CUDAScaleKernelParams;
 
 #endif
-- 
2.52.0


>From 8c0b38cbb0bdc15e84f1450da0b5d435ad5ea4e6 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <[email protected]>
Date: Mon, 2 Mar 2026 00:40:31 +0800
Subject: [PATCH 2/3] avfilter/vf_scale_cuda: add RGB to YUV color space
 conversion

The output color range is controllable via the out_range option
(default: limited/MPEG range). Color matrix selection follows the
same rules as the YUV to RGB path, defaulting to BT.601 when
colorspace is unspecified to match swscale behavior.
---
 libavfilter/vf_scale_cuda.c  | 140 +++++++++++++-
 libavfilter/vf_scale_cuda.cu | 350 +++++++++++++++++++++++++++++++++++
 2 files changed, 483 insertions(+), 7 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 2a4313cae1..f77fbec8ee 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -36,6 +36,7 @@
 #include "avfilter.h"
 #include "colorspace.h"
 #include "filters.h"
+#include "formats.h"
 #include "scale_eval.h"
 #include "video.h"
 
@@ -126,6 +127,8 @@ typedef struct CUDAScaleContext {
     float param;
 
     bool yuv2rgb;
+    bool rgb2yuv;
+    int out_range;                   ///< output color range for RGB->YUV 
(AVColorRange)
     enum AVColorSpace  colorspace;   ///< resolved colorspace for conversion
     enum AVColorRange  color_range;  ///< resolved color range for conversion
     CUDAScaleColorMatrix color_matrix;
@@ -250,6 +253,38 @@ static int compute_yuv2rgb_matrix(void *log_ctx, 
CUDAScaleColorMatrix *mat,
     return 0;
 }
 
+static int compute_rgb2yuv_matrix(void *log_ctx, CUDAScaleColorMatrix *mat,
+                                  enum AVColorSpace colorspace,
+                                  bool limited_range)
+{
+    const AVLumaCoefficients *coeffs = 
av_csp_luma_coeffs_from_avcsp(colorspace);
+    double rgb2yuv[3][3];
+    float y_scale = 1.0f;
+    float uv_scale = 1.0f;
+
+    if (!coeffs) {
+        av_log(log_ctx, AV_LOG_ERROR,
+               "Unsupported colorspace %d for RGB->YUV conversion\n", 
colorspace);
+        return AVERROR(EINVAL);
+    }
+
+    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
+
+    if (limited_range) {
+        y_scale  = (235.0f - 16.0f) / 255.0f;
+        uv_scale = (240.0f - 16.0f) / 255.0f;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        float scale = (i == 0) ? y_scale : uv_scale;
+        mat->m[i][0] = (float)rgb2yuv[i][0] * scale;
+        mat->m[i][1] = (float)rgb2yuv[i][1] * scale;
+        mat->m[i][2] = (float)rgb2yuv[i][2] * scale;
+    }
+
+    return 0;
+}
+
 static av_cold void set_format_info(AVFilterContext *ctx, enum AVPixelFormat 
in_format, enum AVPixelFormat out_format)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -346,6 +381,40 @@ static av_cold int init_processing_chain(AVFilterContext 
*ctx, int in_width, int
                av_color_space_name(cs),
                limited_range ? "limited" : "full");
     }
+    s->rgb2yuv = (s->in_desc->flags & AV_PIX_FMT_FLAG_RGB) &&
+                 !(s->out_desc->flags & AV_PIX_FMT_FLAG_RGB);
+    if (s->rgb2yuv) {
+        AVFilterLink *outlink = ctx->outputs[0];
+
+        if (in_width != out_width || in_height != out_height) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Resizing is not supported during RGB->YUV conversion, "
+                   "use a separate scale_cuda instance for resizing\n");
+            return AVERROR(ENOSYS);
+        }
+        enum AVColorSpace cs = ctx->inputs[0]->colorspace;
+        bool limited_range;
+
+        /* When colorspace is unspecified or RGB, default to BT.601
+         * (SMPTE 170M) to match swscale behavior.
+         */
+        if (cs == AVCOL_SPC_UNSPECIFIED || cs == AVCOL_SPC_RGB)
+            cs = AVCOL_SPC_SMPTE170M;
+
+        limited_range = (outlink->color_range != AVCOL_RANGE_JPEG);
+
+        ret = compute_rgb2yuv_matrix(ctx, &s->color_matrix, cs, limited_range);
+        if (ret < 0)
+            return ret;
+
+        s->colorspace  = cs;
+        s->color_range = outlink->color_range;
+
+        av_log(ctx, AV_LOG_VERBOSE,
+               "RGB->YUV conversion enabled (%s, %s range)\n",
+               av_color_space_name(cs),
+               limited_range ? "limited" : "full");
+    }
 
     if (s->passthrough && in_width == out_width && in_height == out_height && 
in_format == out_format) {
         s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
@@ -386,14 +455,14 @@ static av_cold int 
cudascale_load_functions(AVFilterContext *ctx)
     extern const unsigned char ff_vf_scale_cuda_ptx_data[];
     extern const unsigned int ff_vf_scale_cuda_ptx_len;
 
-    if (s->yuv2rgb) {
-        /* YUV->RGB only supports Nearest for now */
+    if (s->yuv2rgb || s->rgb2yuv) {
+        /* YUV<->RGB only supports Nearest for now */
         function_infix = "Nearest";
         s->interp_use_linear = 0;
         s->interp_as_integer = 1;
         if (s->interp_algo != INTERP_ALGO_DEFAULT && s->interp_algo != 
INTERP_ALGO_NEAREST)
             av_log(ctx, AV_LOG_WARNING,
-                   "YUV->RGB conversion only supports nearest interpolation, 
ignoring interp_algo\n");
+                   "YUV <-> RGB conversion only supports nearest 
interpolation, ignoring interp_algo\n");
     } else {
         switch(s->interp_algo) {
             case INTERP_ALGO_NEAREST:
@@ -444,6 +513,12 @@ static av_cold int 
cudascale_load_functions(AVFilterContext *ctx)
     if (s->yuv2rgb) {
         s->cu_func_uv = NULL;
         av_log(ctx, AV_LOG_DEBUG, "YUV->RGB mode: no separate chroma 
kernel\n");
+    } else if (s->rgb2yuv) {
+        snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, 
in_fmt_name, out_fmt_name);
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, 
buf));
+        if (ret < 0)
+            goto fail;
+        av_log(ctx, AV_LOG_DEBUG, "RGB->YUV chroma filter: %s\n", buf);
     } else {
         snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, 
in_fmt_name, out_fmt_name);
         ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, 
buf));
@@ -458,6 +533,27 @@ fail:
     return ret;
 }
 
+static int cudascale_query_formats(const AVFilterContext *ctx,
+                                   AVFilterFormatsConfig **cfg_in,
+                                   AVFilterFormatsConfig **cfg_out)
+{
+    const CUDAScaleContext *s = ctx->priv;
+    static const enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_CUDA, 
AV_PIX_FMT_NONE };
+    AVFilterFormats *formats;
+    int ret;
+
+    if ((ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, 
pix_fmts)) < 0)
+        return ret;
+
+    formats = s->out_range != AVCOL_RANGE_UNSPECIFIED
+                ? ff_make_formats_list_singleton(s->out_range)
+                : ff_all_color_ranges();
+    if ((ret = ff_formats_ref(formats, &cfg_out[0]->color_ranges)) < 0)
+        return ret;
+
+    return 0;
+}
+
 static av_cold int cudascale_config_props(AVFilterLink *outlink)
 {
     AVFilterContext *ctx = outlink->src;
@@ -555,6 +651,9 @@ static int call_resize_kernel(AVFilterContext *ctx, 
CUfunction func,
     if (s->yuv2rgb) {
         params.log2_chroma_w = s->in_desc->log2_chroma_w;
         params.log2_chroma_h = s->in_desc->log2_chroma_h;
+    } else if (s->rgb2yuv) {
+        params.log2_chroma_w = s->out_desc->log2_chroma_w;
+        params.log2_chroma_h = s->out_desc->log2_chroma_h;
     }
 
     void *args[] = { &params };
@@ -573,10 +672,10 @@ static int scalecuda_resize(AVFilterContext *ctx,
     int i, ret;
 
     /* Color matrix was computed at config time; use the resolved color_range
-     * for yuv2rgb so mpeg_range matches the matrix. For plain scaling,
+     * for yuv2rgb/rgb2yuv so mpeg_range matches the matrix. For plain scaling,
      * read from the input frame. */
-    int mpeg_range = s->yuv2rgb ? (s->color_range != AVCOL_RANGE_JPEG)
-                                : (in->color_range != AVCOL_RANGE_JPEG);
+    int mpeg_range = (s->yuv2rgb || s->rgb2yuv) ? (s->color_range != 
AVCOL_RANGE_JPEG)
+                                                 : (in->color_range != 
AVCOL_RANGE_JPEG);
 
     CUtexObject tex[4] = { 0, 0, 0, 0 };
 
@@ -625,6 +724,22 @@ static int scalecuda_resize(AVFilterContext *ctx,
                                  out, out->width, out->height, 
out->linesize[0], mpeg_range);
         if (ret < 0)
             goto exit;
+    } else if (s->rgb2yuv) {
+        // RGB->YUV: two-pass (Y kernel at full res, UV kernel at chroma res)
+        ret = call_resize_kernel(ctx, s->cu_func,
+                                 tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
+                                 out, out->width, out->height, 
out->linesize[0], mpeg_range);
+        if (ret < 0)
+            goto exit;
+
+        ret = call_resize_kernel(ctx, s->cu_func_uv,
+                                 tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
+                                 out,
+                                 AV_CEIL_RSHIFT(out->width, 
s->out_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(out->height, 
s->out_desc->log2_chroma_h),
+                                 out->linesize[1], mpeg_range);
+        if (ret < 0)
+            goto exit;
     } else {
         // scale primary plane(s). Usually Y (and A), or single plane of RGB 
frames.
         ret = call_resize_kernel(ctx, s->cu_func,
@@ -688,6 +803,9 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
     if (s->yuv2rgb) {
         out->colorspace  = AVCOL_SPC_RGB;
         out->color_range = AVCOL_RANGE_JPEG;
+    } else if (s->rgb2yuv) {
+        out->colorspace  = s->colorspace;
+        out->color_range = s->color_range;
     }
 
     if (out->width != in->width || out->height != in->height) {
@@ -773,6 +891,14 @@ static const AVOption options[] = {
         { "increase", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 
SCALE_FORCE_OAR_INCREASE }, 0, 0, FLAGS, .unit = "force_oar" },
     { "force_divisible_by", "enforce that the output resolution is divisible 
by a defined integer when force_original_aspect_ratio is used", 
OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 256, FLAGS },
     { "reset_sar", "reset SAR to 1 and scale to square pixels if scaling 
proportionally", OFFSET(reset_sar), AV_OPT_TYPE_BOOL, { .i64 = 0}, 0, 1, FLAGS 
},
+    { "out_range", "Output color range", OFFSET(out_range), AV_OPT_TYPE_INT, { 
.i64 = AVCOL_RANGE_UNSPECIFIED }, 0, AVCOL_RANGE_NB - 1, FLAGS, .unit = "range" 
},
+        { "auto",    "keep range from input or default to limited", 0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_UNSPECIFIED }, 0, 0, FLAGS, .unit = 
"range" },
+        { "full",    "full/JPEG range",                             0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG },        0, 0, FLAGS, .unit = 
"range" },
+        { "limited", "limited/MPEG range",                          0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG },        0, 0, FLAGS, .unit = 
"range" },
+        { "jpeg",    "full/JPEG range",                             0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG },        0, 0, FLAGS, .unit = 
"range" },
+        { "mpeg",    "limited/MPEG range",                          0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG },        0, 0, FLAGS, .unit = 
"range" },
+        { "tv",      "limited/MPEG range",                          0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG },        0, 0, FLAGS, .unit = 
"range" },
+        { "pc",      "full/JPEG range",                             0, 
AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG },        0, 0, FLAGS, .unit = 
"range" },
     { NULL },
 };
 
@@ -814,7 +940,7 @@ const FFFilter ff_vf_scale_cuda = {
     FILTER_INPUTS(cudascale_inputs),
     FILTER_OUTPUTS(cudascale_outputs),
 
-    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+    FILTER_QUERY_FUNC2(cudascale_query_formats),
 
     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
 };
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index 0d40681461..9304df4fbe 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -1227,6 +1227,256 @@ __device__ void YuvToRgb_planar(CUDAScaleKernelParams 
params)
     dst[yo * dst_pitch + xo] = yuv8_to_rgba_generic(fy, fu, fv, 
params.color_matrix, channel_order);
 }
 
+// RGB to YUV
+//
+//   Y = m[0][0]*R + m[0][1]*G + m[0][2]*B  (+ 16 for limited range)
+//   U = m[1][0]*R + m[1][1]*G + m[1][2]*B  (+ 128)
+//   V = m[2][0]*R + m[2][1]*G + m[2][2]*B  (+ 128)
+
+// Extract R,G,B from packed uchar4 pixel based on channel ordering.
+// channel_order: 0 = BGRA (x=B,y=G,z=R), 1 = RGBA (x=R,y=G,z=B),
+//                2 = BGR0 (x=B,y=G,z=R), 3 = RGB0 (x=R,y=G,z=B)
+__device__ static inline void extract_rgb(uchar4 pixel, int channel_order,
+                                          float &r, float &g, float &b)
+{
+    switch (channel_order) {
+    case 0: // BGRA
+    case 2: // BGR0
+        b = (float)pixel.x; g = (float)pixel.y; r = (float)pixel.z;
+        break;
+    case 1: // RGBA
+    case 3: // RGB0
+    default:
+        r = (float)pixel.x; g = (float)pixel.y; b = (float)pixel.z;
+        break;
+    }
+}
+
+__device__ static inline float rgb_to_y(float r, float g, float b,
+                                        const CUDAScaleColorMatrix &mat, int 
mpeg_range)
+{
+    float y = mat.m[0][0] * r + mat.m[0][1] * g + mat.m[0][2] * b;
+    if (mpeg_range) y += 16.0f;
+    return clamp_f(y, 0.0f, 255.0f);
+}
+
+__device__ static inline float rgb_to_u(float r, float g, float b,
+                                        const CUDAScaleColorMatrix &mat)
+{
+    float u = mat.m[1][0] * r + mat.m[1][1] * g + mat.m[1][2] * b + 128.0f;
+    return clamp_f(u, 0.0f, 255.0f);
+}
+
+__device__ static inline float rgb_to_v(float r, float g, float b,
+                                        const CUDAScaleColorMatrix &mat)
+{
+    float v = mat.m[2][0] * r + mat.m[2][1] * g + mat.m[2][2] * b + 128.0f;
+    return clamp_f(v, 0.0f, 255.0f);
+}
+
+// RGB->YUV Y kernel (8-bit output)
+template<typename out_y_T, int channel_order>
+__device__ void RgbToYuv_Y(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    uchar4 pixel = tex2D<uchar4>(
+        params.src_tex[0],
+        xo + params.src_left + 0.5f,
+        yo + params.src_top  + 0.5f);
+
+    float r, g, b;
+    extract_rgb(pixel, channel_order, r, g, b);
+    float y = rgb_to_y(r, g, b, params.color_matrix, params.mpeg_range);
+
+    out_y_T *dst_y = (out_y_T*)params.dst[0];
+    int pitch = params.dst_pitch / sizeof(out_y_T);
+    dst_y[yo * pitch + xo] = (out_y_T)y;
+}
+
+// RGB->YUV Y kernel for semiplanar10/16 output
+template<int channel_order>
+__device__ void RgbToYuv_Y_semiplanar16(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    uchar4 pixel = tex2D<uchar4>(
+        params.src_tex[0],
+        xo + params.src_left + 0.5f,
+        yo + params.src_top  + 0.5f);
+
+    float r, g, b;
+    extract_rgb(pixel, channel_order, r, g, b);
+    float y = rgb_to_y(r, g, b, params.color_matrix, params.mpeg_range);
+
+    ushort *dst_y = (ushort*)params.dst[0];
+    int pitch = params.dst_pitch / sizeof(ushort);
+    dst_y[yo * pitch + xo] = (ushort)clamp_f(y * 256.0f, 0.0f, 65535.0f);
+}
+
+// RGB->YUV Y kernel for planar10 output: native 10-bit Y value
+template<int channel_order>
+__device__ void RgbToYuv_Y_planar10(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    uchar4 pixel = tex2D<uchar4>(
+        params.src_tex[0],
+        xo + params.src_left + 0.5f,
+        yo + params.src_top  + 0.5f);
+
+    float r, g, b;
+    extract_rgb(pixel, channel_order, r, g, b);
+    float y = rgb_to_y(r, g, b, params.color_matrix, params.mpeg_range);
+
+    ushort *dst_y = (ushort*)params.dst[0];
+    int pitch = params.dst_pitch / sizeof(ushort);
+    dst_y[yo * pitch + xo] = (ushort)clamp_f(y * 4.0f, 0.0f, 1023.0f);
+}
+
+// RGB->YUV UV kernel for semiplanar output (NV12, NV16)
+template<typename out_uv_T, int channel_order>
+__device__ void RgbToYuv_semiplanar_UV(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    int chroma_w = 1 << params.log2_chroma_w;
+    int chroma_h = 1 << params.log2_chroma_h;
+    int full_w = params.dst_width << params.log2_chroma_w;
+    int full_h = params.dst_height << params.log2_chroma_h;
+
+    float u_acc = 0.0f, v_acc = 0.0f;
+    int count = 0;
+    for (int dy = 0; dy < chroma_h; dy++) {
+        for (int dx = 0; dx < chroma_w; dx++) {
+            int lx = xo * chroma_w + dx;
+            int ly = yo * chroma_h + dy;
+            if (lx >= full_w || ly >= full_h) continue;
+
+            uchar4 pixel = tex2D<uchar4>(
+                params.src_tex[0],
+                lx + params.src_left + 0.5f,
+                ly + params.src_top  + 0.5f);
+
+            float r, g, b;
+            extract_rgb(pixel, channel_order, r, g, b);
+            u_acc += rgb_to_u(r, g, b, params.color_matrix);
+            v_acc += rgb_to_v(r, g, b, params.color_matrix);
+            count++;
+        }
+    }
+
+    float u_avg = u_acc / (float)count;
+    float v_avg = v_acc / (float)count;
+
+    out_uv_T *dst_uv = (out_uv_T*)params.dst[1];
+    int pitch = params.dst_pitch / sizeof(out_uv_T);
+    dst_uv[yo * pitch + xo] = make_uchar2((uchar)u_avg, (uchar)v_avg);
+}
+
+// RGB->YUV UV kernel for semiplanar10/16 output
+template<int channel_order>
+__device__ void RgbToYuv_semiplanar16_UV(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    int chroma_w = 1 << params.log2_chroma_w;
+    int chroma_h = 1 << params.log2_chroma_h;
+    int full_w = params.dst_width << params.log2_chroma_w;
+    int full_h = params.dst_height << params.log2_chroma_h;
+
+    float u_acc = 0.0f, v_acc = 0.0f;
+    int count = 0;
+    for (int dy = 0; dy < chroma_h; dy++) {
+        for (int dx = 0; dx < chroma_w; dx++) {
+            int lx = xo * chroma_w + dx;
+            int ly = yo * chroma_h + dy;
+            if (lx >= full_w || ly >= full_h) continue;
+
+            uchar4 pixel = tex2D<uchar4>(
+                params.src_tex[0],
+                lx + params.src_left + 0.5f,
+                ly + params.src_top  + 0.5f);
+
+            float r, g, b;
+            extract_rgb(pixel, channel_order, r, g, b);
+            u_acc += rgb_to_u(r, g, b, params.color_matrix);
+            v_acc += rgb_to_v(r, g, b, params.color_matrix);
+            count++;
+        }
+    }
+
+    float u_avg = u_acc / (float)count;
+    float v_avg = v_acc / (float)count;
+
+    ushort2 *dst_uv = (ushort2*)params.dst[1];
+    int pitch = params.dst_pitch / sizeof(ushort2);
+    ushort u_val = (ushort)clamp_f(u_avg * 256.0f, 0.0f, 65535.0f);
+    ushort v_val = (ushort)clamp_f(v_avg * 256.0f, 0.0f, 65535.0f);
+    dst_uv[yo * pitch + xo] = make_ushort2(u_val, v_val);
+}
+
+// RGB->YUV UV kernel for planar output (YUV420P, YUV422P, YUV444P, YUV420P10)
+template<typename out_uv_T, int channel_order>
+__device__ void RgbToYuv_planar_UV(CUDAScaleKernelParams params)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    if (yo >= params.dst_height || xo >= params.dst_width) return;
+
+    int chroma_w = 1 << params.log2_chroma_w;
+    int chroma_h = 1 << params.log2_chroma_h;
+    int full_w = params.dst_width << params.log2_chroma_w;
+    int full_h = params.dst_height << params.log2_chroma_h;
+
+    float u_acc = 0.0f, v_acc = 0.0f;
+    int count = 0;
+    for (int dy = 0; dy < chroma_h; dy++) {
+        for (int dx = 0; dx < chroma_w; dx++) {
+            int lx = xo * chroma_w + dx;
+            int ly = yo * chroma_h + dy;
+            if (lx >= full_w || ly >= full_h) continue;
+
+            uchar4 pixel = tex2D<uchar4>(
+                params.src_tex[0],
+                lx + params.src_left + 0.5f,
+                ly + params.src_top  + 0.5f);
+
+            float r, g, b;
+            extract_rgb(pixel, channel_order, r, g, b);
+            u_acc += rgb_to_u(r, g, b, params.color_matrix);
+            v_acc += rgb_to_v(r, g, b, params.color_matrix);
+            count++;
+        }
+    }
+
+    float u_avg = u_acc / (float)count;
+    float v_avg = v_acc / (float)count;
+
+    out_uv_T *dst_u = (out_uv_T*)params.dst[1];
+    out_uv_T *dst_v = (out_uv_T*)params.dst[2];
+    int pitch = params.dst_pitch / sizeof(out_uv_T);
+
+    if (sizeof(out_uv_T) == 1) {
+        dst_u[yo * pitch + xo] = (out_uv_T)u_avg;
+        dst_v[yo * pitch + xo] = (out_uv_T)v_avg;
+    } else {
+        // 10-bit planar: native 10-bit value = 8-bit * 4
+        dst_u[yo * pitch + xo] = (out_uv_T)(clamp_f(u_avg * 4.0f, 0.0f, 
1023.0f));
+        dst_v[yo * pitch + xo] = (out_uv_T)(clamp_f(v_avg * 4.0f, 0.0f, 
1023.0f));
+    }
+}
+
 template<typename T>
 __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
                                               int xo, int yo,
@@ -1522,4 +1772,104 @@ __global__ void 
Subsample_Nearest_planar10_bgr0(CUDAScaleKernelParams p)
 { YuvToRgb_planar<ushort, ushort, 2>(p); }
 __global__ void Subsample_Nearest_planar10_rgb0(CUDAScaleKernelParams p)
 { YuvToRgb_planar<ushort, ushort, 3>(p); }
+
+// --- RGB to YUV kernel exports ---
+// Two-pass: Y kernel writes Y plane, UV kernel writes chroma planes.
+// No resize, color conversion only.
+
+// bgra -> semiplanar8 (NV12, NV16)
+__global__ void Subsample_Nearest_bgra_semiplanar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 0>(p); }
+__global__ void Subsample_Nearest_bgra_semiplanar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar_UV<uchar2, 0>(p); }
+
+// rgba -> semiplanar8
+__global__ void Subsample_Nearest_rgba_semiplanar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 1>(p); }
+__global__ void Subsample_Nearest_rgba_semiplanar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar_UV<uchar2, 1>(p); }
+
+// bgr0 -> semiplanar8
+__global__ void Subsample_Nearest_bgr0_semiplanar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 2>(p); }
+__global__ void Subsample_Nearest_bgr0_semiplanar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar_UV<uchar2, 2>(p); }
+
+// rgb0 -> semiplanar8
+__global__ void Subsample_Nearest_rgb0_semiplanar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 3>(p); }
+__global__ void Subsample_Nearest_rgb0_semiplanar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar_UV<uchar2, 3>(p); }
+
+// bgra -> semiplanar10 (P010, P210)
+__global__ void Subsample_Nearest_bgra_semiplanar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_semiplanar16<0>(p); }
+__global__ void Subsample_Nearest_bgra_semiplanar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar16_UV<0>(p); }
+
+// rgba -> semiplanar10
+__global__ void Subsample_Nearest_rgba_semiplanar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_semiplanar16<1>(p); }
+__global__ void Subsample_Nearest_rgba_semiplanar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar16_UV<1>(p); }
+
+// bgr0 -> semiplanar10
+__global__ void Subsample_Nearest_bgr0_semiplanar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_semiplanar16<2>(p); }
+__global__ void Subsample_Nearest_bgr0_semiplanar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar16_UV<2>(p); }
+
+// rgb0 -> semiplanar10
+__global__ void Subsample_Nearest_rgb0_semiplanar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_semiplanar16<3>(p); }
+__global__ void Subsample_Nearest_rgb0_semiplanar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_semiplanar16_UV<3>(p); }
+
+// bgra -> planar8 (YUV420P, YUV422P, YUV444P)
+__global__ void Subsample_Nearest_bgra_planar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 0>(p); }
+__global__ void Subsample_Nearest_bgra_planar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<uchar, 0>(p); }
+
+// rgba -> planar8
+__global__ void Subsample_Nearest_rgba_planar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 1>(p); }
+__global__ void Subsample_Nearest_rgba_planar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<uchar, 1>(p); }
+
+// bgr0 -> planar8
+__global__ void Subsample_Nearest_bgr0_planar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 2>(p); }
+__global__ void Subsample_Nearest_bgr0_planar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<uchar, 2>(p); }
+
+// rgb0 -> planar8
+__global__ void Subsample_Nearest_rgb0_planar8(CUDAScaleKernelParams p)
+{ RgbToYuv_Y<uchar, 3>(p); }
+__global__ void Subsample_Nearest_rgb0_planar8_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<uchar, 3>(p); }
+
+// bgra -> planar10 (YUV420P10, YUV422P10, YUV444P10)
+__global__ void Subsample_Nearest_bgra_planar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_planar10<0>(p); }
+__global__ void Subsample_Nearest_bgra_planar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<ushort, 0>(p); }
+
+// rgba -> planar10
+__global__ void Subsample_Nearest_rgba_planar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_planar10<1>(p); }
+__global__ void Subsample_Nearest_rgba_planar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<ushort, 1>(p); }
+
+// bgr0 -> planar10
+__global__ void Subsample_Nearest_bgr0_planar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_planar10<2>(p); }
+__global__ void Subsample_Nearest_bgr0_planar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<ushort, 2>(p); }
+
+// rgb0 -> planar10
+__global__ void Subsample_Nearest_rgb0_planar10(CUDAScaleKernelParams p)
+{ RgbToYuv_Y_planar10<3>(p); }
+__global__ void Subsample_Nearest_rgb0_planar10_uv(CUDAScaleKernelParams p)
+{ RgbToYuv_planar_UV<ushort, 3>(p); }
 }
-- 
2.52.0


>From f798bc3e12ba1f8732cfdfee6ffbc20e683364a4 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <[email protected]>
Date: Mon, 2 Mar 2026 00:41:04 +0800
Subject: [PATCH 3/3] avfilter/vf_scale_cuda: add FATE tests for YUV/RGB
 conversion

Add CUDA hardware-accelerated FATE tests for scale_cuda YUV<->RGB
color conversion. Tests require a CUDA GPU and are gated behind
HWACCEL=cuda:
  make fate-filter-scale_cuda HWACCEL=cuda

Test coverage:
- YUV->RGB: NV12, YUV420P, P010, YUV444P -> BGRA/RGBA
- RGB->YUV: BGRA/RGBA -> NV12, YUV420P, P010
- BT.709: NV12 -> BGRA with explicit colorspace=bt709 via setparams
- Full range: roundtrip via full-range NV12
- Roundtrip: NV12 -> BGRA -> NV12
- Regression: existing YUV->YUV scaling path unaffected
---
 tests/Makefile              | 10 ++++
 tests/fate/filter-video.mak | 91 +++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/tests/Makefile b/tests/Makefile
index 4b3fa6a54a..c67dbd04c1 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -139,6 +139,16 @@ CONFIG_LARGE_TESTS:=
 !CONFIG_LARGE_TESTS:=yes
 endif
 
+# Enable CUDA hardware-accelerated tests when HWACCEL=cuda is set on
+# the make command line (e.g. make fate-filter-scale_cuda HWACCEL=cuda).
+ifeq ($(HWACCEL), cuda)
+CONFIG_HWACCEL_CUDA_TESTS:=yes
+!CONFIG_HWACCEL_CUDA_TESTS:=
+else
+CONFIG_HWACCEL_CUDA_TESTS:=
+!CONFIG_HWACCEL_CUDA_TESTS:=yes
+endif
+
 include $(SRC_PATH)/$(APITESTSDIR)/Makefile
 
 include $(SRC_PATH)/tests/fate/acodec.mak
diff --git a/tests/fate/filter-video.mak b/tests/fate/filter-video.mak
index 07b8632c6f..407f4e7fd9 100644
--- a/tests/fate/filter-video.mak
+++ b/tests/fate/filter-video.mak
@@ -845,6 +845,97 @@ FATE_SAMPLES_FFMPEG += $(FATE_FILTER_SAMPLES-yes)
 FATE_FFPROBE += $(FATE_FILTER_FFPROBE-yes)
 FATE_FFMPEG += $(FATE_FILTER-yes)
 
+#
+# scale_cuda filter tests (YUV<->RGB color space conversion)
+# These require a CUDA GPU and are only run when HWACCEL=cuda is set:
+#   make fate-filter-scale_cuda HWACCEL=cuda
+#
+SCALE_CUDA_DEPS = SCALE_CUDA_FILTER HWUPLOAD_CUDA_FILTER HWDOWNLOAD_FILTER \
+                  FORMAT_FILTER TESTSRC2_FILTER HWACCEL_CUDA_TESTS
+
+SCALE_CUDA_ALLYES = $(call ALLYES, $(SCALE_CUDA_DEPS))
+
+# YUV -> RGB conversion tests
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-nv12-bgra
+fate-filter-scale_cuda-nv12-bgra: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=nv12,hwupload_cuda,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-nv12-bgra: CMP = oneline
+fate-filter-scale_cuda-nv12-bgra: REF = b42429ecd55f17a363462a1cefc091eb
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-yuv420p-bgra
+fate-filter-scale_cuda-yuv420p-bgra: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=yuv420p,hwupload_cuda,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-yuv420p-bgra: CMP = oneline
+fate-filter-scale_cuda-yuv420p-bgra: REF = b42429ecd55f17a363462a1cefc091eb
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-p010-bgra
+fate-filter-scale_cuda-p010-bgra: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=p010,hwupload_cuda,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-p010-bgra: CMP = oneline
+fate-filter-scale_cuda-p010-bgra: REF = 953e5995a4c063fefa9e0035cf3b47d0
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-yuv444p-bgra
+fate-filter-scale_cuda-yuv444p-bgra: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=yuv444p,hwupload_cuda,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-yuv444p-bgra: CMP = oneline
+fate-filter-scale_cuda-yuv444p-bgra: REF = 371e4f6f2024c65b1d748df8004a174e
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-nv12-rgba
+fate-filter-scale_cuda-nv12-rgba: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=nv12,hwupload_cuda,scale_cuda=format=rgba,hwdownload,format=rgba"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-nv12-rgba: CMP = oneline
+fate-filter-scale_cuda-nv12-rgba: REF = e96deb453261a480fbbcdaf9659551fb
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-nv12-bgra-bt709
+fate-filter-scale_cuda-nv12-bgra-bt709: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=nv12,setparams=colorspace=bt709:range=tv,hwupload_cuda,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-nv12-bgra-bt709: CMP = oneline
+fate-filter-scale_cuda-nv12-bgra-bt709: REF = e2879809be78787b6afbf3eb65fd9bd2
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-nv12-bgra-full
+fate-filter-scale_cuda-nv12-bgra-full: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=bgra,hwupload_cuda,scale_cuda=format=nv12:out_range=full,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-nv12-bgra-full: CMP = oneline
+fate-filter-scale_cuda-nv12-bgra-full: REF = 31d661661f9e8d0b095c2bfe2eb08b80
+
+# RGB -> YUV conversion tests
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-bgra-nv12
+fate-filter-scale_cuda-bgra-nv12: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=bgra,hwupload_cuda,scale_cuda=format=nv12:out_range=limited,hwdownload,format=nv12"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-bgra-nv12: CMP = oneline
+fate-filter-scale_cuda-bgra-nv12: REF = 5c173e99a466c8f073955cfed6130cf3
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-bgra-yuv420p
+fate-filter-scale_cuda-bgra-yuv420p: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=bgra,hwupload_cuda,scale_cuda=format=yuv420p:out_range=limited,hwdownload,format=yuv420p"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-bgra-yuv420p: CMP = oneline
+fate-filter-scale_cuda-bgra-yuv420p: REF = 498dc518e097fff20df638116805a3c9
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-bgra-p010
+fate-filter-scale_cuda-bgra-p010: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=bgra,hwupload_cuda,scale_cuda=format=p010:out_range=limited,hwdownload,format=p010"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-bgra-p010: CMP = oneline
+fate-filter-scale_cuda-bgra-p010: REF = 64cae4618e8cc9c36628e6cb258e0e74
+
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-rgba-nv12
+fate-filter-scale_cuda-rgba-nv12: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=rgba,hwupload_cuda,scale_cuda=format=nv12:out_range=limited,hwdownload,format=nv12"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-rgba-nv12: CMP = oneline
+fate-filter-scale_cuda-rgba-nv12: REF = 5c173e99a466c8f073955cfed6130cf3
+
+# Roundtrip test: NV12 -> BGRA -> NV12
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-roundtrip-nv12
+fate-filter-scale_cuda-roundtrip-nv12: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=nv12,hwupload_cuda,scale_cuda=format=bgra,scale_cuda=format=nv12:out_range=limited,hwdownload,format=nv12"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-roundtrip-nv12: CMP = oneline
+fate-filter-scale_cuda-roundtrip-nv12: REF = 922566d893120bcfa80879d3a584dd80
+
+# Roundtrip test: BGRA -> YUV444P -> BGRA (lossless chroma, tighter match)
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += 
fate-filter-scale_cuda-roundtrip-yuv444p
+fate-filter-scale_cuda-roundtrip-yuv444p: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=bgra,hwupload_cuda,scale_cuda=format=yuv444p:out_range=limited,scale_cuda=format=bgra,hwdownload,format=bgra"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-roundtrip-yuv444p: CMP = oneline
+fate-filter-scale_cuda-roundtrip-yuv444p: REF = 
4ef255bc06d7f437827c228d72943471
+
+# Regression: existing YUV->YUV path should be unaffected
+FATE_SCALE_CUDA-$(SCALE_CUDA_ALLYES) += fate-filter-scale_cuda-nv12-nv12-scale
+fate-filter-scale_cuda-nv12-nv12-scale: CMD = md5 -lavfi 
"testsrc2=s=160x120:d=1:r=1,format=nv12,hwupload_cuda,scale_cuda=w=320:h=240:format=nv12,hwdownload,format=nv12"
 -frames:v 5 -f rawvideo
+fate-filter-scale_cuda-nv12-nv12-scale: CMP = oneline
+fate-filter-scale_cuda-nv12-nv12-scale: REF = d2f76fd184bf7888305866566a21b3eb
+
+# Add scale_cuda tests to FATE_HW (not included in default fate runs)
+FATE_HW-yes += $(FATE_SCALE_CUDA-yes)
+
+# Convenience target for running only scale_cuda tests
+fate-filter-scale_cuda: $(FATE_SCALE_CUDA-yes)
+
 fate-vfilter: $(FATE_FILTER-yes) $(FATE_FILTER_SAMPLES-yes) 
$(FATE_FILTER_VSYNTH-yes)
 
 fate-filter: fate-afilter fate-vfilter $(FATE_METADATA_FILTER-yes) 
$(FATE_FILTER_FFPROBE-yes)
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to