Re: [FFmpeg-devel] [PATCH] Vulkan hwcontext and filters

Mark Thompson Tue, 21 Jan 2020 15:40:13 -0800

On 19/01/2020 13:11, Lynne wrote:
> Jan 18, 2020, 20:23 by [email protected]:
> 
>> On 10/01/2020 21:05, Lynne wrote:
>>
>> The CUDA parts look like they could be split off into a separate commit?  
>> (It's already huge.)
>>
> 
> I don't really want broken commits or commits with dead code.


Shouldn't be - you've got nice #ifdef markers surrounding exactly the right 
pieces to splice out :)

>>> @@ -3639,7 +3641,7 @@ avformat_deps="avcodec avutil"
>>>  avformat_suggest="libm network zlib"
>>>  avresample_deps="avutil"
>>>  avresample_suggest="libm"
>>> -avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 
>>> vaapi videotoolbox corefoundation corevideo coremedia bcrypt"
>>> +avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 
>>> vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt"
>>>  postproc_deps="avutil gpl"
>>>  postproc_suggest="libm"
>>>  swresample_deps="avutil"
>>> @@ -6626,6 +6628,9 @@ enabled vdpau &&
>>>  
>>>  enabled crystalhd && check_lib crystalhd "stdint.h 
>>> libcrystalhd/libcrystalhd_if.h" DtsCrystalHDVersion -lcrystalhd
>>>  
>>> +enabled vulkan &&
>>> +    require_pkg_config vulkan "vulkan >= 1.1.97" "vulkan/vulkan.h" 
>>> vkCreateInstance
>>>
>>
>> Presumably you have some specific requirement in mind which wants this 
>> version - can you note it somewhere?  (Either here or in the commit message.)
>>
> 
> The DRM export/import and I think the semaphore import API.
> Its not a new version, its been out for a long time.

About a year, which probably isn't long enough to assume it's everywhere.  I 
guess it doesn't really matter, though, because it will only ever increase when 
something extra is added.

>>> ...
>>> +static int cuda_device_derive(AVHWDeviceContext *device_ctx,
>>> +                              AVHWDeviceContext *src_ctx,
>>> +                              int flags) {
>>> +    AVCUDADeviceContext *hwctx = device_ctx->hwctx;
>>> +    CudaFunctions *cu;
>>> +    const char *src_uuid = NULL;
>>> +    CUcontext dummy;
>>> +    int ret, i, device_count, dev_active = 0;
>>> +    unsigned int dev_flags = 0;
>>> +
>>> +    const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
>>> +
>>> +    switch (src_ctx->type) {
>>> +#if CONFIG_VULKAN
>>> +    case AV_HWDEVICE_TYPE_VULKAN: {
>>> +        AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
>>> +        src_uuid = vkctx->device_uuid;
>>> +        break;
>>> +    }
>>> +#endif
>>> +    default:
>>> +        return AVERROR(ENOSYS);
>>> +    }
>>> +
>>> +    if (!src_uuid) {
>>> +        av_log(device_ctx, AV_LOG_ERROR,
>>> +               "Failed to get UUID of source device.\n");
>>> +        goto error;
>>> +    }
>>> +
>>> +    if (cuda_device_init(device_ctx) < 0)
>>> +        goto error;
>>> +
>>> +    cu = hwctx->internal->cuda_dl;
>>> +
>>> +    ret = CHECK_CU(cu->cuInit(0));
>>> +    if (ret < 0)
>>> +        goto error;
>>> +
>>> +    ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
>>> +    if (ret < 0)
>>> +        goto error;
>>> +
>>> +    hwctx->internal->cuda_device = -1;
>>> +    for (i = 0; i < device_count; i++) {
>>> +        CUdevice dev;
>>> +        CUuuid uuid;
>>> +
>>> +        ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
>>> +            hwctx->internal->cuda_device = dev;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    if (hwctx->internal->cuda_device == -1) {
>>> +        av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA 
>>> device.\n");
>>>
>>
>> This error is maybe more like "Can't find the matching CUDA device to the 
>> supplied Vulkan device".
>>
> 
> Let's keep it that way, its not meant to be vulkan specific, though its only 
> used by vulkan currently.

Good point!  Fair enough.

>>> +        goto error;
>>> +    }
>>> +
>>> +    hwctx->internal->flags = flags;
>>> +
>>> +    if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
>>> +        ret = 
>>> CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device, 
>>> &dev_flags, &dev_active));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        if (dev_active && dev_flags != desired_flags) {
>>> +            av_log(device_ctx, AV_LOG_ERROR, "Primary context already 
>>> active with incompatible flags.\n");
>>> +            goto error;
>>> +        } else if (dev_flags != desired_flags) {
>>> +            ret = 
>>> CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device, 
>>> desired_flags));
>>> +            if (ret < 0)
>>> +                goto error;
>>> +        }
>>> +
>>> +        ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx, 
>>> hwctx->internal->cuda_device));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +    } else {
>>> +        ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags, 
>>> hwctx->internal->cuda_device));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
>>> +    }
>>> +
>>> +    hwctx->internal->is_allocated = 1;
>>> +
>>> +    // Setting stream to NULL will make functions automatically use the 
>>> default CUstream
>>> +    hwctx->stream = NULL;
>>> +
>>> +    return 0;
>>> +
>>> +error:
>>> +    cuda_device_uninit(device_ctx);
>>> +    return AVERROR_UNKNOWN;
>>> +}
>>> +
>>>  const HWContextType ff_hwcontext_type_cuda = {
>>>  .type                 = AV_HWDEVICE_TYPE_CUDA,
>>>  .name                 = "CUDA",
>>> @@ -397,6 +517,7 @@ const HWContextType ff_hwcontext_type_cuda = {
>>>  .frames_priv_size     = sizeof(CUDAFramesContext),
>>>  
>>>  .device_create        = cuda_device_create,
>>> +    .device_derive        = cuda_device_derive,
>>>  .device_init          = cuda_device_init,
>>>  .device_uninit        = cuda_device_uninit,
>>>  .frames_get_constraints = cuda_frames_get_constraints,
>>> ...
>>> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
>>> new file mode 100644
>>> index 0000000000..d4eb8ffd35
>>> --- /dev/null
>>> +++ b/libavutil/hwcontext_vulkan.c
>>> @@ -0,0 +1,2804 @@
>>> ...
>>> +
>>> +static const struct {
>>> +    enum AVPixelFormat pixfmt;
>>> +    const VkFormat vkfmts[3];
>>> +} vk_pixfmt_map[] = {
>>> +    { AV_PIX_FMT_GRAY8,   { VK_FORMAT_R8_UNORM } },
>>> +    { AV_PIX_FMT_GRAY16,  { VK_FORMAT_R16_UNORM } },
>>> +    { AV_PIX_FMT_GRAYF32, { VK_FORMAT_R32_SFLOAT } },
>>> +
>>> +    { AV_PIX_FMT_NV12, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8G8_UNORM } },
>>> +    { AV_PIX_FMT_P010, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } },
>>>
>>
>> Is P010 still safe when the low bits might have any value?
>>
> 
> Our padding is in the top bits. Vulkan defines it in the bottom bits, hence 
> we can't use it.

?  Our padding is definitely in the bottom bits, since it's defined to be 
consistent with all the hardware using it.

> I can't see why it wouldn't be safe. Every code that deals with user-supplied 
> frames must rely they're junk.

Probably close enough for government work, but still slightly off.  If the only 
operation you can do is load it into a float as UNORM then a top value with 
zeroes would be slightly off-white and the colourspace-conversion nazis will 
hunt you down.

>>> +    { AV_PIX_FMT_P016, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_YUV420P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, 
>>> VK_FORMAT_R8_UNORM } },
>>> +    { AV_PIX_FMT_YUV422P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, 
>>> VK_FORMAT_R8_UNORM } },
>>> +    { AV_PIX_FMT_YUV444P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, 
>>> VK_FORMAT_R8_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_YUV420P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, 
>>> VK_FORMAT_R16_UNORM } },
>>> +    { AV_PIX_FMT_YUV422P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, 
>>> VK_FORMAT_R16_UNORM } },
>>> +    { AV_PIX_FMT_YUV444P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, 
>>> VK_FORMAT_R16_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_ABGR,   { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } },
>>> +    { AV_PIX_FMT_BGRA,   { VK_FORMAT_B8G8R8A8_UNORM } },
>>> +    { AV_PIX_FMT_RGBA,   { VK_FORMAT_R8G8B8A8_UNORM } },
>>> +    { AV_PIX_FMT_RGB24,  { VK_FORMAT_R8G8B8_UNORM } },
>>> +    { AV_PIX_FMT_BGR24,  { VK_FORMAT_B8G8R8_UNORM } },
>>> +    { AV_PIX_FMT_RGB48,  { VK_FORMAT_R16G16B16_UNORM } },
>>> +    { AV_PIX_FMT_RGBA64, { VK_FORMAT_R16G16B16A16_UNORM } },
>>> +    { AV_PIX_FMT_RGB565, { VK_FORMAT_R5G6B5_UNORM_PACK16 } },
>>> +    { AV_PIX_FMT_BGR565, { VK_FORMAT_B5G6R5_UNORM_PACK16 } },
>>> +    { AV_PIX_FMT_BGR0,   { VK_FORMAT_B8G8R8A8_UNORM } },
>>> +    { AV_PIX_FMT_0BGR,   { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } },
>>> +    { AV_PIX_FMT_RGB0,   { VK_FORMAT_R8G8B8A8_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_GBRPF32, { VK_FORMAT_R32_SFLOAT, VK_FORMAT_R32_SFLOAT, 
>>> VK_FORMAT_R32_SFLOAT } },
>>> +};
>>> +
>>> ...
>>> +static int vulkan_device_create(AVHWDeviceContext *ctx, const char *device,
>>> +                                AVDictionary *opts, int flags)
>>> +{
>>> +    VulkanDeviceSelection dev_select = { 0 };
>>> +    if (device && device[0]) {
>>> +        char *end = NULL;
>>> +        dev_select.index = strtol(device, &end, 10);
>>> +        if (end == device) {
>>> +            dev_select.index = 0;
>>> +            dev_select.name  = device;
>>> +        }
>>> +    }
>>>
>>
>> Is it worth making uuid=f00 work here as well?  (From opts rather than 
>> device: "-init_hw_device vulkan:,uuid=f00".)
>>
> 
> Would like to, but I don't think its worth the extra dependency (libuuid, its 
> widespread on linux but I'd rather not NIH parsing).

Ok.

>>> +
>>> +    return vulkan_device_create_internal(ctx, &dev_select, opts, flags);
>>> +}
>>> ...
>>> +
>>> +static int vulkan_frames_init(AVHWFramesContext *hwfc)
>>> +{
>>> +    int err;
>>> +    AVVkFrame *f;
>>> +    AVVulkanFramesContext *hwctx = hwfc->hwctx;
>>> +    VulkanFramesPriv *fp = hwfc->internal->priv;
>>> +    AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
>>> +    VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
>>> +
>>> +    if (hwfc->pool)
>>> +        return 0;
>>> +
>>> +    /* Default pool flags */
>>> +    hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ?
>>> +                    VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
>>> +
>>> +    hwctx->usage |= DEFAULT_USAGE_FLAGS;
>>>
>>
>> Is it possible that this disallows some use-cases in a device where those 
>> default flags need not be not supported?  For example, some sort of magic 
>> image-writer like a video decoder where the output images can only ever be 
>> used as a source by non-magic operations.  Baking that into the API (as in 
>> the comment on usage in the header) seems bad if so.
>>
> 
> You need to support those flags to do anything useful with the images.
> This restricts read-only images since those don't support the STORAGE flag. 
> Such images are the planar images, which DO support the STORAGE flag but only 
> for their subimages (e.g. individual planes). This is in spec, regardless 
> what Intel says and disagrees with (they advise to alias memory instead). We 
> don't use them anyway, and if a user wanted to repackage received frames as 
> planar (or unpackage them) they still can.

I feel like devices with weird memory are still a problem here, but reading the 
code again I can see that the user-provided pool actually trumps the concern 
anyway - if anything funny is going on then the user can set it up however they 
want.

>>> ...
>>> +
>>> +static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, 
>>> AVVkFrame **frame,
>>> +                                          AVDRMFrameDescriptor *desc)
>>> +{
>>> +    int err = 0;
>>> +    VkResult ret;
>>> +    AVVkFrame *f;
>>> +    AVHWDeviceContext *ctx = hwfc->device_ctx;
>>> +    AVVulkanDeviceContext *hwctx = ctx->hwctx;
>>> +    VulkanDevicePriv *p = ctx->internal->priv;
>>> +    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>>> +    const AVPixFmtDescriptor *fmt_desc = 
>>> av_pix_fmt_desc_get(hwfc->sw_format);
>>> +    const int has_modifiers = p->extensions & EXT_DRM_MODIFIER_FLAGS;
>>> +    VkSubresourceLayout plane_data[AV_NUM_DATA_POINTERS];
>>> +    VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS];
>>> +    VkExternalMemoryHandleTypeFlagBits htype = 
>>> VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
>>> +
>>> +    VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdPropertiesKHR);
>>> +
>>> +    for (int i = 0; i < desc->nb_layers; i++) {
>>> +        if (desc->layers[i].nb_planes > 1) {
>>> +            av_log(ctx, AV_LOG_ERROR, "Cannot import DMABUFS with more 
>>> than 1 "
>>> +                                      "plane per layer!\n");
>>> +            return AVERROR(EINVAL);
>>> +        }
>>> +
>>> +        if (drm_to_vulkan_fmt(desc->layers[i].format) == 
>>> VK_FORMAT_UNDEFINED) {
>>> +            av_log(ctx, AV_LOG_ERROR, "Unsupported DMABUF layer 
>>> format!\n");
>>>
>>
>> Maybe say what the unsupported format is for someone reporting the message, 
>> since this is probably relatively easy to hit (e.g. YUYV).
>>
> 
> Sure, I'll just use the handy DRM API/define to translate DRM FOURCC uint32_t 
> to a string.
> Oh wait, I can't because there is no such API. I've had to manually go 
> through every single define in the past, relying on blind luck and 
> speculation to match those up. DRM devs want people to suffer, as if the 
> big-endian confusion-inducing FOURCC isn't evidence enough for it.
> Not willing to NIH something to pull the chars out of it yet. Next time I 
> have to look up a drm format id maybe.

Print it with %#08x.  I just think it should appear in the log so that it's 
possible for someone reading to log to see the format which failed; if they 
have to decode it a little bit themselves that isn't a disaster.

>>> +            return AVERROR(EINVAL);
>>> +        }
>>> +    }
>>> +
>>> +    if (!(f = av_mallocz(sizeof(*f)))) {
>>> +        av_log(ctx, AV_LOG_ERROR, "Unable to allocate memory for 
>>> AVVkFrame!\n");
>>> +        err = AVERROR(ENOMEM);
>>> +        goto fail;
>>> +    }
>>> +
>>> ...
>>> +
>>> +static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst,
>>> +                             const AVFrame *src, int flags)
>>> +{
>>> +    int err = 0;
>>> +    VkResult ret;
>>> +    AVVkFrame *f = (AVVkFrame *)src->data[0];
>>> +    VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
>>> +    AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
>>> +    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>>> +    VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR);
>>> +    VkImageDrmFormatModifierPropertiesEXT drm_mod = {
>>> +        .sType = 
>>> VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
>>> +    };
>>>
>>
>> Do you need a sync here for any writing being finished, or is it implicit 
>> somehow below?
>>
> 
> There isn't. This would be where we export a semaphore to VAAPI.
> We could make a CPU sync by converting the image to read optimal and waiting 
> on the command queue's semaphore, but that would need another exec context.
> Should we?
> I don't have a way to test this ATM, my test program is gone, so if I do 
> touch it I'll need to write a new one, which is a lot of work. With ffmpeg I 
> can only test raw exporting without it actually being used by anything, since 
> everything complains about some missing contexts IIRC.

You often get away with this because some part of the next operation ends up 
queued on the same hardware submission ring.

I guess leave it unless you want to add the extra sync, but add a comment for 
the next person reading it.

>>> +
>>> +    AVDRMFrameDescriptor *drm_desc = av_mallocz(sizeof(*drm_desc));
>>> +    if (!drm_desc)
>>> +        return AVERROR(ENOMEM);
>>> +
>>> +    err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, 
>>> &vulkan_unmap_to_drm, drm_desc);
>>> +    if (err < 0)
>>> +        goto end;
>>> +
>>> +    if (p->extensions & EXT_DRM_MODIFIER_FLAGS) {
>>> +        VK_LOAD_PFN(hwctx->inst, vkGetImageDrmFormatModifierPropertiesEXT);
>>> +        ret = pfn_vkGetImageDrmFormatModifierPropertiesEXT(hwctx->act_dev, 
>>> f->img[0],
>>> +                                                           &drm_mod);
>>> +        if (ret != VK_SUCCESS) {
>>> +            av_log(hwfc, AV_LOG_ERROR, "Failed to retrieve DRM format 
>>> modifier!\n");
>>> +            err = AVERROR_EXTERNAL;
>>> +            goto end;
>>> +        }
>>> +    }
>>> +
>>> ...
>>> +
>>> +static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame 
>>> *dst,
>>> +                                       const AVFrame *src)
>>> +{
>>> +    int err = 0;
>>> +    AVFrame tmp;
>>> +    AVVkFrame *f = (AVVkFrame *)src->data[0];
>>> +    AVHWDeviceContext *dev_ctx = hwfc->device_ctx;
>>> +    ImageBuffer buf[AV_NUM_DATA_POINTERS] = { { 0 } };
>>> +    const int planes = av_pix_fmt_count_planes(dst->format);
>>> +    int log2_chroma = av_pix_fmt_desc_get(dst->format)->log2_chroma_h;
>>> +
>>> +    if (dst->width > hwfc->width || dst->height > hwfc->height)
>>> +        return AVERROR(EINVAL);
>>> +
>>> +    /* For linear, host visiable images */
>>> +    if (f->tiling == VK_IMAGE_TILING_LINEAR &&
>>> +        f->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
>>>
>>
>> Is it generally expected that this is actually faster than the next option?  
>> (Because evil uncached memory is a thing.)
>>
> 
> Could be. On Intel it was faster, but now its as fast as tiled images. On 
> NVIDIA its slower. Its still faster on Intel for simple short filter chains. 
> Keeping it for cheap devices.

Yep, fair enough.

>>> +        AVFrame *map = av_frame_alloc();
>>> +        if (!map)
>>> +            return AVERROR(ENOMEM);
>>> +        map->format = dst->format;
>>> +
>>> +        err = vulkan_map_frame_to_mem(hwfc, map, src, AV_HWFRAME_MAP_READ);
>>> +        if (err)
>>> +            return err;
>>> +
>>> +        err = av_frame_copy(dst, map);
>>> +        av_frame_free(&map);
>>> +        return err;
>>> +    }
>>> ...
Thanks,

- Mark
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] Vulkan hwcontext and filters

Reply via email to