On 19/01/2020 13:11, Lynne wrote:
> Jan 18, 2020, 20:23 by [email protected]:
>
>> On 10/01/2020 21:05, Lynne wrote:
>>
>> The CUDA parts look like they could be split off into a separate commit?
>> (It's already huge.)
>>
>
> I don't really want broken commits or commits with dead code.
Shouldn't be - you've got nice #ifdef markers surrounding exactly the right
pieces to splice out :)
>>> @@ -3639,7 +3641,7 @@ avformat_deps="avcodec avutil"
>>> avformat_suggest="libm network zlib"
>>> avresample_deps="avutil"
>>> avresample_suggest="libm"
>>> -avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32
>>> vaapi videotoolbox corefoundation corevideo coremedia bcrypt"
>>> +avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32
>>> vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt"
>>> postproc_deps="avutil gpl"
>>> postproc_suggest="libm"
>>> swresample_deps="avutil"
>>> @@ -6626,6 +6628,9 @@ enabled vdpau &&
>>>
>>> enabled crystalhd && check_lib crystalhd "stdint.h
>>> libcrystalhd/libcrystalhd_if.h" DtsCrystalHDVersion -lcrystalhd
>>>
>>> +enabled vulkan &&
>>> + require_pkg_config vulkan "vulkan >= 1.1.97" "vulkan/vulkan.h"
>>> vkCreateInstance
>>>
>>
>> Presumably you have some specific requirement in mind which wants this
>> version - can you note it somewhere? (Either here or in the commit message.)
>>
>
> The DRM export/import and I think the semaphore import API.
> Its not a new version, its been out for a long time.
About a year, which probably isn't long enough to assume it's everywhere. I
guess it doesn't really matter, though, because it will only ever increase when
something extra is added.
>>> ...
>>> +static int cuda_device_derive(AVHWDeviceContext *device_ctx,
>>> + AVHWDeviceContext *src_ctx,
>>> + int flags) {
>>> + AVCUDADeviceContext *hwctx = device_ctx->hwctx;
>>> + CudaFunctions *cu;
>>> + const char *src_uuid = NULL;
>>> + CUcontext dummy;
>>> + int ret, i, device_count, dev_active = 0;
>>> + unsigned int dev_flags = 0;
>>> +
>>> + const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
>>> +
>>> + switch (src_ctx->type) {
>>> +#if CONFIG_VULKAN
>>> + case AV_HWDEVICE_TYPE_VULKAN: {
>>> + AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
>>> + src_uuid = vkctx->device_uuid;
>>> + break;
>>> + }
>>> +#endif
>>> + default:
>>> + return AVERROR(ENOSYS);
>>> + }
>>> +
>>> + if (!src_uuid) {
>>> + av_log(device_ctx, AV_LOG_ERROR,
>>> + "Failed to get UUID of source device.\n");
>>> + goto error;
>>> + }
>>> +
>>> + if (cuda_device_init(device_ctx) < 0)
>>> + goto error;
>>> +
>>> + cu = hwctx->internal->cuda_dl;
>>> +
>>> + ret = CHECK_CU(cu->cuInit(0));
>>> + if (ret < 0)
>>> + goto error;
>>> +
>>> + ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
>>> + if (ret < 0)
>>> + goto error;
>>> +
>>> + hwctx->internal->cuda_device = -1;
>>> + for (i = 0; i < device_count; i++) {
>>> + CUdevice dev;
>>> + CUuuid uuid;
>>> +
>>> + ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
>>> + if (ret < 0)
>>> + goto error;
>>> +
>>> + ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
>>> + if (ret < 0)
>>> + goto error;
>>> +
>>> + if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
>>> + hwctx->internal->cuda_device = dev;
>>> + break;
>>> + }
>>> + }
>>> +
>>> + if (hwctx->internal->cuda_device == -1) {
>>> + av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA
>>> device.\n");
>>>
>>
>> This error is maybe more like "Can't find the matching CUDA device to the
>> supplied Vulkan device".
>>
>
> Let's keep it that way, its not meant to be vulkan specific, though its only
> used by vulkan currently.
Good point! Fair enough.
>>> + goto error;
>>> + }
>>> +
>>> + hwctx->internal->flags = flags;
>>> +
>>> + if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
>>> + ret =
>>> CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device,
>>> &dev_flags, &dev_active));
>>> + if (ret < 0)
>>> + goto error;
>>> +
>>> + if (dev_active && dev_flags != desired_flags) {
>>> + av_log(device_ctx, AV_LOG_ERROR, "Primary context already
>>> active with incompatible flags.\n");
>>> + goto error;
>>> + } else if (dev_flags != desired_flags) {
>>> + ret =
>>> CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device,
>>> desired_flags));
>>> + if (ret < 0)
>>> + goto error;
>>> + }
>>> +
>>> + ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx,
>>> hwctx->internal->cuda_device));
>>> + if (ret < 0)
>>> + goto error;
>>> + } else {
>>> + ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags,
>>> hwctx->internal->cuda_device));
>>> + if (ret < 0)
>>> + goto error;
>>> +
>>> + CHECK_CU(cu->cuCtxPopCurrent(&dummy));
>>> + }
>>> +
>>> + hwctx->internal->is_allocated = 1;
>>> +
>>> + // Setting stream to NULL will make functions automatically use the
>>> default CUstream
>>> + hwctx->stream = NULL;
>>> +
>>> + return 0;
>>> +
>>> +error:
>>> + cuda_device_uninit(device_ctx);
>>> + return AVERROR_UNKNOWN;
>>> +}
>>> +
>>> const HWContextType ff_hwcontext_type_cuda = {
>>> .type = AV_HWDEVICE_TYPE_CUDA,
>>> .name = "CUDA",
>>> @@ -397,6 +517,7 @@ const HWContextType ff_hwcontext_type_cuda = {
>>> .frames_priv_size = sizeof(CUDAFramesContext),
>>>
>>> .device_create = cuda_device_create,
>>> + .device_derive = cuda_device_derive,
>>> .device_init = cuda_device_init,
>>> .device_uninit = cuda_device_uninit,
>>> .frames_get_constraints = cuda_frames_get_constraints,
>>> ...
>>> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
>>> new file mode 100644
>>> index 0000000000..d4eb8ffd35
>>> --- /dev/null
>>> +++ b/libavutil/hwcontext_vulkan.c
>>> @@ -0,0 +1,2804 @@
>>> ...
>>> +
>>> +static const struct {
>>> + enum AVPixelFormat pixfmt;
>>> + const VkFormat vkfmts[3];
>>> +} vk_pixfmt_map[] = {
>>> + { AV_PIX_FMT_GRAY8, { VK_FORMAT_R8_UNORM } },
>>> + { AV_PIX_FMT_GRAY16, { VK_FORMAT_R16_UNORM } },
>>> + { AV_PIX_FMT_GRAYF32, { VK_FORMAT_R32_SFLOAT } },
>>> +
>>> + { AV_PIX_FMT_NV12, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8G8_UNORM } },
>>> + { AV_PIX_FMT_P010, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } },
>>>
>>
>> Is P010 still safe when the low bits might have any value?
>>
>
> Our padding is in the top bits. Vulkan defines it in the bottom bits, hence
> we can't use it.
? Our padding is definitely in the bottom bits, since it's defined to be
consistent with all the hardware using it.
> I can't see why it wouldn't be safe. Every code that deals with user-supplied
> frames must rely they're junk.
Probably close enough for government work, but still slightly off. If the only
operation you can do is load it into a float as UNORM then a top value with
zeroes would be slightly off-white and the colourspace-conversion nazis will
hunt you down.
>>> + { AV_PIX_FMT_P016, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } },
>>> +
>>> + { AV_PIX_FMT_YUV420P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM,
>>> VK_FORMAT_R8_UNORM } },
>>> + { AV_PIX_FMT_YUV422P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM,
>>> VK_FORMAT_R8_UNORM } },
>>> + { AV_PIX_FMT_YUV444P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM,
>>> VK_FORMAT_R8_UNORM } },
>>> +
>>> + { AV_PIX_FMT_YUV420P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM,
>>> VK_FORMAT_R16_UNORM } },
>>> + { AV_PIX_FMT_YUV422P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM,
>>> VK_FORMAT_R16_UNORM } },
>>> + { AV_PIX_FMT_YUV444P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM,
>>> VK_FORMAT_R16_UNORM } },
>>> +
>>> + { AV_PIX_FMT_ABGR, { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } },
>>> + { AV_PIX_FMT_BGRA, { VK_FORMAT_B8G8R8A8_UNORM } },
>>> + { AV_PIX_FMT_RGBA, { VK_FORMAT_R8G8B8A8_UNORM } },
>>> + { AV_PIX_FMT_RGB24, { VK_FORMAT_R8G8B8_UNORM } },
>>> + { AV_PIX_FMT_BGR24, { VK_FORMAT_B8G8R8_UNORM } },
>>> + { AV_PIX_FMT_RGB48, { VK_FORMAT_R16G16B16_UNORM } },
>>> + { AV_PIX_FMT_RGBA64, { VK_FORMAT_R16G16B16A16_UNORM } },
>>> + { AV_PIX_FMT_RGB565, { VK_FORMAT_R5G6B5_UNORM_PACK16 } },
>>> + { AV_PIX_FMT_BGR565, { VK_FORMAT_B5G6R5_UNORM_PACK16 } },
>>> + { AV_PIX_FMT_BGR0, { VK_FORMAT_B8G8R8A8_UNORM } },
>>> + { AV_PIX_FMT_0BGR, { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } },
>>> + { AV_PIX_FMT_RGB0, { VK_FORMAT_R8G8B8A8_UNORM } },
>>> +
>>> + { AV_PIX_FMT_GBRPF32, { VK_FORMAT_R32_SFLOAT, VK_FORMAT_R32_SFLOAT,
>>> VK_FORMAT_R32_SFLOAT } },
>>> +};
>>> +
>>> ...
>>> +static int vulkan_device_create(AVHWDeviceContext *ctx, const char *device,
>>> + AVDictionary *opts, int flags)
>>> +{
>>> + VulkanDeviceSelection dev_select = { 0 };
>>> + if (device && device[0]) {
>>> + char *end = NULL;
>>> + dev_select.index = strtol(device, &end, 10);
>>> + if (end == device) {
>>> + dev_select.index = 0;
>>> + dev_select.name = device;
>>> + }
>>> + }
>>>
>>
>> Is it worth making uuid=f00 work here as well? (From opts rather than
>> device: "-init_hw_device vulkan:,uuid=f00".)
>>
>
> Would like to, but I don't think its worth the extra dependency (libuuid, its
> widespread on linux but I'd rather not NIH parsing).
Ok.
>>> +
>>> + return vulkan_device_create_internal(ctx, &dev_select, opts, flags);
>>> +}
>>> ...
>>> +
>>> +static int vulkan_frames_init(AVHWFramesContext *hwfc)
>>> +{
>>> + int err;
>>> + AVVkFrame *f;
>>> + AVVulkanFramesContext *hwctx = hwfc->hwctx;
>>> + VulkanFramesPriv *fp = hwfc->internal->priv;
>>> + AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
>>> + VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
>>> +
>>> + if (hwfc->pool)
>>> + return 0;
>>> +
>>> + /* Default pool flags */
>>> + hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ?
>>> + VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
>>> +
>>> + hwctx->usage |= DEFAULT_USAGE_FLAGS;
>>>
>>
>> Is it possible that this disallows some use-cases in a device where those
>> default flags need not be not supported? For example, some sort of magic
>> image-writer like a video decoder where the output images can only ever be
>> used as a source by non-magic operations. Baking that into the API (as in
>> the comment on usage in the header) seems bad if so.
>>
>
> You need to support those flags to do anything useful with the images.
> This restricts read-only images since those don't support the STORAGE flag.
> Such images are the planar images, which DO support the STORAGE flag but only
> for their subimages (e.g. individual planes). This is in spec, regardless
> what Intel says and disagrees with (they advise to alias memory instead). We
> don't use them anyway, and if a user wanted to repackage received frames as
> planar (or unpackage them) they still can.
I feel like devices with weird memory are still a problem here, but reading the
code again I can see that the user-provided pool actually trumps the concern
anyway - if anything funny is going on then the user can set it up however they
want.
>>> ...
>>> +
>>> +static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc,
>>> AVVkFrame **frame,
>>> + AVDRMFrameDescriptor *desc)
>>> +{
>>> + int err = 0;
>>> + VkResult ret;
>>> + AVVkFrame *f;
>>> + AVHWDeviceContext *ctx = hwfc->device_ctx;
>>> + AVVulkanDeviceContext *hwctx = ctx->hwctx;
>>> + VulkanDevicePriv *p = ctx->internal->priv;
>>> + const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>>> + const AVPixFmtDescriptor *fmt_desc =
>>> av_pix_fmt_desc_get(hwfc->sw_format);
>>> + const int has_modifiers = p->extensions & EXT_DRM_MODIFIER_FLAGS;
>>> + VkSubresourceLayout plane_data[AV_NUM_DATA_POINTERS];
>>> + VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS];
>>> + VkExternalMemoryHandleTypeFlagBits htype =
>>> VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
>>> +
>>> + VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdPropertiesKHR);
>>> +
>>> + for (int i = 0; i < desc->nb_layers; i++) {
>>> + if (desc->layers[i].nb_planes > 1) {
>>> + av_log(ctx, AV_LOG_ERROR, "Cannot import DMABUFS with more
>>> than 1 "
>>> + "plane per layer!\n");
>>> + return AVERROR(EINVAL);
>>> + }
>>> +
>>> + if (drm_to_vulkan_fmt(desc->layers[i].format) ==
>>> VK_FORMAT_UNDEFINED) {
>>> + av_log(ctx, AV_LOG_ERROR, "Unsupported DMABUF layer
>>> format!\n");
>>>
>>
>> Maybe say what the unsupported format is for someone reporting the message,
>> since this is probably relatively easy to hit (e.g. YUYV).
>>
>
> Sure, I'll just use the handy DRM API/define to translate DRM FOURCC uint32_t
> to a string.
> Oh wait, I can't because there is no such API. I've had to manually go
> through every single define in the past, relying on blind luck and
> speculation to match those up. DRM devs want people to suffer, as if the
> big-endian confusion-inducing FOURCC isn't evidence enough for it.
> Not willing to NIH something to pull the chars out of it yet. Next time I
> have to look up a drm format id maybe.
Print it with %#08x. I just think it should appear in the log so that it's
possible for someone reading to log to see the format which failed; if they
have to decode it a little bit themselves that isn't a disaster.
>>> + return AVERROR(EINVAL);
>>> + }
>>> + }
>>> +
>>> + if (!(f = av_mallocz(sizeof(*f)))) {
>>> + av_log(ctx, AV_LOG_ERROR, "Unable to allocate memory for
>>> AVVkFrame!\n");
>>> + err = AVERROR(ENOMEM);
>>> + goto fail;
>>> + }
>>> +
>>> ...
>>> +
>>> +static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst,
>>> + const AVFrame *src, int flags)
>>> +{
>>> + int err = 0;
>>> + VkResult ret;
>>> + AVVkFrame *f = (AVVkFrame *)src->data[0];
>>> + VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
>>> + AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
>>> + const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>>> + VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR);
>>> + VkImageDrmFormatModifierPropertiesEXT drm_mod = {
>>> + .sType =
>>> VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
>>> + };
>>>
>>
>> Do you need a sync here for any writing being finished, or is it implicit
>> somehow below?
>>
>
> There isn't. This would be where we export a semaphore to VAAPI.
> We could make a CPU sync by converting the image to read optimal and waiting
> on the command queue's semaphore, but that would need another exec context.
> Should we?
> I don't have a way to test this ATM, my test program is gone, so if I do
> touch it I'll need to write a new one, which is a lot of work. With ffmpeg I
> can only test raw exporting without it actually being used by anything, since
> everything complains about some missing contexts IIRC.
You often get away with this because some part of the next operation ends up
queued on the same hardware submission ring.
I guess leave it unless you want to add the extra sync, but add a comment for
the next person reading it.
>>> +
>>> + AVDRMFrameDescriptor *drm_desc = av_mallocz(sizeof(*drm_desc));
>>> + if (!drm_desc)
>>> + return AVERROR(ENOMEM);
>>> +
>>> + err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
>>> &vulkan_unmap_to_drm, drm_desc);
>>> + if (err < 0)
>>> + goto end;
>>> +
>>> + if (p->extensions & EXT_DRM_MODIFIER_FLAGS) {
>>> + VK_LOAD_PFN(hwctx->inst, vkGetImageDrmFormatModifierPropertiesEXT);
>>> + ret = pfn_vkGetImageDrmFormatModifierPropertiesEXT(hwctx->act_dev,
>>> f->img[0],
>>> + &drm_mod);
>>> + if (ret != VK_SUCCESS) {
>>> + av_log(hwfc, AV_LOG_ERROR, "Failed to retrieve DRM format
>>> modifier!\n");
>>> + err = AVERROR_EXTERNAL;
>>> + goto end;
>>> + }
>>> + }
>>> +
>>> ...
>>> +
>>> +static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame
>>> *dst,
>>> + const AVFrame *src)
>>> +{
>>> + int err = 0;
>>> + AVFrame tmp;
>>> + AVVkFrame *f = (AVVkFrame *)src->data[0];
>>> + AVHWDeviceContext *dev_ctx = hwfc->device_ctx;
>>> + ImageBuffer buf[AV_NUM_DATA_POINTERS] = { { 0 } };
>>> + const int planes = av_pix_fmt_count_planes(dst->format);
>>> + int log2_chroma = av_pix_fmt_desc_get(dst->format)->log2_chroma_h;
>>> +
>>> + if (dst->width > hwfc->width || dst->height > hwfc->height)
>>> + return AVERROR(EINVAL);
>>> +
>>> + /* For linear, host visiable images */
>>> + if (f->tiling == VK_IMAGE_TILING_LINEAR &&
>>> + f->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
>>>
>>
>> Is it generally expected that this is actually faster than the next option?
>> (Because evil uncached memory is a thing.)
>>
>
> Could be. On Intel it was faster, but now its as fast as tiled images. On
> NVIDIA its slower. Its still faster on Intel for simple short filter chains.
> Keeping it for cheap devices.
Yep, fair enough.
>>> + AVFrame *map = av_frame_alloc();
>>> + if (!map)
>>> + return AVERROR(ENOMEM);
>>> + map->format = dst->format;
>>> +
>>> + err = vulkan_map_frame_to_mem(hwfc, map, src, AV_HWFRAME_MAP_READ);
>>> + if (err)
>>> + return err;
>>> +
>>> + err = av_frame_copy(dst, map);
>>> + av_frame_free(&map);
>>> + return err;
>>> + }
>>> ...
Thanks,
- Mark
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".