On 10/11/2025 9:38 PM, Michał Winiarski wrote:
> Contiguous PF GGTT VMAs can be scarce after creating VFs.
> Increase the GuC buffer cache size to 8M for PF so that we can fit GuC
> migration data (which currently maxes out at just over 4M) and use the
> cache instead of allocating fresh BOs.
>
> Signed-off-by: Michał Winiarski <[email protected]>
> ---
> drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c | 54 +++++++------------
> drivers/gpu/drm/xe/xe_guc.c | 2 +-
> 2 files changed, 20 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> index 50f09994e2854..8b96eff8df93b 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> @@ -11,7 +11,7 @@
> #include "xe_gt_sriov_pf_helpers.h"
> #include "xe_gt_sriov_pf_migration.h"
> #include "xe_gt_sriov_printk.h"
> -#include "xe_guc.h"
> +#include "xe_guc_buf.h"
> #include "xe_guc_ct.h"
> #include "xe_sriov.h"
> #include "xe_sriov_pf_migration.h"
> @@ -57,73 +57,57 @@ static int pf_send_guc_query_vf_state_size(struct xe_gt
> *gt, unsigned int vfid)
>
> /* Return: number of state dwords saved or a negative error code on failure
> */
> static int pf_send_guc_save_vf_state(struct xe_gt *gt, unsigned int vfid,
> - void *buff, size_t size)
> + void *dst, size_t size)
> {
> const int ndwords = size / sizeof(u32);
> - struct xe_tile *tile = gt_to_tile(gt);
> - struct xe_device *xe = tile_to_xe(tile);
> struct xe_guc *guc = >->uc.guc;
> - struct xe_bo *bo;
> + CLASS(xe_guc_buf, buf)(&guc->buf, ndwords);
> int ret;
>
> xe_gt_assert(gt, size % sizeof(u32) == 0);
> xe_gt_assert(gt, size == ndwords * sizeof(u32));
>
> - bo = xe_bo_create_pin_map_novm(xe, tile,
> - ALIGN(size, PAGE_SIZE),
> - ttm_bo_type_kernel,
> - XE_BO_FLAG_SYSTEM |
> - XE_BO_FLAG_GGTT |
> - XE_BO_FLAG_GGTT_INVALIDATE, false);
> - if (IS_ERR(bo))
> - return PTR_ERR(bo);
> + if (!xe_guc_buf_is_valid(buf))
> + return -ENOBUFS;
> +
> + memset(xe_guc_buf_cpu_ptr(buf), 0, size);
is that necessary? GuC will overwrite that anyway
>
> ret = guc_action_vf_save_restore(guc, vfid, GUC_PF_OPCODE_VF_SAVE,
> - xe_bo_ggtt_addr(bo), ndwords);
> - if (!ret)
> + xe_guc_buf_flush(buf), ndwords);
> + if (!ret) {
> ret = -ENODATA;
> - else if (ret > ndwords)
> + } else if (ret > ndwords) {
> ret = -EPROTO;
> - else if (ret > 0)
> - xe_map_memcpy_from(xe, buff, &bo->vmap, 0, ret * sizeof(u32));
> + } else if (ret > 0) {
> + xe_guc_buf_sync(buf);
> + memcpy(dst, xe_guc_buf_cpu_ptr(buf), ret * sizeof(u32));
with a small change suggested earlier, this could be just:
memcpy(dst, xe_guc_buf_sync(buf), ret * sizeof(u32));
> + }
>
> - xe_bo_unpin_map_no_vm(bo);
> return ret;
> }
>
> /* Return: number of state dwords restored or a negative error code on
> failure */
> static int pf_send_guc_restore_vf_state(struct xe_gt *gt, unsigned int vfid,
> - const void *buff, size_t size)
> + const void *src, size_t size)
> {
> const int ndwords = size / sizeof(u32);
> - struct xe_tile *tile = gt_to_tile(gt);
> - struct xe_device *xe = tile_to_xe(tile);
> struct xe_guc *guc = >->uc.guc;
> - struct xe_bo *bo;
> + CLASS(xe_guc_buf_from_data, buf)(&guc->buf, src, size);
> int ret;
>
> xe_gt_assert(gt, size % sizeof(u32) == 0);
> xe_gt_assert(gt, size == ndwords * sizeof(u32));
>
> - bo = xe_bo_create_pin_map_novm(xe, tile,
> - ALIGN(size, PAGE_SIZE),
> - ttm_bo_type_kernel,
> - XE_BO_FLAG_SYSTEM |
> - XE_BO_FLAG_GGTT |
> - XE_BO_FLAG_GGTT_INVALIDATE, false);
> - if (IS_ERR(bo))
> - return PTR_ERR(bo);
> -
> - xe_map_memcpy_to(xe, &bo->vmap, 0, buff, size);
> + if (!xe_guc_buf_is_valid(buf))
> + return -ENOBUFS;
>
> ret = guc_action_vf_save_restore(guc, vfid, GUC_PF_OPCODE_VF_RESTORE,
> - xe_bo_ggtt_addr(bo), ndwords);
> + xe_guc_buf_flush(buf), ndwords);
> if (!ret)
> ret = -ENODATA;
> else if (ret > ndwords)
> ret = -EPROTO;
>
> - xe_bo_unpin_map_no_vm(bo);
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> index ccc7c60ae9b77..71ca06d1af62b 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -857,7 +857,7 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
> if (ret)
> return ret;
>
> - ret = xe_guc_buf_cache_init(&guc->buf, SZ_8K);
> + ret = xe_guc_buf_cache_init(&guc->buf, IS_SRIOV_PF(guc_to_xe(guc)) ?
> SZ_8M : SZ_8K);
shouldn't we also check for xe_sriov_pf_migration_supported() ?
also, shouldn't we get this SZ_8M somewhere from the PF code?
and maybe PF could (one day) query that somehow from the GuC?
> if (ret)
> return ret;
>