[Public]

Is it possible to just push KFD device initialization later during the KGD 
initialization?
>From a brief look, it doesn't seem like the KFD would operate (let alone the 
>KMS driver not surviving) if late IP initialization failed anyway.
Chunking KFD topology settings into separate phases seems a bit awkward.

Jon

> -----Original Message-----
> From: Jesse.Zhang <[email protected]>
> Sent: Thursday, May 29, 2025 3:58 AM
> To: [email protected]
> Cc: Deucher, Alexander <[email protected]>; Kuehling, Felix
> <[email protected]>; Kim, Jonathan <[email protected]>; Zhang,
> Jesse(Jie) <[email protected]>
> Subject: [PATCH V2] drm/amdkfd: add late initialization support for amdkfd 
> device
>
> Add support for late initialization of KFD device capabilities that
> depend on information only available after IP blocks are fully initialized.
> This is particularly needed for SDMA queue reset capabilities which require
> sdma.supported_reset to be populated during AMDGPU IP late init.
>
> Key changes:
> 1. Added amdgpu_amdkfd_device_late_init() interface
> 2. Implemented kgd2kfd_device_late_init() in KFD
> 3. Added kfd_topology_update_capabilities() to update node properties
> 4. Integrated into amdgpu_device_ip_late_init() sequence
>
> v2: remove the include "kfd_priv.h"
>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  5 +++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  7 +++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c    |  6 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c  | 22 ++++++++++++++++++++++
>  6 files changed, 47 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 4cec3a873995..d80745f60873 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -232,6 +232,11 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device
> *adev)
>       }
>  }
>
> +int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev)
> +{
> +     return kgd2kfd_device_late_init(adev->kfd.dev);
> +}
> +
>  void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev)
>  {
>       if (adev->kfd.dev) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b6ca41859b53..6c8bbcc7f177 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -160,6 +160,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device
> *adev,
>                       const void *ih_ring_entry);
>  void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
>  void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
> +int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev);
>  void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev);
>  int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev);
>  void amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev);
> @@ -410,6 +411,7 @@ void kgd2kfd_exit(void);
>  struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf);
>  bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                        const struct kgd2kfd_shared_resources *gpu_resources);
> +int kgd2kfd_device_late_init(struct kfd_dev *kfd);
>  void kgd2kfd_device_exit(struct kfd_dev *kfd);
>  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> @@ -433,6 +435,11 @@ static inline int kgd2kfd_init(void)
>       return -ENOENT;
>  }
>
> +static inline int kgd2kfd_device_late_init(struct kfd_dev *kfd)
> +{
> +     return -ENOENT;
> +}
> +
>  static inline void kgd2kfd_exit(void)
>  {
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d9d8cd063829..b7c0281cb6ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3395,6 +3395,12 @@ static int amdgpu_device_ip_late_init(struct
> amdgpu_device *adev)
>               return r;
>       }
>
> +     amdgpu_amdkfd_device_late_init(adev);
> +     if (r) {
> +             DRM_ERROR("amdkfd late init failed %d", r);
> +             return r;
> +     }
> +
>       if (!amdgpu_reset_in_recovery(adev))
>               amdgpu_ras_set_error_query_ready(adev, true);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index b9c82be6ce13..3aece03ad092 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -947,6 +947,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>       return kfd->init_complete;
>  }
>
> +int kgd2kfd_device_late_init(struct kfd_dev *kfd)
> +{
> +     kfd_topology_update_capabilities(kfd);
> +     return 0;
> +}
> +
>  void kgd2kfd_device_exit(struct kfd_dev *kfd)
>  {
>       if (kfd->init_complete) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index d221c58dccc3..1eee4d625ba2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1134,6 +1134,7 @@ int kfd_topology_init(void);
>  void kfd_topology_shutdown(void);
>  int kfd_topology_add_device(struct kfd_node *gpu);
>  int kfd_topology_remove_device(struct kfd_node *gpu);
> +void kfd_topology_update_capabilities(struct kfd_dev *kfd);
>  struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
>                                               uint32_t proximity_domain);
>  struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 09011d78f700..052215faff76 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -2026,6 +2026,28 @@ static void kfd_topology_set_capabilities(struct
> kfd_topology_device *dev)
>       kfd_topology_set_dbg_firmware_support(dev);
>  }
>
> +void kfd_topology_update_capabilities(struct kfd_dev *kfd)
> +{
> +     struct amdgpu_device *adev = kfd->adev;
> +     struct kfd_topology_device *kdev;
> +     struct amdgpu_device *node_adev;
> +
> +     list_for_each_entry(kdev, &topology_device_list, list) {
> +
> +             if (!kdev->gpu || !kdev->gpu->adev)
> +                     continue;
> +
> +             /* Compare the underlying adev pointers, not the top-level 
> structs
> directly */
> +             if (kdev->gpu->adev != adev)
> +                     continue;
> +
> +             node_adev = kdev->gpu->adev;
> +             if (KFD_GC_VERSION(kdev->gpu) < IP_VERSION(10, 0, 0) &&
> +                     (node_adev->sdma.supported_reset &
> AMDGPU_RESET_TYPE_PER_QUEUE))
> +                     kdev->node_props.capability2 |=
> HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED;
> +     }
> +}
> +
>  int kfd_topology_add_device(struct kfd_node *gpu)
>  {
>       uint32_t gpu_id;
> --
> 2.49.0

Reply via email to