[Public] Is it possible to just push KFD device initialization later during the KGD initialization? >From a brief look, it doesn't seem like the KFD would operate (let alone the >KMS driver not surviving) if late IP initialization failed anyway. Chunking KFD topology settings into separate phases seems a bit awkward.
Jon > -----Original Message----- > From: Jesse.Zhang <[email protected]> > Sent: Thursday, May 29, 2025 3:58 AM > To: [email protected] > Cc: Deucher, Alexander <[email protected]>; Kuehling, Felix > <[email protected]>; Kim, Jonathan <[email protected]>; Zhang, > Jesse(Jie) <[email protected]> > Subject: [PATCH V2] drm/amdkfd: add late initialization support for amdkfd > device > > Add support for late initialization of KFD device capabilities that > depend on information only available after IP blocks are fully initialized. > This is particularly needed for SDMA queue reset capabilities which require > sdma.supported_reset to be populated during AMDGPU IP late init. > > Key changes: > 1. Added amdgpu_amdkfd_device_late_init() interface > 2. Implemented kgd2kfd_device_late_init() in KFD > 3. Added kfd_topology_update_capabilities() to update node properties > 4. Integrated into amdgpu_device_ip_late_init() sequence > > v2: remove the include "kfd_priv.h" > > Signed-off-by: Jesse Zhang <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 +++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++ > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 6 ++++++ > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 ++++++++++++++++++++++ > 6 files changed, 47 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 4cec3a873995..d80745f60873 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -232,6 +232,11 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device > *adev) > } > } > > +int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev) > +{ > + return kgd2kfd_device_late_init(adev->kfd.dev); > +} > + > void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev) > { > if (adev->kfd.dev) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index b6ca41859b53..6c8bbcc7f177 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -160,6 +160,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device > *adev, > const void *ih_ring_entry); > void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); > void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); > +int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev); > void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev); > int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev); > void amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev); > @@ -410,6 +411,7 @@ void kgd2kfd_exit(void); > struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf); > bool kgd2kfd_device_init(struct kfd_dev *kfd, > const struct kgd2kfd_shared_resources *gpu_resources); > +int kgd2kfd_device_late_init(struct kfd_dev *kfd); > void kgd2kfd_device_exit(struct kfd_dev *kfd); > void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); > int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm); > @@ -433,6 +435,11 @@ static inline int kgd2kfd_init(void) > return -ENOENT; > } > > +static inline int kgd2kfd_device_late_init(struct kfd_dev *kfd) > +{ > + return -ENOENT; > +} > + > static inline void kgd2kfd_exit(void) > { > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index d9d8cd063829..b7c0281cb6ad 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3395,6 +3395,12 @@ static int amdgpu_device_ip_late_init(struct > amdgpu_device *adev) > return r; > } > > + amdgpu_amdkfd_device_late_init(adev); > + if (r) { > + DRM_ERROR("amdkfd late init failed %d", r); > + return r; > + } > + > if (!amdgpu_reset_in_recovery(adev)) > amdgpu_ras_set_error_query_ready(adev, true); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index b9c82be6ce13..3aece03ad092 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -947,6 +947,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, > return kfd->init_complete; > } > > +int kgd2kfd_device_late_init(struct kfd_dev *kfd) > +{ > + kfd_topology_update_capabilities(kfd); > + return 0; > +} > + > void kgd2kfd_device_exit(struct kfd_dev *kfd) > { > if (kfd->init_complete) { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index d221c58dccc3..1eee4d625ba2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -1134,6 +1134,7 @@ int kfd_topology_init(void); > void kfd_topology_shutdown(void); > int kfd_topology_add_device(struct kfd_node *gpu); > int kfd_topology_remove_device(struct kfd_node *gpu); > +void kfd_topology_update_capabilities(struct kfd_dev *kfd); > struct kfd_topology_device *kfd_topology_device_by_proximity_domain( > uint32_t proximity_domain); > struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock( > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > index 09011d78f700..052215faff76 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > @@ -2026,6 +2026,28 @@ static void kfd_topology_set_capabilities(struct > kfd_topology_device *dev) > kfd_topology_set_dbg_firmware_support(dev); > } > > +void kfd_topology_update_capabilities(struct kfd_dev *kfd) > +{ > + struct amdgpu_device *adev = kfd->adev; > + struct kfd_topology_device *kdev; > + struct amdgpu_device *node_adev; > + > + list_for_each_entry(kdev, &topology_device_list, list) { > + > + if (!kdev->gpu || !kdev->gpu->adev) > + continue; > + > + /* Compare the underlying adev pointers, not the top-level > structs > directly */ > + if (kdev->gpu->adev != adev) > + continue; > + > + node_adev = kdev->gpu->adev; > + if (KFD_GC_VERSION(kdev->gpu) < IP_VERSION(10, 0, 0) && > + (node_adev->sdma.supported_reset & > AMDGPU_RESET_TYPE_PER_QUEUE)) > + kdev->node_props.capability2 |= > HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; > + } > +} > + > int kfd_topology_add_device(struct kfd_node *gpu) > { > uint32_t gpu_id; > -- > 2.49.0
