uvm_fault on unhibernating x395

Florian Obser Mon, 13 May 2024 11:11:06 -0700

OCR'ed and edited a bit, there might be mistakes.
Picture: https://dump.sha256.net/dump/unhibernating_panic.jpg


unhibernating & block 50329599 Length 243MB
uvm_fault(0xffffffff826b2860, 0x38, 0, 1) →> e
kernel: page fault trap, code=0
Stopped at ttm_resource_manager_evict_all+0x5e: cmpq %rbx, 0x38(%r14)
    TID PID UID PRFLAGS   PFLAGS CPU COMMAND
*   0   0   0   0x100000  0x20   0K  swapper
ttm_resource_manager_evict_all(ffff80000017f260,0,dba63e95861e671,ffff800000170000,ffff800000170058,2)
 at ttm_resource_
manager_evict_all+0x5e
amdgpu_device_prepare(ffff800000170058, ffff800000170058, fac0345246af 9871, 
ffff800000170058,0,2) at amdgpu_device_prepare
+0x61
amdgpu_activate(ffff800000170000, 2, b6a78044d3a303c5,0, ffff80000014400, 
fffffff f8228acc8) at amdgpu_activate+0x55
config_activate_children(ffff800000144c00,2,172aac03cc1e?5dd,0,ffff80000014a000,2)
 at config_activate_children+0x85
config_activate_children(ffff80000014a000,2,172aac03cc1e75dd,0,ffff800000144100,2)
 at config_activate_children+0x85
config_activate_children(ffff800000144100,2,172aac03ccle75dd,0, 
ffff800000030280,2) at config_activate_chiLdren+0x85
config_activate_children(ffff800000030280,2,172aac03cc1e7256,2,ffff800000030280,0)
config_suspend_all (2,2,72519cb31f5203, fffffff f82a94a38,0,bfff50) at 
config_suspend_all+0x1ae
hibernate_resume(8c03129a1118d1c,ffffffff82a9460,ffff800000142200,0.0,0) at 
hibernate_resume+0x1b4
diskconf (25badalafa9d6262,8, ffffffff82538360, ffffffff82a8008,400056f4b50,8) 
at diskconf+0x188
main(0,0,1001000, ffff800037c871f0,ffffffff81fda030,ffffffff82a94f40) at 
main+0x510

I've bisected it to this changeset:
https://codeberg.org/OpenBSD/src/commit/36668b1581688d40ad5fd6631f4f503e6d36091d

suspend / resume seems to be unaffected by this, reverting makes
hibernate / unhibernate work again.

diff --git sys/dev/pci/drm/amd/amdgpu/amdgpu.h 
sys/dev/pci/drm/amd/amdgpu/amdgpu.h
index 38a424f16fb..afac024456e 100644
--- sys/dev/pci/drm/amd/amdgpu/amdgpu.h
+++ sys/dev/pci/drm/amd/amdgpu/amdgpu.h
@@ -1398,7 +1398,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 void amdgpu_driver_release_kms(struct drm_device *dev);
 
 int amdgpu_device_ip_suspend(struct amdgpu_device *adev);
-int amdgpu_device_prepare(struct drm_device *dev);
 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon);
 int amdgpu_device_resume(struct drm_device *dev, bool fbcon);
 u32 amdgpu_get_vblank_counter_kms(struct drm_crtc *crtc);
diff --git sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c 
sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c
index 2d96609911e..7901aeb4dfd 100644
--- sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c
+++ sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c
@@ -1568,7 +1568,6 @@ static void amdgpu_switcheroo_set_state(struct pci_dev 
*pdev,
        } else {
                pr_info("switched off\n");
                dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
-               amdgpu_device_prepare(dev);
                amdgpu_device_suspend(dev, true);
                amdgpu_device_cache_pci_state(pdev);
                /* Shut down the device */
@@ -4206,43 +4205,6 @@ static int amdgpu_device_evict_resources(struct 
amdgpu_device *adev)
 /*
  * Suspend & resume.
  */
-/**
- * amdgpu_device_prepare - prepare for device suspend
- *
- * @dev: drm dev pointer
- *
- * Prepare to put the hw in the suspend state (all asics).
- * Returns 0 for success or an error on failure.
- * Called at driver suspend.
- */
-int amdgpu_device_prepare(struct drm_device *dev)
-{
-       struct amdgpu_device *adev = drm_to_adev(dev);
-       int i, r;
-
-       if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
-               return 0;
-
-       /* Evict the majority of BOs before starting suspend sequence */
-       r = amdgpu_device_evict_resources(adev);
-       if (r)
-               return r;
-
-       flush_delayed_work(&adev->gfx.gfx_off_delay_work);
-
-       for (i = 0; i < adev->num_ip_blocks; i++) {
-               if (!adev->ip_blocks[i].status.valid)
-                       continue;
-               if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
-                       continue;
-               r = adev->ip_blocks[i].version->funcs->prepare_suspend((void 
*)adev);
-               if (r)
-                       return r;
-       }
-
-       return 0;
-}
-
 /**
  * amdgpu_device_suspend - initiate device suspend
  *
@@ -4268,6 +4230,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
 
        adev->in_suspend = true;
 
+       /* Evict the majority of BOs before grabbing the full access */
+       r = amdgpu_device_evict_resources(adev);
+       if (r)
+               return r;
+
        if (amdgpu_sriov_vf(adev)) {
                amdgpu_virt_fini_data_exchange(adev);
                r = amdgpu_virt_request_full_gpu(adev, false);
diff --git sys/dev/pci/drm/amd/amdgpu/amdgpu_drv.c 
sys/dev/pci/drm/amd/amdgpu/amdgpu_drv.c
index 328f10f9a0d..3c0df8a235e 100644
--- sys/dev/pci/drm/amd/amdgpu/amdgpu_drv.c
+++ sys/dev/pci/drm/amd/amdgpu/amdgpu_drv.c
@@ -2393,9 +2393,8 @@ static int amdgpu_pmops_prepare(struct device *dev)
        /* Return a positive number here so
         * DPM_FLAG_SMART_SUSPEND works properly
         */
-       if (amdgpu_device_supports_boco(drm_dev) &&
-           pm_runtime_suspended(dev))
-               return 1;
+       if (amdgpu_device_supports_boco(drm_dev))
+               return pm_runtime_suspended(dev);
 
        /* if we will not support s3 or s2i for the device
         *  then skip suspend
@@ -2404,7 +2403,7 @@ static int amdgpu_pmops_prepare(struct device *dev)
            !amdgpu_acpi_is_s3_active(adev))
                return 1;
 
-       return amdgpu_device_prepare(drm_dev);
+       return 0;
 }
 
 static void amdgpu_pmops_complete(struct device *dev)
@@ -2606,9 +2605,6 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
        if (amdgpu_device_supports_boco(drm_dev))
                adev->mp1_state = PP_MP1_STATE_UNLOAD;
 
-       ret = amdgpu_device_prepare(drm_dev);
-       if (ret)
-               return ret;
        ret = amdgpu_device_suspend(drm_dev, false);
        if (ret) {
                adev->in_runpm = false;
@@ -3671,7 +3667,6 @@ amdgpu_activate(struct device *self, int act)
        switch (act) {
        case DVACT_QUIESCE:
                rv = config_activate_children(self, act);
-               amdgpu_device_prepare(dev);
                amdgpu_device_suspend(dev, true);
                break;
        case DVACT_SUSPEND:


-- 
In my defence, I have been left unsupervised.

uvm_fault on unhibernating x395

Reply via email to