See amdgpu_gpu_reset():

        /* block scheduler */
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                struct amdgpu_ring *ring = adev->rings[i];

                if (!ring)
                        continue;
                kthread_park(ring->sched.thread);
                amd_sched_hw_job_reset(&ring->sched);
        }

and

                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                        struct amdgpu_ring *ring = adev->rings[i];
                        if (!ring)
                                continue;

amd_sched_job_recovery(&ring->sched);
                        kthread_unpark(ring->sched.thread);
                }

Regards,
Christian.

Am 21.04.2017 um 11:54 schrieb He, Hongbo:
During GPU reset, IOCTL still  submit command to GPU.
No it won't. We disable the GPU scheduler during GPU reset, so that isn't an 
issue.
           Could you point out the related code? Thanks.


Thanks
Roger(Hongbo.He)

-----Original Message-----
From: Christian König [mailto:[email protected]]
Sent: Friday, April 21, 2017 5:16 PM
To: He, Hongbo; [email protected]
Cc: Zhou, David(ChunMing)
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

During GPU reset, IOCTL still  submit command to GPU.
No it won't. We disable the GPU scheduler during GPU reset, so that isn't an 
issue.

Do you have a concrete problem (e.g. dmesg?) you ran into?

Regards,
Christian.

Am 21.04.2017 um 11:00 schrieb He, Hongbo:
Hi Christian:

During GPU reset, IOCTL still  submit command to GPU.
That will result in lots of strange problems when we test this feature.

I have no better idea to handle this case. At that time, I have tried with 
return error directly rather than waiting GPU reset completion.
But that doesn't work and it lead to other problems.

Thanks
Roger(Hongbo.He)

-----Original Message-----
From: Christian König [mailto:[email protected]]
Sent: Friday, April 21, 2017 4:27 PM
To: He, Hongbo; [email protected]
Cc: Zhou, David(ChunMing)
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

NAK, that is exactly what we wanted to avoid.

We used to have an exclusive lock for this and it cause a whole bunch of 
problems.

Please elaborate why that should be necessary.

Regards,
Christian.

Am 21.04.2017 um 09:08 schrieb Roger.He:
Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
Signed-off-by: Roger.He <[email protected]>
---
    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
    4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 71364f5..ab0ffa8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1526,6 +1526,7 @@ struct amdgpu_device {
        atomic64_t                      num_bytes_moved;
        atomic64_t                      num_evictions;
        atomic_t                        gpu_reset_counter;
+       atomic_t                        gpu_reset_state;
/* data for buffer migration throttling */
        struct {
@@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
    #define amdgpu_psp_check_fw_loading_status(adev, i)
(adev)->firmware.funcs->check_fw_loading_status((adev), (i))
/* Common functions */
+static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
+{
+       return atomic_read(&adev->gpu_reset_state);
+}
+
    int amdgpu_gpu_reset(struct amdgpu_device *adev);
    bool amdgpu_need_backup(struct amdgpu_device *adev);
    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f882496..0fb4716 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        mutex_init(&adev->grbm_idx_mutex);
        mutex_init(&adev->mn_lock);
        hash_init(adev->mn_hash);
+       atomic_set(&adev->gpu_reset_state, 0);
amdgpu_check_arguments(adev); @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
    }
/**
+ * amdgpu_device_set_reset_state - set gpu reset state
+ *
+ * @adev: amdgpu device pointer
+ * @state: true when start to reset gpu; false: reset done  */
+static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
+                                                       bool state)
+{
+       atomic_set(&adev->gpu_reset_state, state); }
+
+/**
     * amdgpu_gpu_reset - reset the asic
     *
     * @adev: amdgpu device pointer
@@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
        }
atomic_inc(&adev->gpu_reset_counter);
-
+       amdgpu_device_set_reset_state(adev, true);
        /* block TTM */
        resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
        /* store modesetting */
@@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
                dev_info(adev->dev, "GPU reset failed\n");
        }
+ amdgpu_device_set_reset_state(adev, false);
        return r;
    }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..8cc14af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
        struct drm_file *file_priv = filp->private_data;
        struct drm_device *dev;
        long ret;
+
        dev = file_priv->minor->dev;
        ret = pm_runtime_get_sync(dev->dev);
        if (ret < 0)
                return ret;
+ while (amdgpu_device_is_reset(dev->dev_private))
+               msleep(100);
+
        ret = drm_ioctl(filp, cmd, arg);
pm_runtime_mark_last_busy(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
index 2648291..22b8059 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
@@ -36,10 +36,15 @@
    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned 
long arg)
    {
        unsigned int nr = DRM_IOCTL_NR(cmd);
+       struct drm_file *file_priv = filp->private_data;
+       struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
        int ret;
- if (nr < DRM_COMMAND_BASE)
+       if (nr < DRM_COMMAND_BASE) {
+               while (amdgpu_device_is_reset(adev))
+                       msleep(100);
                return drm_compat_ioctl(filp, cmd, arg);
+       }
ret = amdgpu_drm_ioctl(filp, cmd, arg);
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to