Refactor sdma related code to improve the way to manage irq reference
count. Originally amdgpu_irq_get() is called from ip_blocks[].late_init
and amdgpu_irq_put is called from ip_blocks[].hw_fini. The asymmetric
design may cause issue under certain conditions. So
1) introduce amdgpu_sdma_ras_early_fini() to undo work done by
   amdgpu_sdma_ras_late_init().
2) remove call of amdgpu_irq_put in xxxx_hw_fini().
3) call amdgpu_irq_get() in function sdma_v4_4_2_xcp_resume() to keep
   irq reference count balanced. Currently sdma_v4_4_2_xcp_resume()
   doesn't invoke ip_blocks[].late_init(amdgpu_irq_get), but
   sdma_v4_4_2_xcp_suspend() invokes amdgpu_irq_put(), thus causes
   unbalanced irq reference count. Fix it by calling amdgpu_irq_get()
   in function sdma_v4_4_2_xcp_resume().

Signed-off-by: Jiang Liu <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 26 ++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   |  8 --------
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 23 ++++++++++++---------
 5 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index fa19c5391d8c..ff5907f2c544 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -383,7 +383,7 @@ enum amdgpu_marker {
        AMDGPU_MARKER_RAS_DEBUGFS       = 63,
 };
 
-#define AMDGPU_MARKER_INDEX_IRQ(idx)           (AMDGPU_MARKER_INDEX_IRQ0 + 
(idx))
+#define AMDGPU_MARKER_IRQ(idx)         (AMDGPU_MARKER_IRQ0 + (idx))
 
 struct amdgpu_ip_block_status {
        bool valid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 21938e858d55..799bcd9978da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -110,16 +110,35 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
                                AMDGPU_SDMA_IRQ_INSTANCE0 + i);
                        if (r)
                                goto late_fini;
+                       amdgpu_ras_set_marker(adev, ras_block, 
AMDGPU_MARKER_IRQ(i));
                }
        }
 
        return 0;
 
 late_fini:
-       amdgpu_ras_block_early_fini(adev, ras_block);
+       amdgpu_sdma_ras_early_fini(adev, ras_block);
        return r;
 }
 
+void amdgpu_sdma_ras_early_fini(struct amdgpu_device *adev,
+                               struct ras_common_if *ras_block)
+{
+       int i;
+
+       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+               for (i = 0; i < adev->sdma.num_instances; i++) {
+                       if (amdgpu_ras_test_and_clear_marker(adev, ras_block,
+                           AMDGPU_MARKER_IRQ(i))) {
+                               amdgpu_irq_put(adev, &adev->sdma.ecc_irq,
+                                              AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+                       }
+               }
+       }
+
+       amdgpu_ras_block_early_fini(adev, ras_block);
+}
+
 int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
                void *err_data,
                struct amdgpu_iv_entry *entry)
@@ -334,8 +353,11 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
        adev->sdma.ras_if = &ras->ras_block.ras_comm;
 
        /* If not define special ras_late_init function, use default 
ras_late_init */
-       if (!ras->ras_block.ras_late_init)
+       if (!ras->ras_block.ras_late_init) {
+               WARN_ON(ras->ras_block.ras_early_fini);
                ras->ras_block.ras_late_init = amdgpu_sdma_ras_late_init;
+               ras->ras_block.ras_early_fini = amdgpu_sdma_ras_early_fini;
+       }
 
        /* If not defined special ras_cb function, use default ras_cb */
        if (!ras->ras_block.ras_cb)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..1915e6c9be63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -164,6 +164,8 @@ int amdgpu_sdma_get_index_from_ring(struct amdgpu_ring 
*ring, uint32_t *index);
 uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring, unsigned vmid);
 int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
                              struct ras_common_if *ras_block);
+void amdgpu_sdma_ras_early_fini(struct amdgpu_device *adev,
+                               struct ras_common_if *ras_block);
 int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
                void *err_data,
                struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index ccf0d531776d..369d7094a3ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1968,18 +1968,10 @@ static int sdma_v4_0_hw_init(struct amdgpu_ip_block 
*ip_block)
 static int sdma_v4_0_hw_fini(struct amdgpu_ip_block *ip_block)
 {
        struct amdgpu_device *adev = ip_block->adev;
-       int i;
 
        if (amdgpu_sriov_vf(adev))
                return 0;
 
-       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
-               for (i = 0; i < adev->sdma.num_instances; i++) {
-                       amdgpu_irq_put(adev, &adev->sdma.ecc_irq,
-                                      AMDGPU_SDMA_IRQ_INSTANCE0 + i);
-               }
-       }
-
        sdma_v4_0_ctx_switch_enable(adev, false);
        sdma_v4_0_enable(adev, false);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 9c7cea0890c9..744569bbc1e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1486,19 +1486,11 @@ static int sdma_v4_4_2_hw_fini(struct amdgpu_ip_block 
*ip_block)
 {
        struct amdgpu_device *adev = ip_block->adev;
        uint32_t inst_mask;
-       int i;
 
        if (amdgpu_sriov_vf(adev))
                return 0;
 
        inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
-       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
-               for (i = 0; i < adev->sdma.num_instances; i++) {
-                       amdgpu_irq_put(adev, &adev->sdma.ecc_irq,
-                                      AMDGPU_SDMA_IRQ_INSTANCE0 + i);
-               }
-       }
-
        sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask);
        sdma_v4_4_2_inst_enable(adev, false, inst_mask);
 
@@ -2153,14 +2145,24 @@ const struct amdgpu_ip_block_version 
sdma_v4_4_2_ip_block = {
 static int sdma_v4_4_2_xcp_resume(void *handle, uint32_t inst_mask)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-       int r;
+       uint32_t tmp_mask = inst_mask;
+       int r, i;
 
        if (!amdgpu_sriov_vf(adev))
                sdma_v4_4_2_inst_init_golden_registers(adev, inst_mask);
 
        r = sdma_v4_4_2_inst_start(adev, inst_mask);
+       if (r)
+               return r;
 
-       return r;
+       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+               for_each_inst(i, tmp_mask) {
+                       amdgpu_irq_get(adev, &adev->sdma.ecc_irq,
+                                      AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+               }
+       }
+
+       return 0;
 }
 
 static int sdma_v4_4_2_xcp_suspend(void *handle, uint32_t inst_mask)
@@ -2366,6 +2368,7 @@ static struct amdgpu_sdma_ras sdma_v4_4_2_ras = {
        .ras_block = {
                .hw_ops = &sdma_v4_4_2_ras_hw_ops,
                .ras_late_init = sdma_v4_4_2_ras_late_init,
+               .ras_early_fini = amdgpu_sdma_ras_early_fini,
        },
 };
 
-- 
2.43.5

Reply via email to