v1:
- add ACA error query interface
v2:
- Add a new helper function to determine whether to use ACA or MCA.

Signed-off-by: Yang Wang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c |   8 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 105 ++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  12 ++-
 4 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index a460cde20cf2..2ad06f87de8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -594,6 +594,9 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, 
struct aca_handle *handle,
        struct amdgpu_aca *aca = &adev->aca;
        int ret;
 
+       if (!amdgpu_aca_is_enabled(adev))
+               return 0;
+
        ret = add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data);
        if (ret)
                return ret;
@@ -634,6 +637,11 @@ static void aca_manager_fini(struct aca_handle_manager 
*mgr)
                remove_aca(handle);
 }
 
+bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
+{
+       return adev->aca.is_enabled;
+}
+
 int amdgpu_aca_init(struct amdgpu_device *adev)
 {
        struct amdgpu_aca *aca = &adev->aca;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index bb0a3be72cc8..ada4f6c4660f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -175,6 +175,7 @@ struct aca_smu_funcs {
 struct amdgpu_aca {
        struct aca_handle_manager mgr;
        const struct aca_smu_funcs *smu_funcs;
+       bool is_enabled;
 };
 
 struct aca_info {
@@ -186,6 +187,7 @@ struct aca_info {
 int amdgpu_aca_init(struct amdgpu_device *adev);
 void amdgpu_aca_fini(struct amdgpu_device *adev);
 void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct 
aca_smu_funcs *smu_funcs);
+bool amdgpu_aca_is_enabled(struct amdgpu_device *adev);
 
 int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info);
 int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank 
*bank, int *err_codes, int size);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7048bf853cf6..602bfe2f682e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1183,6 +1183,53 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, 
struct device_attribute *a
                          "ce", info.ce_count);
 }
 
+static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum 
amdgpu_ras_block blk)
+{
+       struct ras_common_if head;
+
+       memset(&head, 0, sizeof(head));
+       head.block = blk;
+
+       return amdgpu_ras_find_obj(adev, &head);
+}
+
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+                       const struct aca_info *aca_info, void *data)
+{
+       struct ras_manager *obj;
+
+       obj = get_ras_manager(adev, blk);
+       if (!obj)
+               return -EINVAL;
+
+       return amdgpu_aca_add_handle(adev, &obj->aca_handle, 
ras_block_str(blk), aca_info, data);
+}
+
+int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block 
blk)
+{
+       struct ras_manager *obj;
+
+       obj = get_ras_manager(adev, blk);
+       if (!obj)
+               return -EINVAL;
+
+       amdgpu_aca_remove_handle(&obj->aca_handle);
+
+       return 0;
+}
+
+static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum 
amdgpu_ras_block blk,
+                                        enum aca_error_type type, struct 
ras_err_data *err_data)
+{
+       struct ras_manager *obj;
+
+       obj = get_ras_manager(adev, blk);
+       if (!obj)
+               return -EINVAL;
+
+       return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, 
err_data);
+}
+
 static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
                                                struct ras_query_if *info,
                                                struct ras_err_data *err_data,
@@ -1190,6 +1237,7 @@ static int amdgpu_ras_query_error_status_helper(struct 
amdgpu_device *adev,
 {
        enum amdgpu_ras_block blk = info ? info->head.block : 
AMDGPU_RAS_BLOCK_COUNT;
        struct amdgpu_ras_block_object *block_obj = NULL;
+       int ret;
 
        if (blk == AMDGPU_RAS_BLOCK_COUNT)
                return -EINVAL;
@@ -1219,9 +1267,19 @@ static int amdgpu_ras_query_error_status_helper(struct 
amdgpu_device *adev,
                        }
                }
        } else {
-               /* FIXME: add code to check return value later */
-               amdgpu_mca_smu_log_ras_error(adev, blk, 
AMDGPU_MCA_ERROR_TYPE_UE, err_data);
-               amdgpu_mca_smu_log_ras_error(adev, blk, 
AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+               if (amdgpu_aca_is_enabled(adev)) {
+                       ret = amdgpu_aca_log_ras_error_data(adev, blk, 
ACA_ERROR_TYPE_UE, err_data);
+                       if (ret)
+                               return ret;
+
+                       ret = amdgpu_aca_log_ras_error_data(adev, blk, 
ACA_ERROR_TYPE_CE, err_data);
+                       if (ret)
+                               return ret;
+               } else {
+                       /* FIXME: add code to check return value later */
+                       amdgpu_mca_smu_log_ras_error(adev, blk, 
AMDGPU_MCA_ERROR_TYPE_UE, err_data);
+                       amdgpu_mca_smu_log_ras_error(adev, blk, 
AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+               }
        }
 
        return 0;
@@ -1270,6 +1328,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
        struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
        const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+       const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
        struct amdgpu_hive_info *hive;
        int hive_ras_recovery = 0;
 
@@ -1280,7 +1339,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
        }
 
        if (!amdgpu_ras_is_supported(adev, block) ||
-           !amdgpu_ras_get_mca_debug_mode(adev))
+           !amdgpu_ras_get_aca_debug_mode(adev))
                return -EOPNOTSUPP;
 
        hive = amdgpu_get_xgmi_hive(adev);
@@ -1292,7 +1351,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
        /* skip ras error reset in gpu reset */
        if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
            hive_ras_recovery) &&
-           mca_funcs && mca_funcs->mca_set_debug_mode)
+           ((smu_funcs && smu_funcs->set_debug_mode) ||
+            (mca_funcs && mca_funcs->mca_set_debug_mode)))
                return -EOPNOTSUPP;
 
        if (block_obj->hw_ops->reset_ras_error_count)
@@ -1788,7 +1848,10 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device 
*adev)
                }
        }
 
-       amdgpu_mca_smu_debugfs_init(adev, dir);
+       if (amdgpu_aca_is_enabled(adev))
+               amdgpu_aca_smu_debugfs_init(adev, dir);
+       else
+               amdgpu_mca_smu_debugfs_init(adev, dir);
 }
 
 /* debugfs end */
@@ -2769,6 +2832,9 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
 
        adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
                adev->ras_hw_enabled & amdgpu_ras_mask;
+
+       /* aca is disabled by default */
+       adev->aca.is_enabled = false;
 }
 
 static void amdgpu_ras_counte_dw(struct work_struct *work)
@@ -3153,7 +3219,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
        if (amdgpu_sriov_vf(adev))
                return 0;
 
-       amdgpu_ras_set_mca_debug_mode(adev, false);
+       if (amdgpu_aca_is_enabled(adev))
+               amdgpu_ras_set_aca_debug_mode(adev, false);
+       else
+               amdgpu_ras_set_mca_debug_mode(adev, false);
 
        list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
                if (!node->ras_obj) {
@@ -3436,7 +3505,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device 
*adev, bool enable)
        if (con) {
                ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
                if (!ret)
-                       con->is_mca_debug_mode = enable;
+                       con->is_aca_debug_mode = enable;
        }
 
        return ret;
@@ -3448,24 +3517,29 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device 
*adev, bool enable)
        int ret = 0;
 
        if (con) {
-               ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
+               if (amdgpu_aca_is_enabled(adev))
+                       ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
+               else
+                       ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
                if (!ret)
-                       con->is_mca_debug_mode = enable;
+                       con->is_aca_debug_mode = enable;
        }
 
        return ret;
 }
 
-bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
        const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
        if (!con)
                return false;
 
-       if (mca_funcs && mca_funcs->mca_set_debug_mode)
-               return con->is_mca_debug_mode;
+       if ((amdgpu_aca_is_enabled(adev) && smu_funcs && 
smu_funcs->set_debug_mode) ||
+           (!amdgpu_aca_is_enabled(adev) && mca_funcs && 
mca_funcs->mca_set_debug_mode))
+               return con->is_aca_debug_mode;
        else
                return true;
 }
@@ -3475,15 +3549,16 @@ bool amdgpu_ras_get_error_query_mode(struct 
amdgpu_device *adev,
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+       const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
 
        if (!con) {
                *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
                return false;
        }
 
-       if (mca_funcs && mca_funcs->mca_set_debug_mode)
+       if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && 
mca_funcs->mca_set_debug_mode))
                *error_query_mode =
-                       (con->is_mca_debug_mode) ? 
AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
+                       (con->is_aca_debug_mode) ? 
AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
        else
                *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 8c487f3bfbf1..4503cc6eec66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -442,7 +442,7 @@ struct amdgpu_ras {
        /* Indicates smu whether need update bad channel info */
        bool update_channel_flag;
        /* Record status of smu mca debug mode */
-       bool is_mca_debug_mode;
+       bool is_aca_debug_mode;
 
        /* Record special requirements of gpu reset caller */
        uint32_t  gpu_reset_flags;
@@ -530,6 +530,8 @@ struct ras_manager {
        struct ras_ih_data ih_data;
 
        struct ras_err_data err_data;
+
+       struct aca_handle aca_handle;
 };
 
 struct ras_badpage {
@@ -781,9 +783,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
-int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
 int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
-bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
+bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev);
 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
                                     unsigned int *mode);
 
@@ -824,4 +826,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data 
*err_data,
 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute 
*attr,
                                  struct aca_handle *handle, char *buf, void 
*data);
 
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+                              const struct aca_info *aca_info, void *data);
+int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block 
blk);
+
 #endif
-- 
2.34.1

Reply via email to