[AMD Official Use Only - AMD Internal Distribution Only] -----Original Message----- From: Sun, Ce(Overlord) <[email protected]> Sent: Tuesday, August 12, 2025 3:35 PM To: [email protected] Cc: Zhou1, Tao <[email protected]>; Yang, Stanley <[email protected]>; Zhang, Hawking <[email protected]>; Wang, Yang(Kevin) <[email protected]>; Chai, Thomas <[email protected]>; Sun, Ce(Overlord) <[email protected]> Subject: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info
By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained Signed-off-by: Ce Sun <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 46 +++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 ---- 3 files changed, 13 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f00a9e0c9c47..ad8ad08f0f33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 50 //ms #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms @@ -3317,8 +3317,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) mutex_init(&ecc_log->lock); INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3337,8 +3335,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_unlock(&ecc_log->lock); mutex_destroy(&ecc_log->lock); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; } static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3386,49 +3382,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, uint32_t poison_creation_count) { int ret = 0; - struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; + uint64_t prev_de_queried_count = 0; + uint64_t bank_count = 0; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; - ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; - de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; + bank_count = amdgpu_aca_get_bank_count(adev); [Thomas] Does bank_count only use for umc deferred error or include umc ce de and other ras block bank error? The amdgpu_ras_poison_creation_handler function is used to handle UMC deferred error. not include umc ce and other ras block bank error. + if (bank_count) { + prev_de_queried_count = bank_count; + amdgpu_aca_clear_bank_count(adev); + timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; } else { - new_detect_count = 0; - } - - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; - - if (timeout) { - if (!--timeout) - break; - msleep(1); - } + --timeout; + msleep(1); } - } while (total_detect_count < need_query_count); + } while (timeout); - if (total_detect_count) + if (prev_de_queried_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ff63020f9c6c..132b45a362c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -492,8 +492,6 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t prev_de_queried_count; }; struct ras_critical_region { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index e590cbdd8de9..8dbffe4d22d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -581,17 +581,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); if (ret) { - if (ret == -EEXIST) - con->umc_ecc_log.de_queried_count++; - else - dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); - kfree(ecc_err); return ret; } - con->umc_ecc_log.de_queried_count++; - memset(page_pfn, 0, sizeof(page_pfn)); count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, pa_addr, -- 2.34.1
