One eeprom record may not map to unit number of bad pages, the accurate
bad page number is gotten after bad page address check.

Signed-off-by: Tao Zhou <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 43 +++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  3 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  6 +--
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1d6d4625abb3..ea3ab8c46115 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2568,6 +2568,9 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device 
*adev,
        }
 
        for (; i < data->count; i++) {
+               if (!data->bps[i].ts)
+                       continue;
+
                (*bps)[i] = (struct ras_badpage){
                        .bp = data->bps[i].retired_page,
                        .size = AMDGPU_GPU_PAGE_SIZE,
@@ -2581,7 +2584,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device 
*adev,
                        (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
        }
 
-       *count = data->count;
+       *count = con->bad_page_num;
 out:
        mutex_unlock(&con->recovery_lock);
        return ret;
@@ -2809,8 +2812,11 @@ static int __amdgpu_ras_restore_bad_pages(struct 
amdgpu_device *adev,
 
        for (j = 0; j < count; j++) {
                if (amdgpu_ras_check_bad_page_unlock(con,
-                       bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+                       bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
+                       data->count++;
+                       data->space_left--;
                        continue;
+               }
 
                if (!data->space_left &&
                    amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
@@ -2823,6 +2829,7 @@ static int __amdgpu_ras_restore_bad_pages(struct 
amdgpu_device *adev,
                                sizeof(struct eeprom_table_record));
                data->count++;
                data->space_left--;
+               con->bad_page_num++;
        }
 
        return 0;
@@ -2954,7 +2961,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                                                ret = 
__amdgpu_ras_convert_rec_array_from_rom(adev,
                                                                                
&bps[i], &err_data, nps);
                                                if (ret)
-                                                       
control->ras_num_bad_pages -= adev->umc.retire_unit;
+                                                       con->bad_page_num -= 
adev->umc.retire_unit;
                                                i += (adev->umc.retire_unit - 
1);
                                        } else {
                                                break;
@@ -2968,8 +2975,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                        ret = __amdgpu_ras_convert_rec_from_rom(adev,
                                &bps[i], &err_data, nps);
                        if (ret)
-                               control->ras_num_bad_pages -= 
adev->umc.retire_unit;
+                               con->bad_page_num -= adev->umc.retire_unit;
                }
+
+               con->eh_data->count_saved = con->eh_data->count;
        } else {
                ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
        }
@@ -2992,7 +3001,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data;
        struct amdgpu_ras_eeprom_control *control;
-       int save_count, unit_num, bad_page_num, i;
+       int unit_num, i;
 
        if (!con || !con->eh_data) {
                if (new_cnt)
@@ -3013,27 +3022,25 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device 
*adev,
        mutex_lock(&con->recovery_lock);
        control = &con->eeprom_control;
        data = con->eh_data;
-       bad_page_num = control->ras_num_bad_pages;
-       save_count = data->count - bad_page_num;
+       unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
        mutex_unlock(&con->recovery_lock);
 
-       unit_num = save_count / adev->umc.retire_unit;
        if (new_cnt)
                *new_cnt = unit_num;
 
        /* only new entries are saved */
-       if (save_count > 0) {
+       if (unit_num > 0) {
                /*old asics only save pa to eeprom like before*/
                if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
                        if (amdgpu_ras_eeprom_append(control,
-                                       &data->bps[bad_page_num], save_count)) {
+                                       &data->bps[data->count_saved], 
unit_num)) {
                                dev_err(adev->dev, "Failed to save EEPROM table 
data!");
                                return -EIO;
                        }
                } else {
                        for (i = 0; i < unit_num; i++) {
                                if (amdgpu_ras_eeprom_append(control,
-                                               &data->bps[bad_page_num +
+                                               &data->bps[data->count_saved +
                                                i * adev->umc.retire_unit], 1)) 
{
                                        dev_err(adev->dev, "Failed to save 
EEPROM table data!");
                                        return -EIO;
@@ -3041,7 +3048,9 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
                        }
                }
 
-               dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", 
save_count);
+               dev_info(adev->dev, "Saved %d pages to EEPROM table.\n",
+                               con->bad_page_num - control->ras_num_bad_pages);
+               data->count_saved = data->count;
        }
 
        return 0;
@@ -3096,17 +3105,17 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
                        }
                }
 
+               ret = amdgpu_ras_add_bad_pages(adev, bps, 
control->ras_num_recs, true);
+               if (ret)
+                       goto out;
+
                ret = amdgpu_ras_eeprom_check(control);
                if (ret)
                        goto out;
 
                /* HW not usable */
-               if (amdgpu_ras_is_rma(adev)) {
+               if (amdgpu_ras_is_rma(adev))
                        ret = -EHWPOISON;
-                       goto out;
-               }
-
-               ret = amdgpu_ras_add_bad_pages(adev, bps, 
control->ras_num_recs, true);
        }
 
 out:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..020245eb6aa0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -570,6 +570,8 @@ struct amdgpu_ras {
        struct ras_event_manager *event_mgr;
 
        uint64_t reserved_pages_in_bytes;
+
+       int bad_page_num;
 };
 
 struct ras_fs_data {
@@ -608,6 +610,7 @@ struct ras_err_handler_data {
        struct eeprom_table_record *bps;
        /* the count of entries */
        int count;
+       int count_saved;
        /* the space can place new entries */
        int space_left;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 54838746f97d..91e20e317cdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -743,8 +743,7 @@ amdgpu_ras_eeprom_append_table(struct 
amdgpu_ras_eeprom_control *control,
        else
                control->ras_num_mca_recs += num;
 
-       control->ras_num_bad_pages = control->ras_num_pa_recs +
-                               control->ras_num_mca_recs * 
adev->umc.retire_unit;
+       control->ras_num_bad_pages = con->bad_page_num;
 Out:
        kfree(buf);
        return res;
@@ -1457,8 +1456,7 @@ int amdgpu_ras_eeprom_check(struct 
amdgpu_ras_eeprom_control *control)
        if (!__get_eeprom_i2c_addr(adev, control))
                return -EINVAL;
 
-       control->ras_num_bad_pages = control->ras_num_pa_recs +
-                       control->ras_num_mca_recs * adev->umc.retire_unit;
+       control->ras_num_bad_pages = ras->bad_page_num;
 
        if (hdr->header == RAS_TABLE_HDR_VAL) {
                dev_dbg(adev->dev,
-- 
2.34.1

Reply via email to