summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorGuchun Chen <guchun.chen@amd.com>2020-07-22 10:00:27 +0800
committerAlex Deucher <alexander.deucher@amd.com>2020-08-04 17:25:58 -0400
commitc84d46707ebbfc303268e27d5aeb8bb4e6d5b1ff (patch)
treea81b4752c33d2278e1cf86f1b6e8587eacbfbacb /drivers/gpu/drm/amd/amdgpu
parentacc0204cdb8e60d2e57053ed766d8280eb01118b (diff)
drm/amdgpu: validate bad page threshold in ras(v3)
Bad page threshold value should be valid in the range between -1 and max records length of eeprom. It could determine when saved bad pages exceed threshold value, and proceed corresponding actions. v2: When using the default typical value, it should be min value between typical value and eeprom max records length. v3: drop the case of setting bad_page_cnt_threshold to be 0xFFFFFFFF, as it confuses user. Signed-off-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c48
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h2
4 files changed, 58 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5680f7eafcb1..6660094a1063 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
/* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1699,6 +1702,47 @@ out:
return ret;
}
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+ uint32_t max_length)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ int tmp_threshold = amdgpu_bad_page_threshold;
+ u64 val;
+
+ /*
+ * Justification of value bad_page_cnt_threshold in ras structure
+ *
+ * Generally, -1 <= amdgpu_bad_page_threshold <= max record length
+ * in eeprom, and introduce two scenarios accordingly.
+ *
+ * Bad page retirement enablement:
+ * - If amdgpu_bad_page_threshold = -1,
+ * bad_page_cnt_threshold = typical value by formula.
+ *
+ * - When the value from user is 0 < amdgpu_bad_page_threshold <
+ * max record length in eeprom, use it directly.
+ *
+ * Bad page retirement disablement:
+ * - If amdgpu_bad_page_threshold = 0, bad page retirement
+ * functionality is disabled, and bad_page_cnt_threshold will
+ * take no effect.
+ */
+
+ if (tmp_threshold < -1)
+ tmp_threshold = -1;
+ else if (tmp_threshold > max_length)
+ tmp_threshold = max_length;
+
+ if (tmp_threshold == -1) {
+ val = adev->gmc.mc_vram_size;
+ do_div(val, RAS_BAD_PAGE_RATE);
+ con->bad_page_cnt_threshold = min(lower_32_bits(val),
+ max_length);
+ } else {
+ con->bad_page_cnt_threshold = tmp_threshold;
+ }
+}
+
/* called in gpu recovery/init */
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
{
@@ -1776,6 +1820,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+ uint32_t max_eeprom_records_len = 0;
int ret;
if (con)
@@ -1794,6 +1839,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;
+ max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+ amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
if (ret)
goto free;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b2667342cf67..4672649a9293 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -336,6 +336,9 @@ struct amdgpu_ras {
struct amdgpu_ras_eeprom_control eeprom_control;
bool error_query_ready;
+
+ /* bad page count threshold */
+ uint32_t bad_page_cnt_threshold;
};
struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index c0096097bbcf..a2c982b1eac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -499,6 +499,11 @@ free_buff:
return ret == num ? 0 : -EIO;
}
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+{
+ return EEPROM_MAX_RECORD_NUM;
+}
+
/* Used for testing if bugs encountered */
#if 0
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 9e7d640920fb..e285e8cafd48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -84,6 +84,8 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
bool write,
int num);
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
+
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
#endif // _AMDGPU_RAS_EEPROM_H