rasdaemon: Add HBM Memory ACLS support for HiSilicon
When a hardware error occurs in a cell of the HBM memory, the internal SRAM of the memory controller is used to replace the faulty memory, this method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS RAS, and the rasdaemon record it and runs the ACLS to replace the faulty memory. HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can check which HBM cell the physical address belongs to and filter invalid HBM addresses. Multiple RAS errors are reported if memory errors occur in different HBM cells. The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and CONFIG_HWPOISON_INJECT [2]. [1]: https://gitee.com/openeuler/kernel/pulls/2757 [2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c Signed-off-by: Junhao He <hejunhao3@huawei.com>
This commit is contained in:
parent
610822a3e2
commit
156e08f2fd
287
0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
Normal file
287
0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
Normal file
@ -0,0 +1,287 @@
|
||||
From f37172d9fb8891f54d9e6cf218f9ff8482828b46 Mon Sep 17 00:00:00 2001
|
||||
From: Junhao He <hejunhao3@huawei.com>
|
||||
Date: Fri, 13 Oct 2023 18:10:16 +0800
|
||||
Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon
|
||||
|
||||
When a hardware error occurs in a cell of the HBM memory, the internal
|
||||
SRAM of the memory controller is used to replace the faulty memory, this
|
||||
method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS
|
||||
RAS, and the rasdaemon record it and runs the ACLS to replace the faulty
|
||||
memory.
|
||||
|
||||
HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can
|
||||
check which HBM cell the physical address belongs to and filter invalid
|
||||
HBM addresses. Multiple RAS errors are reported if memory errors occur
|
||||
in different HBM cells.
|
||||
|
||||
The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and
|
||||
CONFIG_HWPOISON_INJECT [2].
|
||||
|
||||
[1]: https://gitee.com/openeuler/kernel/pulls/2757
|
||||
[2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c
|
||||
|
||||
Signed-off-by: Junhao He <hejunhao3@huawei.com>
|
||||
---
|
||||
configure.ac | 11 ++++
|
||||
misc/rasdaemon.env | 7 ++-
|
||||
non-standard-hisilicon.c | 110 +++++++++++++++++++++++++++++++++++++
|
||||
ras-events.c | 3 +
|
||||
ras-non-standard-handler.c | 32 +++++++++++
|
||||
ras-non-standard-handler.h | 8 +++
|
||||
6 files changed, 170 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index c7af727..f1e1487 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x
|
||||
AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
|
||||
AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
|
||||
|
||||
+AC_ARG_ENABLE([hisi_hbm_memory_acls],
|
||||
+ AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS]))
|
||||
+
|
||||
+AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [
|
||||
+ AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS")
|
||||
+ AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes])
|
||||
+AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"])
|
||||
+
|
||||
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
|
||||
|
||||
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
|
||||
@@ -212,4 +222,5 @@ compile time options summary
|
||||
Memory CE PFA : $USE_MEMORY_CE_PFA
|
||||
AMP RAS errors : $USE_AMP_NS_DECODE
|
||||
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
|
||||
+ HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS
|
||||
EOF
|
||||
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
|
||||
index 7cb18e8..b20d527 100644
|
||||
--- a/misc/rasdaemon.env
|
||||
+++ b/misc/rasdaemon.env
|
||||
@@ -43,4 +43,9 @@ CPU_CE_THRESHOLD="18"
|
||||
CPU_ISOLATION_CYCLE="24h"
|
||||
|
||||
# Prevent excessive isolation from causing an avalanche effect
|
||||
-CPU_ISOLATION_LIMIT="10"
|
||||
\ No newline at end of file
|
||||
+CPU_ISOLATION_LIMIT="10"
|
||||
+
|
||||
+# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no).
|
||||
+HISI_HBM_MEMORY_ACLS="no"
|
||||
+# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no).
|
||||
+HISI_HBM_ISOLATION_PAGE="no"
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 756adf8..62a8386 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -19,6 +19,17 @@
|
||||
#define HISI_BUF_LEN 2048
|
||||
#define HISI_PCIE_INFO_BUF_LEN 256
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+# define HISI_TYPE_UINT32_WIDTH 32
|
||||
+/* Specify the Hisilicon HBMC HBM error type */
|
||||
+# define HISI_HBM_ERR_TYPE 0
|
||||
+# define HISI_HBM_ERR_ACLS BIT(0)
|
||||
+# define HISI_HBM_ACLS_ADDL 1
|
||||
+# define HISI_HBM_ACLS_ADDH 2
|
||||
+# define HISI_HBM_ACLS_ARRAY_SIZE 12
|
||||
+# define HISI_SUBMOD_HBMC_HBM 6
|
||||
+#endif
|
||||
+
|
||||
struct hisi_common_error_section {
|
||||
uint32_t val_bits;
|
||||
uint8_t version;
|
||||
@@ -360,6 +371,100 @@ static int add_hisi_common_table(struct ras_events *ras,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+#include <errno.h>
|
||||
+#include <unistd.h>
|
||||
+
|
||||
+static int write_file(const char *name, unsigned long long value)
|
||||
+{
|
||||
+ char fname[MAX_PATH + 1] = "/sys/kernel/";
|
||||
+ FILE *file;
|
||||
+ int ret;
|
||||
+
|
||||
+ strcat(fname, name);
|
||||
+ if (access(fname, W_OK)) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot access '%s': %s\n",
|
||||
+ fname, strerror(errno));
|
||||
+ return -errno;
|
||||
+ }
|
||||
+
|
||||
+ file = fopen(fname, "w");
|
||||
+ if (!file) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n",
|
||||
+ fname, strerror(errno));
|
||||
+ return -errno;
|
||||
+ }
|
||||
+
|
||||
+ ret = fprintf(file, "0x%llx\n", value);
|
||||
+ if (ret < 0)
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n",
|
||||
+ fname, value, strerror(errno));
|
||||
+
|
||||
+ fclose(file);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err,
|
||||
+ int page_size)
|
||||
+{
|
||||
+ unsigned long long paddr;
|
||||
+ unsigned long long pfn;
|
||||
+ int ret;
|
||||
+
|
||||
+ if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: No valid address array length (%d)\n",
|
||||
+ err->reg_array_size);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ if (!page_size)
|
||||
+ return -1;
|
||||
+
|
||||
+ paddr = err->reg_array[HISI_HBM_ACLS_ADDH];
|
||||
+ paddr <<= HISI_TYPE_UINT32_WIDTH;
|
||||
+ paddr += err->reg_array[HISI_HBM_ACLS_ADDL];
|
||||
+ pfn = paddr / page_size;
|
||||
+
|
||||
+ ret = write_file("hbm_memory/acls/acls_query", paddr);
|
||||
+ if (ret < 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ ret = write_file("debug/hwpoison/corrupt-pfn", pfn);
|
||||
+ if (ret < 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ ret = write_file("hbm_memory/acls/acls_repair", paddr);
|
||||
+ if (ret < 0 && ras_ns_hisi_hbm_isolation_page_enabled()) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep the pfn (0x%llx) offline\n",
|
||||
+ pfn);
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ ret = write_file("debug/hwpoison/unpoison-pfn", pfn);
|
||||
+ if (ret < 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err,
|
||||
+ int page_size)
|
||||
+{
|
||||
+ if (strcmp(module_name[err->module_id], "HBMC") ||
|
||||
+ err->submodule_id != HISI_SUBMOD_HBMC_HBM)
|
||||
+ return;
|
||||
+
|
||||
+ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE)))
|
||||
+ return;
|
||||
+
|
||||
+ if (!(err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS))
|
||||
+ return;
|
||||
+
|
||||
+ if (hisi_hbmc_hbm_acls(err, page_size))
|
||||
+ log(TERM, LOG_WARNING, "Failed to handler HiSilicon HBM ACLS\n");
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static int decode_hisi_common_section(struct ras_events *ras,
|
||||
struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
@@ -395,6 +500,11 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
|
||||
}
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+ if (ras_ns_hisi_hbm_acls_enabled())
|
||||
+ hisi_hbm_acls_handler(err, ras->page_size);
|
||||
+#endif
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 9093954..3b10525 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -901,6 +901,9 @@ int handle_ras_events(int record_events)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NON_STANDARD
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+ ras_ns_hisi_hbm_param_init();
|
||||
+#endif
|
||||
rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event",
|
||||
ras_non_standard_event_handler, NULL, NON_STANDARD_EVENT);
|
||||
if (!rc)
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
index 20d514b..3ed0900 100644
|
||||
--- a/ras-non-standard-handler.c
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -24,6 +24,38 @@
|
||||
|
||||
static struct ras_ns_ev_decoder *ras_ns_ev_dec_list;
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+static bool ras_ns_hisi_hbm_acls;
|
||||
+static bool ras_ns_hisi_hbm_isolation_page;
|
||||
+
|
||||
+void ras_ns_hisi_hbm_param_init(void)
|
||||
+{
|
||||
+ char *env;
|
||||
+
|
||||
+ env = getenv("HISI_HBM_MEMORY_ACLS");
|
||||
+ if (env && strcasecmp(env, "yes") == 0) {
|
||||
+ log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n");
|
||||
+ ras_ns_hisi_hbm_acls = true;
|
||||
+ }
|
||||
+
|
||||
+ env = getenv("HISI_HBM_ISOLATION_PAGE");
|
||||
+ if (env && strcasecmp(env, "yes") == 0) {
|
||||
+ log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n");
|
||||
+ ras_ns_hisi_hbm_isolation_page = true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+bool ras_ns_hisi_hbm_acls_enabled(void)
|
||||
+{
|
||||
+ return ras_ns_hisi_hbm_acls;
|
||||
+}
|
||||
+
|
||||
+bool ras_ns_hisi_hbm_isolation_page_enabled(void)
|
||||
+{
|
||||
+ return ras_ns_hisi_hbm_isolation_page;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
|
||||
trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
|
||||
}
|
||||
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
|
||||
index 341206a..1c2a6e7 100644
|
||||
--- a/ras-non-standard-handler.h
|
||||
+++ b/ras-non-standard-handler.h
|
||||
@@ -46,4 +46,12 @@ void ras_ns_finalize_vendor_tables(void);
|
||||
static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; };
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+#include <stdbool.h>
|
||||
+
|
||||
+void ras_ns_hisi_hbm_param_init(void);
|
||||
+bool ras_ns_hisi_hbm_acls_enabled(void);
|
||||
+bool ras_ns_hisi_hbm_isolation_page_enabled(void);
|
||||
+#endif
|
||||
+
|
||||
#endif
|
||||
--
|
||||
2.30.0
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.7
|
||||
Release: 15
|
||||
Release: 16
|
||||
License: GPLv2
|
||||
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
||||
URL: https://github.com/mchehab/rasdaemon.git
|
||||
@ -55,6 +55,7 @@ Patch9004: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch
|
||||
Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch
|
||||
Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch
|
||||
Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch
|
||||
Patch9008: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
|
||||
|
||||
%description
|
||||
The rasdaemon program is a daemon which monitors the platform
|
||||
@ -73,7 +74,7 @@ autoheader
|
||||
libtoolize --automake --copy --debug --force
|
||||
automake --add-missing
|
||||
%ifarch %{arm} aarch64
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation --enable-hisi-hbm-memory-acls
|
||||
%else
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror
|
||||
%endif
|
||||
@ -106,6 +107,12 @@ fi
|
||||
/usr/bin/systemctl disable rasdaemon.service >/dev/null 2>&1 || :
|
||||
|
||||
%changelog
|
||||
* Fri Dec 8 2023 Junhao He <hejunhao3@huawei.com> - 0.6.7-16
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:Add HBM Memory ACLS support for HiSilicon
|
||||
|
||||
* Wed Dec 6 2023 caijian <caijian11@h-partners.com> - 0.6.7-15
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user