!114 rasdaemon: Add HBM Memory ACLS support for HiSilicon
From: @hejunhao3 Reviewed-by: @lvying6 Signed-off-by: @lvying6
This commit is contained in:
commit
01a769dc66
287
0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
Normal file
287
0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
Normal file
@ -0,0 +1,287 @@
|
||||
From f37172d9fb8891f54d9e6cf218f9ff8482828b46 Mon Sep 17 00:00:00 2001
|
||||
From: Junhao He <hejunhao3@huawei.com>
|
||||
Date: Fri, 13 Oct 2023 18:10:16 +0800
|
||||
Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon
|
||||
|
||||
When a hardware error occurs in a cell of the HBM memory, the internal
|
||||
SRAM of the memory controller is used to replace the faulty memory, this
|
||||
method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS
|
||||
RAS, and the rasdaemon record it and runs the ACLS to replace the faulty
|
||||
memory.
|
||||
|
||||
HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can
|
||||
check which HBM cell the physical address belongs to and filter invalid
|
||||
HBM addresses. Multiple RAS errors are reported if memory errors occur
|
||||
in different HBM cells.
|
||||
|
||||
The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and
|
||||
CONFIG_HWPOISON_INJECT [2].
|
||||
|
||||
[1]: https://gitee.com/openeuler/kernel/pulls/2757
|
||||
[2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c
|
||||
|
||||
Signed-off-by: Junhao He <hejunhao3@huawei.com>
|
||||
---
|
||||
configure.ac | 11 ++++
|
||||
misc/rasdaemon.env | 7 ++-
|
||||
non-standard-hisilicon.c | 110 +++++++++++++++++++++++++++++++++++++
|
||||
ras-events.c | 3 +
|
||||
ras-non-standard-handler.c | 32 +++++++++++
|
||||
ras-non-standard-handler.h | 8 +++
|
||||
6 files changed, 170 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index c7af727..f1e1487 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x
|
||||
AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
|
||||
AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
|
||||
|
||||
+AC_ARG_ENABLE([hisi_hbm_memory_acls],
|
||||
+ AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS]))
|
||||
+
|
||||
+AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [
|
||||
+ AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS")
|
||||
+ AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes])
|
||||
+AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"])
|
||||
+
|
||||
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
|
||||
|
||||
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
|
||||
@@ -212,4 +222,5 @@ compile time options summary
|
||||
Memory CE PFA : $USE_MEMORY_CE_PFA
|
||||
AMP RAS errors : $USE_AMP_NS_DECODE
|
||||
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
|
||||
+ HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS
|
||||
EOF
|
||||
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
|
||||
index 7cb18e8..b20d527 100644
|
||||
--- a/misc/rasdaemon.env
|
||||
+++ b/misc/rasdaemon.env
|
||||
@@ -43,4 +43,9 @@ CPU_CE_THRESHOLD="18"
|
||||
CPU_ISOLATION_CYCLE="24h"
|
||||
|
||||
# Prevent excessive isolation from causing an avalanche effect
|
||||
-CPU_ISOLATION_LIMIT="10"
|
||||
\ No newline at end of file
|
||||
+CPU_ISOLATION_LIMIT="10"
|
||||
+
|
||||
+# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no).
|
||||
+HISI_HBM_MEMORY_ACLS="no"
|
||||
+# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no).
|
||||
+HISI_HBM_ISOLATION_PAGE="no"
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 756adf8..62a8386 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -19,6 +19,17 @@
|
||||
#define HISI_BUF_LEN 2048
|
||||
#define HISI_PCIE_INFO_BUF_LEN 256
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+# define HISI_TYPE_UINT32_WIDTH 32
|
||||
+/* Specify the Hisilicon HBMC HBM error type */
|
||||
+# define HISI_HBM_ERR_TYPE 0
|
||||
+# define HISI_HBM_ERR_ACLS BIT(0)
|
||||
+# define HISI_HBM_ACLS_ADDL 1
|
||||
+# define HISI_HBM_ACLS_ADDH 2
|
||||
+# define HISI_HBM_ACLS_ARRAY_SIZE 12
|
||||
+# define HISI_SUBMOD_HBMC_HBM 6
|
||||
+#endif
|
||||
+
|
||||
struct hisi_common_error_section {
|
||||
uint32_t val_bits;
|
||||
uint8_t version;
|
||||
@@ -360,6 +371,100 @@ static int add_hisi_common_table(struct ras_events *ras,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+#include <errno.h>
|
||||
+#include <unistd.h>
|
||||
+
|
||||
+static int write_file(const char *name, unsigned long long value)
|
||||
+{
|
||||
+ char fname[MAX_PATH + 1] = "/sys/kernel/";
|
||||
+ FILE *file;
|
||||
+ int ret;
|
||||
+
|
||||
+ strcat(fname, name);
|
||||
+ if (access(fname, W_OK)) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot access '%s': %s\n",
|
||||
+ fname, strerror(errno));
|
||||
+ return -errno;
|
||||
+ }
|
||||
+
|
||||
+ file = fopen(fname, "w");
|
||||
+ if (!file) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n",
|
||||
+ fname, strerror(errno));
|
||||
+ return -errno;
|
||||
+ }
|
||||
+
|
||||
+ ret = fprintf(file, "0x%llx\n", value);
|
||||
+ if (ret < 0)
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n",
|
||||
+ fname, value, strerror(errno));
|
||||
+
|
||||
+ fclose(file);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err,
|
||||
+ int page_size)
|
||||
+{
|
||||
+ unsigned long long paddr;
|
||||
+ unsigned long long pfn;
|
||||
+ int ret;
|
||||
+
|
||||
+ if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: No valid address array length (%d)\n",
|
||||
+ err->reg_array_size);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ if (!page_size)
|
||||
+ return -1;
|
||||
+
|
||||
+ paddr = err->reg_array[HISI_HBM_ACLS_ADDH];
|
||||
+ paddr <<= HISI_TYPE_UINT32_WIDTH;
|
||||
+ paddr += err->reg_array[HISI_HBM_ACLS_ADDL];
|
||||
+ pfn = paddr / page_size;
|
||||
+
|
||||
+ ret = write_file("hbm_memory/acls/acls_query", paddr);
|
||||
+ if (ret < 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ ret = write_file("debug/hwpoison/corrupt-pfn", pfn);
|
||||
+ if (ret < 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ ret = write_file("hbm_memory/acls/acls_repair", paddr);
|
||||
+ if (ret < 0 && ras_ns_hisi_hbm_isolation_page_enabled()) {
|
||||
+ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep the pfn (0x%llx) offline\n",
|
||||
+ pfn);
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ ret = write_file("debug/hwpoison/unpoison-pfn", pfn);
|
||||
+ if (ret < 0)
|
||||
+ return ret;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err,
|
||||
+ int page_size)
|
||||
+{
|
||||
+ if (strcmp(module_name[err->module_id], "HBMC") ||
|
||||
+ err->submodule_id != HISI_SUBMOD_HBMC_HBM)
|
||||
+ return;
|
||||
+
|
||||
+ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE)))
|
||||
+ return;
|
||||
+
|
||||
+ if (!(err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS))
|
||||
+ return;
|
||||
+
|
||||
+ if (hisi_hbmc_hbm_acls(err, page_size))
|
||||
+ log(TERM, LOG_WARNING, "Failed to handler HiSilicon HBM ACLS\n");
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static int decode_hisi_common_section(struct ras_events *ras,
|
||||
struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
@@ -395,6 +500,11 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
|
||||
}
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+ if (ras_ns_hisi_hbm_acls_enabled())
|
||||
+ hisi_hbm_acls_handler(err, ras->page_size);
|
||||
+#endif
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 9093954..3b10525 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -901,6 +901,9 @@ int handle_ras_events(int record_events)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NON_STANDARD
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+ ras_ns_hisi_hbm_param_init();
|
||||
+#endif
|
||||
rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event",
|
||||
ras_non_standard_event_handler, NULL, NON_STANDARD_EVENT);
|
||||
if (!rc)
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
index 20d514b..3ed0900 100644
|
||||
--- a/ras-non-standard-handler.c
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -24,6 +24,38 @@
|
||||
|
||||
static struct ras_ns_ev_decoder *ras_ns_ev_dec_list;
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+static bool ras_ns_hisi_hbm_acls;
|
||||
+static bool ras_ns_hisi_hbm_isolation_page;
|
||||
+
|
||||
+void ras_ns_hisi_hbm_param_init(void)
|
||||
+{
|
||||
+ char *env;
|
||||
+
|
||||
+ env = getenv("HISI_HBM_MEMORY_ACLS");
|
||||
+ if (env && strcasecmp(env, "yes") == 0) {
|
||||
+ log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n");
|
||||
+ ras_ns_hisi_hbm_acls = true;
|
||||
+ }
|
||||
+
|
||||
+ env = getenv("HISI_HBM_ISOLATION_PAGE");
|
||||
+ if (env && strcasecmp(env, "yes") == 0) {
|
||||
+ log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n");
|
||||
+ ras_ns_hisi_hbm_isolation_page = true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+bool ras_ns_hisi_hbm_acls_enabled(void)
|
||||
+{
|
||||
+ return ras_ns_hisi_hbm_acls;
|
||||
+}
|
||||
+
|
||||
+bool ras_ns_hisi_hbm_isolation_page_enabled(void)
|
||||
+{
|
||||
+ return ras_ns_hisi_hbm_isolation_page;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
|
||||
trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
|
||||
}
|
||||
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
|
||||
index 341206a..1c2a6e7 100644
|
||||
--- a/ras-non-standard-handler.h
|
||||
+++ b/ras-non-standard-handler.h
|
||||
@@ -46,4 +46,12 @@ void ras_ns_finalize_vendor_tables(void);
|
||||
static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; };
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_HISI_HBM_MEMORY_ACLS
|
||||
+#include <stdbool.h>
|
||||
+
|
||||
+void ras_ns_hisi_hbm_param_init(void);
|
||||
+bool ras_ns_hisi_hbm_acls_enabled(void);
|
||||
+bool ras_ns_hisi_hbm_isolation_page_enabled(void);
|
||||
+#endif
|
||||
+
|
||||
#endif
|
||||
--
|
||||
2.30.0
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.7
|
||||
Release: 15
|
||||
Release: 16
|
||||
License: GPLv2
|
||||
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
||||
URL: https://github.com/mchehab/rasdaemon.git
|
||||
@ -55,6 +55,7 @@ Patch9004: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch
|
||||
Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch
|
||||
Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch
|
||||
Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch
|
||||
Patch9008: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
|
||||
|
||||
%description
|
||||
The rasdaemon program is a daemon which monitors the platform
|
||||
@ -73,7 +74,7 @@ autoheader
|
||||
libtoolize --automake --copy --debug --force
|
||||
automake --add-missing
|
||||
%ifarch %{arm} aarch64
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation --enable-hisi-hbm-memory-acls
|
||||
%else
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror
|
||||
%endif
|
||||
@ -106,6 +107,12 @@ fi
|
||||
/usr/bin/systemctl disable rasdaemon.service >/dev/null 2>&1 || :
|
||||
|
||||
%changelog
|
||||
* Fri Dec 8 2023 Junhao He <hejunhao3@huawei.com> - 0.6.7-16
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:Add HBM Memory ACLS support for HiSilicon
|
||||
|
||||
* Wed Dec 6 2023 caijian <caijian11@h-partners.com> - 0.6.7-15
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user