diff --git a/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch new file mode 100644 index 0000000..c6b5161 --- /dev/null +++ b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch @@ -0,0 +1,287 @@ +From f37172d9fb8891f54d9e6cf218f9ff8482828b46 Mon Sep 17 00:00:00 2001 +From: Junhao He +Date: Fri, 13 Oct 2023 18:10:16 +0800 +Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon + +When a hardware error occurs in a cell of the HBM memory, the internal +SRAM of the memory controller is used to replace the faulty memory, this +method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS +RAS, and the rasdaemon record it and runs the ACLS to replace the faulty +memory. + +HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can +check which HBM cell the physical address belongs to and filter invalid +HBM addresses. Multiple RAS errors are reported if memory errors occur +in different HBM cells. + +The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and +CONFIG_HWPOISON_INJECT [2]. + +[1]: https://gitee.com/openeuler/kernel/pulls/2757 +[2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c + +Signed-off-by: Junhao He +--- + configure.ac | 11 ++++ + misc/rasdaemon.env | 7 ++- + non-standard-hisilicon.c | 110 +++++++++++++++++++++++++++++++++++++ + ras-events.c | 3 + + ras-non-standard-handler.c | 32 +++++++++++ + ras-non-standard-handler.h | 8 +++ + 6 files changed, 170 insertions(+), 1 deletion(-) + +diff --git a/configure.ac b/configure.ac +index c7af727..f1e1487 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x + AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) + ++AC_ARG_ENABLE([hisi_hbm_memory_acls], ++ AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS])) ++ ++AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS") ++ AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS]) ++]) ++AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -212,4 +222,5 @@ compile time options summary + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION ++ HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 7cb18e8..b20d527 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -43,4 +43,9 @@ CPU_CE_THRESHOLD="18" + CPU_ISOLATION_CYCLE="24h" + + # Prevent excessive isolation from causing an avalanche effect +-CPU_ISOLATION_LIMIT="10" +\ No newline at end of file ++CPU_ISOLATION_LIMIT="10" ++ ++# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no). ++HISI_HBM_MEMORY_ACLS="no" ++# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no). ++HISI_HBM_ISOLATION_PAGE="no" +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 756adf8..62a8386 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -19,6 +19,17 @@ + #define HISI_BUF_LEN 2048 + #define HISI_PCIE_INFO_BUF_LEN 256 + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++# define HISI_TYPE_UINT32_WIDTH 32 ++/* Specify the Hisilicon HBMC HBM error type */ ++# define HISI_HBM_ERR_TYPE 0 ++# define HISI_HBM_ERR_ACLS BIT(0) ++# define HISI_HBM_ACLS_ADDL 1 ++# define HISI_HBM_ACLS_ADDH 2 ++# define HISI_HBM_ACLS_ARRAY_SIZE 12 ++# define HISI_SUBMOD_HBMC_HBM 6 ++#endif ++ + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -360,6 +371,100 @@ static int add_hisi_common_table(struct ras_events *ras, + return 0; + } + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++#include ++#include ++ ++static int write_file(const char *name, unsigned long long value) ++{ ++ char fname[MAX_PATH + 1] = "/sys/kernel/"; ++ FILE *file; ++ int ret; ++ ++ strcat(fname, name); ++ if (access(fname, W_OK)) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot access '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ file = fopen(fname, "w"); ++ if (!file) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ ret = fprintf(file, "0x%llx\n", value); ++ if (ret < 0) ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n", ++ fname, value, strerror(errno)); ++ ++ fclose(file); ++ return ret; ++} ++ ++static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, ++ int page_size) ++{ ++ unsigned long long paddr; ++ unsigned long long pfn; ++ int ret; ++ ++ if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: No valid address array length (%d)\n", ++ err->reg_array_size); ++ return -1; ++ } ++ ++ if (!page_size) ++ return -1; ++ ++ paddr = err->reg_array[HISI_HBM_ACLS_ADDH]; ++ paddr <<= HISI_TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HISI_HBM_ACLS_ADDL]; ++ pfn = paddr / page_size; ++ ++ ret = write_file("hbm_memory/acls/acls_query", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("debug/hwpoison/corrupt-pfn", pfn); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("hbm_memory/acls/acls_repair", paddr); ++ if (ret < 0 && ras_ns_hisi_hbm_isolation_page_enabled()) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep the pfn (0x%llx) offline\n", ++ pfn); ++ return ret; ++ } ++ ++ ret = write_file("debug/hwpoison/unpoison-pfn", pfn); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err, ++ int page_size) ++{ ++ if (strcmp(module_name[err->module_id], "HBMC") || ++ err->submodule_id != HISI_SUBMOD_HBMC_HBM) ++ return; ++ ++ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE))) ++ return; ++ ++ if (!(err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS)) ++ return; ++ ++ if (hisi_hbmc_hbm_acls(err, page_size)) ++ log(TERM, LOG_WARNING, "Failed to handler HiSilicon HBM ACLS\n"); ++} ++#endif ++ + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -395,6 +500,11 @@ static int decode_hisi_common_section(struct ras_events *ras, + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); + } + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++ if (ras_ns_hisi_hbm_acls_enabled()) ++ hisi_hbm_acls_handler(err, ras->page_size); ++#endif ++ + return 0; + } + +diff --git a/ras-events.c b/ras-events.c +index 9093954..3b10525 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -901,6 +901,9 @@ int handle_ras_events(int record_events) + #endif + + #ifdef HAVE_NON_STANDARD ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++ ras_ns_hisi_hbm_param_init(); ++#endif + rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event", + ras_non_standard_event_handler, NULL, NON_STANDARD_EVENT); + if (!rc) +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 20d514b..3ed0900 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -24,6 +24,38 @@ + + static struct ras_ns_ev_decoder *ras_ns_ev_dec_list; + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++static bool ras_ns_hisi_hbm_acls; ++static bool ras_ns_hisi_hbm_isolation_page; ++ ++void ras_ns_hisi_hbm_param_init(void) ++{ ++ char *env; ++ ++ env = getenv("HISI_HBM_MEMORY_ACLS"); ++ if (env && strcasecmp(env, "yes") == 0) { ++ log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n"); ++ ras_ns_hisi_hbm_acls = true; ++ } ++ ++ env = getenv("HISI_HBM_ISOLATION_PAGE"); ++ if (env && strcasecmp(env, "yes") == 0) { ++ log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n"); ++ ras_ns_hisi_hbm_isolation_page = true; ++ } ++} ++ ++bool ras_ns_hisi_hbm_acls_enabled(void) ++{ ++ return ras_ns_hisi_hbm_acls; ++} ++ ++bool ras_ns_hisi_hbm_isolation_page_enabled(void) ++{ ++ return ras_ns_hisi_hbm_isolation_page; ++} ++#endif ++ + void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) { + trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]); + } +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +index 341206a..1c2a6e7 100644 +--- a/ras-non-standard-handler.h ++++ b/ras-non-standard-handler.h +@@ -46,4 +46,12 @@ void ras_ns_finalize_vendor_tables(void); + static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; }; + #endif + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++#include ++ ++void ras_ns_hisi_hbm_param_init(void); ++bool ras_ns_hisi_hbm_acls_enabled(void); ++bool ras_ns_hisi_hbm_isolation_page_enabled(void); ++#endif ++ + #endif +-- +2.30.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 61e4823..58219f9 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 15 +Release: 16 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -55,6 +55,7 @@ Patch9004: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch +Patch9008: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch %description The rasdaemon program is a daemon which monitors the platform @@ -73,7 +74,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation --enable-hisi-hbm-memory-acls %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -106,6 +107,12 @@ fi /usr/bin/systemctl disable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Fri Dec 8 2023 Junhao He - 0.6.7-16 +- Type:feature +- ID:NA +- SUG:NA +- DESC:Add HBM Memory ACLS support for HiSilicon + * Wed Dec 6 2023 caijian - 0.6.7-15 - Type:bugfix - ID:NA