revert "rasdaemon: Add HBM Memory ACLS support for HiSilicon"

This reverts commit 156e08f2fdc6eef9df379384aa867860db27a085

The HBM ACLS scheme that rasdaemon relies on has changed. Moreover, the
new solution only needs to be uploaded to the openEuler-22.03-LTS-SP4
branch, so this branch cancels support for HiSilicon HBM Memory ACLS.

Signed-off-by: Junhao He <hejunhao3@huawei.com>
This commit is contained in:
Junhao He 2024-09-02 10:47:34 +08:00
parent 1aac672887
commit ce79115bd5
2 changed files with 283 additions and 2 deletions

View File

@ -0,0 +1,274 @@
From 77c2ccb26b5da0c24a82ae956164fe527723dabd Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Mon, 2 Sep 2024 10:24:59 +0800
Subject: [PATCH] revert "rasdaemon: Add HBM Memory ACLS support for HiSilicon"
The HBM ACLS scheme that rasdaemon relies on has changed. Moreover, the
new solution only needs to be uploaded to the openEuler-22.03-LTS-SP4
branch, so this branch cancels support for HiSilicon HBM Memory ACLS.
Signed-off-by: Junhao He <hejunhao3@huawei.com>
---
configure.ac | 11 ----
misc/rasdaemon.env | 5 --
non-standard-hisilicon.c | 110 -------------------------------------
ras-events.c | 3 -
ras-non-standard-handler.c | 32 -----------
ras-non-standard-handler.h | 8 ---
6 files changed, 169 deletions(-)
diff --git a/configure.ac b/configure.ac
index 30c90d2..d098fcf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -171,16 +171,6 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x
AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
-AC_ARG_ENABLE([hisi_hbm_memory_acls],
- AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS]))
-
-AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [
- AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS")
- AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS])
-])
-AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes])
-AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"])
-
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
@@ -222,5 +212,4 @@ compile time options summary
Memory CE PFA : $USE_MEMORY_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
- HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS
EOF
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index d754128..ca12a1a 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -45,10 +45,5 @@ CPU_ISOLATION_CYCLE="24h"
# Prevent excessive isolation from causing an avalanche effect
CPU_ISOLATION_LIMIT="10"
-# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no).
-HISI_HBM_MEMORY_ACLS="no"
-# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no).
-HISI_HBM_ISOLATION_PAGE="no"
-
# Disable specified events by config
DISABLE="block:block_rq_complete"
\ No newline at end of file
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
index 25c4903..7296d28 100644
--- a/non-standard-hisilicon.c
+++ b/non-standard-hisilicon.c
@@ -19,17 +19,6 @@
#define HISI_BUF_LEN 2048
#define HISI_PCIE_INFO_BUF_LEN 256
-#ifdef HAVE_HISI_HBM_MEMORY_ACLS
-# define HISI_TYPE_UINT32_WIDTH 32
-/* Specify the Hisilicon HBMC HBM error type */
-# define HISI_HBM_ERR_TYPE 0
-# define HISI_HBM_ERR_ACLS BIT(0)
-# define HISI_HBM_ACLS_ADDL 1
-# define HISI_HBM_ACLS_ADDH 2
-# define HISI_HBM_ACLS_ARRAY_SIZE 12
-# define HISI_SUBMOD_HBMC_HBM 6
-#endif
-
struct hisi_common_error_section {
uint32_t val_bits;
uint8_t version;
@@ -369,100 +358,6 @@ static int add_hisi_common_table(struct ras_events *ras,
return 0;
}
-#ifdef HAVE_HISI_HBM_MEMORY_ACLS
-#include <errno.h>
-#include <unistd.h>
-
-static int write_file(const char *name, unsigned long long value)
-{
- char fname[MAX_PATH + 1] = "/sys/kernel/";
- FILE *file;
- int ret;
-
- strcat(fname, name);
- if (access(fname, W_OK)) {
- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot access '%s': %s\n",
- fname, strerror(errno));
- return -errno;
- }
-
- file = fopen(fname, "w");
- if (!file) {
- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n",
- fname, strerror(errno));
- return -errno;
- }
-
- ret = fprintf(file, "0x%llx\n", value);
- if (ret < 0)
- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n",
- fname, value, strerror(errno));
-
- fclose(file);
- return ret;
-}
-
-static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err,
- int page_size)
-{
- unsigned long long paddr;
- unsigned long long pfn;
- int ret;
-
- if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) {
- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: No valid address array length (%d)\n",
- err->reg_array_size);
- return -1;
- }
-
- if (!page_size)
- return -1;
-
- paddr = err->reg_array[HISI_HBM_ACLS_ADDH];
- paddr <<= HISI_TYPE_UINT32_WIDTH;
- paddr += err->reg_array[HISI_HBM_ACLS_ADDL];
- pfn = paddr / page_size;
-
- ret = write_file("hbm_memory/acls/acls_query", paddr);
- if (ret < 0)
- return ret;
-
- ret = write_file("debug/hwpoison/corrupt-pfn", pfn);
- if (ret < 0)
- return ret;
-
- ret = write_file("hbm_memory/acls/acls_repair", paddr);
- if (ret < 0 && ras_ns_hisi_hbm_isolation_page_enabled()) {
- log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep the pfn (0x%llx) offline\n",
- pfn);
- return ret;
- }
-
- ret = write_file("debug/hwpoison/unpoison-pfn", pfn);
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err,
- int page_size)
-{
- if (strcmp(module_name[err->module_id], "HBMC") ||
- err->submodule_id != HISI_SUBMOD_HBMC_HBM)
- return;
-
- if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE)))
- return;
-
- if (!(err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS))
- return;
-
- if (hisi_hbmc_hbm_acls(err, page_size))
- log(TERM, LOG_WARNING, "Failed to handler HiSilicon HBM ACLS\n");
-}
-#endif
-
static int decode_hisi_common_section(struct ras_events *ras,
struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
@@ -498,11 +393,6 @@ static int decode_hisi_common_section(struct ras_events *ras,
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
}
-#ifdef HAVE_HISI_HBM_MEMORY_ACLS
- if (ras_ns_hisi_hbm_acls_enabled())
- hisi_hbm_acls_handler(err, ras->page_size);
-#endif
-
return 0;
}
diff --git a/ras-events.c b/ras-events.c
index d2a7a4e..ed2198b 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -951,9 +951,6 @@ int handle_ras_events(int record_events)
#endif
#ifdef HAVE_NON_STANDARD
-#ifdef HAVE_HISI_HBM_MEMORY_ACLS
- ras_ns_hisi_hbm_param_init();
-#endif
if (is_disabled_event("ras", "non_standard_event")) {
log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
"ras", "non_standard_event");
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
index 3ed0900..20d514b 100644
--- a/ras-non-standard-handler.c
+++ b/ras-non-standard-handler.c
@@ -24,38 +24,6 @@
static struct ras_ns_ev_decoder *ras_ns_ev_dec_list;
-#ifdef HAVE_HISI_HBM_MEMORY_ACLS
-static bool ras_ns_hisi_hbm_acls;
-static bool ras_ns_hisi_hbm_isolation_page;
-
-void ras_ns_hisi_hbm_param_init(void)
-{
- char *env;
-
- env = getenv("HISI_HBM_MEMORY_ACLS");
- if (env && strcasecmp(env, "yes") == 0) {
- log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n");
- ras_ns_hisi_hbm_acls = true;
- }
-
- env = getenv("HISI_HBM_ISOLATION_PAGE");
- if (env && strcasecmp(env, "yes") == 0) {
- log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n");
- ras_ns_hisi_hbm_isolation_page = true;
- }
-}
-
-bool ras_ns_hisi_hbm_acls_enabled(void)
-{
- return ras_ns_hisi_hbm_acls;
-}
-
-bool ras_ns_hisi_hbm_isolation_page_enabled(void)
-{
- return ras_ns_hisi_hbm_isolation_page;
-}
-#endif
-
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
}
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
index 1c2a6e7..341206a 100644
--- a/ras-non-standard-handler.h
+++ b/ras-non-standard-handler.h
@@ -46,12 +46,4 @@ void ras_ns_finalize_vendor_tables(void);
static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; };
#endif
-#ifdef HAVE_HISI_HBM_MEMORY_ACLS
-#include <stdbool.h>
-
-void ras_ns_hisi_hbm_param_init(void);
-bool ras_ns_hisi_hbm_acls_enabled(void);
-bool ras_ns_hisi_hbm_isolation_page_enabled(void);
-#endif
-
#endif
--
2.33.0

View File

@ -1,6 +1,6 @@
Name: rasdaemon Name: rasdaemon
Version: 0.6.7 Version: 0.6.7
Release: 20 Release: 21
License: GPLv2 License: GPLv2
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
URL: https://github.com/mchehab/rasdaemon.git URL: https://github.com/mchehab/rasdaemon.git
@ -60,6 +60,7 @@ Patch9008: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch
Patch9009: add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete.patch Patch9009: add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete.patch
Patch9010: fix-rasdaemon-print-loading-config-logs-multiple-times.patch Patch9010: fix-rasdaemon-print-loading-config-logs-multiple-times.patch
Patch9011: bugfix-fix-cpu-isolate-errors-when-some-cpus-are-.patch Patch9011: bugfix-fix-cpu-isolate-errors-when-some-cpus-are-.patch
Patch9012: 0001-revert-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiS.patch
%description %description
@ -79,7 +80,7 @@ autoheader
libtoolize --automake --copy --debug --force libtoolize --automake --copy --debug --force
automake --add-missing automake --add-missing
%ifarch %{arm} aarch64 %ifarch %{arm} aarch64
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation --enable-hisi-hbm-memory-acls %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation
%else %else
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror
%endif %endif
@ -114,6 +115,12 @@ if [ $1 -eq 0 ] ; then
fi fi
%changelog %changelog
* Mon Sep 2 2024 Junhao He <hejunhao3@huawei.com> - 0.6.7-21
- Type:bugfix
- ID:NA
- SUG:NA
- DESC:Remove the support for HiSilicon HBM Memory ACLS.
* Thu Apr 25 2024 yangjunshuo <yangjunshuo@huawei.com> - 0.6.7-20 * Thu Apr 25 2024 yangjunshuo <yangjunshuo@huawei.com> - 0.6.7-20
- Type:bugfix - Type:bugfix
- ID:NA - ID:NA