A process may map only some of the pages in a folio, and might be missed
if it maps the poisoned page but not the head page. Or it might be
unnecessarily hit if it maps the head page, but not the poisoned page.
Fixes: 7af446a841a2 ("HWPOISON, hugetlb: enable error handling path for hugepage")
Cc: stable(a)vger.kernel.org
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
---
mm/memory-failure.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6953bda11e6e..82e15baabb48 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1570,7 +1570,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(hpage))
+ if (!page_mapped(p))
return true;
if (PageSwapCache(p)) {
@@ -1621,10 +1621,10 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
try_to_unmap(folio, ttu);
}
- unmap_success = !page_mapped(hpage);
+ unmap_success = !page_mapped(p);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pfn, page_mapcount(p));
/*
* try_to_unmap() might put mlocked page in lru cache, so call
--
2.42.0
There are two major types of uncorrected error (UC) :
- Action Required: The error is detected and the processor already consumes the
memory. OS requires to take action (for example, offline failure page/kill
failure thread) to recover this uncorrectable error.
- Action Optional: The error is detected out of processor execution context.
Some data in the memory are corrupted. But the data have not been consumed.
OS is optional to take action to recover this uncorrectable error.
For X86 platforms, we can easily distinguish between these two types
based on the MCA Bank. While for arm64 platform, the memory failure
flags for all UCs which severity are GHES_SEV_RECOVERABLE are set as 0,
a.k.a, Action Optional now.
If UC is detected by a background scrubber, it is obviously an Action
Optional error. For other errors, we should conservatively regard them
as Action Required.
cper_sec_mem_err::error_type identifies the type of error that occurred
if CPER_MEM_VALID_ERROR_TYPE is set. So, set memory failure flags as 0
for Scrub Uncorrected Error (type 14). Otherwise, set memory failure
flags as MF_ACTION_REQUIRED.
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
---
drivers/acpi/apei/ghes.c | 10 ++++++++--
include/linux/cper.h | 3 +++
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 80ad530583c9..6c03059cbfc6 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -474,8 +474,14 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
if (sec_sev == GHES_SEV_CORRECTED &&
(gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
flags = MF_SOFT_OFFLINE;
- if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
- flags = 0;
+ if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE) {
+ if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
+ flags = mem_err->error_type == CPER_MEM_SCRUB_UC ?
+ 0 :
+ MF_ACTION_REQUIRED;
+ else
+ flags = MF_ACTION_REQUIRED;
+ }
if (flags != -1)
return ghes_do_memory_failure(mem_err->physical_addr, flags);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index eacb7dd7b3af..b77ab7636614 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -235,6 +235,9 @@ enum {
#define CPER_MEM_VALID_BANK_ADDRESS 0x100000
#define CPER_MEM_VALID_CHIP_ID 0x200000
+#define CPER_MEM_SCRUB_CE 13
+#define CPER_MEM_SCRUB_UC 14
+
#define CPER_MEM_EXT_ROW_MASK 0x3
#define CPER_MEM_EXT_ROW_SHIFT 16
--
2.20.1.9.gb50a0d7
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Commit 834449872105 ("sc16is7xx: Fix for multi-channel stall") changed
sc16is7xx_port_irq() from looping multiple times when there was still
interrupts to serve. It simply changed the do {} while(1) loop to a
do {} while(0) loop, which makes the loop itself now obsolete.
Clean the code by removing this obsolete do {} while(0) loop.
Fixes: 834449872105 ("sc16is7xx: Fix for multi-channel stall")
Cc: stable(a)vger.kernel.org
Suggested-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
---
drivers/tty/serial/sc16is7xx.c | 81 ++++++++++++++++------------------
1 file changed, 39 insertions(+), 42 deletions(-)
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index ced2446909a2..44a11c89c949 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -725,58 +725,55 @@ static void sc16is7xx_update_mlines(struct sc16is7xx_one *one)
static bool sc16is7xx_port_irq(struct sc16is7xx_port *s, int portno)
{
bool rc = true;
+ unsigned int iir, rxlen;
struct uart_port *port = &s->p[portno].port;
struct sc16is7xx_one *one = to_sc16is7xx_one(port, port);
mutex_lock(&one->efr_lock);
- do {
- unsigned int iir, rxlen;
+ iir = sc16is7xx_port_read(port, SC16IS7XX_IIR_REG);
+ if (iir & SC16IS7XX_IIR_NO_INT_BIT) {
+ rc = false;
+ goto out_port_irq;
+ }
- iir = sc16is7xx_port_read(port, SC16IS7XX_IIR_REG);
- if (iir & SC16IS7XX_IIR_NO_INT_BIT) {
- rc = false;
- goto out_port_irq;
- }
+ iir &= SC16IS7XX_IIR_ID_MASK;
- iir &= SC16IS7XX_IIR_ID_MASK;
+ switch (iir) {
+ case SC16IS7XX_IIR_RDI_SRC:
+ case SC16IS7XX_IIR_RLSE_SRC:
+ case SC16IS7XX_IIR_RTOI_SRC:
+ case SC16IS7XX_IIR_XOFFI_SRC:
+ rxlen = sc16is7xx_port_read(port, SC16IS7XX_RXLVL_REG);
- switch (iir) {
- case SC16IS7XX_IIR_RDI_SRC:
- case SC16IS7XX_IIR_RLSE_SRC:
- case SC16IS7XX_IIR_RTOI_SRC:
- case SC16IS7XX_IIR_XOFFI_SRC:
- rxlen = sc16is7xx_port_read(port, SC16IS7XX_RXLVL_REG);
+ /*
+ * There is a silicon bug that makes the chip report a
+ * time-out interrupt but no data in the FIFO. This is
+ * described in errata section 18.1.4.
+ *
+ * When this happens, read one byte from the FIFO to
+ * clear the interrupt.
+ */
+ if (iir == SC16IS7XX_IIR_RTOI_SRC && !rxlen)
+ rxlen = 1;
- /*
- * There is a silicon bug that makes the chip report a
- * time-out interrupt but no data in the FIFO. This is
- * described in errata section 18.1.4.
- *
- * When this happens, read one byte from the FIFO to
- * clear the interrupt.
- */
- if (iir == SC16IS7XX_IIR_RTOI_SRC && !rxlen)
- rxlen = 1;
-
- if (rxlen)
- sc16is7xx_handle_rx(port, rxlen, iir);
- break;
+ if (rxlen)
+ sc16is7xx_handle_rx(port, rxlen, iir);
+ break;
/* CTSRTS interrupt comes only when CTS goes inactive */
- case SC16IS7XX_IIR_CTSRTS_SRC:
- case SC16IS7XX_IIR_MSI_SRC:
- sc16is7xx_update_mlines(one);
- break;
- case SC16IS7XX_IIR_THRI_SRC:
- sc16is7xx_handle_tx(port);
- break;
- default:
- dev_err_ratelimited(port->dev,
- "ttySC%i: Unexpected interrupt: %x",
- port->line, iir);
- break;
- }
- } while (0);
+ case SC16IS7XX_IIR_CTSRTS_SRC:
+ case SC16IS7XX_IIR_MSI_SRC:
+ sc16is7xx_update_mlines(one);
+ break;
+ case SC16IS7XX_IIR_THRI_SRC:
+ sc16is7xx_handle_tx(port);
+ break;
+ default:
+ dev_err_ratelimited(port->dev,
+ "ttySC%i: Unexpected interrupt: %x",
+ port->line, iir);
+ break;
+ }
out_port_irq:
mutex_unlock(&one->efr_lock);
--
2.39.2