The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e73b0101ae5124bf7cd3fb5d250302ad2f16a416 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch(a)tkos.co.il>
Date: Sun, 17 Jan 2021 15:17:02 +0200
Subject: [PATCH] gpio: mvebu: fix pwm .get_state period calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The period is the sum of on and off values. That is, calculate period as
($on + $off) / clkrate
instead of
$off / clkrate - $on / clkrate
that makes no sense.
Reported-by: Russell King <linux(a)armlinux.org.uk>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
Fixes: 757642f9a584e ("gpio: mvebu: Add limited PWM support")
Signed-off-by: Baruch Siach <baruch(a)tkos.co.il>
Signed-off-by: Bartosz Golaszewski <bgolaszewski(a)baylibre.com>
diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 672681a976f5..a912a8fed197 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -676,20 +676,17 @@ static void mvebu_pwm_get_state(struct pwm_chip *chip,
else
state->duty_cycle = 1;
+ val = (unsigned long long) u; /* on duration */
regmap_read(mvpwm->regs, mvebu_pwmreg_blink_off_duration(mvpwm), &u);
- val = (unsigned long long) u * NSEC_PER_SEC;
+ val += (unsigned long long) u; /* period = on + off duration */
+ val *= NSEC_PER_SEC;
do_div(val, mvpwm->clk_rate);
- if (val < state->duty_cycle) {
+ if (val > UINT_MAX)
+ state->period = UINT_MAX;
+ else if (val)
+ state->period = val;
+ else
state->period = 1;
- } else {
- val -= state->duty_cycle;
- if (val > UINT_MAX)
- state->period = UINT_MAX;
- else if (val)
- state->period = val;
- else
- state->period = 1;
- }
regmap_read(mvchip->regs, GPIO_BLINK_EN_OFF + mvchip->offset, &u);
if (u)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e73b0101ae5124bf7cd3fb5d250302ad2f16a416 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch(a)tkos.co.il>
Date: Sun, 17 Jan 2021 15:17:02 +0200
Subject: [PATCH] gpio: mvebu: fix pwm .get_state period calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The period is the sum of on and off values. That is, calculate period as
($on + $off) / clkrate
instead of
$off / clkrate - $on / clkrate
that makes no sense.
Reported-by: Russell King <linux(a)armlinux.org.uk>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
Fixes: 757642f9a584e ("gpio: mvebu: Add limited PWM support")
Signed-off-by: Baruch Siach <baruch(a)tkos.co.il>
Signed-off-by: Bartosz Golaszewski <bgolaszewski(a)baylibre.com>
diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 672681a976f5..a912a8fed197 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -676,20 +676,17 @@ static void mvebu_pwm_get_state(struct pwm_chip *chip,
else
state->duty_cycle = 1;
+ val = (unsigned long long) u; /* on duration */
regmap_read(mvpwm->regs, mvebu_pwmreg_blink_off_duration(mvpwm), &u);
- val = (unsigned long long) u * NSEC_PER_SEC;
+ val += (unsigned long long) u; /* period = on + off duration */
+ val *= NSEC_PER_SEC;
do_div(val, mvpwm->clk_rate);
- if (val < state->duty_cycle) {
+ if (val > UINT_MAX)
+ state->period = UINT_MAX;
+ else if (val)
+ state->period = val;
+ else
state->period = 1;
- } else {
- val -= state->duty_cycle;
- if (val > UINT_MAX)
- state->period = UINT_MAX;
- else if (val)
- state->period = val;
- else
- state->period = 1;
- }
regmap_read(mvchip->regs, GPIO_BLINK_EN_OFF + mvchip->offset, &u);
if (u)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e73b0101ae5124bf7cd3fb5d250302ad2f16a416 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch(a)tkos.co.il>
Date: Sun, 17 Jan 2021 15:17:02 +0200
Subject: [PATCH] gpio: mvebu: fix pwm .get_state period calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The period is the sum of on and off values. That is, calculate period as
($on + $off) / clkrate
instead of
$off / clkrate - $on / clkrate
that makes no sense.
Reported-by: Russell King <linux(a)armlinux.org.uk>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
Fixes: 757642f9a584e ("gpio: mvebu: Add limited PWM support")
Signed-off-by: Baruch Siach <baruch(a)tkos.co.il>
Signed-off-by: Bartosz Golaszewski <bgolaszewski(a)baylibre.com>
diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 672681a976f5..a912a8fed197 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -676,20 +676,17 @@ static void mvebu_pwm_get_state(struct pwm_chip *chip,
else
state->duty_cycle = 1;
+ val = (unsigned long long) u; /* on duration */
regmap_read(mvpwm->regs, mvebu_pwmreg_blink_off_duration(mvpwm), &u);
- val = (unsigned long long) u * NSEC_PER_SEC;
+ val += (unsigned long long) u; /* period = on + off duration */
+ val *= NSEC_PER_SEC;
do_div(val, mvpwm->clk_rate);
- if (val < state->duty_cycle) {
+ if (val > UINT_MAX)
+ state->period = UINT_MAX;
+ else if (val)
+ state->period = val;
+ else
state->period = 1;
- } else {
- val -= state->duty_cycle;
- if (val > UINT_MAX)
- state->period = UINT_MAX;
- else if (val)
- state->period = val;
- else
- state->period = 1;
- }
regmap_read(mvchip->regs, GPIO_BLINK_EN_OFF + mvchip->offset, &u);
if (u)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9c7d9017a49fb8516c13b7bff59b7da2abed23e1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 8 Jan 2021 19:05:59 +0100
Subject: [PATCH] x86: PM: Register syscore_ops for scale invariance
On x86 scale invariace tends to be disabled during resume from
suspend-to-RAM, because the MPERF or APERF MSR values are not as
expected then due to updates taking place after the platform
firmware has been invoked to complete the suspend transition.
That, of course, is not desirable, especially if the schedutil
scaling governor is in use, because the lack of scale invariance
causes it to be less reliable.
To counter that effect, modify init_freq_invariance() to register
a syscore_ops object for scale invariance with the ->resume callback
pointing to init_counter_refs() which will run on the CPU starting
the resume transition (the other CPUs will be taken care of the
"online" operations taking place later).
Fixes: e2b0d619b400 ("x86, sched: check for counters overflow in frequency invariant accounting")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Acked-by: Giovanni Gherdovich <ggherdovich(a)suse.cz>
Link: https://lkml.kernel.org/r/1803209.Mvru99baaF@kreacher
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8ca66af96a54..117e24fbfd8a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
+#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
}
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
+{
+ /* Bail out if registered already. */
+ if (freq_invariance_syscore_ops.node.prev)
+ return;
+
+ register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
+
static void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
@@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
+ register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cf9d052aa6005f1e8dfaf491d83bf37f368af69e Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Thu, 14 Jan 2021 19:16:24 -0800
Subject: [PATCH] pinctrl: qcom: Don't clear pending interrupts when enabling
In Linux, if a driver does disable_irq() and later does enable_irq()
on its interrupt, I believe it's expecting these properties:
* If an interrupt was pending when the driver disabled then it will
still be pending after the driver re-enables.
* If an edge-triggered interrupt comes in while an interrupt is
disabled it should assert when the interrupt is re-enabled.
If you think that the above sounds a lot like the disable_irq() and
enable_irq() are supposed to be masking/unmasking the interrupt
instead of disabling/enabling it then you've made an astute
observation. Specifically when talking about interrupts, "mask"
usually means to stop posting interrupts but keep tracking them and
"disable" means to fully shut off interrupt detection. It's
unfortunate that this is so confusing, but presumably this is all the
way it is for historical reasons.
Perhaps more confusing than the above is that, even though clients of
IRQs themselves don't have a way to request mask/unmask
vs. disable/enable calls, IRQ chips themselves can implement both.
...and yet more confusing is that if an IRQ chip implements
disable/enable then they will be called when a client driver calls
disable_irq() / enable_irq().
It does feel like some of the above could be cleared up. However,
without any other core interrupt changes it should be clear that when
an IRQ chip gets a request to "disable" an IRQ that it has to treat it
like a mask of that IRQ.
In any case, after that long interlude you can see that the "unmask
and clear" can break things. Maulik tried to fix it so that we no
longer did "unmask and clear" in commit 71266d9d3936 ("pinctrl: qcom:
Move clearing pending IRQ to .irq_request_resources callback"), but it
only handled the PDC case and it had problems (it caused
sc7180-trogdor devices to fail to suspend). Let's fix.
>From my understanding the source of the phantom interrupt in the
were these two things:
1. One that could have been introduced in msm_gpio_irq_set_type()
(only for the non-PDC case).
2. Edges could have been detected when a GPIO was muxed away.
Fixing case #1 is easy. We can just add a clear in
msm_gpio_irq_set_type().
Fixing case #2 is harder. Let's use a concrete example. In
sc7180-trogdor.dtsi we configure the uart3 to have two pinctrl states,
sleep and default, and mux between the two during runtime PM and
system suspend (see geni_se_resources_{on,off}() for more
details). The difference between the sleep and default state is that
the RX pin is muxed to a GPIO during sleep and muxed to the UART
otherwise.
As per Qualcomm, when we mux the pin over to the UART function the PDC
(or the non-PDC interrupt detection logic) is still watching it /
latching edges. These edges don't cause interrupts because the
current code masks the interrupt unless we're entering suspend.
However, as soon as we enter suspend we unmask the interrupt and it's
counted as a wakeup.
Let's deal with the problem like this:
* When we mux away, we'll mask our interrupt. This isn't necessary in
the above case since the client already masked us, but it's a good
idea in general.
* When we mux back will clear any interrupts and unmask.
Fixes: 4b7618fdc7e6 ("pinctrl: qcom: Add irq_enable callback for msm gpio")
Fixes: 71266d9d3936 ("pinctrl: qcom: Move clearing pending IRQ to .irq_request_resources callback")
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: Maulik Shah <mkshah(a)codeaurora.org>
Tested-by: Maulik Shah <mkshah(a)codeaurora.org>
Reviewed-by: Stephen Boyd <swboyd(a)chromium.org>
Link: https://lore.kernel.org/r/20210114191601.v7.4.I7cf3019783720feb57b958c95c2b…
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 192ed31eabf4..d70caecd21d2 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -51,6 +51,7 @@
* @dual_edge_irqs: Bitmap of irqs that need sw emulated dual edge
* detection.
* @skip_wake_irqs: Skip IRQs that are handled by wakeup interrupt controller
+ * @disabled_for_mux: These IRQs were disabled because we muxed away.
* @soc: Reference to soc_data of platform specific data.
* @regs: Base addresses for the TLMM tiles.
* @phys_base: Physical base address
@@ -72,6 +73,7 @@ struct msm_pinctrl {
DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
DECLARE_BITMAP(skip_wake_irqs, MAX_NR_GPIO);
+ DECLARE_BITMAP(disabled_for_mux, MAX_NR_GPIO);
const struct msm_pinctrl_soc_data *soc;
void __iomem *regs[MAX_NR_TILES];
@@ -179,6 +181,10 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
unsigned group)
{
struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev);
+ struct gpio_chip *gc = &pctrl->chip;
+ unsigned int irq = irq_find_mapping(gc->irq.domain, group);
+ struct irq_data *d = irq_get_irq_data(irq);
+ unsigned int gpio_func = pctrl->soc->gpio_func;
const struct msm_pingroup *g;
unsigned long flags;
u32 val, mask;
@@ -195,6 +201,20 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
if (WARN_ON(i == g->nfuncs))
return -EINVAL;
+ /*
+ * If an GPIO interrupt is setup on this pin then we need special
+ * handling. Specifically interrupt detection logic will still see
+ * the pin twiddle even when we're muxed away.
+ *
+ * When we see a pin with an interrupt setup on it then we'll disable
+ * (mask) interrupts on it when we mux away until we mux back. Note
+ * that disable_irq() refcounts and interrupts are disabled as long as
+ * at least one disable_irq() has been called.
+ */
+ if (d && i != gpio_func &&
+ !test_and_set_bit(d->hwirq, pctrl->disabled_for_mux))
+ disable_irq(irq);
+
raw_spin_lock_irqsave(&pctrl->lock, flags);
val = msm_readl_ctl(pctrl, g);
@@ -204,6 +224,20 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
raw_spin_unlock_irqrestore(&pctrl->lock, flags);
+ if (d && i == gpio_func &&
+ test_and_clear_bit(d->hwirq, pctrl->disabled_for_mux)) {
+ /*
+ * Clear interrupts detected while not GPIO since we only
+ * masked things.
+ */
+ if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
+ irq_chip_set_parent_state(d, IRQCHIP_STATE_PENDING, false);
+ else
+ msm_ack_intr_status(pctrl, g);
+
+ enable_irq(irq);
+ }
+
return 0;
}
@@ -781,7 +815,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
raw_spin_unlock_irqrestore(&pctrl->lock, flags);
}
-static void msm_gpio_irq_clear_unmask(struct irq_data *d, bool status_clear)
+static void msm_gpio_irq_unmask(struct irq_data *d)
{
struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
@@ -799,14 +833,6 @@ static void msm_gpio_irq_clear_unmask(struct irq_data *d, bool status_clear)
raw_spin_lock_irqsave(&pctrl->lock, flags);
- /*
- * clear the interrupt status bit before unmask to avoid
- * any erroneous interrupts that would have got latched
- * when the interrupt is not in use.
- */
- if (status_clear)
- msm_ack_intr_status(pctrl, g);
-
val = msm_readl_intr_cfg(pctrl, g);
val |= BIT(g->intr_raw_status_bit);
val |= BIT(g->intr_enable_bit);
@@ -826,7 +852,7 @@ static void msm_gpio_irq_enable(struct irq_data *d)
irq_chip_enable_parent(d);
if (!test_bit(d->hwirq, pctrl->skip_wake_irqs))
- msm_gpio_irq_clear_unmask(d, true);
+ msm_gpio_irq_unmask(d);
}
static void msm_gpio_irq_disable(struct irq_data *d)
@@ -841,11 +867,6 @@ static void msm_gpio_irq_disable(struct irq_data *d)
msm_gpio_irq_mask(d);
}
-static void msm_gpio_irq_unmask(struct irq_data *d)
-{
- msm_gpio_irq_clear_unmask(d, false);
-}
-
/**
* msm_gpio_update_dual_edge_parent() - Prime next edge for IRQs handled by parent.
* @d: The irq dta.
@@ -934,6 +955,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
const struct msm_pingroup *g;
unsigned long flags;
+ bool was_enabled;
u32 val;
if (msm_gpio_needs_dual_edge_parent_workaround(d, type)) {
@@ -995,6 +1017,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
* could cause the INTR_STATUS to be set for EDGE interrupts.
*/
val = msm_readl_intr_cfg(pctrl, g);
+ was_enabled = val & BIT(g->intr_raw_status_bit);
val |= BIT(g->intr_raw_status_bit);
if (g->intr_detection_width == 2) {
val &= ~(3 << g->intr_detection_bit);
@@ -1044,6 +1067,14 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
}
msm_writel_intr_cfg(val, pctrl, g);
+ /*
+ * The first time we set RAW_STATUS_EN it could trigger an interrupt.
+ * Clear the interrupt. This is safe because we have
+ * IRQCHIP_SET_TYPE_MASKED.
+ */
+ if (!was_enabled)
+ msm_ack_intr_status(pctrl, g);
+
if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
msm_gpio_update_dual_edge_pos(pctrl, g, d);
@@ -1097,16 +1128,11 @@ static int msm_gpio_irq_reqres(struct irq_data *d)
}
/*
- * Clear the interrupt that may be pending before we enable
- * the line.
- * This is especially a problem with the GPIOs routed to the
- * PDC. These GPIOs are direct-connect interrupts to the GIC.
- * Disabling the interrupt line at the PDC does not prevent
- * the interrupt from being latched at the GIC. The state at
- * GIC needs to be cleared before enabling.
+ * The disable / clear-enable workaround we do in msm_pinmux_set_mux()
+ * only works if disable is not lazy since we only clear any bogus
+ * interrupt in hardware. Explicitly mark the interrupt as UNLAZY.
*/
- if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
- irq_chip_set_parent_state(d, IRQCHIP_STATE_PENDING, 0);
+ irq_set_status_flags(d->irq, IRQ_DISABLE_UNLAZY);
return 0;
out:
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dad4e5b390866ca902653df0daa864ae4b8d4147 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Sat, 23 Jan 2021 21:01:52 -0800
Subject: [PATCH] mm: fix page reference leak in soft_offline_page()
The conversion to move pfn_to_online_page() internal to
soft_offline_page() missed that the get_user_pages() reference taken by
the madvise() path needs to be dropped when pfn_to_online_page() fails.
Note the direct sysfs-path to soft_offline_page() does not perform a
get_user_pages() lookup.
When soft_offline_page() is handed a pfn_valid() && !pfn_to_online_page()
pfn the kernel hangs at dax-device shutdown due to a leaked reference.
Link: https://lkml.kernel.org/r/161058501210.1840162.8108917599181157327.stgit@dw…
Fixes: feec24a6139d ("mm, soft-offline: convert parameter to pfn")
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Qian Cai <cai(a)lca.pw>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 04d9f154a130..e9481632fcd1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1885,6 +1885,12 @@ static int soft_offline_free_page(struct page *page)
return rc;
}
+static void put_ref_page(struct page *page)
+{
+ if (page)
+ put_page(page);
+}
+
/**
* soft_offline_page - Soft offline a page.
* @pfn: pfn to soft-offline
@@ -1910,20 +1916,26 @@ static int soft_offline_free_page(struct page *page)
int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
- struct page *page;
bool try_again = true;
+ struct page *page, *ref_page = NULL;
+
+ WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
if (!pfn_valid(pfn))
return -ENXIO;
+ if (flags & MF_COUNT_INCREASED)
+ ref_page = pfn_to_page(pfn);
+
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
- if (!page)
+ if (!page) {
+ put_ref_page(ref_page);
return -EIO;
+ }
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
- if (flags & MF_COUNT_INCREASED)
- put_page(page);
+ put_ref_page(ref_page);
return 0;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5c447d274f3746fbed6e695e7b9a2d7bd8b31b71 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb(a)google.com>
Date: Sat, 23 Jan 2021 21:01:15 -0800
Subject: [PATCH] mm: fix numa stats for thp migration
Currently the kernel is not correctly updating the numa stats for
NR_FILE_PAGES and NR_SHMEM on THP migration. Fix that.
For NR_FILE_DIRTY and NR_ZONE_WRITE_PENDING, although at the moment
there is no need to handle THP migration as kernel still does not have
write support for file THP but to be more future proof, this patch adds
the THP support for those stats as well.
Link: https://lkml.kernel.org/r/20210108155813.2914586-2-shakeelb@google.com
Fixes: e71769ae52609 ("mm: enable thp migration for shmem thp")
Signed-off-by: Shakeel Butt <shakeelb(a)google.com>
Acked-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: Roman Gushchin <guro(a)fb.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/migrate.c b/mm/migrate.c
index 613794f6a433..c0efe921bca5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -402,6 +402,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct zone *oldzone, *newzone;
int dirty;
int expected_count = expected_page_refs(mapping, page) + extra_count;
+ int nr = thp_nr_pages(page);
if (!mapping) {
/* Anonymous page without mapping */
@@ -437,7 +438,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */
+ page_ref_add(newpage, nr); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -459,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
if (PageTransHuge(page)) {
int i;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
+ for (i = 1; i < nr; i++) {
xas_next(&xas);
xas_store(&xas, newpage);
}
@@ -470,7 +471,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - thp_nr_pages(page));
+ page_ref_unfreeze(page, expected_count - nr);
xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -493,17 +494,17 @@ int migrate_page_move_mapping(struct address_space *mapping,
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
- __dec_lruvec_state(old_lruvec, NR_FILE_PAGES);
- __inc_lruvec_state(new_lruvec, NR_FILE_PAGES);
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_lruvec_state(old_lruvec, NR_SHMEM);
- __inc_lruvec_state(new_lruvec, NR_SHMEM);
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
if (dirty && mapping_can_writeback(mapping)) {
- __dec_lruvec_state(old_lruvec, NR_FILE_DIRTY);
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
- __inc_lruvec_state(new_lruvec, NR_FILE_DIRTY);
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
}
local_irq_enable();