The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x d7893093a7417527c0d73c9832244e65c9d0114f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062812-bloated-equal-cc64@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d7893093a7417527c0d73c9832244e65c9d0114f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 15 Jun 2023 22:33:57 +0200
Subject: [PATCH] x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054c84be..d4ce5cb5c953 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);
+void smp_kick_mwait_play_dead(void);
+
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d842875f986f..174d6232b87f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
return;
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
* 1) Send an IPI on the reboot vector to all other CPUs.
*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c5ac5d74cdd4..483df0427678 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
unsigned int status;
};
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
/*
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
{
int cpuid;
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If waken up by an INIT in an 82489DX configuration
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
wbinvd();
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
mb();
__mwait(eax, 0);
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+
cond_wakeup_cpu0();
}
}
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
+
void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x d7893093a7417527c0d73c9832244e65c9d0114f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062811-ambush-finishing-abd6@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d7893093a7417527c0d73c9832244e65c9d0114f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 15 Jun 2023 22:33:57 +0200
Subject: [PATCH] x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054c84be..d4ce5cb5c953 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);
+void smp_kick_mwait_play_dead(void);
+
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d842875f986f..174d6232b87f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
return;
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
* 1) Send an IPI on the reboot vector to all other CPUs.
*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c5ac5d74cdd4..483df0427678 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
unsigned int status;
};
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
/*
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
{
int cpuid;
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If waken up by an INIT in an 82489DX configuration
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
wbinvd();
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
mb();
__mwait(eax, 0);
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+
cond_wakeup_cpu0();
}
}
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
+
void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x d7893093a7417527c0d73c9832244e65c9d0114f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062809-ultimate-spider-e3a0@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d7893093a7417527c0d73c9832244e65c9d0114f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 15 Jun 2023 22:33:57 +0200
Subject: [PATCH] x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054c84be..d4ce5cb5c953 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);
+void smp_kick_mwait_play_dead(void);
+
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d842875f986f..174d6232b87f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
return;
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
* 1) Send an IPI on the reboot vector to all other CPUs.
*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c5ac5d74cdd4..483df0427678 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
unsigned int status;
};
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
/*
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
{
int cpuid;
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If waken up by an INIT in an 82489DX configuration
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
wbinvd();
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
mb();
__mwait(eax, 0);
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+
cond_wakeup_cpu0();
}
}
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
+
void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x d7893093a7417527c0d73c9832244e65c9d0114f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062808-each-phony-f7b3@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d7893093a7417527c0d73c9832244e65c9d0114f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 15 Jun 2023 22:33:57 +0200
Subject: [PATCH] x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054c84be..d4ce5cb5c953 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);
+void smp_kick_mwait_play_dead(void);
+
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d842875f986f..174d6232b87f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
return;
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
* 1) Send an IPI on the reboot vector to all other CPUs.
*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c5ac5d74cdd4..483df0427678 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
unsigned int status;
};
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
/*
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
{
int cpuid;
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If waken up by an INIT in an 82489DX configuration
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
wbinvd();
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
mb();
__mwait(eax, 0);
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+
cond_wakeup_cpu0();
}
}
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
+
void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x d7893093a7417527c0d73c9832244e65c9d0114f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062806-handcuff-depress-af1d@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d7893093a7417527c0d73c9832244e65c9d0114f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 15 Jun 2023 22:33:57 +0200
Subject: [PATCH] x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054c84be..d4ce5cb5c953 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);
+void smp_kick_mwait_play_dead(void);
+
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d842875f986f..174d6232b87f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
return;
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
* 1) Send an IPI on the reboot vector to all other CPUs.
*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c5ac5d74cdd4..483df0427678 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
unsigned int status;
};
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
/*
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
{
int cpuid;
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If waken up by an INIT in an 82489DX configuration
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
wbinvd();
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
mb();
__mwait(eax, 0);
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+
cond_wakeup_cpu0();
}
}
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
+
void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
From: Philip Yang <Philip.Yang(a)amd.com>
Under VRAM usage pression, map to GPU may fail to create pt bo and
vmbo->shadow_list is not initialized, then ttm_bo_release calling
amdgpu_bo_vm_destroy to access vmbo->shadow_list generates below
dmesg and NULL pointer access backtrace:
Set vmbo destroy callback to amdgpu_bo_vm_destroy only after creating pt
bo successfully, otherwise use default callback amdgpu_bo_destroy.
amdgpu: amdgpu_vm_bo_update failed
amdgpu: update_gpuvm_pte() failed
amdgpu: Failed to map bo to gpuvm
amdgpu 0000:43:00.0: amdgpu: Failed to map peer:0000:43:00.0 mem_domain:2
BUG: kernel NULL pointer dereference, address:
RIP: 0010:amdgpu_bo_vm_destroy+0x4d/0x80 [amdgpu]
Call Trace:
<TASK>
ttm_bo_release+0x207/0x320 [amdttm]
amdttm_bo_init_reserved+0x1d6/0x210 [amdttm]
amdgpu_bo_create+0x1ba/0x520 [amdgpu]
amdgpu_bo_create_vm+0x3a/0x80 [amdgpu]
amdgpu_vm_pt_create+0xde/0x270 [amdgpu]
amdgpu_vm_ptes_update+0x63b/0x710 [amdgpu]
amdgpu_vm_update_range+0x2e7/0x6e0 [amdgpu]
amdgpu_vm_bo_update+0x2bd/0x600 [amdgpu]
update_gpuvm_pte+0x160/0x420 [amdgpu]
amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x313/0x1130 [amdgpu]
kfd_ioctl_map_memory_to_gpu+0x115/0x390 [amdgpu]
kfd_ioctl+0x24a/0x5b0 [amdgpu]
Signed-off-by: Philip Yang <Philip.Yang(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 9a3c6067bd2ee2ca2652fbb0679f422f3c9109f9)
This fixes a regression introduced by commit 1cc40dccad76 ("drm/amdgpu:
fix Null pointer dereference error in amdgpu_device_recover_vram") in
5.15.118. It's a hand modified cherry-pick because that commit that
introduced the regression touched nearby code and the context is now
incorrect.
Cc: Linux Regressions <regressions(a)lists.linux.dev>
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2650
Fixes: 1cc40dccad76 ("drm/amdgpu: fix Null pointer dereference error in amdgpu_device_recover_vram")
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index d03a4519f945..8a0b652da4f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -685,7 +685,6 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev,
* num of amdgpu_vm_pt entries.
*/
BUG_ON(bp->bo_ptr_size < sizeof(struct amdgpu_bo_vm));
- bp->destroy = &amdgpu_bo_vm_destroy;
r = amdgpu_bo_create(adev, bp, &bo_ptr);
if (r)
return r;
--
2.34.1
From: Bob Peterson <rpeterso(a)redhat.com>
[ Upstream commit 504a10d9e46bc37b23d0a1ae2f28973c8516e636 ]
On corrupt gfs2 file systems the evict code can try to reference the
journal descriptor structure, jdesc, after it has been freed and set to
NULL. The sequence of events is:
init_journal()
...
fail_jindex:
gfs2_jindex_free(sdp); <------frees journals, sets jdesc = NULL
if (gfs2_holder_initialized(&ji_gh))
gfs2_glock_dq_uninit(&ji_gh);
fail:
iput(sdp->sd_jindex); <--references jdesc in evict_linked_inode
evict()
gfs2_evict_inode()
evict_linked_inode()
ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
<------references the now freed/zeroed sd_jdesc pointer.
The call to gfs2_trans_begin is done because the truncate_inode_pages
call can cause gfs2 events that require a transaction, such as removing
journaled data (jdata) blocks from the journal.
This patch fixes the problem by adding a check for sdp->sd_jdesc to
function gfs2_evict_inode. In theory, this should only happen to corrupt
gfs2 file systems, when gfs2 detects the problem, reports it, then tries
to evict all the system inodes it has read in up to that point.
Reported-by: Yang Lan <lanyang0908(a)gmail.com>
Signed-off-by: Bob Peterson <rpeterso(a)redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba(a)redhat.com>
[DP: adjusted context]
Signed-off-by: Dragos-Marian Panait <dragos.panait(a)windriver.com>
---
fs/gfs2/super.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9c593fd50c6a..baf0a70460c0 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1258,6 +1258,14 @@ static void gfs2_evict_inode(struct inode *inode)
if (inode->i_nlink || sb_rdonly(sb))
goto out;
+ /*
+ * In case of an incomplete mount, gfs2_evict_inode() may be called for
+ * system files without having an active journal to write to. In that
+ * case, skip the filesystem evict.
+ */
+ if (!sdp->sd_jdesc)
+ goto out;
+
if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
gfs2_holder_mark_uninitialized(&gh);
--
2.40.1
I was able to reproduce crash on 5.15.y kernel during COW, and
when the grandchild process attempts a write to a private page
inherited from the child process and the private page contains
a memory uncorrectable error. The way to reproduce is described
in Tony's patch, using his ras-tools/einj_mem_uc.
And the patch series fixed the panic issue in 5.15.y.
Followed here is the backport of Tony patch series to stable 5.15
and stable 6.1. Both backport have encountered trivial conflicts
due to missing dependencies, details are provided in each patch.
Please let me know whether the backport is acceptable.
Tony Luck (2):
mm, hwpoison: try to recover from copy-on write faults
mm, hwpoison: when copy-on-write hits poison, take page offline
include/linux/highmem.h | 24 ++++++++++++++++++++++++
include/linux/mm.h | 5 ++++-
mm/memory.c | 33 +++++++++++++++++++++++----------
3 files changed, 51 insertions(+), 11 deletions(-)
--
2.18.4
The variable 'asd', which may be NULL, is dereferenced before
check. The problem has been fixed by the following patch
which can be cleanly applied to the 5.10 branch.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 81c1d029016001f994ce1c46849c5e9900d8eab8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062315-example-anger-442b@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 81c1d029016001f994ce1c46849c5e9900d8eab8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Tue, 20 Jun 2023 18:24:21 +0200
Subject: [PATCH] mptcp: consolidate fallback and non fallback state machine
An orphaned msk releases the used resources via the worker,
when the latter first see the msk in CLOSED status.
If the msk status transitions to TCP_CLOSE in the release callback
invoked by the worker's final release_sock(), such instance of the
workqueue will not take any action.
Additionally the MPTCP code prevents scheduling the worker once the
socket reaches the CLOSE status: such msk resources will be leaked.
The only code path that can trigger the above scenario is the
__mptcp_check_send_data_fin() in fallback mode.
Address the issue removing the special handling of fallback socket
in __mptcp_check_send_data_fin(), consolidating the state machine
for fallback and non fallback socket.
Since non-fallback sockets do not send and do not receive data_fin,
the mptcp code can update the msk internal status to match the next
step in the SM every time data fin (ack) should be generated or
received.
As a consequence we can remove a bunch of checks for fallback from
the fastpath.
Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks")
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9a40dae31cec..27d206f7af62 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -44,7 +44,7 @@ enum {
static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
static void __mptcp_destroy_sock(struct sock *sk);
-static void __mptcp_check_send_data_fin(struct sock *sk);
+static void mptcp_check_send_data_fin(struct sock *sk);
DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;
@@ -424,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- return !__mptcp_check_fallback(msk) &&
- ((1 << sk->sk_state) &
+ return ((1 << sk->sk_state) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
msk->write_seq == READ_ONCE(msk->snd_una);
}
@@ -583,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk)
u64 rcv_data_fin_seq;
bool ret = false;
- if (__mptcp_check_fallback(msk))
- return ret;
-
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
* msk->rcv_data_fin was set when parsing the incoming options
@@ -623,7 +619,8 @@ static bool mptcp_check_data_fin(struct sock *sk)
}
ret = true;
- mptcp_send_ack(msk);
+ if (!__mptcp_check_fallback(msk))
+ mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
return ret;
@@ -1609,7 +1606,7 @@ out:
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
if (do_check_data_fin)
- __mptcp_check_send_data_fin(sk);
+ mptcp_check_send_data_fin(sk);
}
static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
@@ -2680,8 +2677,6 @@ static void mptcp_worker(struct work_struct *work)
if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto unlock;
- mptcp_check_data_fin_ack(sk);
-
mptcp_check_fastclose(msk);
mptcp_pm_nl_work(msk);
@@ -2689,7 +2684,8 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
- __mptcp_check_send_data_fin(sk);
+ mptcp_check_send_data_fin(sk);
+ mptcp_check_data_fin_ack(sk);
mptcp_check_data_fin(sk);
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@@ -2828,6 +2824,12 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
pr_debug("Fallback");
ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how);
+
+ /* simulate the data_fin ack reception to let the state
+ * machine move forward
+ */
+ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
+ mptcp_schedule_work(sk);
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
tcp_send_ack(ssk);
@@ -2867,7 +2869,7 @@ static int mptcp_close_state(struct sock *sk)
return next & TCP_ACTION_FIN;
}
-static void __mptcp_check_send_data_fin(struct sock *sk)
+static void mptcp_check_send_data_fin(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2885,19 +2887,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
- /* fallback socket will not get data_fin/ack, can move to the next
- * state now
- */
- if (__mptcp_check_fallback(msk)) {
- WRITE_ONCE(msk->snd_una, msk->write_seq);
- if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
- inet_sk_state_store(sk, TCP_CLOSE);
- mptcp_close_wake_up(sk);
- } else if (sk->sk_state == TCP_FIN_WAIT1) {
- inet_sk_state_store(sk, TCP_FIN_WAIT2);
- }
- }
-
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -2917,7 +2906,7 @@ static void __mptcp_wr_shutdown(struct sock *sk)
WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
WRITE_ONCE(msk->snd_data_fin_enable, 1);
- __mptcp_check_send_data_fin(sk);
+ mptcp_check_send_data_fin(sk);
}
static void __mptcp_destroy_sock(struct sock *sk)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 4688daa6b38b..d9c8b21c6076 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1749,14 +1749,16 @@ static void subflow_state_change(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
__subflow_state_change(sk);
+ msk = mptcp_sk(parent);
if (subflow_simultaneous_connect(sk)) {
mptcp_propagate_sndbuf(parent, sk);
mptcp_do_fallback(sk);
- mptcp_rcv_space_init(mptcp_sk(parent), sk);
- pr_fallback(mptcp_sk(parent));
+ mptcp_rcv_space_init(msk, sk);
+ pr_fallback(msk);
subflow->conn_finished = 1;
mptcp_set_connected(parent);
}
@@ -1772,11 +1774,12 @@ static void subflow_state_change(struct sock *sk)
subflow_sched_work_if_closed(mptcp_sk(parent), sk);
- if (__mptcp_check_fallback(mptcp_sk(parent)) &&
- !subflow->rx_eof && subflow_is_done(sk)) {
- subflow->rx_eof = 1;
- mptcp_subflow_eof(parent);
- }
+ /* when the fallback subflow closes the rx side, trigger a 'dummy'
+ * ingress data fin, so that the msk state will follow along
+ */
+ if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
+ mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
+ mptcp_schedule_work(parent);
}
void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
Add a NULL check for the 'bdev' parameter of
dm_verity_loadpin_is_bdev_trusted(). The function is called
by loadpin_check(), which passes the block device that
corresponds to the super block of the file system from which
a file is being loaded. Generally a super_block structure has
an associated block device, however that is not always the
case (e.g. tmpfs).
Cc: stable(a)vger.kernel.org # v6.0+
Fixes: b6c1c5745ccc ("dm: Add verity helpers for LoadPin")
Signed-off-by: Matthias Kaehlcke <mka(a)chromium.org>
---
drivers/md/dm-verity-loadpin.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/md/dm-verity-loadpin.c b/drivers/md/dm-verity-loadpin.c
index 4f78cc55c251..0666699b6858 100644
--- a/drivers/md/dm-verity-loadpin.c
+++ b/drivers/md/dm-verity-loadpin.c
@@ -58,6 +58,9 @@ bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev)
int srcu_idx;
bool trusted = false;
+ if (bdev == NULL)
+ return false;
+
if (list_empty(&dm_verity_loadpin_trusted_root_digests))
return false;
--
2.41.0.255.g8b1d071c50-goog
From: Bob Peterson <rpeterso(a)redhat.com>
[ Upstream commit 504a10d9e46bc37b23d0a1ae2f28973c8516e636 ]
On corrupt gfs2 file systems the evict code can try to reference the
journal descriptor structure, jdesc, after it has been freed and set to
NULL. The sequence of events is:
init_journal()
...
fail_jindex:
gfs2_jindex_free(sdp); <------frees journals, sets jdesc = NULL
if (gfs2_holder_initialized(&ji_gh))
gfs2_glock_dq_uninit(&ji_gh);
fail:
iput(sdp->sd_jindex); <--references jdesc in evict_linked_inode
evict()
gfs2_evict_inode()
evict_linked_inode()
ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
<------references the now freed/zeroed sd_jdesc pointer.
The call to gfs2_trans_begin is done because the truncate_inode_pages
call can cause gfs2 events that require a transaction, such as removing
journaled data (jdata) blocks from the journal.
This patch fixes the problem by adding a check for sdp->sd_jdesc to
function gfs2_evict_inode. In theory, this should only happen to corrupt
gfs2 file systems, when gfs2 detects the problem, reports it, then tries
to evict all the system inodes it has read in up to that point.
Reported-by: Yang Lan <lanyang0908(a)gmail.com>
Signed-off-by: Bob Peterson <rpeterso(a)redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba(a)redhat.com>
[DP: adjusted context]
Signed-off-by: Dragos-Marian Panait <dragos.panait(a)windriver.com>
---
fs/gfs2/super.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 56bfed0a5873..73290263402a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1575,6 +1575,14 @@ static void gfs2_evict_inode(struct inode *inode)
if (inode->i_nlink || sb_rdonly(sb))
goto out;
+ /*
+ * In case of an incomplete mount, gfs2_evict_inode() may be called for
+ * system files without having an active journal to write to. In that
+ * case, skip the filesystem evict.
+ */
+ if (!sdp->sd_jdesc)
+ goto out;
+
if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
gfs2_holder_mark_uninitialized(&gh);
--
2.40.1
From: Bob Peterson <rpeterso(a)redhat.com>
[ Upstream commit 504a10d9e46bc37b23d0a1ae2f28973c8516e636 ]
On corrupt gfs2 file systems the evict code can try to reference the
journal descriptor structure, jdesc, after it has been freed and set to
NULL. The sequence of events is:
init_journal()
...
fail_jindex:
gfs2_jindex_free(sdp); <------frees journals, sets jdesc = NULL
if (gfs2_holder_initialized(&ji_gh))
gfs2_glock_dq_uninit(&ji_gh);
fail:
iput(sdp->sd_jindex); <--references jdesc in evict_linked_inode
evict()
gfs2_evict_inode()
evict_linked_inode()
ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
<------references the now freed/zeroed sd_jdesc pointer.
The call to gfs2_trans_begin is done because the truncate_inode_pages
call can cause gfs2 events that require a transaction, such as removing
journaled data (jdata) blocks from the journal.
This patch fixes the problem by adding a check for sdp->sd_jdesc to
function gfs2_evict_inode. In theory, this should only happen to corrupt
gfs2 file systems, when gfs2 detects the problem, reports it, then tries
to evict all the system inodes it has read in up to that point.
Reported-by: Yang Lan <lanyang0908(a)gmail.com>
Signed-off-by: Bob Peterson <rpeterso(a)redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba(a)redhat.com>
[DP: adjusted context]
Signed-off-by: Dragos-Marian Panait <dragos.panait(a)windriver.com>
---
fs/gfs2/super.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 3cc2237e5896..bb0eaa4638e3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1586,6 +1586,14 @@ static void gfs2_evict_inode(struct inode *inode)
if (inode->i_nlink || sb_rdonly(sb))
goto out;
+ /*
+ * In case of an incomplete mount, gfs2_evict_inode() may be called for
+ * system files without having an active journal to write to. In that
+ * case, skip the filesystem evict.
+ */
+ if (!sdp->sd_jdesc)
+ goto out;
+
if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
gfs2_holder_mark_uninitialized(&gh);
--
2.40.1
After switching from dwarf_decl_file() to die_get_decl_file(), it is not
possible to add probes for certain functions:
$ perf probe -x /usr/lib/systemd/systemd-logind match_unit_removed
A function DIE doesn't have decl_line. Maybe broken DWARF?
A function DIE doesn't have decl_line. Maybe broken DWARF?
Probe point 'match_unit_removed' not found.
Error: Failed to add events.
The problem is that die_get_decl_file() uses the wrong CU to search for
the file. elfutils commit e1db5cdc9f has some good explanation for this:
dwarf_decl_file uses dwarf_attr_integrate to get the DW_AT_decl_file
attribute. This means the attribute might come from a different DIE
in a different CU. If so, we need to use the CU associated with the
attribute, not the original DIE, to resolve the file name.
This patch uses the same source of information as elfutils: use attribute
DW_AT_decl_file and use this CU to search for the file.
Fixes: dc9a5d2ccd5c ("perf probe: Fix to get declared file name from clang DWARF5")
Signed-off-by: Georg Müller <georgmueller(a)gmx.net>
Link: https://lore.kernel.org/r/5a00d5a5-7be7-ef8a-4044-9a16249fff25@gmx.net/
Cc: stable(a)vger.kernel.org
---
tools/perf/util/dwarf-aux.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index b07414409771..137b3ed9897b 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -478,8 +478,10 @@ static const char *die_get_file_name(Dwarf_Die *dw_die, int idx)
{
Dwarf_Die cu_die;
Dwarf_Files *files;
+ Dwarf_Attribute attr_mem;
- if (idx < 0 || !dwarf_diecu(dw_die, &cu_die, NULL, NULL) ||
+ if (idx < 0 || !dwarf_attr_integrate(dw_die, DW_AT_decl_file, &attr_mem) ||
+ !dwarf_cu_die(attr_mem.cu, &cu_die, NULL, NULL, NULL, NULL, NULL, NULL) ||
dwarf_getsrcfiles(&cu_die, &files, NULL) != 0)
return NULL;
--
2.41.0
After commit 0e96ea5c3eb5904e5dc2f ("MIPS: Loongson64: Clean up use of
cc-ifversion") we get a build error when make modules_install:
cc1: error: '-mloongson-mmi' must be used with '-mhard-float'
The reason is when make modules_install, 'call cc-option' doesn't work
in $(KBUILD_CFLAGS) of 'CHECKFLAGS'. Then there is no -mno-loongson-mmi
applied and -march=loongson3a enable MMI instructions.
Fix this by partially reverting to the old logic, use 'call cc-option'
to conditionally apply -march=loongson3a and -march=mips64r2.
Fixes: 0e96ea5c3eb5904e5dc2f ("MIPS: Loongson64: Clean up use of cc-ifversion")
Cc: stable(a)vger.kernel.org
Cc: Nathan Chancellor <nathan(a)kernel.org>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
arch/mips/Makefile | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index a7a4ee66a9d3..7fb76d12829e 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -186,11 +186,8 @@ cflags-$(CONFIG_CPU_LOONGSON2F) += -march=loongson2f -Wa,--trap
# Some -march= flags enable MMI instructions, and GCC complains about that
# support being enabled alongside -msoft-float. Thus explicitly disable MMI.
cflags-$(CONFIG_CPU_LOONGSON2EF) += $(call cc-option,-mno-loongson-mmi)
-ifdef CONFIG_CPU_LOONGSON64
cflags-$(CONFIG_CPU_LOONGSON64) += -Wa,--trap
-cflags-$(CONFIG_CC_IS_GCC) += -march=loongson3a
-cflags-$(CONFIG_CC_IS_CLANG) += -march=mips64r2
-endif
+cflags-$(CONFIG_CPU_LOONGSON64) += $(call cc-option,-march=loongson3a,-march=mips64r2)
cflags-$(CONFIG_CPU_LOONGSON64) += $(call cc-option,-mno-loongson-mmi)
cflags-$(CONFIG_CPU_R4000_WORKAROUNDS) += $(call cc-option,-mfix-r4000,)
--
2.39.3
From: Zack Rusin <zackr(a)vmware.com>
Cursor planes on virtualized drivers have special meaning and require
that the clients handle them in specific ways, e.g. the cursor plane
should react to the mouse movement the way a mouse cursor would be
expected to and the client is required to set hotspot properties on it
in order for the mouse events to be routed correctly.
This breaks the contract as specified by the "universal planes". Fix it
by disabling the cursor planes on virtualized drivers while adding
a foundation on top of which it's possible to special case mouse cursor
planes for clients that want it.
Disabling the cursor planes makes some kms compositors which were broken,
e.g. Weston, fallback to software cursor which works fine or at least
better than currently while having no effect on others, e.g. gnome-shell
or kwin, which put virtualized drivers on a deny-list when running in
atomic context to make them fallback to legacy kms and avoid this issue.
Signed-off-by: Zack Rusin <zackr(a)vmware.com>
Fixes: 681e7ec73044 ("drm: Allow userspace to ask for universal plane list (v2)")
Cc: <stable(a)vger.kernel.org> # v5.4+
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Maxime Ripard <mripard(a)kernel.org>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: David Airlie <airlied(a)linux.ie>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: Gerd Hoffmann <kraxel(a)redhat.com>
Cc: Hans de Goede <hdegoede(a)redhat.com>
Cc: Gurchetan Singh <gurchetansingh(a)chromium.org>
Cc: Chia-I Wu <olvaffe(a)gmail.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: virtualization(a)lists.linux-foundation.org
Cc: spice-devel(a)lists.freedesktop.org
Acked-by: Pekka Paalanen <pekka.paalanen(a)collabora.com>
Reviewed-by: Javier Martinez Canillas <javierm(a)redhat.com>
---
drivers/gpu/drm/drm_plane.c | 13 +++++++++++++
drivers/gpu/drm/qxl/qxl_drv.c | 2 +-
drivers/gpu/drm/vboxvideo/vbox_drv.c | 2 +-
drivers/gpu/drm/virtio/virtgpu_drv.c | 3 ++-
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 2 +-
include/drm/drm_drv.h | 9 +++++++++
include/drm/drm_file.h | 12 ++++++++++++
7 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 24e7998d1731..c6bbb0c209f4 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -678,6 +678,19 @@ int drm_mode_getplane_res(struct drm_device *dev, void *data,
!file_priv->universal_planes)
continue;
+ /*
+ * If we're running on a virtualized driver then,
+ * unless userspace advertizes support for the
+ * virtualized cursor plane, disable cursor planes
+ * because they'll be broken due to missing cursor
+ * hotspot info.
+ */
+ if (plane->type == DRM_PLANE_TYPE_CURSOR &&
+ drm_core_check_feature(dev, DRIVER_CURSOR_HOTSPOT) &&
+ file_priv->atomic &&
+ !file_priv->supports_virtualized_cursor_plane)
+ continue;
+
if (drm_lease_held(file_priv, plane->base.id)) {
if (count < plane_resp->count_planes &&
put_user(plane->base.id, plane_ptr + count))
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index b30ede1cf62d..91930e84a9cd 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -283,7 +283,7 @@ static const struct drm_ioctl_desc qxl_ioctls[] = {
};
static struct drm_driver qxl_driver = {
- .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC,
+ .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
.dumb_create = qxl_mode_dumb_create,
.dumb_map_offset = drm_gem_ttm_dumb_map_offset,
diff --git a/drivers/gpu/drm/vboxvideo/vbox_drv.c b/drivers/gpu/drm/vboxvideo/vbox_drv.c
index 4fee15c97c34..8ecd0863fad7 100644
--- a/drivers/gpu/drm/vboxvideo/vbox_drv.c
+++ b/drivers/gpu/drm/vboxvideo/vbox_drv.c
@@ -172,7 +172,7 @@ DEFINE_DRM_GEM_FOPS(vbox_fops);
static const struct drm_driver driver = {
.driver_features =
- DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC,
+ DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
.fops = &vbox_fops,
.name = DRIVER_NAME,
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index a7ec5a3770da..60b1fd23229c 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -176,7 +176,8 @@ static const struct drm_driver driver = {
* If KMS is disabled DRIVER_MODESET and DRIVER_ATOMIC are masked
* out via drm_device::driver_features:
*/
- .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_RENDER | DRIVER_ATOMIC,
+ .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_RENDER | DRIVER_ATOMIC |
+ DRIVER_CURSOR_HOTSPOT,
.open = virtio_gpu_driver_open,
.postclose = virtio_gpu_driver_postclose,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 8b24ecf60e3e..d3e308fdfd5b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1611,7 +1611,7 @@ static const struct file_operations vmwgfx_driver_fops = {
static const struct drm_driver driver = {
.driver_features =
- DRIVER_MODESET | DRIVER_RENDER | DRIVER_ATOMIC | DRIVER_GEM,
+ DRIVER_MODESET | DRIVER_RENDER | DRIVER_ATOMIC | DRIVER_GEM | DRIVER_CURSOR_HOTSPOT,
.ioctls = vmw_ioctls,
.num_ioctls = ARRAY_SIZE(vmw_ioctls),
.master_set = vmw_master_set,
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index b77f2c7275b7..8303016665dd 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -104,6 +104,15 @@ enum drm_driver_feature {
* acceleration should be handled by two drivers that are connected using auxiliary bus.
*/
DRIVER_COMPUTE_ACCEL = BIT(7),
+ /**
+ * @DRIVER_CURSOR_HOTSPOT:
+ *
+ * Driver supports and requires cursor hotspot information in the
+ * cursor plane (e.g. cursor plane has to actually track the mouse
+ * cursor and the clients are required to set hotspot in order for
+ * the cursor planes to work correctly).
+ */
+ DRIVER_CURSOR_HOTSPOT = BIT(8),
/* IMPORTANT: Below are all the legacy flags, add new ones above. */
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 966912053cb0..91cf7f452f86 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -228,6 +228,18 @@ struct drm_file {
*/
bool is_master;
+ /**
+ * @supports_virtualized_cursor_plane:
+ *
+ * This client is capable of handling the cursor plane with the
+ * restrictions imposed on it by the virtualized drivers.
+ *
+ * This implies that the cursor plane has to behave like a cursor
+ * i.e. track cursor movement. It also requires setting of the
+ * hotspot properties by the client on the cursor plane.
+ */
+ bool supports_virtualized_cursor_plane;
+
/**
* @master:
*
--
2.39.2
From: Xiubo Li <xiubli(a)redhat.com>
If just before the revoke request, which will increase the 'seq', is
sent out the clients released the corresponding caps and sent out
the cap update request to MDS with old 'seq', the mds will miss
checking the seqs and calculating the caps.
We should always send an ack for revoke requests.
Cc: stable(a)vger.kernel.org
Cc: Patrick Donnelly <pdonnell(a)redhat.com>
URL: https://tracker.ceph.com/issues/61782
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
fs/ceph/caps.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1052885025b3..eee2fbca3430 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3737,6 +3737,15 @@ static void handle_cap_grant(struct inode *inode,
}
BUG_ON(cap->issued & ~cap->implemented);
+ /* don't let check_caps skip sending a response to MDS for revoke msgs */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
+ cap->mds_wanted = 0;
+ if (cap == ci->i_auth_cap)
+ check_caps = 1; /* check auth cap only */
+ else
+ check_caps = 2; /* check all caps */
+ }
+
if (extra_info->inline_version > 0 &&
extra_info->inline_version >= ci->i_inline_version) {
ci->i_inline_version = extra_info->inline_version;
--
2.40.1
__register_btf_kfunc_id_set() assumes .BTF to be part of the module's
.ko file if CONFIG_DEBUG_INFO_BTF is enabled. If that's not the case,
the function prints an error message and return an error. As a result,
such modules cannot be loaded.
However, the section could be stripped out during a build process. It
would be better to let the modules loaded, because their basic
functionalities have no problem[1], though the BTF functionalities will
not be supported. Make the function to lower the level of the message
from error to warn, and return no error.
[1] https://lore.kernel.org/bpf/20220219082037.ow2kbq5brktf4f2u@apollo.legion/
Reported-by: Alexander Egorenkov <Alexander.Egorenkov(a)ibm.com>
Link: https://lore.kernel.org/bpf/87y228q66f.fsf@oc8242746057.ibm.com/
Suggested-by: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Link: https://lore.kernel.org/bpf/20220219082037.ow2kbq5brktf4f2u@apollo.legion/
Fixes: dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf")
Cc: <stable(a)vger.kernel.org> # 5.17.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
kernel/bpf/btf.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6b682b8e4b50..d683f034996f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7848,14 +7848,10 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
btf = btf_get_module_btf(kset->owner);
if (!btf) {
- if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- pr_err("missing vmlinux BTF, cannot register kfuncs\n");
- return -ENOENT;
- }
- if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
- pr_err("missing module BTF, cannot register kfuncs\n");
- return -ENOENT;
- }
+ if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
+ pr_warn("missing vmlinux BTF, cannot register kfuncs\n");
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register kfuncs\n");
return 0;
}
if (IS_ERR(btf))
--
2.25.1
From: Souradeep Chakrabarti <schakrabarti(a)linux.microsoft.com>
VF unload gets stuck in MANA driver, when the host is not responding.
The function mana_dealloc_queues() tries to clear the inflight packets,
and gets stuck in while loop. Another problem in this scenario is the
timeout from hwc send request.
These patch add fix for the same.
In mana driver we are adding a timeout in the while loop, to fix it.
Also we are adding a new attribute in mana_context, which gets set when
mana_hwc_send_request() hits a timeout because of host unresponsiveness.
Souradeep Chakrabarti (2):
net: mana: Fix MANA VF unload when host is unresponsive
net: mana: Fix MANA VF unload when host is unresponsive
.../net/ethernet/microsoft/mana/gdma_main.c | 4 +++-
.../net/ethernet/microsoft/mana/hw_channel.c | 12 +++++++++++-
drivers/net/ethernet/microsoft/mana/mana_en.c | 19 +++++++++++++++++--
include/net/mana/mana.h | 2 ++
4 files changed, 33 insertions(+), 4 deletions(-)
--
2.34.1
From: Zack Rusin <zackr(a)vmware.com>
Cursor planes on virtualized drivers have special meaning and require
that the clients handle them in specific ways, e.g. the cursor plane
should react to the mouse movement the way a mouse cursor would be
expected to and the client is required to set hotspot properties on it
in order for the mouse events to be routed correctly.
This breaks the contract as specified by the "universal planes". Fix it
by disabling the cursor planes on virtualized drivers while adding
a foundation on top of which it's possible to special case mouse cursor
planes for clients that want it.
Disabling the cursor planes makes some kms compositors which were broken,
e.g. Weston, fallback to software cursor which works fine or at least
better than currently while having no effect on others, e.g. gnome-shell
or kwin, which put virtualized drivers on a deny-list when running in
atomic context to make them fallback to legacy kms and avoid this issue.
Signed-off-by: Zack Rusin <zackr(a)vmware.com>
Fixes: 681e7ec73044 ("drm: Allow userspace to ask for universal plane list (v2)")
Cc: <stable(a)vger.kernel.org> # v5.4+
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Maxime Ripard <mripard(a)kernel.org>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: David Airlie <airlied(a)linux.ie>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: Gerd Hoffmann <kraxel(a)redhat.com>
Cc: Hans de Goede <hdegoede(a)redhat.com>
Cc: Gurchetan Singh <gurchetansingh(a)chromium.org>
Cc: Chia-I Wu <olvaffe(a)gmail.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: virtualization(a)lists.linux-foundation.org
Cc: spice-devel(a)lists.freedesktop.org
---
drivers/gpu/drm/drm_plane.c | 13 +++++++++++++
drivers/gpu/drm/qxl/qxl_drv.c | 2 +-
drivers/gpu/drm/vboxvideo/vbox_drv.c | 2 +-
drivers/gpu/drm/virtio/virtgpu_drv.c | 2 +-
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 2 +-
include/drm/drm_drv.h | 9 +++++++++
include/drm/drm_file.h | 12 ++++++++++++
7 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 24e7998d1731..a4a39f4834e2 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -678,6 +678,19 @@ int drm_mode_getplane_res(struct drm_device *dev, void *data,
!file_priv->universal_planes)
continue;
+ /*
+ * If we're running on a virtualized driver then,
+ * unless userspace advertizes support for the
+ * virtualized cursor plane, disable cursor planes
+ * because they'll be broken due to missing cursor
+ * hotspot info.
+ */
+ if (plane->type == DRM_PLANE_TYPE_CURSOR &&
+ drm_core_check_feature(dev, DRIVER_CURSOR_HOTSPOT) &&
+ file_priv->atomic &&
+ !file_priv->supports_virtualized_cursor_plane)
+ continue;
+
if (drm_lease_held(file_priv, plane->base.id)) {
if (count < plane_resp->count_planes &&
put_user(plane->base.id, plane_ptr + count))
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index b30ede1cf62d..91930e84a9cd 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -283,7 +283,7 @@ static const struct drm_ioctl_desc qxl_ioctls[] = {
};
static struct drm_driver qxl_driver = {
- .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC,
+ .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
.dumb_create = qxl_mode_dumb_create,
.dumb_map_offset = drm_gem_ttm_dumb_map_offset,
diff --git a/drivers/gpu/drm/vboxvideo/vbox_drv.c b/drivers/gpu/drm/vboxvideo/vbox_drv.c
index 4fee15c97c34..8ecd0863fad7 100644
--- a/drivers/gpu/drm/vboxvideo/vbox_drv.c
+++ b/drivers/gpu/drm/vboxvideo/vbox_drv.c
@@ -172,7 +172,7 @@ DEFINE_DRM_GEM_FOPS(vbox_fops);
static const struct drm_driver driver = {
.driver_features =
- DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC,
+ DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
.fops = &vbox_fops,
.name = DRIVER_NAME,
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index a7ec5a3770da..8f4bb8a4e952 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -176,7 +176,7 @@ static const struct drm_driver driver = {
* If KMS is disabled DRIVER_MODESET and DRIVER_ATOMIC are masked
* out via drm_device::driver_features:
*/
- .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_RENDER | DRIVER_ATOMIC,
+ .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_RENDER | DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
.open = virtio_gpu_driver_open,
.postclose = virtio_gpu_driver_postclose,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 8b24ecf60e3e..d3e308fdfd5b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1611,7 +1611,7 @@ static const struct file_operations vmwgfx_driver_fops = {
static const struct drm_driver driver = {
.driver_features =
- DRIVER_MODESET | DRIVER_RENDER | DRIVER_ATOMIC | DRIVER_GEM,
+ DRIVER_MODESET | DRIVER_RENDER | DRIVER_ATOMIC | DRIVER_GEM | DRIVER_CURSOR_HOTSPOT,
.ioctls = vmw_ioctls,
.num_ioctls = ARRAY_SIZE(vmw_ioctls),
.master_set = vmw_master_set,
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index b77f2c7275b7..8303016665dd 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -104,6 +104,15 @@ enum drm_driver_feature {
* acceleration should be handled by two drivers that are connected using auxiliary bus.
*/
DRIVER_COMPUTE_ACCEL = BIT(7),
+ /**
+ * @DRIVER_CURSOR_HOTSPOT:
+ *
+ * Driver supports and requires cursor hotspot information in the
+ * cursor plane (e.g. cursor plane has to actually track the mouse
+ * cursor and the clients are required to set hotspot in order for
+ * the cursor planes to work correctly).
+ */
+ DRIVER_CURSOR_HOTSPOT = BIT(8),
/* IMPORTANT: Below are all the legacy flags, add new ones above. */
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 966912053cb0..91cf7f452f86 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -228,6 +228,18 @@ struct drm_file {
*/
bool is_master;
+ /**
+ * @supports_virtualized_cursor_plane:
+ *
+ * This client is capable of handling the cursor plane with the
+ * restrictions imposed on it by the virtualized drivers.
+ *
+ * This implies that the cursor plane has to behave like a cursor
+ * i.e. track cursor movement. It also requires setting of the
+ * hotspot properties by the client on the cursor plane.
+ */
+ bool supports_virtualized_cursor_plane;
+
/**
* @master:
*
--
2.39.2
On 03.04.23 08:14, Purohit, Kaushal wrote:
> Hi,
>
Hi,
> Referring to patch with commit ID (*e10dcb1b6ba714243ad5a35a11b91cc14103a9a9*).
>
> This is a spec violation for CDC NCM class driver. Driver clearly says the significance of network capabilities. (snapshot below)
>
> However, with the mentioned patch these values are disrespected and commands specific to these capabilities are sent from the host regardless of device' capabilities to handle them.
Right. So for your device, the correct behavior would be to do
nothing, wouldn't it? The packets would be delivered and the host
needs to filter and discard unrequested packets.
> Currently we are setting these bits to 0 indicating no capabilities on our device and still we observe that Host (Linux kernel host cdc driver) has been sending requests specific to these capabilities.
>
> Please let me know if there is a better way to indicate host that device does not have these capabilities.
no you are doing things as they are supposed to be done and
the host is at fault. This kernel bug needs to be fixed.
Regards
Oliver
From: Long Li <longli(a)microsoft.com>
It's inefficient to ring the doorbell page every time a WQE is posted to
the received queue.
Move the code for ringing doorbell page to where after we have posted all
WQEs to the receive queue during a callback from napi_poll().
Tests showed no regression in network latency benchmarks.
Cc: stable(a)vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Long Li <longli(a)microsoft.com>
---
Change log:
v2:
Check for comp_read > 0 as it might be negative on completion error.
Set rq.wqe_cnt to 0 according to BNIC spec.
drivers/net/ethernet/microsoft/mana/gdma_main.c | 5 ++++-
drivers/net/ethernet/microsoft/mana/mana_en.c | 10 ++++++++--
2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8f3f78b68592..ef11d09a3655 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -300,8 +300,11 @@ static void mana_gd_ring_doorbell(struct gdma_context *gc, u32 db_index,
void mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue)
{
+ /* BNIC Spec specifies that client should set 0 for rq.wqe_cnt
+ * This value is not used in sq
+ */
mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type,
- queue->id, queue->head * GDMA_WQE_BU_SIZE, 1);
+ queue->id, queue->head * GDMA_WQE_BU_SIZE, 0);
}
void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index cd4d5ceb9f2d..1d8abe63fcb8 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1383,8 +1383,8 @@ static void mana_post_pkt_rxq(struct mana_rxq *rxq)
recv_buf_oob = &rxq->rx_oobs[curr_index];
- err = mana_gd_post_and_ring(rxq->gdma_rq, &recv_buf_oob->wqe_req,
- &recv_buf_oob->wqe_inf);
+ err = mana_gd_post_work_request(rxq->gdma_rq, &recv_buf_oob->wqe_req,
+ &recv_buf_oob->wqe_inf);
if (WARN_ON_ONCE(err))
return;
@@ -1654,6 +1654,12 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
mana_process_rx_cqe(rxq, cq, &comp[i]);
}
+ if (comp_read > 0) {
+ struct gdma_context *gc = rxq->gdma_rq->gdma_dev->gdma_context;
+
+ mana_gd_wq_ring_doorbell(gc, rxq->gdma_rq);
+ }
+
if (rxq->xdp_flush)
xdp_do_flush();
}
--
2.34.1
From: Johannes Berg <johannes.berg(a)intel.com>
[ Upstream commit 34d4e3eb67fed9c19719bedb748e5a8b6ccc97a5 ]
Since links are only controlled by userspace via cfg80211
in AP mode, also only remove them from the driver in that
case.
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman(a)intel.com>
Link: https://lore.kernel.org/r/20230608163202.ed65b94916fa.I2458c46888284cc5ce30…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/util.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d1a89e82ead08..4053d65d0218b 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
* Copyright 2007-2009 Johannes Berg <johannes(a)sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
*/
#include <linux/export.h>
#include <linux/bitops.h>
@@ -2548,6 +2548,13 @@ void cfg80211_remove_links(struct wireless_dev *wdev)
{
unsigned int link_id;
+ /*
+ * links are controlled by upper layers (userspace/cfg)
+ * only for AP mode, so only remove them here for AP
+ */
+ if (wdev->iftype != NL80211_IFTYPE_AP)
+ return;
+
wdev_lock(wdev);
if (wdev->valid_links) {
for_each_valid_link(wdev, link_id)
--
2.39.2
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 57fc0f1ceaa4016354cf6f88533e20b56190e41a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062330-scrawny-capture-257c@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 57fc0f1ceaa4016354cf6f88533e20b56190e41a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Tue, 20 Jun 2023 18:24:23 +0200
Subject: [PATCH] mptcp: ensure listener is unhashed before updating the sk
status
The MPTCP protocol access the listener subflow in a lockless
manner in a couple of places (poll, diag). That works only if
the msk itself leaves the listener status only after that the
subflow itself has been closed/disconnected. Otherwise we risk
deadlock in diag, as reported by Christoph.
Address the issue ensuring that the first subflow (the listener
one) is always disconnected before updating the msk socket status.
Reported-by: Christoph Paasch <cpaasch(a)apple.com>
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/407
Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 59f8f3124855..1224dfca5bf3 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
if (err)
return err;
+ inet_sk_state_store(newsk, TCP_LISTEN);
err = kernel_listen(ssock, backlog);
if (err)
return err;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a66ec341485e..a6c7f2d24909 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2368,13 +2368,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
kfree_rcu(subflow, rcu);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
- if (ssk->sk_state == TCP_LISTEN) {
- tcp_set_state(ssk, TCP_CLOSE);
- mptcp_subflow_queue_clean(sk, ssk);
- inet_csk_listen_stop(ssk);
- mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
- }
-
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@@ -2902,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
return EPOLLIN | EPOLLRDNORM;
}
-static void mptcp_listen_inuse_dec(struct sock *sk)
+static void mptcp_check_listen_stop(struct sock *sk)
{
- if (inet_sk_state_load(sk) == TCP_LISTEN)
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ struct sock *ssk;
+
+ if (inet_sk_state_load(sk) != TCP_LISTEN)
+ return;
+
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ ssk = mptcp_sk(sk)->first;
+ if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
+ return;
+
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ mptcp_subflow_queue_clean(sk, ssk);
+ inet_csk_listen_stop(ssk);
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
+ tcp_set_state(ssk, TCP_CLOSE);
+ release_sock(ssk);
}
bool __mptcp_close(struct sock *sk, long timeout)
@@ -2918,7 +2925,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
- mptcp_listen_inuse_dec(sk);
+ mptcp_check_listen_stop(sk);
inet_sk_state_store(sk, TCP_CLOSE);
goto cleanup;
}
@@ -3035,7 +3042,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
if (msk->fastopening)
return -EBUSY;
- mptcp_listen_inuse_dec(sk);
+ mptcp_check_listen_stop(sk);
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_stop_timer(sk);
Hi ,
We are the most trusted data partner who excels in offering only authentic, updated, and verified Design Automation Conference 2023 Attendees list
I hope your organization is planning to Build brand awareness, engage with decision makers, launch new products and services at the upcoming Design Automation Conference - DAC_2023, Conference & Exhibition will take place from July 9th - July 13th , 2023, in San Francisco, CA.
Attendees_List of Design Automation Conference - DAC is now available-for-purchase with below data fields in an excel spread sheet.
Let me know if you'd be interested in acquiring any of the above list, so I can get back to you with a sample_file and associated details for your review.
If I have reached the wrong person, please direct me to the right person.
Thanks,
Sharon Smith
Global Lead Generation | Walnut, CA 91789, USA
If you don't want to include yourself in our mailing list, please reply with "No Thanks"
From: "Darrick J. Wong" <djwong(a)kernel.org>
commit 22ed903eee23a5b174e240f1cdfa9acf393a5210 upstream.
syzbot detected a crash during log recovery:
XFS (loop0): Mounting V5 Filesystem bfdc47fc-10d8-4eed-a562-11a831b3f791
XFS (loop0): Torn write (CRC failure) detected at log block 0x180. Truncating head block from 0x200.
XFS (loop0): Starting recovery (logdev: internal)
==================================================================
BUG: KASAN: slab-out-of-bounds in xfs_btree_lookup_get_block+0x15c/0x6d0 fs/xfs/libxfs/xfs_btree.c:1813
Read of size 8 at addr ffff88807e89f258 by task syz-executor132/5074
CPU: 0 PID: 5074 Comm: syz-executor132 Not tainted 6.2.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x1b1/0x290 lib/dump_stack.c:106
print_address_description+0x74/0x340 mm/kasan/report.c:306
print_report+0x107/0x1f0 mm/kasan/report.c:417
kasan_report+0xcd/0x100 mm/kasan/report.c:517
xfs_btree_lookup_get_block+0x15c/0x6d0 fs/xfs/libxfs/xfs_btree.c:1813
xfs_btree_lookup+0x346/0x12c0 fs/xfs/libxfs/xfs_btree.c:1913
xfs_btree_simple_query_range+0xde/0x6a0 fs/xfs/libxfs/xfs_btree.c:4713
xfs_btree_query_range+0x2db/0x380 fs/xfs/libxfs/xfs_btree.c:4953
xfs_refcount_recover_cow_leftovers+0x2d1/0xa60 fs/xfs/libxfs/xfs_refcount.c:1946
xfs_reflink_recover_cow+0xab/0x1b0 fs/xfs/xfs_reflink.c:930
xlog_recover_finish+0x824/0x920 fs/xfs/xfs_log_recover.c:3493
xfs_log_mount_finish+0x1ec/0x3d0 fs/xfs/xfs_log.c:829
xfs_mountfs+0x146a/0x1ef0 fs/xfs/xfs_mount.c:933
xfs_fs_fill_super+0xf95/0x11f0 fs/xfs/xfs_super.c:1666
get_tree_bdev+0x400/0x620 fs/super.c:1282
vfs_get_tree+0x88/0x270 fs/super.c:1489
do_new_mount+0x289/0xad0 fs/namespace.c:3145
do_mount fs/namespace.c:3488 [inline]
__do_sys_mount fs/namespace.c:3697 [inline]
__se_sys_mount+0x2d3/0x3c0 fs/namespace.c:3674
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7f89fa3f4aca
Code: 83 c4 08 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fffd5fb5ef8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 00646975756f6e2c RCX: 00007f89fa3f4aca
RDX: 0000000020000100 RSI: 0000000020009640 RDI: 00007fffd5fb5f10
RBP: 00007fffd5fb5f10 R08: 00007fffd5fb5f50 R09: 000000000000970d
R10: 0000000000200800 R11: 0000000000000206 R12: 0000000000000004
R13: 0000555556c6b2c0 R14: 0000000000200800 R15: 00007fffd5fb5f50
</TASK>
The fuzzed image contains an AGF with an obviously garbage
agf_refcount_level value of 32, and a dirty log with a buffer log item
for that AGF. The ondisk AGF has a higher LSN than the recovered log
item. xlog_recover_buf_commit_pass2 reads the buffer, compares the
LSNs, and decides to skip replay because the ondisk buffer appears to be
newer.
Unfortunately, the ondisk buffer is corrupt, but recovery just read the
buffer with no buffer ops specified:
error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno,
buf_f->blf_len, buf_flags, &bp, NULL);
Skipping the buffer leaves its contents in memory unverified. This sets
us up for a kernel crash because xfs_refcount_recover_cow_leftovers
reads the buffer (which is still around in XBF_DONE state, so no read
verification) and creates a refcountbt cursor of height 32. This is
impossible so we run off the end of the cursor object and crash.
Fix this by invoking the verifier on all skipped buffers and aborting
log recovery if the ondisk buffer is corrupt. It might be smarter to
force replay the log item atop the buffer and then see if it'll pass the
write verifier (like ext4 does) but for now let's go with the
conservative option where we stop immediately.
Link: https://syzkaller.appspot.com/bug?extid=7e9494b8b399902e994e
Signed-off-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Dave Chinner <dchinner(a)redhat.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
Signed-off-by: Chandan Babu R <chandan.babu(a)oracle.com>
Acked-by: Darrick J. Wong <djwong(a)kernel.org>
---
Hi Greg,
This is a backport of a patch that has already been merged into 6.1.y,
5.15.y and 5.10.y. I have tested this patch and have not found any new
regressions arising because of it. Please commit this patch into 5.4.y
tree.
fs/xfs/xfs_log_recover.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 84f6c8628db5..d9b906d75dfa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2783,6 +2783,16 @@ xlog_recover_buffer_pass2(
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
trace_xfs_log_recover_buf_skip(log, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+
+ /*
+ * We're skipping replay of this buffer log item due to the log
+ * item LSN being behind the ondisk buffer. Verify the buffer
+ * contents since we aren't going to run the write verifier.
+ */
+ if (bp->b_ops) {
+ bp->b_ops->verify_read(bp);
+ error = bp->b_error;
+ }
goto out_release;
}
--
2.39.1
Code which interacts with timestamps needs to use the ktime_t type
returned by functions like ktime_get. The int type does not offer
enough space to store these values, and attempting to use it is a
recipe for problems. In this particular case, overflows would occur
when calculating/storing timestamps leading to incorrect values being
reported to userspace. In some cases these bad timestamps cause input
handling in userspace to appear hung.
Link: https://gitlab.freedesktop.org/libinput/libinput/-/issues/901
Fixes: 17d793f3ed53 ("HID: wacom: insert timestamp to packed Bluetooth (BT) events")
CC: stable(a)vger.kernel.org
Signed-off-by: Jason Gerecke <jason.gerecke(a)wacom.com>
---
v2: Use div_u64 to perform division to deal with ARC and ARM architectures
(as found by the kernel test robot)
drivers/hid/wacom_wac.c | 6 +++---
drivers/hid/wacom_wac.h | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c
index 2ccf838371343..174bf03908d7c 100644
--- a/drivers/hid/wacom_wac.c
+++ b/drivers/hid/wacom_wac.c
@@ -1314,7 +1314,7 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
struct input_dev *pen_input = wacom->pen_input;
unsigned char *data = wacom->data;
int number_of_valid_frames = 0;
- int time_interval = 15000000;
+ ktime_t time_interval = 15000000;
ktime_t time_packet_received = ktime_get();
int i;
@@ -1348,7 +1348,7 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
if (number_of_valid_frames) {
if (wacom->hid_data.time_delayed)
time_interval = ktime_get() - wacom->hid_data.time_delayed;
- time_interval /= number_of_valid_frames;
+ time_interval = div_u64(time_interval, number_of_valid_frames);
wacom->hid_data.time_delayed = time_packet_received;
}
@@ -1359,7 +1359,7 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
bool range = frame[0] & 0x20;
bool invert = frame[0] & 0x10;
int frames_number_reversed = number_of_valid_frames - i - 1;
- int event_timestamp = time_packet_received - frames_number_reversed * time_interval;
+ ktime_t event_timestamp = time_packet_received - frames_number_reversed * time_interval;
if (!valid)
continue;
diff --git a/drivers/hid/wacom_wac.h b/drivers/hid/wacom_wac.h
index 1a40bb8c5810c..ee21bb260f22f 100644
--- a/drivers/hid/wacom_wac.h
+++ b/drivers/hid/wacom_wac.h
@@ -324,7 +324,7 @@ struct hid_data {
int ps_connected;
bool pad_input_event_flag;
unsigned short sequence_number;
- int time_delayed;
+ ktime_t time_delayed;
};
struct wacom_remote_data {
--
2.41.0
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f1a0898b5d6a77d332d036da03bad6fa9770de5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062633-grip-headway-6fcc@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f1a0898b5d6a77d332d036da03bad6fa9770de5b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Fri, 9 Jun 2023 14:29:39 -0700
Subject: [PATCH] wifi: iwlwifi: mvm: spin_lock_bh() to fix lockdep regression
Lockdep on 6.4-rc on ThinkPad X1 Carbon 5th says
=====================================================
WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
6.4.0-rc5 #1 Not tainted
-----------------------------------------------------
kworker/3:1/49 [HC0[0]:SC0[4]:HE1:SE0] is trying to acquire:
ffff8881066fa368 (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}, at: rs_drv_get_rate+0x46/0xe7
and this task is already holding:
ffff8881066f80a8 (&sta->rate_ctrl_lock){+.-.}-{2:2}, at: rate_control_get_rate+0xbd/0x126
which would create a new lock dependency:
(&sta->rate_ctrl_lock){+.-.}-{2:2} -> (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}
but this new dependency connects a SOFTIRQ-irq-safe lock:
(&sta->rate_ctrl_lock){+.-.}-{2:2}
etc. etc. etc.
Changing the spin_lock() in rs_drv_get_rate() to spin_lock_bh() was not
enough to pacify lockdep, but changing them all on pers.lock has worked.
Fixes: a8938bc881d2 ("wifi: iwlwifi: mvm: Add locking to the rate read flow")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Link: https://lore.kernel.org/r/79ffcc22-9775-cb6d-3ffd-1a517c40beef@google.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 23266d0c9ce4..9a20468345e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -2692,7 +2692,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
lq_sta = mvm_sta;
- spin_lock(&lq_sta->pers.lock);
+ spin_lock_bh(&lq_sta->pers.lock);
iwl_mvm_hwrate_to_tx_rate_v1(lq_sta->last_rate_n_flags,
info->band, &info->control.rates[0]);
info->control.rates[0].count = 1;
@@ -2707,7 +2707,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
iwl_mvm_hwrate_to_tx_rate_v1(last_ucode_rate, info->band,
&txrc->reported_rate);
}
- spin_unlock(&lq_sta->pers.lock);
+ spin_unlock_bh(&lq_sta->pers.lock);
}
static void *rs_drv_alloc_sta(void *mvm_rate, struct ieee80211_sta *sta,
@@ -3264,11 +3264,11 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
/* If it's locked we are in middle of init flow
* just wait for next tx status to update the lq_sta data
*/
- if (!spin_trylock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
+ if (!spin_trylock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
return;
__iwl_mvm_rs_tx_status(mvm, sta, tid, info, ndp);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -4117,9 +4117,9 @@ void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm,
} else {
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- spin_lock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_lock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
rs_drv_rate_init(mvm, sta, band);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
}
The patch below does not apply to the 6.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.3.y
git checkout FETCH_HEAD
git cherry-pick -x f1a0898b5d6a77d332d036da03bad6fa9770de5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062625-squeamish-trident-4e6a@gregkh' --subject-prefix 'PATCH 6.3.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f1a0898b5d6a77d332d036da03bad6fa9770de5b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Fri, 9 Jun 2023 14:29:39 -0700
Subject: [PATCH] wifi: iwlwifi: mvm: spin_lock_bh() to fix lockdep regression
Lockdep on 6.4-rc on ThinkPad X1 Carbon 5th says
=====================================================
WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
6.4.0-rc5 #1 Not tainted
-----------------------------------------------------
kworker/3:1/49 [HC0[0]:SC0[4]:HE1:SE0] is trying to acquire:
ffff8881066fa368 (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}, at: rs_drv_get_rate+0x46/0xe7
and this task is already holding:
ffff8881066f80a8 (&sta->rate_ctrl_lock){+.-.}-{2:2}, at: rate_control_get_rate+0xbd/0x126
which would create a new lock dependency:
(&sta->rate_ctrl_lock){+.-.}-{2:2} -> (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}
but this new dependency connects a SOFTIRQ-irq-safe lock:
(&sta->rate_ctrl_lock){+.-.}-{2:2}
etc. etc. etc.
Changing the spin_lock() in rs_drv_get_rate() to spin_lock_bh() was not
enough to pacify lockdep, but changing them all on pers.lock has worked.
Fixes: a8938bc881d2 ("wifi: iwlwifi: mvm: Add locking to the rate read flow")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Link: https://lore.kernel.org/r/79ffcc22-9775-cb6d-3ffd-1a517c40beef@google.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 23266d0c9ce4..9a20468345e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -2692,7 +2692,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
lq_sta = mvm_sta;
- spin_lock(&lq_sta->pers.lock);
+ spin_lock_bh(&lq_sta->pers.lock);
iwl_mvm_hwrate_to_tx_rate_v1(lq_sta->last_rate_n_flags,
info->band, &info->control.rates[0]);
info->control.rates[0].count = 1;
@@ -2707,7 +2707,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
iwl_mvm_hwrate_to_tx_rate_v1(last_ucode_rate, info->band,
&txrc->reported_rate);
}
- spin_unlock(&lq_sta->pers.lock);
+ spin_unlock_bh(&lq_sta->pers.lock);
}
static void *rs_drv_alloc_sta(void *mvm_rate, struct ieee80211_sta *sta,
@@ -3264,11 +3264,11 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
/* If it's locked we are in middle of init flow
* just wait for next tx status to update the lq_sta data
*/
- if (!spin_trylock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
+ if (!spin_trylock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
return;
__iwl_mvm_rs_tx_status(mvm, sta, tid, info, ndp);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -4117,9 +4117,9 @@ void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm,
} else {
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- spin_lock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_lock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
rs_drv_rate_init(mvm, sta, band);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
}
When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list. Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first. However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first. As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.
To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable. The default is still
10, but it can be overridden at compile-time (via Kconfig), boot-time
(via a kernel command-line option), or runtime (via sysfs).
This is Cc: stable because (when combined with appropriate userspace
changes) it fixes a severe performance and stability problem for Qubes
OS users.
Cc: stable(a)vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi(a)invisiblethingslab.com>
---
drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
2 files changed, 41 insertions(+), 11 deletions(-)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
static void gnttab_handle_deferred(struct timer_list *);
static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10;
+
static void gnttab_handle_deferred(struct timer_list *unused)
{
- unsigned int nr = 10;
+ unsigned int nr = READ_ONCE(free_per_iteration);
+ const bool ignore_limit = nr == 0;
struct deferred_entry *first = NULL;
unsigned long flags;
+ size_t freed = 0;
spin_lock_irqsave(&gnttab_list_lock, flags);
- while (nr--) {
+ while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
struct deferred_entry *entry
= list_first_entry(&deferred_list,
struct deferred_entry, list);
@@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
list_del(&entry->list);
spin_unlock_irqrestore(&gnttab_list_lock, flags);
if (_gnttab_end_foreign_access_ref(entry->ref)) {
+ uint64_t ret = atomic64_sub_return(1, &deferred_count);
put_free_entry(entry->ref);
- pr_debug("freeing g.e. %#x (pfn %#lx)\n",
- entry->ref, page_to_pfn(entry->page));
+ pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+ entry->ref, page_to_pfn(entry->page),
+ (unsigned long long)ret);
put_page(entry->page);
+ freed++;
kfree(entry);
entry = NULL;
} else {
@@ -530,21 +539,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
spin_lock_irqsave(&gnttab_list_lock, flags);
if (entry)
list_add_tail(&entry->list, &deferred_list);
- else if (list_empty(&deferred_list))
- break;
}
- if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+ if (list_empty(&deferred_list))
+ WARN_ON(atomic64_read(&deferred_count));
+ else if (!timer_pending(&deferred_timer)) {
deferred_timer.expires = jiffies + HZ;
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ pr_debug("Freed %zu references", freed);
}
static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
{
struct deferred_entry *entry;
gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
- const char *what = KERN_WARNING "leaking";
+ uint64_t leaked, deferred;
entry = kmalloc(sizeof(*entry), gfp);
if (!page) {
@@ -567,12 +577,20 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
- what = KERN_DEBUG "deferring";
+ deferred = atomic64_add_return(1, &deferred_count);
+ leaked = atomic64_read(&leaked_count);
+ pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+ } else {
+ deferred = atomic64_read(&deferred_count);
+ leaked = atomic64_add_return(1, &leaked_count);
+ pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
}
- printk("%s g.e. %#x (pfn %#lx)\n",
- what, ref, page ? page_to_pfn(page) : -1);
}
+module_param(free_per_iteration, uint, 0600);
+
int gnttab_try_end_foreign_access(grant_ref_t ref)
{
int ret = _gnttab_end_foreign_access_ref(ref);
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
Hello!
Following to the initial discussion
https://lore.kernel.org/all/20220701110341.3094023-1-s.hauer@pengutronix.de
which caused the revert commit:
Are there any plans to fix this issue for 5.10.y (and maybe other
stable branches)?
Thanks in advance!
On Thu, Jun 22, 2023 at 6:46 AM Kegl Rohit <keglrohit(a)gmail.com> wrote:
>
> After reverting the revert :), the data corruption did not happen anymore!
>
> https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git/comm…
>
> On Wed, Jun 21, 2023 at 7:55 PM Kegl Rohit <keglrohit(a)gmail.com> wrote:
> >
> > ok, looking at the 5.10.184 gpmi-nand.c:
> >
> > #define BF_GPMI_TIMING1_BUSY_TIMEOUT(v) \
> > (((v) << BP_GPMI_TIMING1_BUSY_TIMEOUT) & BM_GPMI_TIMING1_BUSY_TIMEOUT)
> >
> > hw->timing1 = BF_GPMI_TIMING1_BUSY_TIMEOUT(busy_timeout_cycles * 4096);
> >
> > and then 5.19 (upstream patch source 0fddf9ad06fd9f439f137139861556671673e31c)
> > https://github.com/gregkh/linux/commit/0fddf9ad06fd9f439f137139861556671673…
> >
> > hw->timing1 = BF_GPMI_TIMING1_BUSY_TIMEOUT(DIV_ROUND_UP(busy_timeout_cycles,
> > 4096));
> >
> > could be the cause. DIV_ROUND_UP is most likely a division and
> > busy_timeout_cycles * 4096 a multiplication!
> >
> > The backport is wrong, because on the 5.10 kernel tree commit
> > cc5ee0e0eed0bec2b7cc1d0feb9405e884eace7d was reverted and on mainline
> > not.
> > https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git/comm…
> >
> > => now in 5.10.184 this line "hw->timing1 ..." is wrong!
> >
> > I will test this tomorrow.
> >
> > On Wed, Jun 21, 2023 at 5:26 PM han.xu <han.xu(a)nxp.com> wrote:
> > >
> > > On 23/06/21 04:27PM, Kegl Rohit wrote:
> > > > Hello!
> > > >
> > > > Using imx7d and rt stable kernel tree.
> > > >
> > > > After upgrading to v5.10.184-rt90 the rootfs ubifs mtd partition got corrupted.
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git/tag/…
> > > >
> > > > After reverting the latest patch
> > > > (e4e4b24b42e710db058cc2a79a7cf16bf02b4915), the rootfs partition did
> > > > not get corrupted.
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git/comm…
> > > >
> > > > The commit message states the timeout calculation was changed.
> > > > Here are the calculated timeouts `busy_timeout_cycles` before (_old)
> > > > and after the patch (_new):
> > > >
> > > > [ 0.491534] busy_timeout_cycles_old 4353
> > > > [ 0.491604] busy_timeout_cycles_new 1424705
> > > > [ 0.492300] nand: device found, Manufacturer ID: 0xc2, Chip ID: 0xdc
> > > > [ 0.492310] nand: Macronix MX30LF4G28AC
> > > > [ 0.492316] nand: 512 MiB, SLC, erase size: 128 KiB, page size:
> > > > 2048, OOB size: 112
> > > > [ 0.492488] busy_timeout_cycles_old 4353
> > > > [ 0.492493] busy_timeout_cycles_new 1424705
> > > > [ 0.492863] busy_timeout_cycles_old 2510
> > > > [ 0.492872] busy_timeout_cycles_new 350000
> > > >
> > > > The new timeouts are set a lot higher. Higher timeouts should not be
> > > > an issue. Lower timeouts could be an issue.
> > > > But because of this high timeouts gpmi-nand is broken for us.
> > > >
> > > > For now we simple reverted the change.
> > > > The new calculations seem to be flaky, a previous "fix backport" was
> > > > already reverted because of data corruption.
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git/comm…
> > > >
> > > > Any guesses why the high timeout causes issues?
> > >
> > > high timeout with wrong calculation may overflow and causes DEVICE_BUSY_TIMEOUT
> > > register turns to be 0.
> > >
> > > >
> > > >
> > > > Thanks in advance!
> > > >
> > > > ______________________________________________________
> > > > Linux MTD discussion mailing list
> > > > http://lists.infradead.org/mailman/listinfo/linux-mtd/