Commit 25b892b583cc ("ARM: dts: arm: Update register-bit-led nodes
'reg' and node names") added a 'reg' property to nodes. This change has
the side effect of changing how the kernel generates the device name.
The assumption was a translatable 'reg' address is unique. However, in
the case of the register-bit-led binding (and a few others) that is not
the case. The 'mask' property must also be used in this case to make a
unique device name.
Fixes: 25b892b583cc ("ARM: dts: arm: Update register-bit-led nodes 'reg' and node names")
Reported-by: Guenter Roeck <linux(a)roeck-us.net>
Cc: stable(a)vger.kernel.org
Cc: Frank Rowand <frowand.list(a)gmail.com>
Cc: Linus Walleij <linus.walleij(a)linaro.org>
Signed-off-by: Rob Herring <robh(a)kernel.org>
---
This should be applied to stable to minimize DT ABI breakage.
---
drivers/of/platform.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 07813fb1ef37..b3faf89744aa 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -76,6 +76,7 @@ static void of_device_make_bus_id(struct device *dev)
struct device_node *node = dev->of_node;
const __be32 *reg;
u64 addr;
+ u32 mask;
/* Construct the name, using parent nodes if necessary to ensure uniqueness */
while (node->parent) {
@@ -85,8 +86,13 @@ static void of_device_make_bus_id(struct device *dev)
*/
reg = of_get_property(node, "reg", NULL);
if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) {
- dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
- addr, node, dev_name(dev));
+ if (!of_property_read_u32(node, "mask", &mask))
+ dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn",
+ addr, ffs(mask) - 1, node, dev_name(dev));
+
+ else
+ dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
+ addr, node, dev_name(dev));
return;
}
--
2.32.0
From: Amit Kumar Mahapatra <amit.kumar-mahapatra(a)xilinx.com>
[ Upstream commit 167721a5909f867f8c18c8e78ea58e705ad9bbd4 ]
In kernel 5.4, support has been added for reading MTD devices via the nvmem
API.
For this the mtd devices are registered as read-only NVMEM providers under
sysfs with the same name as the flash partition label property.
So if flash partition label property of multiple flash devices are
identical then the second mtd device fails to get registered as a NVMEM
provider.
This patch fixes the issue by having different label property for different
flashes.
Signed-off-by: Amit Kumar Mahapatra <amit.kumar-mahapatra(a)xilinx.com>
Signed-off-by: Michal Simek <michal.simek(a)xilinx.com>
Link: https://lore.kernel.org/r/6c4b9b9232b93d9e316a63c086540fd5bf6b8687.16236842…
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts b/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts
index 4a86efa32d687..f7124e15f0ff6 100644
--- a/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts
+++ b/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts
@@ -131,7 +131,7 @@
reg = <0>;
partition@0 {
- label = "data";
+ label = "spi0-data";
reg = <0x0 0x100000>;
};
};
@@ -149,7 +149,7 @@
reg = <0>;
partition@0 {
- label = "data";
+ label = "spi1-data";
reg = <0x0 0x84000>;
};
};
--
2.33.0
From: Arnd Bergmann <arnd(a)arndb.de>
Naresh and Antonio ran into a build failure with latest Debian
armhf compilers, with lots of output like
tmp/ccY3nOAs.s:2215: Error: selected processor does not support `cpsid i' in ARM mode
As it turns out, $(cc-option) fails early here when the FPU is not
selected before CPU architecture is selected, as the compiler
option check runs before enabling -msoft-float, which causes
a problem when testing a target architecture level without an FPU:
cc1: error: '-mfloat-abi=hard': selected architecture lacks an FPU
Passing e.g. -march=armv6k+fp in place of -march=armv6k would avoid this
issue, but the fallback logic is already broken because all supported
compilers (gcc-5 and higher) are much more recent than these options,
and building with -march=armv5t as a fallback no longer works.
The best way forward that I see is to just remove all the checks, which
also has the nice side-effect of slightly improving the startup time for
'make'.
The -mtune=marvell-f option was apparently never supported by any mainline
compiler, and the custom Codesourcery gcc build that did support is
now too old to build kernels, so just use -mtune=xscale unconditionally
for those.
This should be safe to apply on all stable kernels, and will be required
in order to keep building them with gcc-11 and higher.
Reported-by: Antonio Terceiro <antonio.terceiro(a)linaro.org>
Reported-by: Naresh Kamboju <naresh.kamboju(a)linaro.org>
Reported-by: Sebastian Andrzej Siewior <sebastian(a)breakpoint.cc>
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=996419
Cc: Matthias Klose <doko(a)debian.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/arm/Makefile | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 847c31e7c368..fa45837b8065 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -60,15 +60,15 @@ KBUILD_CFLAGS += $(call cc-option,-fno-ipa-sra)
# Note that GCC does not numerically define an architecture version
# macro, but instead defines a whole series of macros which makes
# testing for a specific architecture or later rather impossible.
-arch-$(CONFIG_CPU_32v7M) =-D__LINUX_ARM_ARCH__=7 -march=armv7-m -Wa,-march=armv7-m
-arch-$(CONFIG_CPU_32v7) =-D__LINUX_ARM_ARCH__=7 $(call cc-option,-march=armv7-a,-march=armv5t -Wa$(comma)-march=armv7-a)
-arch-$(CONFIG_CPU_32v6) =-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6,-march=armv5t -Wa$(comma)-march=armv6)
+arch-$(CONFIG_CPU_32v7M) =-D__LINUX_ARM_ARCH__=7 -march=armv7-m
+arch-$(CONFIG_CPU_32v7) =-D__LINUX_ARM_ARCH__=7 -march=armv7-a
+arch-$(CONFIG_CPU_32v6) =-D__LINUX_ARM_ARCH__=6 -march=armv6
# Only override the compiler option if ARMv6. The ARMv6K extensions are
# always available in ARMv7
ifeq ($(CONFIG_CPU_32v6),y)
-arch-$(CONFIG_CPU_32v6K) =-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6k,-march=armv5t -Wa$(comma)-march=armv6k)
+arch-$(CONFIG_CPU_32v6K) =-D__LINUX_ARM_ARCH__=6 -march=armv6k
endif
-arch-$(CONFIG_CPU_32v5) =-D__LINUX_ARM_ARCH__=5 $(call cc-option,-march=armv5te,-march=armv4t)
+arch-$(CONFIG_CPU_32v5) =-D__LINUX_ARM_ARCH__=5 -march=armv5te
arch-$(CONFIG_CPU_32v4T) =-D__LINUX_ARM_ARCH__=4 -march=armv4t
arch-$(CONFIG_CPU_32v4) =-D__LINUX_ARM_ARCH__=4 -march=armv4
arch-$(CONFIG_CPU_32v3) =-D__LINUX_ARM_ARCH__=3 -march=armv3m
@@ -82,7 +82,7 @@ tune-$(CONFIG_CPU_ARM720T) =-mtune=arm7tdmi
tune-$(CONFIG_CPU_ARM740T) =-mtune=arm7tdmi
tune-$(CONFIG_CPU_ARM9TDMI) =-mtune=arm9tdmi
tune-$(CONFIG_CPU_ARM940T) =-mtune=arm9tdmi
-tune-$(CONFIG_CPU_ARM946E) =$(call cc-option,-mtune=arm9e,-mtune=arm9tdmi)
+tune-$(CONFIG_CPU_ARM946E) =-mtune=arm9e
tune-$(CONFIG_CPU_ARM920T) =-mtune=arm9tdmi
tune-$(CONFIG_CPU_ARM922T) =-mtune=arm9tdmi
tune-$(CONFIG_CPU_ARM925T) =-mtune=arm9tdmi
@@ -90,11 +90,11 @@ tune-$(CONFIG_CPU_ARM926T) =-mtune=arm9tdmi
tune-$(CONFIG_CPU_FA526) =-mtune=arm9tdmi
tune-$(CONFIG_CPU_SA110) =-mtune=strongarm110
tune-$(CONFIG_CPU_SA1100) =-mtune=strongarm1100
-tune-$(CONFIG_CPU_XSCALE) =$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
-tune-$(CONFIG_CPU_XSC3) =$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
-tune-$(CONFIG_CPU_FEROCEON) =$(call cc-option,-mtune=marvell-f,-mtune=xscale)
-tune-$(CONFIG_CPU_V6) =$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
-tune-$(CONFIG_CPU_V6K) =$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
+tune-$(CONFIG_CPU_XSCALE) =-mtune=xscale
+tune-$(CONFIG_CPU_XSC3) =-mtune=xscale
+tune-$(CONFIG_CPU_FEROCEON) =-mtune=xscale
+tune-$(CONFIG_CPU_V6) =-mtune=arm1136j-s
+tune-$(CONFIG_CPU_V6K) =-mtune=arm1136j-s
# Evaluate tune cc-option calls now
tune-y := $(tune-y)
--
2.29.2
Hi Thomas,
We (ChromeOS) have run into an issue which we believe is related to
the following errata on 11th Gen Intel Core CPUs:
"TGL034 A SYSENTER FOLLOWING AN XSAVE OR A VZEROALL MAY LEAD TO
UNEXPECTED SYSTEM BEHAVIOR" [1]
Essentially we notice that the value returned by a RDPKRU instruction
will flip after some amount of time when running on kernels earlier
than 5.14. I have a simple repro that can be used [2].
After a little digging it appears a lot of work was done to refactor
that code and I bisected to the following commit which fixes the
issue:
commit 954436989cc550dd91aab98363240c9c0a4b7e23
Author: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed Jun 23 14:02:21 2021 +0200
x86/fpu: Remove PKRU handling from switch_fpu_finish()
I backported this patch to 5.4 and it does appear to fix the issue
because it avoids XSAVE. However, I have no idea if it's actually
fixing anything or if the behavior is working as intended. So we're
curious, does it make sense to pull back that patch, would that patch
be enough? Any guidance here would be appreciated because this does
seem broken (because of how it was previously implemented) for those
CPUs prior to 5.14, which is why I'm CCing stable@.
Thanks in advance,
Brian
1. https://cdrdv2.intel.com/v1/dl/getContent/631123?explicitVersion=true
2. https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f
Resend the email using plain text.
I found some kernel performance regression issues that might be
related w/ 4.14.y LTS commit.
4.14.y commit: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=…
The issue is observed when "console=" is used as a kernel parameter to
disable the kernel console.
I browsed android common kernel logs and the upstream stable kernel
tree, found some related changes.
printk: handle blank console arguments passed in. (link:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=…)
Revert "init/console: Use ttynull as a fallback when there is no
console" (link:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=…)
It looks like upstream also noticed the regression introduced by the
commit, and the workaround is to use "ttynull" to handle "console="
case. But the "ttynull" was reverted due to some other reasons
mentioned in the commit message.
Any insight or recommendation will be appreciated.
Thanks,
Yi Fan
The document 'ACPI for Arm Components 1.0' defines the following
_HID mappings:
-'Prime cell UART (PL011)': ARMH0011
-'SBSA UART': ARMHB000
Use the sbsa-uart driver when a device is described with
the 'ARMHB000' _HID.
Note:
PL011 devices currently use the sbsa-uart driver instead of the
uart-pl011 driver. Indeed, PL011 devices are not bound to a clock
in ACPI. It is not possible to change their baudrate.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pierre Gondois <Pierre.Gondois(a)arm.com>
---
drivers/tty/serial/amba-pl011.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index d361cd84ff8c..52518a606c06 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2947,6 +2947,7 @@ MODULE_DEVICE_TABLE(of, sbsa_uart_of_match);
static const struct acpi_device_id __maybe_unused sbsa_uart_acpi_match[] = {
{ "ARMH0011", 0 },
+ { "ARMHB000", 0 },
{},
};
MODULE_DEVICE_TABLE(acpi, sbsa_uart_acpi_match);
--
2.17.1
From: Johannes Berg <johannes.berg(a)intel.com>
In commit 8c89f7b3d3f2 ("mac80211: Use flex-array for radiotap header
bitmap") we accidentally pointed the position to the wrong place, so
we overwrite a present bitmap, and thus cause all kinds of trouble.
To see the issue, note that the previous code read:
pos = (void *)(it_present + 1);
The requirement now is that we need to calculate pos via it_optional,
to not trigger the compiler hardening checks, as:
pos = (void *)&rthdr->it_optional[...];
Rewriting the original expression, we get (obviously, since that just
adds "+ x - x" terms):
pos = (void *)(it_present + 1 + rthdr->it_optional - rthdr->it_optional)
and moving the "+ rthdr->it_optional" outside to be used as an array:
pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional];
The original is off by one, fix it.
Cc: stable(a)vger.kernel.org
Fixes: 8c89f7b3d3f2 ("mac80211: Use flex-array for radiotap header bitmap")
Reported-by: Sid Hayn <sidhayn(a)gmail.com>
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
---
net/mac80211/rx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fc5c608d02e2..3562730ea0f8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -364,7 +364,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
* the compiler to think we have walked past the end of the
* struct member.
*/
- pos = (void *)&rthdr->it_optional[it_present - rthdr->it_optional];
+ pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional];
/* the order of the following fields is important */
--
2.31.1
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:12 -0700
Subject: [PATCH] binder: use euid from cred instead of using task
Save the 'struct cred' associated with a binder process
at initial open to avoid potential race conditions
when converting to an euid.
Set a transaction's sender_euid from the 'struct cred'
saved at binder_open() instead of looking up the euid
from the binder proc's 'struct task'. This ensures
the euid is associated with the security context that
of the task that opened binder.
Cc: stable(a)vger.kernel.org # 4.4+
Fixes: 457b9a6f09f0 ("Staging: android: add binder driver")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Suggested-by: Jann Horn <jannh(a)google.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..231cff9b3b75 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2702,7 +2702,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from = thread;
else
t->from = NULL;
- t->sender_euid = task_euid(proc->tsk);
+ t->sender_euid = proc->cred->euid;
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
@@ -4343,6 +4343,7 @@ static void binder_free_proc(struct binder_proc *proc)
}
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
+ put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
kfree(proc);
}
@@ -5021,6 +5022,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->freeze_wait);
proc->default_priority = task_nice(current);
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..e7d4920b3368 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -364,6 +364,9 @@ struct binder_ref {
* (invariant after initialized)
* @tsk task_struct for group_leader of process
* (invariant after initialized)
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -424,6 +427,7 @@ struct binder_proc {
struct list_head waiting_threads;
int pid;
struct task_struct *tsk;
+ const struct cred *cred;
struct hlist_node deferred_work_node;
int deferred_work;
int outstanding_txns;
The patch titled
Subject: shm: extend forced shm destroy to support objects from several IPC nses
has been removed from the -mm tree. Its filename was
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Subject: shm: extend forced shm destroy to support objects from several IPC nses
Currently, exit_shm function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).
That particular patch is attempt to fix the problem by extending exit_shm
mechanism to handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we
initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in
task->sysvshm.shm_clist and gets IPC namespace not from current task as
it was before but from shp's object itself, then call shm_destroy(shp,
ns).
Note. We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using
special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter
only if IPC ns not in the "state of destruction".
Q/A
Q: Why we can access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther than IPC namespace
lifetime, so, if we get shp object from the task->sysvshm.shm_clist
while holding task_lock(task) nobody can steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: Not. It's just fixes non-covered case when process may leave IPC
namespace without getting task->sysvshm.shm_clist list cleaned up.
Link: https://lkml.kernel.org/r/20211027224348.611025-3-alexander.mikhalitsyn@vir…
Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Cc: Vasily Averin <vvs(a)virtuozzo.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/ipc_namespace.h | 15 ++
include/linux/sched/task.h | 2
include/linux/shm.h | 2
ipc/shm.c | 170 +++++++++++++++++++++++---------
4 files changed, 142 insertions(+), 47 deletions(-)
--- a/include/linux/ipc_namespace.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/ipc_namespace.h
@@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_
return ns;
}
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ if (ns) {
+ if (refcount_inc_not_zero(&ns->ns.count))
+ return ns;
+ }
+
+ return NULL;
+}
+
extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_
{
return ns;
}
+
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ return ns;
+}
static inline void put_ipc_ns(struct ipc_namespace *ns)
{
--- a/include/linux/sched/task.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/sched/task.h
@@ -157,7 +157,7 @@ static inline struct vm_struct *task_sta
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
--- a/include/linux/shm.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/shm.h
@@ -11,7 +11,7 @@ struct file;
#ifdef CONFIG_SYSVIPC
struct sysv_shm {
- struct list_head shm_clist;
+ struct list_head shm_clist;
};
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
--- a/ipc/shm.c~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/ipc/shm.c
@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the ke
struct pid *shm_lprid;
struct ucounts *mlock_ucounts;
- /* The task created the shm object. NULL if the task is dead. */
+ /*
+ * The task created the shm object, for looking up
+ * task->sysvshm.shm_clist_lock
+ */
struct task_struct *shm_creator;
- struct list_head shm_clist; /* list by creator */
+
+ /*
+ * list by creator. shm_clist_lock required for read/write
+ * if list_empty(), then the creator is dead already
+ */
+ struct list_head shm_clist;
+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_names
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ WARN_ON(ns != shp->ns);
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,10 +235,36 @@ static void shm_rcu_free(struct rcu_head
kfree(shp);
}
-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+/*
+ * It has to be called with shp locked.
+ * It must be called before ipc_rmid()
+ */
+static inline void shm_clist_rm(struct shmid_kernel *shp)
{
- list_del(&s->shm_clist);
- ipc_rmid(&shm_ids(ns), &s->shm_perm);
+ struct task_struct *creator;
+
+ /*
+ * A concurrent exit_shm may do a list_del_init() as well.
+ * Just do nothing if exit_shm already did the work
+ */
+ if (list_empty(&shp->shm_clist))
+ return;
+
+ /*
+ * shp->shm_creator is guaranteed to be valid *only*
+ * if shp->shm_clist is not empty.
+ */
+ creator = shp->shm_creator;
+
+ task_lock(creator);
+ list_del_init(&shp->shm_clist);
+ task_unlock(creator);
+}
+
+static inline void shm_rmid(struct shmid_kernel *s)
+{
+ shm_clist_rm(s);
+ ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}
@@ -283,7 +319,7 @@ static void shm_destroy(struct ipc_names
shm_file = shp->shm_file;
shp->shm_file = NULL;
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
- shm_rmid(ns, shp);
+ shm_rmid(shp);
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_ucounts);
@@ -306,10 +342,10 @@ static void shm_destroy(struct ipc_names
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
@@ -340,7 +376,7 @@ static void shm_close(struct vm_area_str
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -361,10 +397,10 @@ static int shm_try_destroy_orphaned(int
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -382,48 +418,87 @@ void shm_destroy_orphaned(struct ipc_nam
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
- struct shmid_kernel *shp, *n;
+ for (;;) {
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns;
- if (list_empty(&task->sysvshm.shm_clist))
- return;
+ task_lock(task);
+
+ if (list_empty(&task->sysvshm.shm_clist)) {
+ task_unlock(task);
+ break;
+ }
+
+ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
+ shm_clist);
+
+ /* 1) unlink */
+ list_del_init(&shp->shm_clist);
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
/*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
+ * 2) Get pointer to the ipc namespace. It is worth to say
+ * that this pointer is guaranteed to be valid because
+ * shp lifetime is always shorter than namespace lifetime
+ * in which shp lives.
+ * We taken task_lock it means that shp won't be freed.
*/
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
+ ns = shp->ns;
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
- shp->shm_creator = NULL;
+ /*
+ * 3) If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced) {
+ task_unlock(task);
+ continue;
+ }
- if (shm_may_destroy(ns, shp)) {
+ /*
+ * 4) get a reference to the namespace.
+ * The refcount could be already 0. If it is 0, then
+ * the shm objects will be free by free_ipc_work().
+ */
+ ns = get_ipc_ns_not_zero(ns);
+ if (ns) {
+ /*
+ * 5) get a reference to the shp itself.
+ * This cannot fail: shm_clist_rm() is called before
+ * ipc_rmid(), thus the refcount cannot be 0.
+ */
+ WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
+ }
+
+ task_unlock(task);
+
+ if (ns) {
+ down_write(&shm_ids(ns).rwsem);
shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ /*
+ * rcu_read_lock was implicitly taken in
+ * shm_lock_by_ptr, it's safe to call
+ * ipc_rcu_putref here
+ */
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+
+ if (ipc_valid_object(&shp->shm_perm)) {
+ if (shm_may_destroy(shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ } else {
+ /*
+ * Someone else deleted the shp from namespace
+ * idr/kht while we have waited.
+ * Just unlock and continue.
+ */
+ shm_unlock(shp);
+ }
+
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
}
}
-
- /* Remove the list head from any segments still attached. */
- list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
}
static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -680,7 +755,11 @@ static int newseg(struct ipc_namespace *
if (error < 0)
goto no_id;
+ shp->ns = ns;
+
+ task_lock(current);
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
+ task_unlock(current);
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
@@ -1573,7 +1652,8 @@ out_nattch:
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
_
Patches currently in -mm which might be from alexander.mikhalitsyn(a)virtuozzo.com are
ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch
I hope you have positive and great start this new week, consider
this e-mail as a kind reminder of my previous e-mail sent to you
at the beginning of last week.
Do confirm if you got it ?
Warm regards .... Mrs Karen Ngui,
This reverts commit 2fd3e5efe791946be0957c8e1eed9560b541fe46.
The above commit replaces page_address(bv->bv_page) by bvec_virt(bv) to
avoid directly access to bv->bv_page, but in situation bv->bv_offset is
not zero and page_address(bv->bv_page) is not equal to bvec_virt(bv). In
such case a memory corruption may happen because memory in next page is
tainted by following line in do_btree_node_write(),
memcpy(bvec_virt(bv), addr, PAGE_SIZE);
This patch reverts the mentioned commit to avoid the memory corruption.
Fixes: 2fd3e5efe791 ("bcache: use bvec_virt")
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: stable(a)vger.kernel.org # 5.15
---
drivers/md/bcache/btree.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 93b67b8d31c3..88c573eeb598 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -378,7 +378,7 @@ static void do_btree_node_write(struct btree *b)
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, b->bio, iter_all) {
- memcpy(bvec_virt(bv), addr, PAGE_SIZE);
+ memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
addr += PAGE_SIZE;
}
--
2.31.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: [PATCH] binder: use cred instead of task for getsecid
Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.
Cc: stable(a)vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1571e01cfa52..49b08c04fa09 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2713,16 +2713,7 @@ static void binder_transaction(struct binder_proc *proc,
u32 secid;
size_t added_size;
- /*
- * Arguably this should be the task's subjective LSM secid but
- * we can't reliably access the subjective creds of a task
- * other than our own so we must use the objective creds, which
- * are safe to access. The downside is that if a task is
- * temporarily overriding it's creds it will not be reflected
- * here; however, it isn't clear that binder would handle that
- * case well anyway.
- */
- security_task_getsecid_obj(proc->tsk, &secid);
+ security_cred_getsecid(proc->cred, &secid);
ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
if (ret) {
return_error = BR_FAILED_REPLY;
diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
{
}
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ *secid = 0;
+}
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: [PATCH] binder: use cred instead of task for getsecid
Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.
Cc: stable(a)vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1571e01cfa52..49b08c04fa09 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2713,16 +2713,7 @@ static void binder_transaction(struct binder_proc *proc,
u32 secid;
size_t added_size;
- /*
- * Arguably this should be the task's subjective LSM secid but
- * we can't reliably access the subjective creds of a task
- * other than our own so we must use the objective creds, which
- * are safe to access. The downside is that if a task is
- * temporarily overriding it's creds it will not be reflected
- * here; however, it isn't clear that binder would handle that
- * case well anyway.
- */
- security_task_getsecid_obj(proc->tsk, &secid);
+ security_cred_getsecid(proc->cred, &secid);
ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
if (ret) {
return_error = BR_FAILED_REPLY;
diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
{
}
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ *secid = 0;
+}
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: [PATCH] binder: use cred instead of task for getsecid
Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.
Cc: stable(a)vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1571e01cfa52..49b08c04fa09 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2713,16 +2713,7 @@ static void binder_transaction(struct binder_proc *proc,
u32 secid;
size_t added_size;
- /*
- * Arguably this should be the task's subjective LSM secid but
- * we can't reliably access the subjective creds of a task
- * other than our own so we must use the objective creds, which
- * are safe to access. The downside is that if a task is
- * temporarily overriding it's creds it will not be reflected
- * here; however, it isn't clear that binder would handle that
- * case well anyway.
- */
- security_task_getsecid_obj(proc->tsk, &secid);
+ security_cred_getsecid(proc->cred, &secid);
ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
if (ret) {
return_error = BR_FAILED_REPLY;
diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
{
}
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ *secid = 0;
+}
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: [PATCH] binder: use cred instead of task for getsecid
Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.
Cc: stable(a)vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1571e01cfa52..49b08c04fa09 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2713,16 +2713,7 @@ static void binder_transaction(struct binder_proc *proc,
u32 secid;
size_t added_size;
- /*
- * Arguably this should be the task's subjective LSM secid but
- * we can't reliably access the subjective creds of a task
- * other than our own so we must use the objective creds, which
- * are safe to access. The downside is that if a task is
- * temporarily overriding it's creds it will not be reflected
- * here; however, it isn't clear that binder would handle that
- * case well anyway.
- */
- security_task_getsecid_obj(proc->tsk, &secid);
+ security_cred_getsecid(proc->cred, &secid);
ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
if (ret) {
return_error = BR_FAILED_REPLY;
diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
{
}
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ *secid = 0;
+}
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: [PATCH] binder: use cred instead of task for getsecid
Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.
Cc: stable(a)vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1571e01cfa52..49b08c04fa09 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2713,16 +2713,7 @@ static void binder_transaction(struct binder_proc *proc,
u32 secid;
size_t added_size;
- /*
- * Arguably this should be the task's subjective LSM secid but
- * we can't reliably access the subjective creds of a task
- * other than our own so we must use the objective creds, which
- * are safe to access. The downside is that if a task is
- * temporarily overriding it's creds it will not be reflected
- * here; however, it isn't clear that binder would handle that
- * case well anyway.
- */
- security_task_getsecid_obj(proc->tsk, &secid);
+ security_cred_getsecid(proc->cred, &secid);
ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
if (ret) {
return_error = BR_FAILED_REPLY;
diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
{
}
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ *secid = 0;
+}
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:14 -0700
Subject: [PATCH] binder: use cred instead of task for getsecid
Use the 'struct cred' saved at binder_open() to lookup
the security ID via security_cred_getsecid(). This
ensures that the security context that opened binder
is the one used to generate the secctx.
Cc: stable(a)vger.kernel.org # 5.4+
Fixes: ec74136ded79 ("binder: create node flag to request sender's security context")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1571e01cfa52..49b08c04fa09 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2713,16 +2713,7 @@ static void binder_transaction(struct binder_proc *proc,
u32 secid;
size_t added_size;
- /*
- * Arguably this should be the task's subjective LSM secid but
- * we can't reliably access the subjective creds of a task
- * other than our own so we must use the objective creds, which
- * are safe to access. The downside is that if a task is
- * temporarily overriding it's creds it will not be reflected
- * here; however, it isn't clear that binder would handle that
- * case well anyway.
- */
- security_task_getsecid_obj(proc->tsk, &secid);
+ security_cred_getsecid(proc->cred, &secid);
ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
if (ret) {
return_error = BR_FAILED_REPLY;
diff --git a/include/linux/security.h b/include/linux/security.h
index 9be72166e859..cc6d39358336 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new,
{
}
+static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ *secid = 0;
+}
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:12 -0700
Subject: [PATCH] binder: use euid from cred instead of using task
Save the 'struct cred' associated with a binder process
at initial open to avoid potential race conditions
when converting to an euid.
Set a transaction's sender_euid from the 'struct cred'
saved at binder_open() instead of looking up the euid
from the binder proc's 'struct task'. This ensures
the euid is associated with the security context that
of the task that opened binder.
Cc: stable(a)vger.kernel.org # 4.4+
Fixes: 457b9a6f09f0 ("Staging: android: add binder driver")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Suggested-by: Jann Horn <jannh(a)google.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..231cff9b3b75 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2702,7 +2702,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from = thread;
else
t->from = NULL;
- t->sender_euid = task_euid(proc->tsk);
+ t->sender_euid = proc->cred->euid;
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
@@ -4343,6 +4343,7 @@ static void binder_free_proc(struct binder_proc *proc)
}
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
+ put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
kfree(proc);
}
@@ -5021,6 +5022,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->freeze_wait);
proc->default_priority = task_nice(current);
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..e7d4920b3368 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -364,6 +364,9 @@ struct binder_ref {
* (invariant after initialized)
* @tsk task_struct for group_leader of process
* (invariant after initialized)
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -424,6 +427,7 @@ struct binder_proc {
struct list_head waiting_threads;
int pid;
struct task_struct *tsk;
+ const struct cred *cred;
struct hlist_node deferred_work_node;
int deferred_work;
int outstanding_txns;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:12 -0700
Subject: [PATCH] binder: use euid from cred instead of using task
Save the 'struct cred' associated with a binder process
at initial open to avoid potential race conditions
when converting to an euid.
Set a transaction's sender_euid from the 'struct cred'
saved at binder_open() instead of looking up the euid
from the binder proc's 'struct task'. This ensures
the euid is associated with the security context that
of the task that opened binder.
Cc: stable(a)vger.kernel.org # 4.4+
Fixes: 457b9a6f09f0 ("Staging: android: add binder driver")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Suggested-by: Jann Horn <jannh(a)google.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..231cff9b3b75 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2702,7 +2702,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from = thread;
else
t->from = NULL;
- t->sender_euid = task_euid(proc->tsk);
+ t->sender_euid = proc->cred->euid;
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
@@ -4343,6 +4343,7 @@ static void binder_free_proc(struct binder_proc *proc)
}
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
+ put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
kfree(proc);
}
@@ -5021,6 +5022,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->freeze_wait);
proc->default_priority = task_nice(current);
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..e7d4920b3368 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -364,6 +364,9 @@ struct binder_ref {
* (invariant after initialized)
* @tsk task_struct for group_leader of process
* (invariant after initialized)
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -424,6 +427,7 @@ struct binder_proc {
struct list_head waiting_threads;
int pid;
struct task_struct *tsk;
+ const struct cred *cred;
struct hlist_node deferred_work_node;
int deferred_work;
int outstanding_txns;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:12 -0700
Subject: [PATCH] binder: use euid from cred instead of using task
Save the 'struct cred' associated with a binder process
at initial open to avoid potential race conditions
when converting to an euid.
Set a transaction's sender_euid from the 'struct cred'
saved at binder_open() instead of looking up the euid
from the binder proc's 'struct task'. This ensures
the euid is associated with the security context that
of the task that opened binder.
Cc: stable(a)vger.kernel.org # 4.4+
Fixes: 457b9a6f09f0 ("Staging: android: add binder driver")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Suggested-by: Jann Horn <jannh(a)google.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..231cff9b3b75 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2702,7 +2702,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from = thread;
else
t->from = NULL;
- t->sender_euid = task_euid(proc->tsk);
+ t->sender_euid = proc->cred->euid;
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
@@ -4343,6 +4343,7 @@ static void binder_free_proc(struct binder_proc *proc)
}
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
+ put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
kfree(proc);
}
@@ -5021,6 +5022,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->freeze_wait);
proc->default_priority = task_nice(current);
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..e7d4920b3368 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -364,6 +364,9 @@ struct binder_ref {
* (invariant after initialized)
* @tsk task_struct for group_leader of process
* (invariant after initialized)
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -424,6 +427,7 @@ struct binder_proc {
struct list_head waiting_threads;
int pid;
struct task_struct *tsk;
+ const struct cred *cred;
struct hlist_node deferred_work_node;
int deferred_work;
int outstanding_txns;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:12 -0700
Subject: [PATCH] binder: use euid from cred instead of using task
Save the 'struct cred' associated with a binder process
at initial open to avoid potential race conditions
when converting to an euid.
Set a transaction's sender_euid from the 'struct cred'
saved at binder_open() instead of looking up the euid
from the binder proc's 'struct task'. This ensures
the euid is associated with the security context that
of the task that opened binder.
Cc: stable(a)vger.kernel.org # 4.4+
Fixes: 457b9a6f09f0 ("Staging: android: add binder driver")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Suggested-by: Jann Horn <jannh(a)google.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..231cff9b3b75 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2702,7 +2702,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from = thread;
else
t->from = NULL;
- t->sender_euid = task_euid(proc->tsk);
+ t->sender_euid = proc->cred->euid;
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
@@ -4343,6 +4343,7 @@ static void binder_free_proc(struct binder_proc *proc)
}
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
+ put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
kfree(proc);
}
@@ -5021,6 +5022,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->freeze_wait);
proc->default_priority = task_nice(current);
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..e7d4920b3368 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -364,6 +364,9 @@ struct binder_ref {
* (invariant after initialized)
* @tsk task_struct for group_leader of process
* (invariant after initialized)
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -424,6 +427,7 @@ struct binder_proc {
struct list_head waiting_threads;
int pid;
struct task_struct *tsk;
+ const struct cred *cred;
struct hlist_node deferred_work_node;
int deferred_work;
int outstanding_txns;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Tue, 12 Oct 2021 09:56:12 -0700
Subject: [PATCH] binder: use euid from cred instead of using task
Save the 'struct cred' associated with a binder process
at initial open to avoid potential race conditions
when converting to an euid.
Set a transaction's sender_euid from the 'struct cred'
saved at binder_open() instead of looking up the euid
from the binder proc's 'struct task'. This ensures
the euid is associated with the security context that
of the task that opened binder.
Cc: stable(a)vger.kernel.org # 4.4+
Fixes: 457b9a6f09f0 ("Staging: android: add binder driver")
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Suggested-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
Suggested-by: Jann Horn <jannh(a)google.com>
Acked-by: Casey Schaufler <casey(a)schaufler-ca.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..231cff9b3b75 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2702,7 +2702,7 @@ static void binder_transaction(struct binder_proc *proc,
t->from = thread;
else
t->from = NULL;
- t->sender_euid = task_euid(proc->tsk);
+ t->sender_euid = proc->cred->euid;
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
@@ -4343,6 +4343,7 @@ static void binder_free_proc(struct binder_proc *proc)
}
binder_alloc_deferred_release(&proc->alloc);
put_task_struct(proc->tsk);
+ put_cred(proc->cred);
binder_stats_deleted(BINDER_STAT_PROC);
kfree(proc);
}
@@ -5021,6 +5022,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ proc->cred = get_cred(filp->f_cred);
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->freeze_wait);
proc->default_priority = task_nice(current);
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..e7d4920b3368 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -364,6 +364,9 @@ struct binder_ref {
* (invariant after initialized)
* @tsk task_struct for group_leader of process
* (invariant after initialized)
+ * @cred struct cred associated with the `struct file`
+ * in binder_open()
+ * (invariant after initialized)
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -424,6 +427,7 @@ struct binder_proc {
struct list_head waiting_threads;
int pid;
struct task_struct *tsk;
+ const struct cred *cred;
struct hlist_node deferred_work_node;
int deferred_work;
int outstanding_txns;
The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that will run when the total free pages go below a watermark to
ensure that there are always some free pages available to, for
example, support efficient page faults.
With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is sometimes accessed
without any protection and when it is protected it is done
inconsistently with any one of the spin locks associated with the
individual NUMA nodes.
The consequence of sgx_nr_free_pages not being protected is that
its value may not accurately reflect the actual number of free
pages on the system, impacting the availability of free pages in
support of many flows. The problematic scenario is when the
reclaimer never runs because it believes there to be sufficient
free pages while any attempt to allocate a page fails because there
are no free pages available. The worst scenario observed was a
user space hang because of repeated page faults caused by
no free pages ever made available.
Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.
Cc: stable(a)vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre(a)intel.com>
---
arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 63d3de02bbcc..8558d7d5f3e7 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
static LIST_HEAD(sgx_active_page_list);
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
-/* The free page list lock protected variables prepend the lock. */
-static unsigned long sgx_nr_free_pages;
+atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
/* Nodes with one or more EPC sections. */
static nodemask_t sgx_numa_mask;
@@ -403,14 +402,15 @@ static void sgx_reclaim_pages(void)
spin_lock(&node->lock);
list_add_tail(&epc_page->list, &node->free_page_list);
- sgx_nr_free_pages++;
spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+ !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
@@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- sgx_nr_free_pages--;
spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
return page;
}
@@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
spin_lock(&node->lock);
list_add_tail(&page->list, &node->free_page_list);
- sgx_nr_free_pages++;
spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
--
2.25.1
The patch titled
Subject: shm: extend forced shm destroy to support objects from several IPC nses
has been added to the -mm tree. Its filename is
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses-simplified.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/shm-extend-forced-shm-destroy-to-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/shm-extend-forced-shm-destroy-to-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Subject: shm: extend forced shm destroy to support objects from several IPC nses
Currently, exit_shm function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).
That particular patch is attempt to fix the problem by extending exit_shm
mechanism to handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we
initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in
task->sysvshm.shm_clist and gets IPC namespace not from current task as
it was before but from shp's object itself, then call shm_destroy(shp,
ns).
Note. We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using
special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter
only if IPC ns not in the "state of destruction".
Q/A
Q: Why we can access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther
than IPC namespace lifetime, so, if we get shp object from the
task->sysvshm.shm_clist while holding task_lock(task) nobody can
steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: Not. It's just fixes non-covered case when process may leave
IPC namespace without getting task->sysvshm.shm_clist list cleaned up.
Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife…
Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Cc: Vasily Averin <vvs(a)virtuozzo.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/ipc_namespace.h | 15 ++
include/linux/sched/task.h | 2
ipc/shm.c | 189 ++++++++++++++++++++++++--------
3 files changed, 159 insertions(+), 47 deletions(-)
--- a/include/linux/ipc_namespace.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses-simplified
+++ a/include/linux/ipc_namespace.h
@@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_
return ns;
}
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ if (ns) {
+ if (refcount_inc_not_zero(&ns->ns.count))
+ return ns;
+ }
+
+ return NULL;
+}
+
extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_
{
return ns;
}
+
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ return ns;
+}
static inline void put_ipc_ns(struct ipc_namespace *ns)
{
--- a/include/linux/sched/task.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses-simplified
+++ a/include/linux/sched/task.h
@@ -157,7 +157,7 @@ static inline struct vm_struct *task_sta
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
--- a/ipc/shm.c~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses-simplified
+++ a/ipc/shm.c
@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the ke
struct pid *shm_lprid;
struct ucounts *mlock_ucounts;
- /* The task created the shm object. NULL if the task is dead. */
+ /*
+ * The task created the shm object, for
+ * task_lock(shp->shm_creator)
+ */
struct task_struct *shm_creator;
- struct list_head shm_clist; /* list by creator */
+
+ /*
+ * List by creator. task_lock(->shm_creator) required for read/write.
+ * If list_empty(), then the creator is dead already.
+ */
+ struct list_head shm_clist;
+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_names
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ WARN_ON(ns != shp->ns);
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head
kfree(shp);
}
-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+/*
+ * It has to be called with shp locked.
+ * It must be called before ipc_rmid()
+ */
+static inline void shm_clist_rm(struct shmid_kernel *shp)
+{
+ struct task_struct *creator;
+
+ /* ensure that shm_creator does not disappear */
+ rcu_read_lock();
+
+ /*
+ * A concurrent exit_shm may do a list_del_init() as well.
+ * Just do nothing if exit_shm already did the work
+ */
+ if (!list_empty(&shp->shm_clist)) {
+ /*
+ * shp->shm_creator is guaranteed to be valid *only*
+ * if shp->shm_clist is not empty.
+ */
+ creator = shp->shm_creator;
+
+ task_lock(creator);
+ /*
+ * list_del_init() is a nop if the entry was already removed
+ * from the list.
+ */
+ list_del_init(&shp->shm_clist);
+ task_unlock(creator);
+ }
+ rcu_read_unlock();
+}
+
+static inline void shm_rmid(struct shmid_kernel *s)
{
- list_del(&s->shm_clist);
- ipc_rmid(&shm_ids(ns), &s->shm_perm);
+ shm_clist_rm(s);
+ ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}
@@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_names
shm_file = shp->shm_file;
shp->shm_file = NULL;
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
- shm_rmid(ns, shp);
+ shm_rmid(shp);
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_ucounts);
@@ -306,10 +349,10 @@ static void shm_destroy(struct ipc_names
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
@@ -340,7 +383,7 @@ static void shm_close(struct vm_area_str
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -361,10 +404,10 @@ static int shm_try_destroy_orphaned(int
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -382,48 +425,97 @@ void shm_destroy_orphaned(struct ipc_nam
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
- struct shmid_kernel *shp, *n;
+ for (;;) {
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns;
- if (list_empty(&task->sysvshm.shm_clist))
- return;
+ task_lock(task);
+
+ if (list_empty(&task->sysvshm.shm_clist)) {
+ task_unlock(task);
+ break;
+ }
+
+ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
+ shm_clist);
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
/*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
+ * 1) Get pointer to the ipc namespace. It is worth to say
+ * that this pointer is guaranteed to be valid because
+ * shp lifetime is always shorter than namespace lifetime
+ * in which shp lives.
+ * We taken task_lock it means that shp won't be freed.
*/
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
+ ns = shp->ns;
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
- shp->shm_creator = NULL;
+ /*
+ * 2) If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced)
+ goto unlink_continue;
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ /*
+ * 3) get a reference to the namespace.
+ * The refcount could be already 0. If it is 0, then
+ * the shm objects will be free by free_ipc_work().
+ */
+ ns = get_ipc_ns_not_zero(ns);
+ if (!ns) {
+unlink_continue:
+ list_del_init(&shp->shm_clist);
+ task_unlock(task);
+ continue;
}
- }
- /* Remove the list head from any segments still attached. */
- list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
+ /*
+ * 4) get a reference to shp.
+ * This cannot fail: shm_clist_rm() is called before
+ * ipc_rmid(), thus the refcount cannot be 0.
+ */
+ WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
+
+ /*
+ * 5) unlink the shm segment from the list of segments
+ * created by current.
+ * This must be done last. After unlinking,
+ * only the refcounts obtained above prevent IPC_RMID
+ * from destroying the segment or the namespace.
+ */
+ list_del_init(&shp->shm_clist);
+
+ task_unlock(task);
+
+ /*
+ * 6) we have all references
+ * Thus lock & if needed destroy shp.
+ */
+ down_write(&shm_ids(ns).rwsem);
+ shm_lock_by_ptr(shp);
+ /*
+ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
+ * safe to call ipc_rcu_putref here
+ */
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+
+ if (ipc_valid_object(&shp->shm_perm)) {
+ if (shm_may_destroy(shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ } else {
+ /*
+ * Someone else deleted the shp from namespace
+ * idr/kht while we have waited.
+ * Just unlock and continue.
+ */
+ shm_unlock(shp);
+ }
+
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
+ }
}
static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -680,7 +772,11 @@ static int newseg(struct ipc_namespace *
if (error < 0)
goto no_id;
+ shp->ns = ns;
+
+ task_lock(current);
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
+ task_unlock(current);
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
@@ -1573,7 +1669,8 @@ out_nattch:
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
_
Patches currently in -mm which might be from alexander.mikhalitsyn(a)virtuozzo.com are
ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses-simplified.patch
The backing memory was not freed, after loading it back to the SGX
reserved memory. This problem was not prevalent with the systems that
were available at the time when SGX was first upstreamed, as the swapped
memory could grow only up to 128 MB. However, Icelake Server can have
gigabytes of SGX memory, and thus this has become a real bottleneck.
Free the swap space for other use by calling shmem_truncate_range(),
when a page is faulted back.
Cc: stable(a)vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
arch/x86/kernel/cpu/sgx/encl.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 001808e3901c..f2d3f2e5028f 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -22,6 +22,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
{
unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
struct sgx_encl *encl = encl_page->encl;
+ struct inode *inode = file_inode(encl->backing);
struct sgx_pageinfo pginfo;
struct sgx_backing b;
pgoff_t page_index;
@@ -60,6 +61,9 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
sgx_encl_put_backing(&b, false);
+ /* Free the backing memory. */
+ shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+
return ret;
}
--
2.32.0
The patch titled
Subject: shm: extend forced shm destroy to support objects from several IPC nses
has been added to the -mm tree. Its filename is
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/shm-extend-forced-shm-destroy-to-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/shm-extend-forced-shm-destroy-to-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Subject: shm: extend forced shm destroy to support objects from several IPC nses
Currently, exit_shm function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).
That particular patch is attempt to fix the problem by extending exit_shm
mechanism to handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we
initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in
task->sysvshm.shm_clist and gets IPC namespace not from current task as
it was before but from shp's object itself, then call shm_destroy(shp,
ns).
Note. We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using
special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter
only if IPC ns not in the "state of destruction".
Q/A
Q: Why we can access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther
than IPC namespace lifetime, so, if we get shp object from the
task->sysvshm.shm_clist while holding task_lock(task) nobody can
steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: Not. It's just fixes non-covered case when process may leave
IPC namespace without getting task->sysvshm.shm_clist list cleaned up.
Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Cc: Vasily Averin <vvs(a)virtuozzo.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/ipc_namespace.h | 15 ++
include/linux/sched/task.h | 2
ipc/shm.c | 186 ++++++++++++++++++++++++--------
3 files changed, 156 insertions(+), 47 deletions(-)
--- a/include/linux/ipc_namespace.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/ipc_namespace.h
@@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_
return ns;
}
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ if (ns) {
+ if (refcount_inc_not_zero(&ns->ns.count))
+ return ns;
+ }
+
+ return NULL;
+}
+
extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_
{
return ns;
}
+
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ return ns;
+}
static inline void put_ipc_ns(struct ipc_namespace *ns)
{
--- a/include/linux/sched/task.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/sched/task.h
@@ -157,7 +157,7 @@ static inline struct vm_struct *task_sta
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
--- a/ipc/shm.c~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/ipc/shm.c
@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the ke
struct pid *shm_lprid;
struct ucounts *mlock_ucounts;
- /* The task created the shm object. NULL if the task is dead. */
+ /*
+ * The task created the shm object, for
+ * task_lock(shp->shm_creator)
+ */
struct task_struct *shm_creator;
- struct list_head shm_clist; /* list by creator */
+
+ /*
+ * List by creator. task_lock(->shm_creator) required for read/write.
+ * If list_empty(), then the creator is dead already.
+ */
+ struct list_head shm_clist;
+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_names
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ WARN_ON(ns != shp->ns);
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head
kfree(shp);
}
-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+/*
+ * It has to be called with shp locked.
+ * It must be called before ipc_rmid()
+ */
+static inline void shm_clist_rm(struct shmid_kernel *shp)
{
- list_del(&s->shm_clist);
- ipc_rmid(&shm_ids(ns), &s->shm_perm);
+ struct task_struct *creator;
+
+ /* ensure that shm_creator does not disappear */
+ rcu_read_lock();
+
+ /*
+ * A concurrent exit_shm may do a list_del_init() as well.
+ * Just do nothing if exit_shm already did the work
+ */
+ if (!list_empty(&shp->shm_clist)) {
+ /*
+ * shp->shm_creator is guaranteed to be valid *only*
+ * if shp->shm_clist is not empty.
+ */
+ creator = shp->shm_creator;
+
+ task_lock(creator);
+ /*
+ * list_del_init() is a nop if the entry was already removed
+ * from the list.
+ */
+ list_del_init(&shp->shm_clist);
+ task_unlock(creator);
+ }
+ rcu_read_unlock();
+}
+
+static inline void shm_rmid(struct shmid_kernel *s)
+{
+ shm_clist_rm(s);
+ ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}
@@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_names
shm_file = shp->shm_file;
shp->shm_file = NULL;
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
- shm_rmid(ns, shp);
+ shm_rmid(shp);
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_ucounts);
@@ -303,10 +346,10 @@ static void shm_destroy(struct ipc_names
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
@@ -337,7 +380,7 @@ static void shm_close(struct vm_area_str
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -358,10 +401,10 @@ static int shm_try_destroy_orphaned(int
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -379,48 +422,94 @@ void shm_destroy_orphaned(struct ipc_nam
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
- struct shmid_kernel *shp, *n;
+ for (;;) {
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns;
- if (list_empty(&task->sysvshm.shm_clist))
- return;
+ task_lock(task);
+
+ if (list_empty(&task->sysvshm.shm_clist)) {
+ task_unlock(task);
+ break;
+ }
+
+ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
+ shm_clist);
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
/*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
+ * 1) get a reference to shp.
+ * This must be done first: Right now, task_lock() prevents
+ * any concurrent IPC_RMID calls. After the list_del_init(),
+ * IPC_RMID will not acquire task_lock(->shm_creator)
+ * anymore.
*/
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
+ WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
- shp->shm_creator = NULL;
+ /* 2) unlink */
+ list_del_init(&shp->shm_clist);
+
+ /*
+ * 3) Get pointer to the ipc namespace. It is worth to say
+ * that this pointer is guaranteed to be valid because
+ * shp lifetime is always shorter than namespace lifetime
+ * in which shp lives.
+ * We taken task_lock it means that shp won't be freed.
+ */
+ ns = shp->ns;
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ /*
+ * 4) If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced) {
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+ task_unlock(task);
+ continue;
}
- }
- /* Remove the list head from any segments still attached. */
- list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
+ /*
+ * 5) get a reference to the namespace.
+ * The refcount could be already 0. If it is 0, then
+ * the shm objects will be free by free_ipc_work().
+ */
+ ns = get_ipc_ns_not_zero(ns);
+ if (!ns) {
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+ task_unlock(task);
+ continue;
+ }
+ task_unlock(task);
+
+ /*
+ * 6) we have all references
+ * Thus lock & if needed destroy shp.
+ */
+ down_write(&shm_ids(ns).rwsem);
+ shm_lock_by_ptr(shp);
+ /*
+ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
+ * safe to call ipc_rcu_putref here
+ */
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+
+ if (ipc_valid_object(&shp->shm_perm)) {
+ if (shm_may_destroy(shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ } else {
+ /*
+ * Someone else deleted the shp from namespace
+ * idr/kht while we have waited.
+ * Just unlock and continue.
+ */
+ shm_unlock(shp);
+ }
+
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
+ }
}
static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -676,7 +765,11 @@ static int newseg(struct ipc_namespace *
if (error < 0)
goto no_id;
+ shp->ns = ns;
+
+ task_lock(current);
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
+ task_unlock(current);
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
@@ -1567,7 +1660,8 @@ out_nattch:
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
_
Patches currently in -mm which might be from alexander.mikhalitsyn(a)virtuozzo.com are
ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
A long story behind all of that...
Some time ago I met kernel crash after CRIU restore procedure,
fortunately, it was CRIU restore, so, I had dump files and could
do restore many times and crash reproduced easily. After some
investigation I've constructed the minimal reproducer. It was
found that it's use-after-free and it happens only if
sysctl kernel.shm_rmid_forced = 1.
The key of the problem is that the exit_shm() function
not handles shp's object destroy when task->sysvshm.shm_clist
contains items from different IPC namespaces. In most cases
this list will contain only items from one IPC namespace.
Why this list may contain object from different namespaces?
Function exit_shm() designed to clean up this list always when
process leaves IPC namespace. But we made a mistake a long time ago
and not add exit_shm() call into setns() syscall procedures.
1st second idea was just to add this call to setns() syscall but
it's obviously changes semantics of setns() syscall and that's
userspace-visible change. So, I gave up this idea.
First real attempt to address the issue was just to omit forced destroy
if we meet shp object not from current task IPC namespace [1]. But
that was not the best idea because task->sysvshm.shm_clist was
protected by rwsem which belongs to current task IPC namespace.
It means that list corruption may occur.
Second approach is just extend exit_shm() to properly handle
shp's from different IPC namespaces [2]. This is really
non-trivial thing, I've put a lot of effort into that but
not believed that it's possible to make it fully safe, clean
and clear.
Thanks to the efforts of Manfred Spraul working and elegant
solution was designed. Thanks a lot, Manfred!
Eric also suggested the way to address the issue in
("[RFC][PATCH] shm: In shm_exit destroy all created and never attached segments")
Eric's idea was to maintain a list of shm_clists one per IPC namespace,
use lock-less lists. But there is some extra memory consumption-related concerns.
Alternative solution which was suggested by me was implemented in
("shm: reset shm_clist on setns but omit forced shm destroy")
Idea is pretty simple, we add exit_shm() syscall to setns() but DO NOT
destroy shm segments even if sysctl kernel.shm_rmid_forced = 1, we just
clean up the task->sysvshm.shm_clist list. This chages semantics of
setns() syscall a little bit but in comparision to "naive" solution
when we just add exit_shm() without any special exclusions this looks
like a safer option.
[1] https://lkml.org/lkml/2021/7/6/1108
[2] https://lkml.org/lkml/2021/7/14/736
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Cc: Vasily Averin <vvs(a)virtuozzo.com>
Cc: Manfred Spraul <manfred(a)colorfullife.com>
Cc: Alexander Mikhalitsyn <alexander(a)mihalicyn.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Alexander Mikhalitsyn (2):
ipc: WARN if trying to remove ipc object which is absent
shm: extend forced shm destroy to support objects from several IPC
nses
include/linux/ipc_namespace.h | 15 +++
include/linux/sched/task.h | 2 +-
include/linux/shm.h | 2 +-
ipc/shm.c | 170 +++++++++++++++++++++++++---------
ipc/util.c | 6 +-
5 files changed, 145 insertions(+), 50 deletions(-)
--
2.31.1
Some devices tend to trigger SYS_ERR interrupt while the host handling
SYS_ERR state of the device during power up. This creates a race
condition and causes a failure in booting up the device.
The issue is seen on the Sierra Wireless EM9191 modem during SYS_ERR
handling in mhi_async_power_up(). Once the host detects that the device
is in SYS_ERR state, it issues MHI_RESET and waits for the device to
process the reset request. During this time, the device triggers SYS_ERR
interrupt to the host and host starts handling SYS_ERR execution.
So by the time the device has completed reset, host starts SYS_ERR
handling. This causes the race condition and the modem fails to boot.
Hence, register the IRQ handler only after handling the SYS_ERR check
to avoid getting spurious IRQs from the device.
Cc: stable(a)vger.kernel.org
Fixes: e18d4e9fa79b ("bus: mhi: core: Handle syserr during power_up")
Reported-by: Aleksander Morgado <aleksander(a)aleksander.es>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
---
drivers/bus/mhi/core/pm.c | 26 +++++++++++---------------
1 file changed, 11 insertions(+), 15 deletions(-)
diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c
index fb99e3727155..ec5f11166820 100644
--- a/drivers/bus/mhi/core/pm.c
+++ b/drivers/bus/mhi/core/pm.c
@@ -1055,10 +1055,6 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
mutex_lock(&mhi_cntrl->pm_mutex);
mhi_cntrl->pm_state = MHI_PM_DISABLE;
- ret = mhi_init_irq_setup(mhi_cntrl);
- if (ret)
- goto error_setup_irq;
-
/* Setup BHI INTVEC */
write_lock_irq(&mhi_cntrl->pm_lock);
mhi_write_reg(mhi_cntrl, mhi_cntrl->bhi, BHI_INTVEC, 0);
@@ -1072,7 +1068,7 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
dev_err(dev, "%s is not a valid EE for power on\n",
TO_MHI_EXEC_STR(current_ee));
ret = -EIO;
- goto error_async_power_up;
+ goto error_setup_irq;
}
state = mhi_get_mhi_state(mhi_cntrl);
@@ -1082,19 +1078,18 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
if (state == MHI_STATE_SYS_ERR) {
mhi_set_mhi_state(mhi_cntrl, MHI_STATE_RESET);
ret = wait_event_timeout(mhi_cntrl->state_event,
- MHI_PM_IN_FATAL_STATE(mhi_cntrl->pm_state) ||
- mhi_read_reg_field(mhi_cntrl,
- mhi_cntrl->regs,
- MHICTRL,
- MHICTRL_RESET_MASK,
- MHICTRL_RESET_SHIFT,
+ mhi_read_reg_field(mhi_cntrl,
+ mhi_cntrl->regs,
+ MHICTRL,
+ MHICTRL_RESET_MASK,
+ MHICTRL_RESET_SHIFT,
&val) ||
!val,
msecs_to_jiffies(mhi_cntrl->timeout_ms));
if (!ret) {
ret = -EIO;
dev_info(dev, "Failed to reset MHI due to syserr state\n");
- goto error_async_power_up;
+ goto error_setup_irq;
}
/*
@@ -1104,6 +1099,10 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
mhi_write_reg(mhi_cntrl, mhi_cntrl->bhi, BHI_INTVEC, 0);
}
+ ret = mhi_init_irq_setup(mhi_cntrl);
+ if (ret)
+ goto error_setup_irq;
+
/* Transition to next state */
next_state = MHI_IN_PBL(current_ee) ?
DEV_ST_TRANSITION_PBL : DEV_ST_TRANSITION_READY;
@@ -1116,9 +1115,6 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
return 0;
-error_async_power_up:
- mhi_deinit_free_irq(mhi_cntrl);
-
error_setup_irq:
mhi_cntrl->pm_state = MHI_PM_DISABLE;
mutex_unlock(&mhi_cntrl->pm_mutex);
--
2.25.1
From: Charan Teja Reddy <charante(a)codeaurora.org>
[ Upstream commit f492283b157053e9555787262f058ae33096f568 ]
It is expected from the clients to follow the below steps on an imported
dmabuf fd:
a) dmabuf = dma_buf_get(fd) // Get the dmabuf from fd
b) dma_buf_attach(dmabuf); // Clients attach to the dmabuf
o Here the kernel does some slab allocations, say for
dma_buf_attachment and may be some other slab allocation in the
dmabuf->ops->attach().
c) Client may need to do dma_buf_map_attachment().
d) Accordingly dma_buf_unmap_attachment() should be called.
e) dma_buf_detach () // Clients detach to the dmabuf.
o Here the slab allocations made in b) are freed.
f) dma_buf_put(dmabuf) // Can free the dmabuf if it is the last
reference.
Now say an erroneous client failed at step c) above thus it directly
called dma_buf_put(), step f) above. Considering that it may be the last
reference to the dmabuf, buffer will be freed with pending attachments
left to the dmabuf which can show up as the 'memory leak'. This should
at least be reported as the WARN().
Signed-off-by: Charan Teja Reddy <charante(a)codeaurora.org>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1627043468-16381-1-git-send-e…
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/dma-buf/dma-buf.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 758de0e9b2ddc..16bbc9bc9e6d1 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -79,6 +79,7 @@ static void dma_buf_release(struct dentry *dentry)
if (dmabuf->resv == (struct dma_resv *)&dmabuf[1])
dma_resv_fini(dmabuf->resv);
+ WARN_ON(!list_empty(&dmabuf->attachments));
module_put(dmabuf->owner);
kfree(dmabuf->name);
kfree(dmabuf);
--
2.33.0
From: Charan Teja Reddy <charante(a)codeaurora.org>
[ Upstream commit f492283b157053e9555787262f058ae33096f568 ]
It is expected from the clients to follow the below steps on an imported
dmabuf fd:
a) dmabuf = dma_buf_get(fd) // Get the dmabuf from fd
b) dma_buf_attach(dmabuf); // Clients attach to the dmabuf
o Here the kernel does some slab allocations, say for
dma_buf_attachment and may be some other slab allocation in the
dmabuf->ops->attach().
c) Client may need to do dma_buf_map_attachment().
d) Accordingly dma_buf_unmap_attachment() should be called.
e) dma_buf_detach () // Clients detach to the dmabuf.
o Here the slab allocations made in b) are freed.
f) dma_buf_put(dmabuf) // Can free the dmabuf if it is the last
reference.
Now say an erroneous client failed at step c) above thus it directly
called dma_buf_put(), step f) above. Considering that it may be the last
reference to the dmabuf, buffer will be freed with pending attachments
left to the dmabuf which can show up as the 'memory leak'. This should
at least be reported as the WARN().
Signed-off-by: Charan Teja Reddy <charante(a)codeaurora.org>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1627043468-16381-1-git-send-e…
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/dma-buf/dma-buf.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 511fe0d217a08..733c8b1c8467c 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -79,6 +79,7 @@ static void dma_buf_release(struct dentry *dentry)
if (dmabuf->resv == (struct dma_resv *)&dmabuf[1])
dma_resv_fini(dmabuf->resv);
+ WARN_ON(!list_empty(&dmabuf->attachments));
module_put(dmabuf->owner);
kfree(dmabuf->name);
kfree(dmabuf);
--
2.33.0
In commit 221abd4d478a ("staging: r8188eu: Remove no more necessary definitions
and code"), two entries were removed from RTW_ChannelPlanMap[], but not replaced
with zeros. The position within this table is important, thus the patch broke
systems operating in regulatory domains osted later than entry 0x13 in the table.
Unfortunately, the FCC entry comes before that point and most testers did not see
this problem.
Reported-and-tested-by: Zameer Manji <zmanji(a)gmail.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Fixes: 221abd4d478a ("staging: r8188eu: Remove no more necessary definitions and code")
Cc: Stable <stable(a)vger.kernel.org> # v5.5+
Signed-off-by: Larry Finger <Larry.Finger(a)lwfinger.net>
---
v2 - fixed use of () rsther than {} - found by kernel test robot
---
drivers/staging/r8188eu/core/rtw_mlme_ext.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/staging/r8188eu/core/rtw_mlme_ext.c b/drivers/staging/r8188eu/core/rtw_mlme_ext.c
index 55c3d4a6faeb..5b60e6df5f87 100644
--- a/drivers/staging/r8188eu/core/rtw_mlme_ext.c
+++ b/drivers/staging/r8188eu/core/rtw_mlme_ext.c
@@ -107,6 +107,7 @@ static struct rt_channel_plan_map RTW_ChannelPlanMap[RT_CHANNEL_DOMAIN_MAX] = {
{0x01}, /* 0x10, RT_CHANNEL_DOMAIN_JAPAN */
{0x02}, /* 0x11, RT_CHANNEL_DOMAIN_FCC_NO_DFS */
{0x01}, /* 0x12, RT_CHANNEL_DOMAIN_JAPAN_NO_DFS */
+ {0x00}, /* 0x13 */
{0x02}, /* 0x14, RT_CHANNEL_DOMAIN_TAIWAN_NO_DFS */
{0x00}, /* 0x15, RT_CHANNEL_DOMAIN_ETSI_NO_DFS */
{0x00}, /* 0x16, RT_CHANNEL_DOMAIN_KOREA_NO_DFS */
@@ -118,6 +119,7 @@ static struct rt_channel_plan_map RTW_ChannelPlanMap[RT_CHANNEL_DOMAIN_MAX] = {
{0x00}, /* 0x1C, */
{0x00}, /* 0x1D, */
{0x00}, /* 0x1E, */
+ {0x00}, /* 0x1F, */
/* 0x20 ~ 0x7F , New Define ===== */
{0x00}, /* 0x20, RT_CHANNEL_DOMAIN_WORLD_NULL */
{0x01}, /* 0x21, RT_CHANNEL_DOMAIN_ETSI1_NULL */
--
2.33.1
Good Day,
My name is Luis Fernandez.I am contacting you because we have
investors that have the capacity to invest in any massive project
in your country or invest in your existing project that requires
funding.
Kindly get back to me for more details.
Best Regards
Luis Fernandez
Since ARMv8.0 the upper 32 bits of ESR_ELx have been RES0, and recently
some of the upper bits gained a meaning and can be non-zero. For
example, when FEAT_LS64 is implemented, ESR_ELx[36:32] contain ISS2,
which for an ST64BV or ST64BV0 can be non-zero. This can be seen in ARM
DDI 0487G.b, page D13-3145, section D13.2.37.
Generally, we must not rely on RES0 bit remaining zero in future, and
when extracting ESR_ELx.EC we must mask out all other bits.
All C code uses the ESR_ELx_EC() macro, which masks out the irrelevant
bits, and therefore no alterations are required to C code to avoid
consuming irrelevant bits.
In a couple of places the KVM assembly extracts ESR_ELx.EC using LSR on
an X register, and so could in theory consume previously RES0 bits. In
both cases this is for comparison with EC values ESR_ELx_EC_HVC32 and
ESR_ELx_EC_HVC64, for which the upper bits of ESR_ELx must currently be
zero, but this could change in future.
This patch adjusts the KVM vectors to use UBFX rather than LSR to
extract ESR_ELx.EC, ensuring these are robust to future additions to
ESR_ELx.
Cc: stable(a)vger.kernel.org
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Alexandru Elisei <alexandru.elisei(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: James Morse <james.morse(a)arm.com>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
---
arch/arm64/include/asm/esr.h | 1 +
arch/arm64/kvm/hyp/hyp-entry.S | 2 +-
arch/arm64/kvm/hyp/nvhe/host.S | 2 +-
3 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index a305ce256090..d52a0b269ee8 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -68,6 +68,7 @@
#define ESR_ELx_EC_MAX (0x3F)
#define ESR_ELx_EC_SHIFT (26)
+#define ESR_ELx_EC_WIDTH (6)
#define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT)
#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 9aa9b73475c9..b6b6801d96d5 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -44,7 +44,7 @@
el1_sync: // Guest trapped into EL2
mrs x0, esr_el2
- lsr x0, x0, #ESR_ELx_EC_SHIFT
+ ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH
cmp x0, #ESR_ELx_EC_HVC64
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 4b652ffb591d..d310d2b2c8b4 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -115,7 +115,7 @@ SYM_FUNC_END(__hyp_do_panic)
.L__vect_start\@:
stp x0, x1, [sp, #-16]!
mrs x0, esr_el2
- lsr x0, x0, #ESR_ELx_EC_SHIFT
+ ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH
cmp x0, #ESR_ELx_EC_HVC64
b.ne __host_exit
--
2.11.0
Commit 761ed4a94582 ("tty: serial_core: convert uart_close to use
tty_port_close") converted serial core to use tty_port_close() but
failed to notice that the transmit buffer still needs to be freed on
final close.
Not freeing the transmit buffer means that the buffer is no longer
cleared on next open so that any ioctl() waiting for the buffer to drain
might wait indefinitely (e.g. on termios changes) or that stale data can
end up being transmitted in case tx is restarted.
Furthermore, the buffer of any port that has been opened would leak on
driver unbind.
Note that the port lock is held when clearing the buffer pointer due to
the ldisc race worked around by commit a5ba1d95e46e ("uart: fix race
between uart_put_char() and uart_shutdown()").
Also note that the tty-port shutdown() callback is not called for
console ports so it is not strictly necessary to free the buffer page
after releasing the lock (cf. d72402145ace ("tty/serial: do not free
trasnmit buffer page under port lock")).
Reported-by: Baruch Siach <baruch(a)tkos.co.il>
Link: https://lore.kernel.org/r/319321886d97c456203d5c6a576a5480d07c3478.16357816…
Fixes: 761ed4a94582 ("tty: serial_core: convert uart_close to use tty_port_close")
Cc: stable(a)vger.kernel.org # 4.9
Cc: Rob Herring <robh(a)kernel.org>
Signed-off-by: Johan Hovold <johan(a)kernel.org>
---
drivers/tty/serial/serial_core.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0e2e35ab64c7..58834698739c 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1542,6 +1542,7 @@ static void uart_tty_port_shutdown(struct tty_port *port)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport = uart_port_check(state);
+ char *buf;
/*
* At this point, we stop accepting input. To do this, we
@@ -1563,8 +1564,18 @@ static void uart_tty_port_shutdown(struct tty_port *port)
*/
tty_port_set_suspended(port, 0);
- uart_change_pm(state, UART_PM_STATE_OFF);
+ /*
+ * Free the transmit buffer.
+ */
+ spin_lock_irq(&uport->lock);
+ buf = state->xmit.buf;
+ state->xmit.buf = NULL;
+ spin_unlock_irq(&uport->lock);
+ if (buf)
+ free_page((unsigned long)buf);
+
+ uart_change_pm(state, UART_PM_STATE_OFF);
}
static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
--
2.32.0
Dear Linux stable folks,
Am 28.10.21 um 16:21 schrieb Alex Deucher:
> The DMA mask on SI parts is 40 bits not 44. Copy
> paste typo.
>
> Fixes: 244511f386ccb9 ("drm/amdgpu: simplify and cleanup setting the dma mask")
> Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1762
> Acked-by: Christian König <christian.koenig(a)amd.com>
> Tested-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
> Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
This is commit 403475be6d8b122c3e6b8a47e075926d7299e5ef in Linus’ master
branch. Could you please apply it to the stable series (5.4+)?
Kind regards,
Paul
commit c7cb42e94473aafe553c0f2a3d8ca904599399ed upstream.
When handling THP hwpoison checked if the THP is in allocation or free
stage since hwpoison may mistreat it as hugetlb page. After commit
415c64c1453a ("mm/memory-failure: split thp earlier in memory error
handling") the problem has been fixed, so this check is no longer
needed. Remove it. The side effect of the removal is hwpoison may
report unsplit THP instead of unknown error for shmem THP. It seems not
like a big deal.
The following patch "mm: filemap: check if THP has hwpoisoned subpage
for PMD page fault" depends on this, which fixes shmem THP with
hwpoisoned subpage(s) are mapped PMD wrongly. So this patch needs to be
backported to -stable as well.
Link: https://lkml.kernel.org/r/20211020210755.23964-2-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Suggested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
---
mm/memory-failure.c | 14 --------------
1 file changed, 14 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 01445ddff58d..bd2cd4dd59b6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -956,20 +956,6 @@ static int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
- if (!PageHuge(head) && PageTransHuge(head)) {
- /*
- * Non anonymous thp exists only in allocation/free time. We
- * can't handle such a case correctly, so let's give it up.
- * This should be better than triggering BUG_ON when kernel
- * tries to touch the "partially handled" page.
- */
- if (!PageAnon(head)) {
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- page_to_pfn(page));
- return 0;
- }
- }
-
if (get_page_unless_zero(head)) {
if (head == compound_head(page))
return 1;
--
2.26.2
From: Mike Marciniszyn <mike.marciniszyn(a)cornelisnetworks.com>
This series ports upstream commit:
d39bf40e55e6 ("IB/qib: Protect from buffer overflow in struct qib_user_sdma_pkt fields")
Gustavo A. R. Silva (1):
IB/qib: Use struct_size() helper
Mike Marciniszyn (1):
IB/qib: Protect from buffer overflow in struct qib_user_sdma_pkt
fields
drivers/infiniband/hw/qib/qib_user_sdma.c | 35 ++++++++++++++++++++++---------
1 file changed, 25 insertions(+), 10 deletions(-)
--
Changes from v1:
Correct signed off for Mike Marciniszyn
Hi Greg and Sasha,
Please apply commit 3d5e7a28b1ea ("KVM: x86: avoid warning with
-Wbitwise-instead-of-logical") to 5.10, 5.14, and 5.15, where it
resolves a build error with tip of tree clang due to -Werror:
arch/x86/kvm/mmu/mmu.c:3548:15: error: use of bitwise '|' with boolean operands [-Werror,-Wbitwise-instead-of-logical]
reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) |
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
||
arch/x86/kvm/mmu/mmu.c:3548:15: note: cast one or both operands to int to silence this warning
1 error generated.
It applies cleanly to 5.14 and 5.15 and I have attached a backport for
5.10. I have added Paolo in case he has any objections to this.
Cheers,
Nathan
Good Day,
My name is Luis Fernandez.I am contacting you because we have
investors that have the capacity to invest in any massive project
in your country or invest in your existing project that requires
funding.
Kindly get back to me for more details.
Best Regards
Luis Fernandez
2021-11-08 0:22 GMT+09:00, Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>:
> All the error handling paths of 'smb2_sess_setup()' end to 'out_err'.
>
> All but the new error handling path added by the commit given in the Fixes
> tag below.
>
> Fix this error handling path and branch to 'out_err' as well.
>
> Fixes: 0d994cd482ee ("ksmbd: add buffer validation in session setup")
Cc: stable(a)vger.kernel.org # v5.15
> Signed-off-by: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Acked-by: Namjae Jeon <linkinjeon(a)kernel.org>
Thanks!
Hi Greg,
could you cherry-pick the following three commits to 5.14.y tree?
The Cc-to-stable was missing on those, although they should have been
picked up.
cbea6e5a7772b7a5b80baa8f98fd77853487fd2a
ALSA: pcm: Check mmap capability of runtime dma buffer at first
0899a7a23047f106c06888769d6cd6ff43d7395f
ALSA: pci: rme: Set up buffer type properly
4d9e9153f1c64d91a125c6967bc0bfb0bb653ea0
ALSA: pci: cs46xx: Fix set up buffer type properly
They are needed only for 5.14.y, not for older versions.
The relevant bug report is found at:
https://bugzilla.kernel.org/show_bug.cgi?id=214947
Thanks!
Takashi
The function can_rx_offload_threaded_irq_finish() is needed to trigger
the NAPI thread to deliver read CAN frames to the networking stack.
This patch adds the missing call to can_rx_offload_threaded_irq_finish()
in case of a bus off, before leaving the interrupt handler to avoid
packet starvation.
Link: https://lore.kernel.org/all/20211106201526.44292-1-mkl@pengutronix.de
Fixes: 30bfec4fec59 ("can: rx-offload: can_rx_offload_threaded_irq_finish(): add new function to be called from threaded interrupt")
Cc: stable(a)vger.kernel.org
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 673861ab665a..212fcd1554e4 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -2290,8 +2290,10 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id)
* check will fail, too. So leave IRQ handler
* directly.
*/
- if (priv->can.state == CAN_STATE_BUS_OFF)
+ if (priv->can.state == CAN_STATE_BUS_OFF) {
+ can_rx_offload_threaded_irq_finish(&priv->offload);
return IRQ_HANDLED;
+ }
}
handled = IRQ_HANDLED;
--
2.33.0
From: Zhang Changzhong <zhangchangzhong(a)huawei.com>
The TP.CM_BAM message must be sent to the global address [1], so add a
check to drop TP.CM_BAM sent to a non-global address.
Without this patch, the receiver will treat the following packets as
normal RTS/CTS transport:
18EC0102#20090002FF002301
18EB0102#0100000000000000
18EB0102#020000FFFFFFFFFF
[1] SAE-J1939-82 2015 A.3.3 Row 1.
Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Link: https://lore.kernel.org/all/1635431907-15617-4-git-send-email-zhangchangzho…
Cc: stable(a)vger.kernel.org
Signed-off-by: Zhang Changzhong <zhangchangzhong(a)huawei.com>
Acked-by: Oleksij Rempel <o.rempel(a)pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
net/can/j1939/transport.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 05eb3d059e17..a271688780a2 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -2023,6 +2023,11 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
extd = J1939_ETP;
fallthrough;
case J1939_TP_CMD_BAM:
+ if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) {
+ netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
fallthrough;
case J1939_TP_CMD_RTS:
if (skcb->addr.type != extd)
--
2.33.0
The patch titled
Subject: shm: extend forced shm destroy to support objects from several IPC nses
has been added to the -mm tree. Its filename is
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/shm-extend-forced-shm-destroy-to-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/shm-extend-forced-shm-destroy-to-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Subject: shm: extend forced shm destroy to support objects from several IPC nses
Currently, exit_shm function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).
That particular patch is attempt to fix the problem by extending exit_shm
mechanism to handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we
initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in
task->sysvshm.shm_clist and gets IPC namespace not from current task as
it was before but from shp's object itself, then call shm_destroy(shp,
ns).
Note. We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using
special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter
only if IPC ns not in the "state of destruction".
Q/A
Q: Why we can access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther than IPC namespace
lifetime, so, if we get shp object from the task->sysvshm.shm_clist
while holding task_lock(task) nobody can steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: Not. It's just fixes non-covered case when process may leave IPC
namespace without getting task->sysvshm.shm_clist list cleaned up.
Link: https://lkml.kernel.org/r/20211027224348.611025-3-alexander.mikhalitsyn@vir…
Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Manfred Spraul <manfred(a)colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Cc: Vasily Averin <vvs(a)virtuozzo.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/ipc_namespace.h | 15 ++
include/linux/sched/task.h | 2
include/linux/shm.h | 2
ipc/shm.c | 170 +++++++++++++++++++++++---------
4 files changed, 142 insertions(+), 47 deletions(-)
--- a/include/linux/ipc_namespace.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/ipc_namespace.h
@@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_
return ns;
}
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ if (ns) {
+ if (refcount_inc_not_zero(&ns->ns.count))
+ return ns;
+ }
+
+ return NULL;
+}
+
extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_
{
return ns;
}
+
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ return ns;
+}
static inline void put_ipc_ns(struct ipc_namespace *ns)
{
--- a/include/linux/sched/task.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/sched/task.h
@@ -157,7 +157,7 @@ static inline struct vm_struct *task_sta
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
--- a/include/linux/shm.h~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/include/linux/shm.h
@@ -11,7 +11,7 @@ struct file;
#ifdef CONFIG_SYSVIPC
struct sysv_shm {
- struct list_head shm_clist;
+ struct list_head shm_clist;
};
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
--- a/ipc/shm.c~shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses
+++ a/ipc/shm.c
@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the ke
struct pid *shm_lprid;
struct ucounts *mlock_ucounts;
- /* The task created the shm object. NULL if the task is dead. */
+ /*
+ * The task created the shm object, for looking up
+ * task->sysvshm.shm_clist_lock
+ */
struct task_struct *shm_creator;
- struct list_head shm_clist; /* list by creator */
+
+ /*
+ * list by creator. shm_clist_lock required for read/write
+ * if list_empty(), then the creator is dead already
+ */
+ struct list_head shm_clist;
+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_names
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ WARN_ON(ns != shp->ns);
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,10 +235,36 @@ static void shm_rcu_free(struct rcu_head
kfree(shp);
}
-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+/*
+ * It has to be called with shp locked.
+ * It must be called before ipc_rmid()
+ */
+static inline void shm_clist_rm(struct shmid_kernel *shp)
{
- list_del(&s->shm_clist);
- ipc_rmid(&shm_ids(ns), &s->shm_perm);
+ struct task_struct *creator;
+
+ /*
+ * A concurrent exit_shm may do a list_del_init() as well.
+ * Just do nothing if exit_shm already did the work
+ */
+ if (list_empty(&shp->shm_clist))
+ return;
+
+ /*
+ * shp->shm_creator is guaranteed to be valid *only*
+ * if shp->shm_clist is not empty.
+ */
+ creator = shp->shm_creator;
+
+ task_lock(creator);
+ list_del_init(&shp->shm_clist);
+ task_unlock(creator);
+}
+
+static inline void shm_rmid(struct shmid_kernel *s)
+{
+ shm_clist_rm(s);
+ ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}
@@ -283,7 +319,7 @@ static void shm_destroy(struct ipc_names
shm_file = shp->shm_file;
shp->shm_file = NULL;
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
- shm_rmid(ns, shp);
+ shm_rmid(shp);
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_ucounts);
@@ -306,10 +342,10 @@ static void shm_destroy(struct ipc_names
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
@@ -340,7 +376,7 @@ static void shm_close(struct vm_area_str
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -361,10 +397,10 @@ static int shm_try_destroy_orphaned(int
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -382,48 +418,87 @@ void shm_destroy_orphaned(struct ipc_nam
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
- struct shmid_kernel *shp, *n;
+ for (;;) {
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns;
- if (list_empty(&task->sysvshm.shm_clist))
- return;
+ task_lock(task);
+
+ if (list_empty(&task->sysvshm.shm_clist)) {
+ task_unlock(task);
+ break;
+ }
+
+ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
+ shm_clist);
+
+ /* 1) unlink */
+ list_del_init(&shp->shm_clist);
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
/*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
+ * 2) Get pointer to the ipc namespace. It is worth to say
+ * that this pointer is guaranteed to be valid because
+ * shp lifetime is always shorter than namespace lifetime
+ * in which shp lives.
+ * We taken task_lock it means that shp won't be freed.
*/
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
+ ns = shp->ns;
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
- shp->shm_creator = NULL;
+ /*
+ * 3) If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced) {
+ task_unlock(task);
+ continue;
+ }
- if (shm_may_destroy(ns, shp)) {
+ /*
+ * 4) get a reference to the namespace.
+ * The refcount could be already 0. If it is 0, then
+ * the shm objects will be free by free_ipc_work().
+ */
+ ns = get_ipc_ns_not_zero(ns);
+ if (ns) {
+ /*
+ * 5) get a reference to the shp itself.
+ * This cannot fail: shm_clist_rm() is called before
+ * ipc_rmid(), thus the refcount cannot be 0.
+ */
+ WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
+ }
+
+ task_unlock(task);
+
+ if (ns) {
+ down_write(&shm_ids(ns).rwsem);
shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ /*
+ * rcu_read_lock was implicitly taken in
+ * shm_lock_by_ptr, it's safe to call
+ * ipc_rcu_putref here
+ */
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+
+ if (ipc_valid_object(&shp->shm_perm)) {
+ if (shm_may_destroy(shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ } else {
+ /*
+ * Someone else deleted the shp from namespace
+ * idr/kht while we have waited.
+ * Just unlock and continue.
+ */
+ shm_unlock(shp);
+ }
+
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
}
}
-
- /* Remove the list head from any segments still attached. */
- list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
}
static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -680,7 +755,11 @@ static int newseg(struct ipc_namespace *
if (error < 0)
goto no_id;
+ shp->ns = ns;
+
+ task_lock(current);
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
+ task_unlock(current);
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
@@ -1573,7 +1652,8 @@ out_nattch:
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
_
Patches currently in -mm which might be from alexander.mikhalitsyn(a)virtuozzo.com are
ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch
shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
This is a note to let you know that I've just added the patch titled
xhci: Fix USB 3.1 enumeration issues by increasing roothub
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From e1959faf085b004e6c3afaaaa743381f00e7c015 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Fri, 5 Nov 2021 18:00:36 +0200
Subject: xhci: Fix USB 3.1 enumeration issues by increasing roothub
power-on-good delay
Some USB 3.1 enumeration issues were reported after the hub driver removed
the minimum 100ms limit for the power-on-good delay.
Since commit 90d28fb53d4a ("usb: core: reduce power-on-good delay time of
root hub") the hub driver sets the power-on-delay based on the
bPwrOn2PwrGood value in the hub descriptor.
xhci driver has a 20ms bPwrOn2PwrGood value for both roothubs based
on xhci spec section 5.4.8, but it's clearly not enough for the
USB 3.1 devices, causing enumeration issues.
Tests indicate full 100ms delay is needed.
Reported-by: Walt Jr. Brake <mr.yming81(a)gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Fixes: 90d28fb53d4a ("usb: core: reduce power-on-good delay time of root hub")
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211105160036.549516-1-mathias.nyman@linux.intel…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/host/xhci-hub.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index a3f875eea751..af946c42b6f0 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -257,7 +257,6 @@ static void xhci_common_hub_descriptor(struct xhci_hcd *xhci,
{
u16 temp;
- desc->bPwrOn2PwrGood = 10; /* xhci section 5.4.9 says 20ms max */
desc->bHubContrCurrent = 0;
desc->bNbrPorts = ports;
@@ -292,6 +291,7 @@ static void xhci_usb2_hub_descriptor(struct usb_hcd *hcd, struct xhci_hcd *xhci,
desc->bDescriptorType = USB_DT_HUB;
temp = 1 + (ports / 8);
desc->bDescLength = USB_DT_HUB_NONVAR_SIZE + 2 * temp;
+ desc->bPwrOn2PwrGood = 10; /* xhci section 5.4.8 says 20ms */
/* The Device Removable bits are reported on a byte granularity.
* If the port doesn't exist within that byte, the bit is set to 0.
@@ -344,6 +344,7 @@ static void xhci_usb3_hub_descriptor(struct usb_hcd *hcd, struct xhci_hcd *xhci,
xhci_common_hub_descriptor(xhci, desc, ports);
desc->bDescriptorType = USB_DT_SS_HUB;
desc->bDescLength = USB_DT_SS_HUB_SIZE;
+ desc->bPwrOn2PwrGood = 50; /* usb 3.1 may fail if less than 100ms */
/* header decode latency should be zero for roothubs,
* see section 4.23.5.2.
--
2.33.1
I'm announcing the release of the 5.15.1 kernel.
All users of the 5.15 kernel series must upgrade.
The updated 5.15.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.15.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
drivers/amba/bus.c | 3
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 --------------
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 5
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 -
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 3
drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 4
drivers/media/firewire/firedtv-avc.c | 14 +-
drivers/media/firewire/firedtv-ci.c | 2
drivers/net/ethernet/sfc/ethtool_common.c | 10 -
drivers/net/wireless/ath/wcn36xx/main.c | 10 -
drivers/net/wireless/ath/wcn36xx/pmc.c | 5
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1
drivers/soc/imx/gpcv2.c | 4
drivers/usb/core/hcd.c | 29 +----
drivers/usb/host/xhci.c | 1
include/linux/usb/hcd.h | 2
sound/usb/quirks.c | 2
20 files changed, 32 insertions(+), 158 deletions(-)
Anson Jacob (1):
drm/amd/display: Revert "Directly retrain link from debugfs"
Bryan O'Donoghue (1):
Revert "wcn36xx: Disable bmps when encryption is disabled"
Christian König (1):
drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"
Dan Carpenter (1):
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Erik Ekman (1):
sfc: Fix reading non-legacy supported link modes
Greg Kroah-Hartman (3):
Revert "xhci: Set HCD flag to defer primary roothub registration"
Revert "usb: core: hcd: Add support for deferring roothub registration"
Linux 5.15.1
Lucas Stach (1):
Revert "soc: imx: gpcv2: move reset assert after requesting domain power up"
Matthew Brost (1):
Revert "drm/i915/gt: Propagate change in error status to children on unhold"
Takashi Iwai (1):
ALSA: usb-audio: Add quirk for Audient iD14
Wang Kefeng (1):
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Yifan Zhang (1):
drm/amdkfd: fix boot failure when iommu is disabled in Picasso.
I'm announcing the release of the 5.14.17 kernel.
All users of the 5.14 kernel series must upgrade.
The updated 5.14.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.14.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
drivers/amba/bus.c | 3
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 --------------
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 5
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 -
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 3
drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 4
drivers/gpu/drm/i915/i915_reg.h | 8 -
drivers/gpu/drm/i915/intel_dram.c | 30 -----
drivers/media/firewire/firedtv-avc.c | 14 +-
drivers/media/firewire/firedtv-ci.c | 2
drivers/net/ethernet/sfc/ethtool_common.c | 10 -
drivers/net/vrf.c | 4
drivers/net/wireless/ath/wcn36xx/main.c | 10 -
drivers/net/wireless/ath/wcn36xx/pmc.c | 5
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1
drivers/scsi/scsi.c | 4
drivers/scsi/scsi_sysfs.c | 9 +
drivers/soc/imx/gpcv2.c | 4
drivers/usb/core/hcd.c | 29 +----
drivers/usb/host/xhci.c | 1
include/linux/usb/hcd.h | 2
sound/usb/mixer_maps.c | 8 +
25 files changed, 52 insertions(+), 199 deletions(-)
Anson Jacob (1):
drm/amd/display: Revert "Directly retrain link from debugfs"
Bryan O'Donoghue (1):
Revert "wcn36xx: Disable bmps when encryption is disabled"
Christian König (1):
drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"
Dan Carpenter (1):
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Erik Ekman (1):
sfc: Fix reading non-legacy supported link modes
Eugene Crosser (1):
vrf: Revert "Reset skb conntrack connection..."
Greg Kroah-Hartman (3):
Revert "xhci: Set HCD flag to defer primary roothub registration"
Revert "usb: core: hcd: Add support for deferring roothub registration"
Linux 5.14.17
José Roberto de Souza (1):
drm/i915: Remove memory frequency calculation
Lucas Stach (1):
Revert "soc: imx: gpcv2: move reset assert after requesting domain power up"
Matthew Brost (1):
Revert "drm/i915/gt: Propagate change in error status to children on unhold"
Ming Lei (1):
scsi: core: Put LLD module refcnt after SCSI device is released
Takashi Iwai (2):
ALSA: usb-audio: Add Schiit Hel device to mixer map quirk table
ALSA: usb-audio: Add Audient iD14 to mixer map quirk table
Wang Kefeng (1):
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Yifan Zhang (1):
drm/amdkfd: fix boot failure when iommu is disabled in Picasso.
I'm announcing the release of the 5.4.158 kernel.
All users of the 5.4 kernel series must upgrade.
The updated 5.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 -
drivers/amba/bus.c | 3 --
drivers/gpu/drm/ttm/ttm_bo_util.c | 1
drivers/media/firewire/firedtv-avc.c | 14 +++++++++---
drivers/media/firewire/firedtv-ci.c | 2 +
drivers/net/ethernet/microchip/lan743x_main.c | 10 +++++---
drivers/net/ethernet/sfc/ethtool.c | 10 +-------
drivers/net/vrf.c | 4 ---
drivers/scsi/scsi.c | 4 ++-
drivers/scsi/scsi_sysfs.c | 9 ++++++++
drivers/usb/core/hcd.c | 29 +++++---------------------
drivers/usb/host/xhci.c | 1
include/linux/usb/hcd.h | 2 -
13 files changed, 40 insertions(+), 51 deletions(-)
Dan Carpenter (1):
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Erik Ekman (1):
sfc: Fix reading non-legacy supported link modes
Eugene Crosser (1):
vrf: Revert "Reset skb conntrack connection..."
Greg Kroah-Hartman (4):
Revert "xhci: Set HCD flag to defer primary roothub registration"
Revert "usb: core: hcd: Add support for deferring roothub registration"
Revert "drm/ttm: fix memleak in ttm_transfered_destroy"
Linux 5.4.158
Ming Lei (1):
scsi: core: Put LLD module refcnt after SCSI device is released
Wang Kefeng (1):
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Yuiko Oshino (1):
net: ethernet: microchip: lan743x: Fix skb allocation failure
I'm announcing the release of the 4.19.216 kernel.
All users of the 4.19 kernel series must upgrade.
The updated 4.19.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.19.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 -
arch/arc/include/asm/pgtable.h | 2 +
arch/arm/include/asm/pgtable-2level.h | 2 +
arch/arm/include/asm/pgtable-3level.h | 2 +
arch/mips/include/asm/pgtable-32.h | 3 ++
arch/powerpc/include/asm/pte-common.h | 2 +
arch/riscv/include/asm/pgtable-32.h | 2 +
drivers/amba/bus.c | 3 --
drivers/infiniband/hw/qib/qib_user_sdma.c | 34 +++++++++++++++++++++---------
drivers/media/firewire/firedtv-avc.c | 14 +++++++++---
drivers/media/firewire/firedtv-ci.c | 2 +
drivers/net/ethernet/sfc/ethtool.c | 10 +-------
drivers/scsi/scsi.c | 4 ++-
drivers/scsi/scsi_sysfs.c | 9 +++++++
include/asm-generic/pgtable.h | 13 +++++++++++
15 files changed, 78 insertions(+), 26 deletions(-)
Arnd Bergmann (1):
arch: pgtable: define MAX_POSSIBLE_PHYSMEM_BITS where needed
Dan Carpenter (1):
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Erik Ekman (1):
sfc: Fix reading non-legacy supported link modes
Greg Kroah-Hartman (1):
Linux 4.19.216
Gustavo A. R. Silva (1):
IB/qib: Use struct_size() helper
Mike Marciniszyn (1):
IB/qib: Protect from buffer overflow in struct qib_user_sdma_pkt fields
Ming Lei (1):
scsi: core: Put LLD module refcnt after SCSI device is released
Wang Kefeng (1):
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
This is the start of the stable review cycle for the 5.14.17 release.
There are 16 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 06 Nov 2021 14:11:51 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.14.17-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.14.17-rc1
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add Audient iD14 to mixer map quirk table
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add Schiit Hel device to mixer map quirk table
Matthew Brost <matthew.brost(a)intel.com>
Revert "drm/i915/gt: Propagate change in error status to children on unhold"
Anson Jacob <Anson.Jacob(a)amd.com>
drm/amd/display: Revert "Directly retrain link from debugfs"
Christian König <christian.koenig(a)amd.com>
drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"
Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>
Revert "wcn36xx: Disable bmps when encryption is disabled"
Wang Kefeng <wangkefeng.wang(a)huawei.com>
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Lucas Stach <l.stach(a)pengutronix.de>
Revert "soc: imx: gpcv2: move reset assert after requesting domain power up"
José Roberto de Souza <jose.souza(a)intel.com>
drm/i915: Remove memory frequency calculation
Yifan Zhang <yifan1.zhang(a)amd.com>
drm/amdkfd: fix boot failure when iommu is disabled in Picasso.
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "usb: core: hcd: Add support for deferring roothub registration"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "xhci: Set HCD flag to defer primary roothub registration"
Dan Carpenter <dan.carpenter(a)oracle.com>
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Eugene Crosser <crosser(a)average.org>
vrf: Revert "Reset skb conntrack connection..."
Erik Ekman <erik(a)kryo.se>
sfc: Fix reading non-legacy supported link modes
Ming Lei <ming.lei(a)redhat.com>
scsi: core: Put LLD module refcnt after SCSI device is released
-------------
Diffstat:
Makefile | 4 +-
drivers/amba/bus.c | 3 -
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 ----------------------
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 5 --
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ---
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +
.../drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 3 +-
.../gpu/drm/i915/gt/intel_execlists_submission.c | 4 --
drivers/gpu/drm/i915/i915_reg.h | 8 ---
drivers/gpu/drm/i915/intel_dram.c | 30 +-------
drivers/media/firewire/firedtv-avc.c | 14 +++-
drivers/media/firewire/firedtv-ci.c | 2 +
drivers/net/ethernet/sfc/ethtool_common.c | 10 +--
drivers/net/vrf.c | 4 --
drivers/net/wireless/ath/wcn36xx/main.c | 10 ---
drivers/net/wireless/ath/wcn36xx/pmc.c | 5 +-
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1 -
drivers/scsi/scsi.c | 4 +-
drivers/scsi/scsi_sysfs.c | 9 +++
drivers/soc/imx/gpcv2.c | 4 +-
drivers/usb/core/hcd.c | 29 ++------
drivers/usb/host/xhci.c | 1 -
include/linux/usb/hcd.h | 2 -
sound/usb/mixer_maps.c | 8 +++
25 files changed, 53 insertions(+), 200 deletions(-)
When run below command to remove ethernet driver on
stratix10 platform, there will be warning trace as below:
$ cd /sys/class/net/etha01/device/driver
$ echo ff800000.ethernet > unbind
WARNING: CPU: 3 PID: 386 at drivers/clk/clk.c:810 clk_core_unprepare+0x114/0x274
Modules linked in: sch_fq_codel
CPU: 3 PID: 386 Comm: sh Tainted: G W 5.10.74-yocto-standard #1
Hardware name: SoCFPGA Stratix 10 SoCDK (DT)
pstate: 00000005 (nzcv daif -PAN -UAO -TCO BTYPE=--)
pc : clk_core_unprepare+0x114/0x274
lr : clk_core_unprepare+0x114/0x274
sp : ffff800011bdbb10
clk_core_unprepare+0x114/0x274
clk_unprepare+0x38/0x50
stmmac_remove_config_dt+0x40/0x80
stmmac_pltfr_remove+0x64/0x80
platform_drv_remove+0x38/0x60
... ..
el0_sync_handler+0x1a4/0x1b0
el0_sync+0x180/0x1c0
This issue is introduced by introducing upstream commit 8f269102baf7
("net: stmmac: disable clocks in stmmac_remove_config_dt()")
Because clock has been disabled in function stmmac_dvr_remove()
It not reasonable the remove clock disable action from function
stmmac_remove_config_dt(), because it is mainly used in probe failed,
and other platform drivers also use this common function. So, remove
stmmac_remove_config_dt() from stmmac_pltfr_remove(), only other
necessary code.
Fixes: 1af3a8e91f1a ("net: stmmac: disable clocks in stmmac_remove_config_dt()")
Signed-off-by: Meng Li <Meng.Li(a)windriver.com>
---
Some extra comments as below:
1. This patch is only for linux-stable kernel v5.10, so the fixed commit ID is the one
in linux-stable kernel, not the one in mainline upsteam kernel.
2. I created a patch only to fix the linux-stable kernel v5.10, not submit it to upstream kernel.
The reason as below:
In fact, upstream kernel doesn't have this issue any more. Because it has a patch to improve
the clock management and other 4 patches to fix the 1st patch. Detial patches as below:
5ec55823438e("net: stmmac: add clocks management for gmac driver")
30f347ae7cc1("net: stmmac: fix missing unlock on error in stmmac_suspend()")
b3dcb3127786("net: stmmac: correct clocks enabled in stmmac_vlan_rx_kill_vid()")
4691ffb18ac9("net: stmmac: fix system hang if change mac address after interface ifdown")
ab00f3e051e8("net: stmmac: fix issue where clk is being unprepared twice")
But I think it is a little complex to backport all the 5 patches. Moreover, it may be related
with other patches and code context mofification.
Therefore, I create a simple and clear patch to only this issue on linux-stable kernel, v 5.10
---
drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 53be8fc1d125..0fb702ce2408 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -706,7 +706,8 @@ int stmmac_pltfr_remove(struct platform_device *pdev)
if (plat->exit)
plat->exit(pdev, plat->bsp_priv);
- stmmac_remove_config_dt(pdev, plat);
+ of_node_put(plat->phy_node);
+ of_node_put(plat->mdio_node);
return ret;
}
--
2.17.1
This is the start of the stable review cycle for the 5.10.78 release.
There are 14 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 06 Nov 2021 17:01:02 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.10.78-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.10.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.10.78-rc2
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add Audient iD14 to mixer map quirk table
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add Schiit Hel device to mixer map quirk table
Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>
Revert "wcn36xx: Disable bmps when encryption is disabled"
Wang Kefeng <wangkefeng.wang(a)huawei.com>
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "drm/ttm: fix memleak in ttm_transfered_destroy"
Yang Shi <shy828301(a)gmail.com>
mm: khugepaged: skip huge page collapse for special files
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "usb: core: hcd: Add support for deferring roothub registration"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "xhci: Set HCD flag to defer primary roothub registration"
Dan Carpenter <dan.carpenter(a)oracle.com>
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Yuiko Oshino <yuiko.oshino(a)microchip.com>
net: ethernet: microchip: lan743x: Fix skb allocation failure
Eugene Crosser <crosser(a)average.org>
vrf: Revert "Reset skb conntrack connection..."
Erik Ekman <erik(a)kryo.se>
sfc: Fix reading non-legacy supported link modes
Lee Jones <lee.jones(a)linaro.org>
Revert "io_uring: reinforce cancel on flush during exit"
Ming Lei <ming.lei(a)redhat.com>
scsi: core: Put LLD module refcnt after SCSI device is released
-------------
Diffstat:
Makefile | 4 ++--
drivers/amba/bus.c | 3 ---
drivers/gpu/drm/ttm/ttm_bo_util.c | 1 -
drivers/media/firewire/firedtv-avc.c | 14 ++++++++++---
drivers/media/firewire/firedtv-ci.c | 2 ++
drivers/net/ethernet/microchip/lan743x_main.c | 10 +++++----
drivers/net/ethernet/sfc/ethtool_common.c | 10 ++-------
drivers/net/vrf.c | 4 ----
drivers/net/wireless/ath/wcn36xx/main.c | 10 ---------
drivers/net/wireless/ath/wcn36xx/pmc.c | 5 +----
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1 -
drivers/scsi/scsi.c | 4 +++-
drivers/scsi/scsi_sysfs.c | 9 +++++++++
drivers/usb/core/hcd.c | 29 ++++++---------------------
drivers/usb/host/xhci.c | 1 -
fs/io_uring.c | 3 ++-
include/linux/usb/hcd.h | 2 --
mm/khugepaged.c | 17 +++++++++-------
sound/usb/mixer_maps.c | 8 ++++++++
19 files changed, 62 insertions(+), 75 deletions(-)
From: Rongwei Wang <rongwei.wang(a)linux.alibaba.com>
Subject: mm, thp: fix incorrect unmap behavior for private pages
When truncating pagecache on file THP, the private pages of a process
should not be unmapped mapping. This incorrect behavior on a dynamic
shared libraries which will cause related processes to happen core dump.
A simple test for a DSO (Prerequisite is the DSO mapped in file THP):
int main(int argc, char *argv[])
{
int fd;
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
}
close(fd);
return 0;
}
The test only to open a target DSO, and do nothing. But this operation
will lead one or more process to happen core dump. This patch mainly to
fix this bug.
Link: https://lkml.kernel.org/r/20211025092134.18562-3-rongwei.wang@linux.alibaba…
Fixes: eb6ecbed0aa2 ("mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs")
Signed-off-by: Rongwei Wang <rongwei.wang(a)linux.alibaba.com>
Tested-by: Xu Yu <xuyu(a)linux.alibaba.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Song Liu <song(a)kernel.org>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Collin Fijalkovich <cfijalkovich(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/open.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
--- a/fs/open.c~mm-thp-fix-incorrect-unmap-behavior-for-private-pages
+++ a/fs/open.c
@@ -857,8 +857,17 @@ static int do_dentry_open(struct file *f
*/
smp_mb();
if (filemap_nr_thps(inode->i_mapping)) {
+ struct address_space *mapping = inode->i_mapping;
+
filemap_invalidate_lock(inode->i_mapping);
- truncate_pagecache(inode, 0);
+ /*
+ * unmap_mapping_range just need to be called once
+ * here, because the private pages is not need to be
+ * unmapped mapping (e.g. data segment of dynamic
+ * shared libraries here).
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+ truncate_inode_pages(mapping, 0);
filemap_invalidate_unlock(inode->i_mapping);
}
}
_
From: Vasily Averin <vvs(a)virtuozzo.com>
Subject: memcg: prohibit unconditional exceeding the limit of dying tasks
Memory cgroup charging allows killed or exiting tasks to exceed the hard
limit. It is assumed that the amount of the memory charged by those tasks
is bound and most of the memory will get released while the task is
exiting. This is resembling a heuristic for the global OOM situation when
tasks get access to memory reserves. There is no global memory shortage
at the memcg level so the memcg heuristic is more relieved.
The above assumption is overly optimistic though. E.g. vmalloc can scale
to really large requests and the heuristic would allow that. We used to
have an early break in the vmalloc allocator for killed tasks but this has
been reverted by commit b8c8a338f75e ("Revert "vmalloc: back off when the
current task is killed""). There are likely other similar code paths
which do not check for fatal signals in an allocation&charge loop. Also
there are some kernel objects charged to a memcg which are not bound to a
process life time.
It has been observed that it is not really hard to trigger these bypasses
and cause global OOM situation.
One potential way to address these runaways would be to limit the amount
of excess (similar to the global OOM with limited oom reserves). This is
certainly possible but it is not really clear how much of an excess is
desirable and still protects from global OOMs as that would have to
consider the overall memcg configuration.
This patch is addressing the problem by removing the heuristic altogether.
Bypass is only allowed for requests which either cannot fail or where the
failure is not desirable while excess should be still limited (e.g.
atomic requests). Implementation wise a killed or dying task fails to
charge if it has passed the OOM killer stage. That should give all forms
of reclaim chance to restore the limit before the failure (ENOMEM) and
tell the caller to back off.
In addition, this patch renames should_force_charge() helper to
task_is_dying() because now its use is not associated witch forced
charging.
This patch depends on pagefault_out_of_memory() to not trigger
out_of_memory(), because then a memcg failure can unwind to VM_FAULT_OOM
and cause a global OOM killer.
Link: https://lkml.kernel.org/r/8f5cebbb-06da-4902-91f0-6566fc4b4203@virtuozzo.com
Signed-off-by: Vasily Averin <vvs(a)virtuozzo.com>
Suggested-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: Uladzislau Rezki <urezki(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel(a)i-love.sakura.ne.jp>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 27 ++++++++-------------------
1 file changed, 8 insertions(+), 19 deletions(-)
--- a/mm/memcontrol.c~memcg-prohibit-unconditional-exceeding-the-limit-of-dying-tasks
+++ a/mm/memcontrol.c
@@ -234,7 +234,7 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
@@ -1624,7 +1624,7 @@ static bool mem_cgroup_out_of_memory(str
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = should_force_charge() || out_of_memory(&oc);
+ ret = task_is_dying() || out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -2579,6 +2579,7 @@ static int try_charge_memcg(struct mem_c
struct page_counter *counter;
enum oom_status oom_status;
unsigned long nr_reclaimed;
+ bool passed_oom = false;
bool may_swap = true;
bool drained = false;
unsigned long pflags;
@@ -2614,15 +2615,6 @@ retry:
goto force;
/*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(should_force_charge()))
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
@@ -2679,8 +2671,9 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (fatal_signal_pending(current))
- goto force;
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
/*
* keep retrying as long as the memcg oom killer is able to make
@@ -2689,14 +2682,10 @@ retry:
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
- switch (oom_status) {
- case OOM_SUCCESS:
+ if (oom_status == OOM_SUCCESS) {
+ passed_oom = true;
nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
- case OOM_FAILED:
- goto force;
- default:
- goto nomem;
}
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
_
From: Michal Hocko <mhocko(a)suse.com>
Subject: mm, oom: do not trigger out_of_memory from the #PF
Any allocation failure during the #PF path will return with VM_FAULT_OOM
which in turn results in pagefault_out_of_memory. This can happen for 2
different reasons. a) Memcg is out of memory and we rely on
mem_cgroup_oom_synchronize to perform the memcg OOM handling or b) normal
allocation fails.
The latter is quite problematic because allocation paths already trigger
out_of_memory and the page allocator tries really hard to not fail
allocations. Anyway, if the OOM killer has been already invoked there is
no reason to invoke it again from the #PF path. Especially when the OOM
condition might be gone by that time and we have no way to find out other
than allocate.
Moreover if the allocation failed and the OOM killer hasn't been invoked
then we are unlikely to do the right thing from the #PF context because we
have already lost the allocation context and restictions and therefore
might oom kill a task from a different NUMA domain.
This all suggests that there is no legitimate reason to trigger
out_of_memory from pagefault_out_of_memory so drop it. Just to be sure
that no #PF path returns with VM_FAULT_OOM without allocation print a
warning that this is happening before we restart the #PF.
[VvS: #PF allocation can hit into limit of cgroup v1 kmem controller.
This is a local problem related to memcg, however, it causes unnecessary
global OOM kills that are repeated over and over again and escalate into a
real disaster. This has been broken since kmem accounting has been
introduced for cgroup v1 (3.8). There was no kmem specific reclaim for
the separate limit so the only way to handle kmem hard limit was to return
with ENOMEM. In upstream the problem will be fixed by removing the
outdated kmem limit, however stable and LTS kernels cannot do it and are
still affected. This patch fixes the problem and should be backported
into stable/LTS.]
Link: https://lkml.kernel.org/r/f5fd8dd8-0ad4-c524-5f65-920b01972a42@virtuozzo.com
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Signed-off-by: Vasily Averin <vvs(a)virtuozzo.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Tetsuo Handa <penguin-kernel(a)i-love.sakura.ne.jp>
Cc: Uladzislau Rezki <urezki(a)gmail.com>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/oom_kill.c | 22 ++++++++--------------
1 file changed, 8 insertions(+), 14 deletions(-)
--- a/mm/oom_kill.c~mm-oom-do-not-trigger-out_of_memory-from-the-pf
+++ a/mm/oom_kill.c
@@ -1120,19 +1120,15 @@ bool out_of_memory(struct oom_control *o
}
/*
- * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
- * killing is already in progress so do nothing.
+ * The pagefault handler calls here because some allocation has failed. We have
+ * to take care of the memcg OOM here because this is the only safe context without
+ * any locks held but let the oom killer triggered from the allocation context care
+ * about the global OOM.
*/
void pagefault_out_of_memory(void)
{
- struct oom_control oc = {
- .zonelist = NULL,
- .nodemask = NULL,
- .memcg = NULL,
- .gfp_mask = 0,
- .order = 0,
- };
+ static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (mem_cgroup_oom_synchronize(true))
return;
@@ -1140,10 +1136,8 @@ void pagefault_out_of_memory(void)
if (fatal_signal_pending(current))
return;
- if (!mutex_trylock(&oom_lock))
- return;
- out_of_memory(&oc);
- mutex_unlock(&oom_lock);
+ if (__ratelimit(&pfoom_rs))
+ pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
}
SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
_
From: Vasily Averin <vvs(a)virtuozzo.com>
Subject: mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks
Patch series "memcg: prohibit unconditional exceeding the limit of dying tasks", v3.
Memory cgroup charging allows killed or exiting tasks to exceed the hard
limit. It can be misused and allowed to trigger global OOM from inside a
memcg-limited container. On the other hand if memcg fails allocation,
called from inside #PF handler it triggers global OOM from inside
pagefault_out_of_memory().
To prevent these problems this patchset:
a) removes execution of out_of_memory() from pagefault_out_of_memory(),
becasue nobody can explain why it is necessary.
b) allow memcg to fail allocation of dying/killed tasks.
This patch (of 3):
Any allocation failure during the #PF path will return with VM_FAULT_OOM
which in turn results in pagefault_out_of_memory which in turn executes
out_out_memory() and can kill a random task.
An allocation might fail when the current task is the oom victim and there
are no memory reserves left. The OOM killer is already handled at the
page allocator level for the global OOM and at the charging level for the
memcg one. Both have much more information about the scope of
allocation/charge request. This means that either the OOM killer has been
invoked properly and didn't lead to the allocation success or it has been
skipped because it couldn't have been invoked. In both cases triggering
it from here is pointless and even harmful.
It makes much more sense to let the killed task die rather than to wake up
an eternally hungry oom-killer and send him to choose a fatter victim for
breakfast.
Link: https://lkml.kernel.org/r/0828a149-786e-7c06-b70a-52d086818ea3@virtuozzo.com
Signed-off-by: Vasily Averin <vvs(a)virtuozzo.com>
Suggested-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Tetsuo Handa <penguin-kernel(a)i-love.sakura.ne.jp>
Cc: Uladzislau Rezki <urezki(a)gmail.com>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/oom_kill.c | 3 +++
1 file changed, 3 insertions(+)
--- a/mm/oom_kill.c~mm-oom-pagefault_out_of_memory-dont-force-global-oom-for-dying-tasks
+++ a/mm/oom_kill.c
@@ -1137,6 +1137,9 @@ void pagefault_out_of_memory(void)
if (mem_cgroup_oom_synchronize(true))
return;
+ if (fatal_signal_pending(current))
+ return;
+
if (!mutex_trylock(&oom_lock))
return;
out_of_memory(&oc);
_
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Subject: mm/filemap.c: remove bogus VM_BUG_ON
It is not safe to check page->index without holding the page lock. It can
be changed if the page is moved between the swap cache and the page cache
for a shmem file, for example. There is a VM_BUG_ON below which checks
page->index is correct after taking the page lock.
Link: https://lkml.kernel.org/r/20210818144932.940640-1-willy@infradead.org
Fixes: 5c211ba29deb ("mm: add and use find_lock_entries")
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Reported-by: <syzbot+c87be4f669d920c76330(a)syzkaller.appspotmail.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 1 -
1 file changed, 1 deletion(-)
--- a/mm/filemap.c~mm-remove-bogus-vm_bug_on
+++ a/mm/filemap.c
@@ -2093,7 +2093,6 @@ unsigned find_lock_entries(struct addres
if (!xa_is_value(page)) {
if (page->index < start)
goto put;
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
if (page->index + thp_nr_pages(page) - 1 > end)
goto put;
if (!trylock_page(page))
_
From: Jan Kara <jack(a)suse.cz>
Subject: ocfs2: fix data corruption on truncate
Patch series "ocfs2: Truncate data corruption fix".
As further testing has shown, commit 5314454ea3f ("ocfs2: fix data
corruption after conversion from inline format") didn't fix all the data
corruption issues the customer started observing after 6dbf7bb55598 ("fs:
Don't invalidate page buffers in block_write_full_page()") This time I
have tracked them down to two bugs in ocfs2 truncation code.
One bug (truncating page cache before clearing tail cluster and setting
i_size) could cause data corruption even before 6dbf7bb55598, but before
that commit it needed a race with page fault, after 6dbf7bb55598 it
started to be pretty deterministic.
Another bug (zeroing pages beyond old i_size) used to be harmless
inefficiency before commit 6dbf7bb55598. But after commit 6dbf7bb55598 in
combination with the first bug it resulted in deterministic data
corruption.
Although fixing only the first problem is needed to stop data corruption,
I've fixed both issues to make the code more robust.
This patch (of 2):
ocfs2_truncate_file() did unmap invalidate page cache pages before zeroing
partial tail cluster and setting i_size. Thus some pages could be left
(and likely have left if the cluster zeroing happened) in the page cache
beyond i_size after truncate finished letting user possibly see stale data
once the file was extended again. Also the tail cluster zeroing was not
guaranteed to finish before truncate finished causing possible stale data
exposure. The problem started to be particularly easy to hit after commit
6dbf7bb55598 "fs: Don't invalidate page buffers in
block_write_full_page()" stopped invalidation of pages beyond i_size from
page writeback path.
Fix these problems by unmapping and invalidating pages in the page cache
after the i_size is reduced and tail cluster is zeroed out.
Link: https://lkml.kernel.org/r/20211025150008.29002-1-jack@suse.cz
Link: https://lkml.kernel.org/r/20211025151332.11301-1-jack@suse.cz
Fixes: ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/file.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
--- a/fs/ocfs2/file.c~ocfs2-fix-data-corruption-on-truncate
+++ a/fs/ocfs2/file.c
@@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *in
* greater than page size, so we have to truncate them
* anyway.
*/
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ unmap_mapping_range(inode->i_mapping,
+ new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
i_size_read(inode), 1);
if (status)
@@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *in
goto bail_unlock_sem;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
_
This simply adds proper support for panel backlights that can be controlled
via VESA's backlight control protocol, but which also require that we
enable and disable the backlight via PWM instead of via the DPCD interface.
We also enable this by default, in order to fix some people's backlights
that were broken by not having this enabled.
For reference, backlights that require this and use VESA's backlight
interface tend to be laptops with hybrid GPUs, but this very well may
change in the future.
v4:
* Make sure that we call intel_backlight_level_to_pwm() in
intel_dp_aux_vesa_enable_backlight() - vsyrjala
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Link: https://gitlab.freedesktop.org/drm/intel/-/issues/3680
Fixes: fe7d52bccab6 ("drm/i915/dp: Don't use DPCD backlights that need PWM enable/disable")
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # v5.12+
---
.../drm/i915/display/intel_dp_aux_backlight.c | 27 ++++++++++++++-----
1 file changed, 21 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c
index 569d17b4d00f..f05b71c01b8e 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c
@@ -293,6 +293,13 @@ intel_dp_aux_vesa_enable_backlight(const struct intel_crtc_state *crtc_state,
struct intel_panel *panel = &connector->panel;
struct intel_dp *intel_dp = enc_to_intel_dp(connector->encoder);
+ if (!panel->backlight.edp.vesa.info.aux_enable) {
+ u32 pwm_level = intel_backlight_invert_pwm_level(connector,
+ panel->backlight.pwm_level_max);
+
+ panel->backlight.pwm_funcs->enable(crtc_state, conn_state, pwm_level);
+ }
+
drm_edp_backlight_enable(&intel_dp->aux, &panel->backlight.edp.vesa.info, level);
}
@@ -304,6 +311,10 @@ static void intel_dp_aux_vesa_disable_backlight(const struct drm_connector_state
struct intel_dp *intel_dp = enc_to_intel_dp(connector->encoder);
drm_edp_backlight_disable(&intel_dp->aux, &panel->backlight.edp.vesa.info);
+
+ if (!panel->backlight.edp.vesa.info.aux_enable)
+ panel->backlight.pwm_funcs->disable(old_conn_state,
+ intel_backlight_invert_pwm_level(connector, 0));
}
static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector, enum pipe pipe)
@@ -321,6 +332,15 @@ static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector,
if (ret < 0)
return ret;
+ if (!panel->backlight.edp.vesa.info.aux_enable) {
+ ret = panel->backlight.pwm_funcs->setup(connector, pipe);
+ if (ret < 0) {
+ drm_err(&i915->drm,
+ "Failed to setup PWM backlight controls for eDP backlight: %d\n",
+ ret);
+ return ret;
+ }
+ }
panel->backlight.max = panel->backlight.edp.vesa.info.max;
panel->backlight.min = 0;
if (current_mode == DP_EDP_BACKLIGHT_CONTROL_MODE_DPCD) {
@@ -340,12 +360,7 @@ intel_dp_aux_supports_vesa_backlight(struct intel_connector *connector)
struct intel_dp *intel_dp = intel_attached_dp(connector);
struct drm_i915_private *i915 = dp_to_i915(intel_dp);
- /* TODO: We currently only support AUX only backlight configurations, not backlights which
- * require a mix of PWM and AUX controls to work. In the mean time, these machines typically
- * work just fine using normal PWM controls anyway.
- */
- if ((intel_dp->edp_dpcd[1] & DP_EDP_BACKLIGHT_AUX_ENABLE_CAP) &&
- drm_edp_backlight_supported(intel_dp->edp_dpcd)) {
+ if (drm_edp_backlight_supported(intel_dp->edp_dpcd)) {
drm_dbg_kms(&i915->drm, "AUX Backlight Control Supported!\n");
return true;
}
--
2.31.1
Added updating of request frame number for elapsed frames,
otherwise frame number will remain as previous use of request.
This will allow function driver to correctly track frames in
case of Missed ISOC occurs.
Added setting request actual length to 0 for elapsed frames.
In Slave mode when pushing data to RxFIFO by dwords, request
actual length incrementing accordingly. But before whole packet
will be pushed into RxFIFO and send to host can occurs Missed
ISOC and data will not send to host. So, in this case request
actual length should be reset to 0.
Fixes: 91bb163e1e4f ("usb: dwc2: gadget: Fix ISOC flow for BDMA and Slave")
Signed-off-by: Minas Harutyunyan <Minas.Harutyunyan(a)synopsys.com>
---
drivers/usb/dwc2/gadget.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c
index 11d85a6e0b0d..2190225bf3da 100644
--- a/drivers/usb/dwc2/gadget.c
+++ b/drivers/usb/dwc2/gadget.c
@@ -1198,6 +1198,8 @@ static void dwc2_hsotg_start_req(struct dwc2_hsotg *hsotg,
}
ctrl |= DXEPCTL_CNAK;
} else {
+ hs_req->req.frame_number = hs_ep->target_frame;
+ hs_req->req.actual = 0;
dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA);
return;
}
@@ -2857,9 +2859,12 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep)
do {
hs_req = get_ep_head(hs_ep);
- if (hs_req)
+ if (hs_req) {
+ hs_req->req.frame_number = hs_ep->target_frame;
+ hs_req->req.actual = 0;
dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req,
-ENODATA);
+ }
dwc2_gadget_incr_frame_num(hs_ep);
/* Update current frame number value. */
hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg);
@@ -2912,8 +2917,11 @@ static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep)
while (dwc2_gadget_target_frame_elapsed(ep)) {
hs_req = get_ep_head(ep);
- if (hs_req)
+ if (hs_req) {
+ hs_req->req.frame_number = ep->target_frame;
+ hs_req->req.actual = 0;
dwc2_hsotg_complete_request(hsotg, ep, hs_req, -ENODATA);
+ }
dwc2_gadget_incr_frame_num(ep);
/* Update current frame number value. */
@@ -3002,8 +3010,11 @@ static void dwc2_gadget_handle_nak(struct dwc2_hsotg_ep *hs_ep)
while (dwc2_gadget_target_frame_elapsed(hs_ep)) {
hs_req = get_ep_head(hs_ep);
- if (hs_req)
+ if (hs_req) {
+ hs_req->req.frame_number = hs_ep->target_frame;
+ hs_req->req.actual = 0;
dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA);
+ }
dwc2_gadget_incr_frame_num(hs_ep);
/* Update current frame number value. */
base-commit: c26f1c109d21f2ea874e4a85c0c76c385b8f46cb
--
2.11.0
This is the start of the stable review cycle for the 4.19.216 release.
There are 7 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 06 Nov 2021 14:11:51 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.216-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.19.216-rc1
Wang Kefeng <wangkefeng.wang(a)huawei.com>
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Arnd Bergmann <arnd(a)arndb.de>
arch: pgtable: define MAX_POSSIBLE_PHYSMEM_BITS where needed
Erik Ekman <erik(a)kryo.se>
sfc: Fix reading non-legacy supported link modes
Mike Marciniszyn <mike.marciniszyn(a)cornelisnetworks.com>
IB/qib: Protect from buffer overflow in struct qib_user_sdma_pkt fields
Gustavo A. R. Silva <gustavo(a)embeddedor.com>
IB/qib: Use struct_size() helper
Dan Carpenter <dan.carpenter(a)oracle.com>
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Ming Lei <ming.lei(a)redhat.com>
scsi: core: Put LLD module refcnt after SCSI device is released
-------------
Diffstat:
Makefile | 4 ++--
arch/arc/include/asm/pgtable.h | 2 ++
arch/arm/include/asm/pgtable-2level.h | 2 ++
arch/arm/include/asm/pgtable-3level.h | 2 ++
arch/mips/include/asm/pgtable-32.h | 3 +++
arch/powerpc/include/asm/pte-common.h | 2 ++
arch/riscv/include/asm/pgtable-32.h | 2 ++
drivers/amba/bus.c | 3 ---
drivers/infiniband/hw/qib/qib_user_sdma.c | 34 ++++++++++++++++++++++---------
drivers/media/firewire/firedtv-avc.c | 14 ++++++++++---
drivers/media/firewire/firedtv-ci.c | 2 ++
drivers/net/ethernet/sfc/ethtool.c | 10 ++-------
drivers/scsi/scsi.c | 4 +++-
drivers/scsi/scsi_sysfs.c | 9 ++++++++
include/asm-generic/pgtable.h | 13 ++++++++++++
15 files changed, 79 insertions(+), 27 deletions(-)
This is the start of the stable review cycle for the 5.4.158 release.
There are 9 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 06 Nov 2021 14:11:51 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.4.158-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.4.158-rc1
Wang Kefeng <wangkefeng.wang(a)huawei.com>
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "drm/ttm: fix memleak in ttm_transfered_destroy"
Erik Ekman <erik(a)kryo.se>
sfc: Fix reading non-legacy supported link modes
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "usb: core: hcd: Add support for deferring roothub registration"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "xhci: Set HCD flag to defer primary roothub registration"
Dan Carpenter <dan.carpenter(a)oracle.com>
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Yuiko Oshino <yuiko.oshino(a)microchip.com>
net: ethernet: microchip: lan743x: Fix skb allocation failure
Eugene Crosser <crosser(a)average.org>
vrf: Revert "Reset skb conntrack connection..."
Ming Lei <ming.lei(a)redhat.com>
scsi: core: Put LLD module refcnt after SCSI device is released
-------------
Diffstat:
Makefile | 4 ++--
drivers/amba/bus.c | 3 ---
drivers/gpu/drm/ttm/ttm_bo_util.c | 1 -
drivers/media/firewire/firedtv-avc.c | 14 ++++++++++---
drivers/media/firewire/firedtv-ci.c | 2 ++
drivers/net/ethernet/microchip/lan743x_main.c | 10 +++++----
drivers/net/ethernet/sfc/ethtool.c | 10 ++-------
drivers/net/vrf.c | 4 ----
drivers/scsi/scsi.c | 4 +++-
drivers/scsi/scsi_sysfs.c | 9 +++++++++
drivers/usb/core/hcd.c | 29 ++++++---------------------
drivers/usb/host/xhci.c | 1 -
include/linux/usb/hcd.h | 2 --
13 files changed, 41 insertions(+), 52 deletions(-)
This is the start of the stable review cycle for the 5.15.1 release.
There are 12 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 06 Nov 2021 14:11:51 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.15.1-rc1…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.15.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.15.1-rc1
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add quirk for Audient iD14
Matthew Brost <matthew.brost(a)intel.com>
Revert "drm/i915/gt: Propagate change in error status to children on unhold"
Anson Jacob <Anson.Jacob(a)amd.com>
drm/amd/display: Revert "Directly retrain link from debugfs"
Christian König <christian.koenig(a)amd.com>
drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"
Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>
Revert "wcn36xx: Disable bmps when encryption is disabled"
Wang Kefeng <wangkefeng.wang(a)huawei.com>
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Lucas Stach <l.stach(a)pengutronix.de>
Revert "soc: imx: gpcv2: move reset assert after requesting domain power up"
Yifan Zhang <yifan1.zhang(a)amd.com>
drm/amdkfd: fix boot failure when iommu is disabled in Picasso.
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "usb: core: hcd: Add support for deferring roothub registration"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "xhci: Set HCD flag to defer primary roothub registration"
Dan Carpenter <dan.carpenter(a)oracle.com>
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Erik Ekman <erik(a)kryo.se>
sfc: Fix reading non-legacy supported link modes
-------------
Diffstat:
Makefile | 4 +-
drivers/amba/bus.c | 3 -
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 ----------------------
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 5 --
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ---
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +
.../drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 3 +-
.../gpu/drm/i915/gt/intel_execlists_submission.c | 4 --
drivers/media/firewire/firedtv-avc.c | 14 +++-
drivers/media/firewire/firedtv-ci.c | 2 +
drivers/net/ethernet/sfc/ethtool_common.c | 10 +--
drivers/net/wireless/ath/wcn36xx/main.c | 10 ---
drivers/net/wireless/ath/wcn36xx/pmc.c | 5 +-
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1 -
drivers/soc/imx/gpcv2.c | 4 +-
drivers/usb/core/hcd.c | 29 ++------
drivers/usb/host/xhci.c | 1 -
include/linux/usb/hcd.h | 2 -
sound/usb/quirks.c | 2 +
20 files changed, 33 insertions(+), 159 deletions(-)
This is the start of the stable review cycle for the 5.10.78 release.
There are 16 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 06 Nov 2021 14:11:51 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.10.78-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.10.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.10.78-rc1
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add Audient iD14 to mixer map quirk table
Takashi Iwai <tiwai(a)suse.de>
ALSA: usb-audio: Add Schiit Hel device to mixer map quirk table
Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>
Revert "wcn36xx: Disable bmps when encryption is disabled"
Wang Kefeng <wangkefeng.wang(a)huawei.com>
ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "drm/ttm: fix memleak in ttm_transfered_destroy"
Yang Shi <shy828301(a)gmail.com>
mm: khugepaged: skip huge page collapse for special files
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "usb: core: hcd: Add support for deferring roothub registration"
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "xhci: Set HCD flag to defer primary roothub registration"
Dan Carpenter <dan.carpenter(a)oracle.com>
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
Yang Shi <shy828301(a)gmail.com>
mm: filemap: check if THP has hwpoisoned subpage for PMD page fault
Yang Shi <shy828301(a)gmail.com>
mm: hwpoison: remove the unnecessary THP check
Yuiko Oshino <yuiko.oshino(a)microchip.com>
net: ethernet: microchip: lan743x: Fix skb allocation failure
Eugene Crosser <crosser(a)average.org>
vrf: Revert "Reset skb conntrack connection..."
Erik Ekman <erik(a)kryo.se>
sfc: Fix reading non-legacy supported link modes
Lee Jones <lee.jones(a)linaro.org>
Revert "io_uring: reinforce cancel on flush during exit"
Ming Lei <ming.lei(a)redhat.com>
scsi: core: Put LLD module refcnt after SCSI device is released
-------------
Diffstat:
Makefile | 4 ++--
drivers/amba/bus.c | 3 ---
drivers/gpu/drm/ttm/ttm_bo_util.c | 1 -
drivers/media/firewire/firedtv-avc.c | 14 ++++++++++---
drivers/media/firewire/firedtv-ci.c | 2 ++
drivers/net/ethernet/microchip/lan743x_main.c | 10 +++++----
drivers/net/ethernet/sfc/ethtool_common.c | 10 ++-------
drivers/net/vrf.c | 4 ----
drivers/net/wireless/ath/wcn36xx/main.c | 10 ---------
drivers/net/wireless/ath/wcn36xx/pmc.c | 5 +----
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1 -
drivers/scsi/scsi.c | 4 +++-
drivers/scsi/scsi_sysfs.c | 9 +++++++++
drivers/usb/core/hcd.c | 29 ++++++---------------------
drivers/usb/host/xhci.c | 1 -
fs/io_uring.c | 3 ++-
include/linux/page-flags.h | 23 +++++++++++++++++++++
include/linux/usb/hcd.h | 2 --
mm/huge_memory.c | 2 ++
mm/khugepaged.c | 17 +++++++++-------
mm/memory-failure.c | 28 +++++++++++++-------------
mm/memory.c | 9 +++++++++
mm/page_alloc.c | 4 +++-
sound/usb/mixer_maps.c | 8 ++++++++
24 files changed, 113 insertions(+), 90 deletions(-)
Stable Maintainers,
Please consider this patch for back-porting to v4.4y and v4.9.y
4c761daf8bb9a ("net: hso: register netdev later to avoid a race condition")
It should apply cleanly to both branches.
Kind regards,
Lee
--
Lee Jones [李琼斯]
Senior Technical Lead - Developer Services
Linaro.org │ Open source software for Arm SoCs
Follow Linaro: Facebook | Twitter | Blog
When running as PVH or HVM guest with actual memory < max memory the
hypervisor is using "populate on demand" in order to allow the guest
to balloon down from its maximum memory size. For this to work
correctly the guest must not touch more memory pages than its target
memory size as otherwise the PoD cache will be exhausted and the guest
is crashed as a result of that.
In extreme cases ballooning down might not be finished today before
the init process is started, which can consume lots of memory.
In order to avoid random boot crashes in such cases, add a late init
call to wait for ballooning down having finished for PVH/HVM guests.
Warn on console if initial ballooning fails, panic() after stalling
for more than 3 minutes per default. Add a module parameter for
changing this timeout.
Cc: <stable(a)vger.kernel.org>
Reported-by: Marek Marczykowski-Górecki <marmarek(a)invisiblethingslab.com>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
---
V2:
- add warning and panic() when stalling (Marek Marczykowski-Górecki)
- don't wait if credit > 0
V3:
- issue warning only after ballooning failed (Marek Marczykowski-Górecki)
- make panic() timeout configurable via parameter
V4:
- fix boot parameter (Boris Ostrovsky)
- set new state directly in update_schedule() (Boris Ostrovsky)
---
.../admin-guide/kernel-parameters.txt | 7 ++
drivers/xen/balloon.c | 86 ++++++++++++++-----
2 files changed, 70 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 43dc35fe5bc0..1396fd2d9031 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6349,6 +6349,13 @@
improve timer resolution at the expense of processing
more timer interrupts.
+ xen.balloon_boot_timeout= [XEN]
+ The time (in seconds) to wait before giving up to boot
+ in case initial ballooning fails to free enough memory.
+ Applies only when running as HVM or PVH guest and
+ started with less memory configured than allowed at
+ max. Default is 180.
+
xen.event_eoi_delay= [XEN]
How long to delay EOI handling in case of event
storms (jiffies). Default is 10.
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 3a50f097ed3e..3a661b7697d4 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -58,6 +58,7 @@
#include <linux/percpu-defs.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
+#include <linux/moduleparam.h>
#include <asm/page.h>
#include <asm/tlb.h>
@@ -73,6 +74,12 @@
#include <xen/page.h>
#include <xen/mem-reservation.h>
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static uint __read_mostly balloon_boot_timeout = 180;
+module_param(balloon_boot_timeout, uint, 0444);
+
static int xen_hotplug_unpopulated;
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
@@ -125,12 +132,12 @@ static struct ctl_table xen_root[] = {
* BP_ECANCELED: error, balloon operation canceled.
*/
-enum bp_state {
+static enum bp_state {
BP_DONE,
BP_WAIT,
BP_EAGAIN,
BP_ECANCELED
-};
+} balloon_state = BP_DONE;
/* Main waiting point for xen-balloon thread. */
static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq);
@@ -199,18 +206,15 @@ static struct page *balloon_next_page(struct page *page)
return list_entry(next, struct page, lru);
}
-static enum bp_state update_schedule(enum bp_state state)
+static void update_schedule(void)
{
- if (state == BP_WAIT)
- return BP_WAIT;
-
- if (state == BP_ECANCELED)
- return BP_ECANCELED;
+ if (balloon_state == BP_WAIT || balloon_state == BP_ECANCELED)
+ return;
- if (state == BP_DONE) {
+ if (balloon_state == BP_DONE) {
balloon_stats.schedule_delay = 1;
balloon_stats.retry_count = 1;
- return BP_DONE;
+ return;
}
++balloon_stats.retry_count;
@@ -219,7 +223,8 @@ static enum bp_state update_schedule(enum bp_state state)
balloon_stats.retry_count > balloon_stats.max_retry_count) {
balloon_stats.schedule_delay = 1;
balloon_stats.retry_count = 1;
- return BP_ECANCELED;
+ balloon_state = BP_ECANCELED;
+ return;
}
balloon_stats.schedule_delay <<= 1;
@@ -227,7 +232,7 @@ static enum bp_state update_schedule(enum bp_state state)
if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay)
balloon_stats.schedule_delay = balloon_stats.max_schedule_delay;
- return BP_EAGAIN;
+ balloon_state = BP_EAGAIN;
}
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
@@ -494,9 +499,9 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
* Stop waiting if either state is BP_DONE and ballooning action is
* needed, or if the credit has changed while state is not BP_DONE.
*/
-static bool balloon_thread_cond(enum bp_state state, long credit)
+static bool balloon_thread_cond(long credit)
{
- if (state == BP_DONE)
+ if (balloon_state == BP_DONE)
credit = 0;
return current_credit() != credit || kthread_should_stop();
@@ -510,13 +515,12 @@ static bool balloon_thread_cond(enum bp_state state, long credit)
*/
static int balloon_thread(void *unused)
{
- enum bp_state state = BP_DONE;
long credit;
unsigned long timeout;
set_freezable();
for (;;) {
- switch (state) {
+ switch (balloon_state) {
case BP_DONE:
case BP_ECANCELED:
timeout = 3600 * HZ;
@@ -532,7 +536,7 @@ static int balloon_thread(void *unused)
credit = current_credit();
wait_event_freezable_timeout(balloon_thread_wq,
- balloon_thread_cond(state, credit), timeout);
+ balloon_thread_cond(credit), timeout);
if (kthread_should_stop())
return 0;
@@ -543,22 +547,23 @@ static int balloon_thread(void *unused)
if (credit > 0) {
if (balloon_is_inflated())
- state = increase_reservation(credit);
+ balloon_state = increase_reservation(credit);
else
- state = reserve_additional_memory();
+ balloon_state = reserve_additional_memory();
}
if (credit < 0) {
long n_pages;
n_pages = min(-credit, si_mem_available());
- state = decrease_reservation(n_pages, GFP_BALLOON);
- if (state == BP_DONE && n_pages != -credit &&
+ balloon_state = decrease_reservation(n_pages,
+ GFP_BALLOON);
+ if (balloon_state == BP_DONE && n_pages != -credit &&
n_pages < totalreserve_pages)
- state = BP_EAGAIN;
+ balloon_state = BP_EAGAIN;
}
- state = update_schedule(state);
+ update_schedule();
mutex_unlock(&balloon_mutex);
@@ -765,3 +770,38 @@ static int __init balloon_init(void)
return 0;
}
subsys_initcall(balloon_init);
+
+static int __init balloon_wait_finish(void)
+{
+ long credit, last_credit = 0;
+ unsigned long last_changed = 0;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ /* PV guests don't need to wait. */
+ if (xen_pv_domain() || !current_credit())
+ return 0;
+
+ pr_info("Waiting for initial ballooning down having finished.\n");
+
+ while ((credit = current_credit()) < 0) {
+ if (credit != last_credit) {
+ last_changed = jiffies;
+ last_credit = credit;
+ }
+ if (balloon_state == BP_ECANCELED) {
+ pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n",
+ -credit);
+ if (jiffies - last_changed >= HZ * balloon_boot_timeout)
+ panic("Initial ballooning failed!\n");
+ }
+
+ schedule_timeout_interruptible(HZ / 10);
+ }
+
+ pr_info("Initial ballooning down finished.\n");
+
+ return 0;
+}
+late_initcall_sync(balloon_wait_finish);
--
2.26.2
There are pclusters in runtime marked with Z_EROFS_PCLUSTER_TAIL
before actual I/O submission. Thus, the submission chain can be
extended if the following pcluster chain hook such tail pcluster.
As the related comment mentioned, if some page is made of a hooked
pcluster and another followed pcluster, it can be reused for
in-place I/O (since I/O should be submitted anyway):
_______________________________________________________________
| tail (partial) page | head (partial) page |
|_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
However, it's by no means safe to reuse as pagevec since if such
PRIMARY_HOOKED pclusters finally move into bypass chain without I/O
submission. It's somewhat hard to reproduce with LZ4 and I just
found it by ro_fsstress a LZMA image for long time.
I'm going to clean up related code together with multi-page folio
adaption in the next few months. Let's address it directly for
easier backporting for now.
Call trace for reference:
z_erofs_decompress_pcluster+0x10a/0x8a0 [erofs]
z_erofs_decompress_queue.isra.36+0x3c/0x60 [erofs]
z_erofs_runqueue+0x5f3/0x840 [erofs]
z_erofs_readahead+0x1e8/0x320 [erofs]
read_pages+0x91/0x270
page_cache_ra_unbounded+0x18b/0x240
filemap_get_pages+0x10a/0x5f0
filemap_read+0xa9/0x330
new_sync_read+0x11b/0x1a0
vfs_read+0xf1/0x190
Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support")
Cc: <stable(a)vger.kernel.org> # 4.19+
Signed-off-by: Gao Xiang <xiang(a)kernel.org>
---
fs/erofs/zdata.c | 13 +++++++------
fs/erofs/zpvec.h | 13 ++++++++++---
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 11c7a1aaebad..eb51df4a9f77 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -373,8 +373,8 @@ static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
/* callers must be with collection lock held */
static int z_erofs_attach_page(struct z_erofs_collector *clt,
- struct page *page,
- enum z_erofs_page_type type)
+ struct page *page, enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
int ret;
@@ -384,9 +384,9 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
z_erofs_try_inplace_io(clt, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector, page, type);
+ ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+ pvec_safereuse);
clt->cl->vcnt += (unsigned int)ret;
-
return ret ? 0 : -EAGAIN;
}
@@ -729,7 +729,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type);
+ err = z_erofs_attach_page(clt, page, page_type,
+ clt->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
@@ -737,7 +738,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
err = z_erofs_attach_page(clt, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+ Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
}
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index dfd7fe0503bb..b05464f4a808 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -106,11 +106,18 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
struct page *page,
- enum z_erofs_page_type type)
+ enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
- if (!ctor->next && type)
- if (ctor->index + 1 == ctor->nr)
+ if (!ctor->next) {
+ /* some pages cannot be reused as pvec safely without I/O */
+ if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
+ type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
+
+ if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+ ctor->index + 1 == ctor->nr)
return false;
+ }
if (ctor->index >= ctor->nr)
z_erofs_pagevec_ctor_pagedown(ctor, false);
--
2.20.1