When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list. Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first. However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first. As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.
To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable. The default is still
10, but it can be overridden at compile-time (via Kconfig), boot-time
(via a kernel command-line option), or runtime (via sysfs).
This is Cc: stable because (when combined with appropriate userspace
changes) it fixes a severe performance and stability problem for Qubes
OS users.
Cc: stable(a)vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi(a)invisiblethingslab.com>
---
drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
1 file changed, 29 insertions(+), 11 deletions(-)
Changes since v2:
- use atomic_inc_return(x) and atomic_dec_return(x) instead of
atomic_add_return(1, x) and atomic_sub_return(1, x) respectively.
- move module_param macro closer to the definition of
free_per_iteration.
- add blank line between declarations and statements.
Changes since v1:
- drop setting default via Kconfig
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..f13c3b76ad1eb7110e2a2981e9fa4e504174e431 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,21 @@ static LIST_HEAD(deferred_list);
static void gnttab_handle_deferred(struct timer_list *);
static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10;
+module_param(free_per_iteration, uint, 0600);
+
static void gnttab_handle_deferred(struct timer_list *unused)
{
- unsigned int nr = 10;
+ unsigned int nr = READ_ONCE(free_per_iteration);
+ const bool ignore_limit = nr == 0;
struct deferred_entry *first = NULL;
unsigned long flags;
+ size_t freed = 0;
spin_lock_irqsave(&gnttab_list_lock, flags);
- while (nr--) {
+ while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
struct deferred_entry *entry
= list_first_entry(&deferred_list,
struct deferred_entry, list);
@@ -515,10 +522,14 @@ static void gnttab_handle_deferred(struct timer_list *unused)
list_del(&entry->list);
spin_unlock_irqrestore(&gnttab_list_lock, flags);
if (_gnttab_end_foreign_access_ref(entry->ref)) {
+ uint64_t ret = atomic64_dec_return(&deferred_count);
+
put_free_entry(entry->ref);
- pr_debug("freeing g.e. %#x (pfn %#lx)\n",
- entry->ref, page_to_pfn(entry->page));
+ pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+ entry->ref, page_to_pfn(entry->page),
+ (unsigned long long)ret);
put_page(entry->page);
+ freed++;
kfree(entry);
entry = NULL;
} else {
@@ -530,21 +541,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
spin_lock_irqsave(&gnttab_list_lock, flags);
if (entry)
list_add_tail(&entry->list, &deferred_list);
- else if (list_empty(&deferred_list))
- break;
}
- if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+ if (list_empty(&deferred_list))
+ WARN_ON(atomic64_read(&deferred_count));
+ else if (!timer_pending(&deferred_timer)) {
deferred_timer.expires = jiffies + HZ;
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ pr_debug("Freed %zu references", freed);
}
static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
{
struct deferred_entry *entry;
gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
- const char *what = KERN_WARNING "leaking";
+ uint64_t leaked, deferred;
entry = kmalloc(sizeof(*entry), gfp);
if (!page) {
@@ -567,10 +579,16 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
- what = KERN_DEBUG "deferring";
+ deferred = atomic64_inc_return(&deferred_count);
+ leaked = atomic64_read(&leaked_count);
+ pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+ } else {
+ deferred = atomic64_read(&deferred_count);
+ leaked = atomic64_inc_return(&leaked_count);
+ pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
}
- printk("%s g.e. %#x (pfn %#lx)\n",
- what, ref, page ? page_to_pfn(page) : -1);
}
int gnttab_try_end_foreign_access(grant_ref_t ref)
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
Patch 1: Better detection of ip6tables vs ip6tables-legacy tools for
self tests. Fix for 6.4 and newer.
Patch 2: Only generate "new listener" event if listen operation
succeeds. Fix for 6.2 and newer.
Signed-off-by: Mat Martineau <martineau(a)kernel.org>
---
Matthieu Baerts (1):
selftests: mptcp: join: only check for ip6tables if needed
Paolo Abeni (1):
mptcp: more accurate NL event generation
net/mptcp/protocol.c | 3 +--
tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 +---
2 files changed, 2 insertions(+), 5 deletions(-)
---
base-commit: 284779dbf4e98753458708783af8c35630674a21
change-id: 20230725-send-net-20230725-579ecd4326e8
Best regards,
--
Mat Martineau <martineau(a)kernel.org>
The patch titled
Subject: crypto, cifs: Fix error handling in extract_iter_to_sg()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
crypto-cifs-fix-error-handling-in-extract_iter_to_sg.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: David Howells <dhowells(a)redhat.com>
Subject: crypto, cifs: Fix error handling in extract_iter_to_sg()
Date: Wed, 26 Jul 2023 11:57:56 +0100
Fix error handling in extract_iter_to_sg(). Pages need to be unpinned, not
put in extract_user_to_sg() when handling IOVEC/UBUF sources.
The bug may result in a warning like the following:
WARNING: CPU: 1 PID: 20384 at mm/gup.c:229 __lse_atomic_add arch/arm64/include/asm/atomic_lse.h:27 [inline]
WARNING: CPU: 1 PID: 20384 at mm/gup.c:229 arch_atomic_add arch/arm64/include/asm/atomic.h:28 [inline]
WARNING: CPU: 1 PID: 20384 at mm/gup.c:229 raw_atomic_add include/linux/atomic/atomic-arch-fallback.h:537 [inline]
WARNING: CPU: 1 PID: 20384 at mm/gup.c:229 atomic_add include/linux/atomic/atomic-instrumented.h:105 [inline]
WARNING: CPU: 1 PID: 20384 at mm/gup.c:229 try_grab_page+0x108/0x160 mm/gup.c:252
...
pc : try_grab_page+0x108/0x160 mm/gup.c:229
lr : follow_page_pte+0x174/0x3e4 mm/gup.c:651
...
Call trace:
__lse_atomic_add arch/arm64/include/asm/atomic_lse.h:27 [inline]
arch_atomic_add arch/arm64/include/asm/atomic.h:28 [inline]
raw_atomic_add include/linux/atomic/atomic-arch-fallback.h:537 [inline]
atomic_add include/linux/atomic/atomic-instrumented.h:105 [inline]
try_grab_page+0x108/0x160 mm/gup.c:252
follow_pmd_mask mm/gup.c:734 [inline]
follow_pud_mask mm/gup.c:765 [inline]
follow_p4d_mask mm/gup.c:782 [inline]
follow_page_mask+0x12c/0x2e4 mm/gup.c:839
__get_user_pages+0x174/0x30c mm/gup.c:1217
__get_user_pages_locked mm/gup.c:1448 [inline]
__gup_longterm_locked+0x94/0x8f4 mm/gup.c:2142
internal_get_user_pages_fast+0x970/0xb60 mm/gup.c:3140
pin_user_pages_fast+0x4c/0x60 mm/gup.c:3246
iov_iter_extract_user_pages lib/iov_iter.c:1768 [inline]
iov_iter_extract_pages+0xc8/0x54c lib/iov_iter.c:1831
extract_user_to_sg lib/scatterlist.c:1123 [inline]
extract_iter_to_sg lib/scatterlist.c:1349 [inline]
extract_iter_to_sg+0x26c/0x6fc lib/scatterlist.c:1339
hash_sendmsg+0xc0/0x43c crypto/algif_hash.c:117
sock_sendmsg_nosec net/socket.c:725 [inline]
sock_sendmsg+0x54/0x60 net/socket.c:748
____sys_sendmsg+0x270/0x2ac net/socket.c:2494
___sys_sendmsg+0x80/0xdc net/socket.c:2548
__sys_sendmsg+0x68/0xc4 net/socket.c:2577
__do_sys_sendmsg net/socket.c:2586 [inline]
__se_sys_sendmsg net/socket.c:2584 [inline]
__arm64_sys_sendmsg+0x24/0x30 net/socket.c:2584
__invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
invoke_syscall+0x48/0x114 arch/arm64/kernel/syscall.c:52
el0_svc_common.constprop.0+0x44/0xe4 arch/arm64/kernel/syscall.c:142
do_el0_svc+0x38/0xa4 arch/arm64/kernel/syscall.c:191
el0_svc+0x2c/0xb0 arch/arm64/kernel/entry-common.c:647
el0t_64_sync_handler+0xc0/0xc4 arch/arm64/kernel/entry-common.c:665
el0t_64_sync+0x19c/0x1a0 arch/arm64/kernel/entry.S:591
Link: https://lkml.kernel.org/r/20571.1690369076@warthog.procyon.org.uk
Fixes: 018584697533 ("netfs: Add a function to extract an iterator into a scatterlist")
Reported-by: syzbot+9b82859567f2e50c123e(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-mm/000000000000273d0105ff97bf56@google.com/
Signed-off-by: David Howells <dhowells(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Steve French <stfrench(a)microsoft.com>
Cc: Sven Schnelle <svens(a)linux.ibm.com>
Cc: Herbert Xu <herbert(a)gondor.apana.org.au>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Jeff Layton <jlayton(a)kernel.org>
Cc: Shyam Prasad N <nspmangalore(a)gmail.com>
Cc: Rohith Surabattula <rohiths.msft(a)gmail.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Herbert Xu <herbert(a)gondor.apana.org.au>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: Paolo Abeni <pabeni(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/scatterlist.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/lib/scatterlist.c~crypto-cifs-fix-error-handling-in-extract_iter_to_sg
+++ a/lib/scatterlist.c
@@ -1148,7 +1148,7 @@ static ssize_t extract_user_to_sg(struct
failed:
while (sgtable->nents > sgtable->orig_nents)
- put_page(sg_page(&sgtable->sgl[--sgtable->nents]));
+ unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents]));
return res;
}
_
Patches currently in -mm which might be from dhowells(a)redhat.com are
crypto-cifs-fix-error-handling-in-extract_iter_to_sg.patch
mm-merge-folio_has_private-filemap_release_folio-call-pairs.patch
mm-netfs-fscache-stop-read-optimisation-when-folio-removed-from-pagecache.patch
The original patch from Lin Ma enables the vdpa driver to use validation
netlink ops.
The second patch simply disables the validation skip which is no longer
neccesary. Patchset started of from this discussion [0].
[0] https://lore.kernel.org/virtualization/20230726074710-mutt-send-email-mst@k…
v2: cc'ed stable
Dragos Tatulea (1):
vdpa: Enable strict validation for netlinks ops
Lin Ma (1):
vdpa: Complement vdpa_nl_policy for nlattr length check
drivers/vdpa/vdpa.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
--
2.41.0
Unlike the other data structures provided during registration the
thermal core takes a copy of the thermal_zone_params provided to it and
stores that copy in the thermal_zone_device, taking care to free it on
unregistration. This is done because the parameters will be modified at
runtime.
Unfortunately the thermal_of code assumes that the params structure it
provides will be used throughout the lifetime of the device and since
the params are dynamically allocated based on the bindings it attempts
to free it on unregistration. This results in not only leaking the
original params but also double freeing the copy the core made, leading
to memory corruption.
Fix this by instead freeing the params parsed from the DT during
registration.
This issue causing instability on systems where thermal zones are
unregistered, especially visble on those systems where some zones
provided by a device have no trip points such as Allwinner systems.
For example with current mainline an arm64 defconfig is unbootable on
Pine64 Plus and LibreTech Tritium is massively unstable. These issues
have been there for a while and have been made more prominent by recent
memory management changes.
Fixes: 3fd6d6e2b4e80 ("thermal/of: Rework the thermal device tree initialization")
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
drivers/thermal/thermal_of.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 6fb14e521197..0af11cdfa2c1 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -442,13 +442,11 @@ static int thermal_of_unbind(struct thermal_zone_device *tz,
static void thermal_of_zone_unregister(struct thermal_zone_device *tz)
{
struct thermal_trip *trips = tz->trips;
- struct thermal_zone_params *tzp = tz->tzp;
struct thermal_zone_device_ops *ops = tz->ops;
thermal_zone_device_disable(tz);
thermal_zone_device_unregister(tz);
kfree(trips);
- kfree(tzp);
kfree(ops);
}
@@ -530,6 +528,9 @@ static struct thermal_zone_device *thermal_of_zone_register(struct device_node *
goto out_kfree_tzp;
}
+ /* The core will take a copy of tzp, free our copy here. */
+ kfree(tzp);
+
ret = thermal_zone_device_enable(tz);
if (ret) {
pr_err("Failed to enabled thermal zone '%s', id=%d: %d\n",
---
base-commit: fdf0eaf11452d72945af31804e2a1048ee1b574c
change-id: 20230722-thermal-fix-of-memory-corruption-73c023f8612b
Best regards,
--
Mark Brown <broonie(a)kernel.org>