From: John Stultz <jstultz(a)google.com>
[ Upstream commit bccdd808902f8c677317cec47c306e42b93b849e ]
In some cases running with the test-ww_mutex code, I was seeing
odd behavior where sometimes it seemed flush_workqueue was
returning before all the work threads were finished.
Often this would cause strange crashes as the mutexes would be
freed while they were being used.
Looking at the code, there is a lifetime problem as the
controlling thread that spawns the work allocates the
"struct stress" structures that are passed to the workqueue
threads. Then when the workqueue threads are finished,
they free the stress struct that was passed to them.
Unfortunately the workqueue work_struct node is in the stress
struct. Which means the work_struct is freed before the work
thread returns and while flush_workqueue is waiting.
It seems like a better idea to have the controlling thread
both allocate and free the stress structures, so that we can
be sure we don't corrupt the workqueue by freeing the structure
prematurely.
So this patch reworks the test to do so, and with this change
I no longer see the early flush_workqueue returns.
Signed-off-by: John Stultz <jstultz(a)google.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20230922043616.19282-3-jstultz@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/locking/test-ww_mutex.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 93cca6e698600..7c5a8f05497f2 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -466,7 +466,6 @@ static void stress_inorder_work(struct work_struct *work)
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -531,7 +530,6 @@ static void stress_reorder_work(struct work_struct *work)
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
@@ -552,8 +550,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -564,15 +560,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -596,9 +601,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -613,6 +616,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
--
2.42.0
Recent changes to kernel_connect() and kernel_bind() ensure that
callers are insulated from changes to the address parameter made by BPF
SOCK_ADDR hooks. This patch wraps direct calls to ops->connect() and
ops->bind() with kernel_connect() and kernel_bind() to protect callers
in such cases.
Link: https://lore.kernel.org/netdev/9944248dba1bce861375fcce9de663934d933ba9.cam…
Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect")
Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jordan Rife <jrife(a)google.com>
---
fs/dlm/lowcomms.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 67f8dd8a05ef2..6296c62c10fa9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1817,8 +1817,8 @@ static int dlm_tcp_bind(struct socket *sock)
memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
- result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
- addr_len);
+ result = kernel_bind(sock, (struct sockaddr *)&src_addr,
+ addr_len);
if (result < 0) {
/* This *may* not indicate a critical error */
log_print("could not bind for connect: %d", result);
@@ -1830,7 +1830,7 @@ static int dlm_tcp_bind(struct socket *sock)
static int dlm_tcp_connect(struct connection *con, struct socket *sock,
struct sockaddr *addr, int addr_len)
{
- return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
+ return kernel_connect(sock, addr, addr_len, O_NONBLOCK);
}
static int dlm_tcp_listen_validate(void)
@@ -1862,8 +1862,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
/* Bind to our port */
make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
- addr_len);
+ return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
+ addr_len);
}
static const struct dlm_proto_ops dlm_tcp_ops = {
@@ -1888,12 +1888,12 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
int ret;
/*
- * Make sock->ops->connect() function return in specified time,
+ * Make kernel_connect() function return in specified time,
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
sock_set_sndtimeo(sock->sk, 5);
- ret = sock->ops->connect(sock, addr, addr_len, 0);
+ ret = kernel_connect(sock, addr, addr_len, 0);
sock_set_sndtimeo(sock->sk, 0);
return ret;
}
--
2.42.0.869.gea05f2083d-goog
Greg,
Friday before the merge window opened, I received a bug report
for the eventfs code that was in linux-next. I spent the next
5 days debugging it and not only fixing it, but it led to finding
other bugs in the code. Several of these other bugs happen to
also affect the 6.6 kernel.
The eventfs code was written in two parts to lower the complexity.
The first part added just the dynamic creation of the eventfs
file system and that was added to 6.6.
The second part went further and removed the one-to-one mapping between
dentry/inode and meta data, as all events have the same files. It replaced
the meta data for each file with callbacks, which caused quite a bit of
code churn.
As the merge window was already open, when I finished all the fixes
I just sent those fixes on top of the linux-next changes along with
my pull request. That means, there are 5 commits that are marked
stable (or should be marked for stable) that need to be applied to
6.6 but require a bit of tweaking or even a new way of implementing the fix!
After sending the pull request, I then checked out 6.6 an took those
5 changes and fixed them up on top of it. I ran them through all my
tests that I use to send to Linus.
So these should be as good as the versions of the patches in Linus's tree.
I waited until Linus pulled in those changes to send this series out.
-- Steve
Steven Rostedt (Google) (5):
tracing: Have trace_event_file have ref counters
eventfs: Remove "is_freed" union with rcu head
eventfs: Save ownership and mode
eventfs: Delete eventfs_inode when the last dentry is freed
eventfs: Use simple_recursive_removal() to clean up dentries
----
fs/tracefs/event_inode.c | 288 +++++++++++++++++++++++--------------
include/linux/trace_events.h | 4 +
kernel/trace/trace.c | 15 ++
kernel/trace/trace.h | 3 +
kernel/trace/trace_events.c | 31 +++-
kernel/trace/trace_events_filter.c | 3 +
6 files changed, 231 insertions(+), 113 deletions(-)
The patch titled
Subject: mm: fix for negative counter: nr_file_hugepages
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-for-negative-counter-nr_file_hugepages.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Stefan Roesch <shr(a)devkernel.io>
Subject: mm: fix for negative counter: nr_file_hugepages
Date: Mon, 6 Nov 2023 10:19:18 -0800
While qualifiying the 6.4 release, the following warning was detected in
messages:
vmstat_refresh: nr_file_hugepages -15664
The warning is caused by the incorrect updating of the NR_FILE_THPS
counter in the function split_huge_page_to_list. The if case is checking
for folio_test_swapbacked, but the else case is missing the check for
folio_test_pmd_mappable. The other functions that manipulate the counter
like __filemap_add_folio and filemap_unaccount_folio have the
corresponding check.
I have a test case, which reproduces the problem. It can be found here:
https://github.com/sroeschus/testcase/blob/main/vmstat_refresh/madv.c
The test case reproduces on an XFS filesystem. Running the same test
case on a BTRFS filesystem does not reproduce the problem.
AFAIK version 6.1 until 6.6 are affected by this problem.
Link: https://lkml.kernel.org/r/20231106181918.1091043-1-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr(a)devkernel.io>
Co-debugged-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/huge_memory.c~mm-fix-for-negative-counter-nr_file_hugepages
+++ a/mm/huge_memory.c
@@ -2772,7 +2772,8 @@ int split_huge_page_to_list(struct page
if (folio_test_swapbacked(folio)) {
__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
-nr);
- } else {
+ } else if (folio_test_pmd_mappable(folio)) {
+
__lruvec_stat_mod_folio(folio, NR_FILE_THPS,
-nr);
filemap_nr_thps_dec(mapping);
_
Patches currently in -mm which might be from shr(a)devkernel.io are
mm-fix-for-negative-counter-nr_file_hugepages.patch