The patch titled
Subject: Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Edward Liaw <edliaw(a)google.com>
Subject: Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
Date: Fri, 18 Oct 2024 17:17:23 +0000
This reverts commit e61ef21e27e8deed8c474e9f47f4aa7bc37e138c.
uffd_poll_thread may be called by other tests that do not initialize the
pthread_barrier, so this approach is not correct. This will revert to
using atomic_bool instead.
Link: https://lkml.kernel.org/r/20241018171734.2315053-3-edliaw@google.com
Fixes: e61ef21e27e8 ("selftests/mm: replace atomic_bool with pthread_barrier_t")
Signed-off-by: Edward Liaw <edliaw(a)google.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-common.c | 5 ++---
tools/testing/selftests/mm/uffd-common.h | 3 ++-
tools/testing/selftests/mm/uffd-unit-tests.c | 14 ++++++--------
3 files changed, 10 insertions(+), 12 deletions(-)
--- a/tools/testing/selftests/mm/uffd-common.c~revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-common.c
@@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
unsigned long long *count_verify;
uffd_test_ops_t *uffd_test_ops;
uffd_test_case_ops_t *uffd_test_case_ops;
-pthread_barrier_t ready_for_fork;
+atomic_bool ready_for_fork;
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
{
@@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
pollfd[1].fd = pipefd[cpu*2];
pollfd[1].events = POLLIN;
- /* Ready for parent thread to fork */
- pthread_barrier_wait(&ready_for_fork);
+ ready_for_fork = true;
for (;;) {
ret = poll(pollfd, 2, -1);
--- a/tools/testing/selftests/mm/uffd-common.h~revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-common.h
@@ -33,6 +33,7 @@
#include <inttypes.h>
#include <stdint.h>
#include <sys/random.h>
+#include <stdatomic.h>
#include "../kselftest.h"
#include "vm_util.h"
@@ -104,7 +105,7 @@ extern bool map_shared;
extern bool test_uffdio_wp;
extern unsigned long long *count_verify;
extern volatile bool test_uffdio_copy_eexist;
-extern pthread_barrier_t ready_for_fork;
+extern atomic_bool ready_for_fork;
extern uffd_test_ops_t anon_uffd_test_ops;
extern uffd_test_ops_t shmem_uffd_test_ops;
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -774,7 +774,7 @@ static void uffd_sigbus_test_common(bool
char c;
struct uffd_args args = { 0 };
- pthread_barrier_init(&ready_for_fork, NULL, 2);
+ ready_for_fork = false;
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
@@ -791,9 +791,8 @@ static void uffd_sigbus_test_common(bool
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
- /* Wait for child thread to start before forking */
- pthread_barrier_wait(&ready_for_fork);
- pthread_barrier_destroy(&ready_for_fork);
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
pid = fork();
if (pid < 0)
@@ -834,7 +833,7 @@ static void uffd_events_test_common(bool
char c;
struct uffd_args args = { 0 };
- pthread_barrier_init(&ready_for_fork, NULL, 2);
+ ready_for_fork = false;
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
if (uffd_register(uffd, area_dst, nr_pages * page_size,
@@ -845,9 +844,8 @@ static void uffd_events_test_common(bool
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
- /* Wait for child thread to start before forking */
- pthread_barrier_wait(&ready_for_fork);
- pthread_barrier_destroy(&ready_for_fork);
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
pid = fork();
if (pid < 0)
_
Patches currently in -mm which might be from edliaw(a)google.com are
revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
selftests-mm-fix-deadlock-for-fork-after-pthread_create-with-atomic_bool.patch
The patch titled
Subject: Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Edward Liaw <edliaw(a)google.com>
Subject: Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
Date: Fri, 18 Oct 2024 17:17:22 +0000
Patch series "selftests/mm: revert pthread_barrier change"
On Android arm, pthread_create followed by a fork caused a deadlock in
the case where the fork required work to be completed by the created
thread.
The previous patches incorrectly assumed that the parent would
always initialize the pthread_barrier for the child thread. This
reverts the change and replaces the fix for wp-fork-with-event with the
original use of atomic_bool.
This patch (of 3):
This reverts commit e142cc87ac4ec618f2ccf5f68aedcd6e28a59d9d.
fork_event_consumer may be called by other tests that do not initialize
the pthread_barrier, so this approach is not correct. The subsequent
patch will revert to using atomic_bool instead.
Link: https://lkml.kernel.org/r/20241018171734.2315053-1-edliaw@google.com
Link: https://lkml.kernel.org/r/20241018171734.2315053-2-edliaw@google.com
Fixes: e142cc87ac4e ("fix deadlock for fork after pthread_create on ARM")
Signed-off-by: Edward Liaw <edliaw(a)google.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 7 -------
1 file changed, 7 deletions(-)
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -241,9 +241,6 @@ static void *fork_event_consumer(void *d
fork_event_args *args = data;
struct uffd_msg msg = { 0 };
- /* Ready for parent thread to fork */
- pthread_barrier_wait(&ready_for_fork);
-
/* Read until a full msg received */
while (uffd_read_msg(args->parent_uffd, &msg));
@@ -311,12 +308,8 @@ static int pagemap_test_fork(int uffd, b
/* Prepare a thread to resolve EVENT_FORK */
if (with_event) {
- pthread_barrier_init(&ready_for_fork, NULL, 2);
if (pthread_create(&thread, NULL, fork_event_consumer, &args))
err("pthread_create()");
- /* Wait for child thread to start before forking */
- pthread_barrier_wait(&ready_for_fork);
- pthread_barrier_destroy(&ready_for_fork);
}
child = fork();
_
Patches currently in -mm which might be from edliaw(a)google.com are
revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
selftests-mm-fix-deadlock-for-fork-after-pthread_create-with-atomic_bool.patch
There is a race between laundromat handling of revoked delegations
and a client sending free_stateid operation. Laundromat thread
finds that delegation has expired and needs to be revoked so it
marks the delegation stid revoked and it puts it on a reaper list
but then it unlock the state lock and the actual delegation revocation
happens without the lock. Once the stid is marked revoked a racing
free_stateid processing thread does the following (1) it calls
list_del_init() which removes it from the reaper list and (2) frees
the delegation stid structure. The laundromat thread ends up not
calling the revoke_delegation() function for this particular delegation
but that means it will no release the lock lease that exists on
the file.
Now, a new open for this file comes in and ends up finding that
lease list isn't empty and calls nfsd_breaker_owns_lease() which ends
up trying to derefence a freed delegation stateid. Leading to the
followint use-after-free KASAN warning:
kernel: ==================================================================
kernel: BUG: KASAN: slab-use-after-free in nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: Read of size 8 at addr ffff0000e73cd0c8 by task nfsd/6205
kernel:
kernel: CPU: 2 UID: 0 PID: 6205 Comm: nfsd Kdump: loaded Not tainted 6.11.0-rc7+ #9
kernel: Hardware name: Apple Inc. Apple Virtualization Generic Platform, BIOS 2069.0.0.0.0 08/03/2024
kernel: Call trace:
kernel: dump_backtrace+0x98/0x120
kernel: show_stack+0x1c/0x30
kernel: dump_stack_lvl+0x80/0xe8
kernel: print_address_description.constprop.0+0x84/0x390
kernel: print_report+0xa4/0x268
kernel: kasan_report+0xb4/0xf8
kernel: __asan_report_load8_noabort+0x1c/0x28
kernel: nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: nfsd_file_do_acquire+0xb3c/0x11d0 [nfsd]
kernel: nfsd_file_acquire_opened+0x84/0x110 [nfsd]
kernel: nfs4_get_vfs_file+0x634/0x958 [nfsd]
kernel: nfsd4_process_open2+0xa40/0x1a40 [nfsd]
kernel: nfsd4_open+0xa08/0xe80 [nfsd]
kernel: nfsd4_proc_compound+0xb8c/0x2130 [nfsd]
kernel: nfsd_dispatch+0x22c/0x718 [nfsd]
kernel: svc_process_common+0x8e8/0x1960 [sunrpc]
kernel: svc_process+0x3d4/0x7e0 [sunrpc]
kernel: svc_handle_xprt+0x828/0xe10 [sunrpc]
kernel: svc_recv+0x2cc/0x6a8 [sunrpc]
kernel: nfsd+0x270/0x400 [nfsd]
kernel: kthread+0x288/0x310
kernel: ret_from_fork+0x10/0x20
This patch proposes a fixed that's based on adding 2 new additional
stid's sc_status values that help coordinate between the laundromat
and other operations (nfsd4_free_stateid() and nfsd4_delegreturn()).
First to make sure, that once the stid is marked revoked, it is not
removed by the nfsd4_free_stateid(), the laundromat take a reference
on the stateid. Then, coordinating whether the stid has been put
on the cl_revoked list or we are processing FREE_STATEID and need to
make sure to remove it from the list, each check that state and act
accordingly. If laundromat has added to the cl_revoke list before
the arrival of FREE_STATEID, then nfsd4_free_stateid() knows to remove
it from the list. If nfsd4_free_stateid() finds that operations arrived
before laundromat has placed it on cl_revoke list, it marks the state
freed and then laundromat will no longer add it to the list.
Also, for nfsd4_delegreturn() when looking for the specified stid,
we need to access stid that are marked removed or freeable, it means
the laundromat has started processing it but hasn't finished and this
delegreturn needs to return nfserr_deleg_revoked and not
nfserr_bad_stateid. The latter will not trigger a FREE_STATEID and the
lack of it will leave this stid on the cl_revoked list indefinitely.
Fixes: 2d4a532d385f ("nfsd: ensure that clp->cl_revoked list is
protected by clp->cl_lock")
CC: stable(a)vger.kernel.org
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
--- v3. (1) adds refcount to nfsd4_revoke_states() (2) adds comments
to revoke_delegation(), adds the WARN_ON_ONCE to make sure stid
state is what is expected and changes unlock placement.
---
fs/nfsd/nfs4state.c | 46 +++++++++++++++++++++++++++++++++++++--------
fs/nfsd/state.h | 2 ++
2 files changed, 40 insertions(+), 8 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7905ab9d8bc6..28e9b52b01fd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1351,21 +1351,47 @@ static void destroy_delegation(struct nfs4_delegation *dp)
destroy_unhashed_deleg(dp);
}
+/**
+ * revoke_delegation - perform nfs4 delegation structure cleanup
+ * @dp: pointer to the delegation
+ *
+ * This function assumes that it's called either from the administrative
+ * interface (nfsd4_revoke_states()) that's revoking a specific delegation
+ * stateid or it's called from a laundromat thread (nfsd4_landromat()) that
+ * determined that this specific state has expired and needs to be revoked
+ * (both mark state with the appropriate stid sc_status mode). It is also
+ * assumed that a reference was take on the @dp state.
+ *
+ * If this function finds that the @dp state is SC_STATUS_FREED it means
+ * that a FREE_STATEID operation for this stateid has been processed and
+ * we can proceed to removing it from recalled list. However, if @dp state
+ * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked
+ * list and wait for the FREE_STATEID to arrive from the client. At the same
+ * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the
+ * nfsd4_free_stateid() function that this stateid has already been added
+ * to the cl_revoked list and that nfsd4_free_stateid() is now responsible
+ * for removing it from the list. Inspection of where the delegation state
+ * in the revocation process is protected by the clp->cl_lock.
+ */
static void revoke_delegation(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_stid.sc_client;
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ WARN_ON_ONCE(!(dp->dl_stid.sc_status &
+ (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)));
trace_nfsd_stid_revoke(&dp->dl_stid);
- if (dp->dl_stid.sc_status &
- (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
- spin_lock(&clp->cl_lock);
- refcount_inc(&dp->dl_stid.sc_count);
- list_add(&dp->dl_recall_lru, &clp->cl_revoked);
- spin_unlock(&clp->cl_lock);
+ spin_lock(&clp->cl_lock);
+ if (dp->dl_stid.sc_status & SC_STATUS_FREED) {
+ list_del_init(&dp->dl_recall_lru);
+ goto out;
}
+ list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ dp->dl_stid.sc_status |= SC_STATUS_FREEABLE;
+out:
+ spin_unlock(&clp->cl_lock);
destroy_unhashed_deleg(dp);
}
@@ -1772,6 +1798,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
mutex_unlock(&stp->st_mutex);
break;
case SC_TYPE_DELEG:
+ refcount_inc(&stid->sc_count);
dp = delegstateid(stid);
spin_lock(&state_lock);
if (!unhash_delegation_locked(
@@ -6606,6 +6633,7 @@ nfs4_laundromat(struct nfsd_net *nn)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(<, dp->dl_time))
break;
+ refcount_inc(&dp->dl_stid.sc_count);
unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -7218,7 +7246,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
s->sc_status |= SC_STATUS_CLOSED;
spin_unlock(&s->sc_lock);
dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
+ if (s->sc_status & SC_STATUS_FREEABLE)
+ list_del_init(&dp->dl_recall_lru);
+ s->sc_status |= SC_STATUS_FREED;
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -7548,7 +7578,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED|SC_STATUS_FREEABLE, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6351e6eca7cc..cc00d6b64b88 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -114,6 +114,8 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define SC_STATUS_REVOKED BIT(1)
#define SC_STATUS_ADMIN_REVOKED BIT(2)
+#define SC_STATUS_FREEABLE BIT(3)
+#define SC_STATUS_FREED BIT(4)
unsigned short sc_status;
struct list_head sc_cp_list;
--
2.43.5
There is a race between laundromat handling of revoked delegations
and a client sending free_stateid operation. Laundromat thread
finds that delegation has expired and needs to be revoked so it
marks the delegation stid revoked and it puts it on a reaper list
but then it unlock the state lock and the actual delegation revocation
happens without the lock. Once the stid is marked revoked a racing
free_stateid processing thread does the following (1) it calls
list_del_init() which removes it from the reaper list and (2) frees
the delegation stid structure. The laundromat thread ends up not
calling the revoke_delegation() function for this particular delegation
but that means it will no release the lock lease that exists on
the file.
Now, a new open for this file comes in and ends up finding that
lease list isn't empty and calls nfsd_breaker_owns_lease() which ends
up trying to derefence a freed delegation stateid. Leading to the
followint use-after-free KASAN warning:
kernel: ==================================================================
kernel: BUG: KASAN: slab-use-after-free in nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: Read of size 8 at addr ffff0000e73cd0c8 by task nfsd/6205
kernel:
kernel: CPU: 2 UID: 0 PID: 6205 Comm: nfsd Kdump: loaded Not tainted 6.11.0-rc7+ #9
kernel: Hardware name: Apple Inc. Apple Virtualization Generic Platform, BIOS 2069.0.0.0.0 08/03/2024
kernel: Call trace:
kernel: dump_backtrace+0x98/0x120
kernel: show_stack+0x1c/0x30
kernel: dump_stack_lvl+0x80/0xe8
kernel: print_address_description.constprop.0+0x84/0x390
kernel: print_report+0xa4/0x268
kernel: kasan_report+0xb4/0xf8
kernel: __asan_report_load8_noabort+0x1c/0x28
kernel: nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: leases_conflict+0x68/0x370
kernel: __break_lease+0x204/0xc38
kernel: nfsd_open_break_lease+0x8c/0xf0 [nfsd]
kernel: nfsd_file_do_acquire+0xb3c/0x11d0 [nfsd]
kernel: nfsd_file_acquire_opened+0x84/0x110 [nfsd]
kernel: nfs4_get_vfs_file+0x634/0x958 [nfsd]
kernel: nfsd4_process_open2+0xa40/0x1a40 [nfsd]
kernel: nfsd4_open+0xa08/0xe80 [nfsd]
kernel: nfsd4_proc_compound+0xb8c/0x2130 [nfsd]
kernel: nfsd_dispatch+0x22c/0x718 [nfsd]
kernel: svc_process_common+0x8e8/0x1960 [sunrpc]
kernel: svc_process+0x3d4/0x7e0 [sunrpc]
kernel: svc_handle_xprt+0x828/0xe10 [sunrpc]
kernel: svc_recv+0x2cc/0x6a8 [sunrpc]
kernel: nfsd+0x270/0x400 [nfsd]
kernel: kthread+0x288/0x310
kernel: ret_from_fork+0x10/0x20
This patch proposes a fix that's based on adding 2 new additional
stid's sc_status values that help coordinate between the laundromat
and other operations (nfsd4_free_stateid() and nfsd4_delegreturn()).
First to make sure, that once the stid is marked revoked, it is not
removed by the nfsd4_free_stateid(), the laundromat take a reference
on the stateid. Then, coordinating whether the stid has been put
on the cl_revoked list or we are processing FREE_STATEID and need to
make sure to remove it from the list, each check that state and act
accordingly. If laundromat has added to the cl_revoke list before
the arrival of FREE_STATEID, then nfsd4_free_stateid() knows to remove
it from the list. If nfsd4_free_stateid() finds that operations arrived
before laundromat has placed it on cl_revoke list, it marks the state
freed and then laundromat will no longer add it to the list.
Also, for nfsd4_delegreturn() when looking for the specified stid,
we need to access stid that are marked removed or freeable, it means
the laundromat has started processing it but hasn't finished and this
delegreturn needs to return nfserr_deleg_revoked and not
nfserr_bad_stateid. The latter will not trigger a FREE_STATEID and the
lack of it will leave this stid on the cl_revoked list indefinitely.
Fixes: 2d4a532d385f ("nfsd: ensure that clp->cl_revoked list is
protected by clp->cl_lock")
CC: stable(a)vger.kernel.org
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
---
fs/nfsd/nfs4state.c | 15 ++++++++++++---
fs/nfsd/state.h | 2 ++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ac1859c7cc9d..cb989802e896 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1370,10 +1370,16 @@ static void revoke_delegation(struct nfs4_delegation *dp)
if (dp->dl_stid.sc_status &
(SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
spin_lock(&clp->cl_lock);
- refcount_inc(&dp->dl_stid.sc_count);
+ if (dp->dl_stid.sc_status & SC_STATUS_FREED) {
+ list_del_init(&dp->dl_recall_lru);
+ spin_unlock(&clp->cl_lock);
+ goto out;
+ }
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ dp->dl_stid.sc_status |= SC_STATUS_FREEABLE;
spin_unlock(&clp->cl_lock);
}
+out:
destroy_unhashed_deleg(dp);
}
@@ -6545,6 +6551,7 @@ nfs4_laundromat(struct nfsd_net *nn)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(<, dp->dl_time))
break;
+ refcount_inc(&dp->dl_stid.sc_count);
unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -7156,7 +7163,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (s->sc_status & SC_STATUS_REVOKED) {
spin_unlock(&s->sc_lock);
dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
+ if (s->sc_status & SC_STATUS_FREEABLE)
+ list_del_init(&dp->dl_recall_lru);
+ s->sc_status |= SC_STATUS_FREED;
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -7486,7 +7495,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED|SC_STATUS_FREEABLE, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 79c743c01a47..35b3564c065f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -114,6 +114,8 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define SC_STATUS_REVOKED BIT(1)
#define SC_STATUS_ADMIN_REVOKED BIT(2)
+#define SC_STATUS_FREEABLE BIT(3)
+#define SC_STATUS_FREED BIT(4)
unsigned short sc_status;
struct list_head sc_cp_list;
--
2.43.5
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Commit ea7e2d5e49c0 ("mm: call the security_mmap_file() LSM hook in
remap_file_pages()") fixed a security issue, it added an LSM check when
trying to remap file pages, so that LSMs have the opportunity to evaluate
such action like for other memory operations such as mmap() and mprotect().
However, that commit called security_mmap_file() inside the mmap_lock lock,
while the other calls do it before taking the lock, after commit
8b3ec6814c83 ("take security_mmap_file() outside of ->mmap_sem").
This caused lock inversion issue with IMA which was taking the mmap_lock
and i_mutex lock in the opposite way when the remap_file_pages() system
call was called.
Solve the issue by splitting the critical region in remap_file_pages() in
two regions: the first takes a read lock of mmap_lock and retrieves the VMA
and the file associated, and calculate the 'prot' and 'flags' variable; the
second takes a write lock on mmap_lock, checks that the VMA flags and the
VMA file descriptor are the same as the ones obtained in the first critical
region (otherwise the system call fails), and calls do_mmap().
In between, after releasing the read lock and taking the write lock, call
security_mmap_file(), and solve the lock inversion issue.
Cc: stable(a)vger.kernel.org
Fixes: ea7e2d5e49c0 ("mm: call the security_mmap_file() LSM hook in remap_file_pages()")
Reported-by: syzbot+91ae49e1c1a2634d20c0(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-security-module/66f7b10e.050a0220.46d20.0036.…
Reviewed-by: Roberto Sassu <roberto.sassu(a)huawei.com> (Calculate prot and flags earlier)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
---
mm/mmap.c | 62 ++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 45 insertions(+), 17 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c0fb43064b5..762944427e03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,6 +1640,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;
+ vm_flags_t vm_flags;
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
current->comm, current->pid);
@@ -1656,12 +1657,53 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
- if (mmap_write_lock_killable(mm))
+ if (mmap_read_lock_killable(mm))
+ return -EINTR;
+
+ vma = vma_lookup(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED)) {
+ mmap_read_unlock(mm);
+ return -EINVAL;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+
+ /* Save vm_flags used to calculate prot and flags, and recheck later. */
+ vm_flags = vma->vm_flags;
+ file = get_file(vma->vm_file);
+
+ mmap_read_unlock(mm);
+
+ ret = security_mmap_file(file, prot, flags);
+ if (ret) {
+ fput(file);
+ return ret;
+ }
+
+ ret = -EINVAL;
+
+ if (mmap_write_lock_killable(mm)) {
+ fput(file);
return -EINTR;
+ }
vma = vma_lookup(mm, start);
- if (!vma || !(vma->vm_flags & VM_SHARED))
+ if (!vma)
+ goto out;
+
+ if (vma->vm_flags != vm_flags)
+ goto out;
+
+ if (vma->vm_file != file)
goto out;
if (start + size > vma->vm_end) {
@@ -1689,25 +1731,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
}
- prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
- prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
- prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
-
- flags &= MAP_NONBLOCK;
- flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
-
- file = get_file(vma->vm_file);
- ret = security_mmap_file(vma->vm_file, prot, flags);
- if (ret)
- goto out_fput;
ret = do_mmap(vma->vm_file, start, size,
prot, flags, 0, pgoff, &populate, NULL);
-out_fput:
- fput(file);
out:
mmap_write_unlock(mm);
+ fput(file);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
--
2.34.1
If some remap_pfn_range() calls succeeded before one failed, we still have
buffer pages mapped into the userspace page tables when we drop the buffer
reference with comedi_buf_map_put(bm). The userspace mappings are only
cleaned up later in the mmap error path.
Fix it by explicitly flushing all mappings in our VMA on the error path.
See commit 79a61cc3fc04 ("mm: avoid leaving partial pfn mappings around in
error case").
Cc: stable(a)vger.kernel.org
Fixes: ed9eccbe8970 ("Staging: add comedi core")
Signed-off-by: Jann Horn <jannh(a)google.com>
---
Note: compile-tested only; I don't actually have comedi hardware, and I
don't know anything about comedi.
---
Changes in v3:
- gate zapping ptes on CONFIG_MMU (Intel kernel test robot)
- Link to v2: https://lore.kernel.org/r/20241015-comedi-tlb-v2-1-cafb0e27dd9a@google.com
Changes in v2:
- only do the zapping in the pfnmap path (Ian Abbott)
- use zap_vma_ptes() instead of zap_page_range_single() (Ian Abbott)
- Link to v1: https://lore.kernel.org/r/20241014-comedi-tlb-v1-1-4b699144b438@google.com
---
drivers/comedi/comedi_fops.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index 1b481731df96..b9df9b19d4bd 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -2407,6 +2407,18 @@ static int comedi_mmap(struct file *file, struct vm_area_struct *vma)
start += PAGE_SIZE;
}
+
+#ifdef CONFIG_MMU
+ /*
+ * Leaving behind a partial mapping of a buffer we're about to
+ * drop is unsafe, see remap_pfn_range_notrack().
+ * We need to zap the range here ourselves instead of relying
+ * on the automatic zapping in remap_pfn_range() because we call
+ * remap_pfn_range() in a loop.
+ */
+ if (retval)
+ zap_vma_ptes(vma, vma->vm_start, size);
+#endif
}
if (retval == 0) {
---
base-commit: 6485cf5ea253d40d507cd71253c9568c5470cd27
change-id: 20241014-comedi-tlb-400246505961
--
Jann Horn <jannh(a)google.com>
In psnet_open_pf_bar() and snet_open_vf_bar() a string later passed to
pcim_iomap_regions() is placed on the stack. Neither
pcim_iomap_regions() nor the functions it calls copy that string.
Should the string later ever be used, this, consequently, causes
undefined behavior since the stack frame will by then have disappeared.
Fix the bug by allocating the strings on the heap through
devm_kasprintf().
Cc: stable(a)vger.kernel.org # v6.3
Fixes: 51a8f9d7f587 ("virtio: vdpa: new SolidNET DPU driver.")
Reported-by: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Closes: https://lore.kernel.org/all/74e9109a-ac59-49e2-9b1d-d825c9c9f891@wanadoo.fr/
Suggested-by: Andy Shevchenko <andy(a)kernel.org>
Signed-off-by: Philipp Stanner <pstanner(a)redhat.com>
---
drivers/vdpa/solidrun/snet_main.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c
index 99428a04068d..c8b74980dbd1 100644
--- a/drivers/vdpa/solidrun/snet_main.c
+++ b/drivers/vdpa/solidrun/snet_main.c
@@ -555,7 +555,7 @@ static const struct vdpa_config_ops snet_config_ops = {
static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
{
- char name[50];
+ char *name;
int ret, i, mask = 0;
/* We don't know which BAR will be used to communicate..
* We will map every bar with len > 0.
@@ -573,7 +573,10 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
return -ENODEV;
}
- snprintf(name, sizeof(name), "psnet[%s]-bars", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
ret = pcim_iomap_regions(pdev, mask, name);
if (ret) {
SNET_ERR(pdev, "Failed to request and map PCI BARs\n");
@@ -590,10 +593,13 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet)
{
- char name[50];
+ char *name;
int ret;
- snprintf(name, sizeof(name), "snet[%s]-bar", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "snet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
/* Request and map BAR */
ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name);
if (ret) {
--
2.46.1