From: Filipe Manana <fdmanana(a)suse.com>
commit d47704bd1c78c85831561bcf701b90dd66f811b2 upstream.
At find_delalloc_subrange(), when we need to get the next extent map, we
do a full search on the extent map tree (a red black tree). This is fine
but it's a lot more efficient to simply use rb_next(), which typically
requires iterating over less nodes of the tree and never needs to compare
the ranges of nodes with the one we are looking for.
So add a public helper to extent_map.{h,c} to get the extent map that
immediately follows another extent map, using rb_next(), and use that
helper at find_delalloc_subrange().
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
Please add this patch to the next 6.1 stable release.
It happens to fix a bug recently reported at:
https://bugzilla.redhat.com/show_bug.cgi?id=2187312
Thanks.
fs/btrfs/extent_map.c | 31 +++++++++++++++++++++++++++++-
fs/btrfs/extent_map.h | 2 ++
fs/btrfs/file.c | 44 ++++++++++++++++++++++++++-----------------
3 files changed, 59 insertions(+), 18 deletions(-)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8ae02aa632e..4abbe4b35253 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -523,7 +523,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
setup_extent_mapping(tree, new, modified);
}
-static struct extent_map *next_extent_map(struct extent_map *em)
+static struct extent_map *next_extent_map(const struct extent_map *em)
{
struct rb_node *next;
@@ -533,6 +533,35 @@ static struct extent_map *next_extent_map(struct extent_map *em)
return container_of(next, struct extent_map, rb_node);
}
+/*
+ * Get the extent map that immediately follows another one.
+ *
+ * @tree: The extent map tree that the extent map belong to.
+ * Holding read or write access on the tree's lock is required.
+ * @em: An extent map from the given tree. The caller must ensure that
+ * between getting @em and between calling this function, the
+ * extent map @em is not removed from the tree - for example, by
+ * holding the tree's lock for the duration of those 2 operations.
+ *
+ * Returns the extent map that immediately follows @em, or NULL if @em is the
+ * last extent map in the tree.
+ */
+struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
+ const struct extent_map *em)
+{
+ struct extent_map *next;
+
+ /* The lock must be acquired either in read mode or write mode. */
+ lockdep_assert_held(&tree->lock);
+ ASSERT(extent_map_in_tree(em));
+
+ next = next_extent_map(em);
+ if (next)
+ refcount_inc(&next->refs);
+
+ return next;
+}
+
static struct extent_map *prev_extent_map(struct extent_map *em)
{
struct rb_node *prev;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ad311864272a..68d3f2c9ea1d 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -87,6 +87,8 @@ static inline u64 extent_map_block_end(struct extent_map *em)
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
+struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
+ const struct extent_map *em);
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1bda59c68360..77202addead8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3248,40 +3248,50 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
*/
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
- read_unlock(&em_tree->lock);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ return (delalloc_len > 0);
+ }
/* extent_map_end() returns a non-inclusive end offset. */
- em_end = em ? extent_map_end(em) : 0;
+ em_end = extent_map_end(em);
/*
* If we have a hole/prealloc extent map, check the next one if this one
* ends before our range's end.
*/
- if (em && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+ if ((em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
struct extent_map *next_em;
- read_lock(&em_tree->lock);
- next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
- read_unlock(&em_tree->lock);
-
+ next_em = btrfs_next_extent_map(em_tree, em);
free_extent_map(em);
- em_end = next_em ? extent_map_end(next_em) : 0;
+
+ /*
+ * There's no next extent map or the next one starts beyond our
+ * range, return the range found in the io tree (if any).
+ */
+ if (!next_em || next_em->start > end) {
+ read_unlock(&em_tree->lock);
+ free_extent_map(next_em);
+ return (delalloc_len > 0);
+ }
+
+ em_end = extent_map_end(next_em);
em = next_em;
}
- if (em && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- free_extent_map(em);
- em = NULL;
- }
+ read_unlock(&em_tree->lock);
/*
- * No extent map or one for a hole or prealloc extent. Use the delalloc
- * range we found in the io tree if we have one.
+ * We have a hole or prealloc extent that ends at or beyond our range's
+ * end, return the range found in the io tree (if any).
*/
- if (!em)
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ free_extent_map(em);
return (delalloc_len > 0);
+ }
/*
* We don't have any range as EXTENT_DELALLOC in the io tree, so the
--
2.34.1
From: Brian Foster <bfoster(a)redhat.com>
commit 7cd3099f4925d7c15887d1940ebd65acd66100f5 upstream.
Per-inode ioend completion batching has a log reservation deadlock
vector between preallocated append transactions and transactions
that are acquired at completion time for other purposes (i.e.,
unwritten extent conversion or COW fork remaps). For example, if the
ioend completion workqueue task executes on a batch of ioends that
are sorted such that an append ioend sits at the tail, it's possible
for the outstanding append transaction reservation to block
allocation of transactions required to process preceding ioends in
the list.
Append ioend completion is historically the common path for on-disk
inode size updates. While file extending writes may have completed
sometime earlier, the on-disk inode size is only updated after
successful writeback completion. These transactions are preallocated
serially from writeback context to mitigate concurrency and
associated log reservation pressure across completions processed by
multi-threaded workqueue tasks.
However, now that delalloc blocks unconditionally map to unwritten
extents at physical block allocation time, size updates via append
ioends are relatively rare. This means that inode size updates most
commonly occur as part of the preexisting completion time
transaction to convert unwritten extents. As a result, there is no
longer a strong need to preallocate size update transactions.
Remove the preallocation of inode size update transactions to avoid
the ioend completion processing log reservation deadlock. Instead,
continue to send all potential size extending ioends to workqueue
context for completion and allocate the transaction from that
context. This ensures that no outstanding log reservation is owned
by the ioend completion worker task when it begins to process
ioends.
Signed-off-by: Brian Foster <bfoster(a)redhat.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Signed-off-by: Darrick J. Wong <djwong(a)kernel.org>
Reported-by: Christian Theune <ct(a)flyingcircus.io>
Link: https://lore.kernel.org/linux-xfs/CAOQ4uxjj2UqA0h4Y31NbmpHksMhVrXfXjLG4Tnz3…
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
Acked-by: Darrick J. Wong <djwong(a)kernel.org>
---
Greg,
One more fix from v5.13 that I missed from my backports.
Thanks,
Amir.
fs/xfs/xfs_aops.c | 45 +++------------------------------------------
1 file changed, 3 insertions(+), 42 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 953de843d9c3..e341d6531e68 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -39,33 +39,6 @@ static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
XFS_I(ioend->io_inode)->i_d.di_size;
}
-STATIC int
-xfs_setfilesize_trans_alloc(
- struct iomap_ioend *ioend)
-{
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
- if (error)
- return error;
-
- ioend->io_private = tp;
-
- /*
- * We may pass freeze protection with a transaction. So tell lockdep
- * we released it.
- */
- __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
- /*
- * We hand off the transaction to the completion thread now, so
- * clear the flag here.
- */
- xfs_trans_clear_context(tp);
- return 0;
-}
-
/*
* Update on-disk file size now that data has been written to disk.
*/
@@ -191,12 +164,10 @@ xfs_end_ioend(
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- else
- ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
+ if (!error && xfs_ioend_is_append(ioend))
+ error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
done:
- if (ioend->io_private)
- error = xfs_setfilesize_ioend(ioend, error);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
@@ -246,7 +217,7 @@ xfs_end_io(
static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
{
- return ioend->io_private ||
+ return xfs_ioend_is_append(ioend) ||
ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED);
}
@@ -259,8 +230,6 @@ xfs_end_bio(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
- ASSERT(xfs_ioend_needs_workqueue(ioend));
-
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
@@ -510,14 +479,6 @@ xfs_prepare_ioend(
ioend->io_offset, ioend->io_size);
}
- /* Reserve log space if we might write beyond the on-disk inode size. */
- if (!status &&
- ((ioend->io_flags & IOMAP_F_SHARED) ||
- ioend->io_type != IOMAP_UNWRITTEN) &&
- xfs_ioend_is_append(ioend) &&
- !ioend->io_private)
- status = xfs_setfilesize_trans_alloc(ioend);
-
memalloc_nofs_restore(nofs_flag);
if (xfs_ioend_needs_workqueue(ioend))
--
2.34.1
commit 542a56e8eb4467ae654eefab31ff194569db39cd upstream.
The VCN firmware loading path enables the indirect SRAM mode if it's
advertised as supported. We might have some cases of FW issues that
prevents this mode to working properly though, ending-up in a failed
probe. An example below, observed in the Steam Deck:
[...]
[drm] failed to load ucode VCN0_RAM(0x3A)
[drm] psp gfx command LOAD_IP_FW(0x6) failed and response status is (0xFFFF0000)
amdgpu 0000:04:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring vcn_dec_0 test failed (-110)
[drm:amdgpu_device_init.cold [amdgpu]] *ERROR* hw_init of IP block <vcn_v3_0> failed -110
amdgpu 0000:04:00.0: amdgpu: amdgpu_device_ip_init failed
amdgpu 0000:04:00.0: amdgpu: Fatal error during GPU init
[...]
Disabling the VCN block circumvents this, but it's a very invasive
workaround that turns off the entire feature. So, let's add a quirk
on VCN loading that checks for known problematic BIOSes on Vangogh,
so we can proactively disable the indirect SRAM mode and allow the
HW proper probe and VCN IP block to work fine.
Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2385
Fixes: 82132ecc5432 ("drm/amdgpu: enable Vangogh VCN indirect sram mode")
Fixes: 9a8cc8cabc1e ("drm/amdgpu: enable Vangogh VCN indirect sram mode")
Cc: stable(a)vger.kernel.org
Cc: James Zhu <James.Zhu(a)amd.com>
Cc: Leo Liu <leo.liu(a)amd.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
---
Hi folks, this was build/boot tested on Deck. I've also adjusted the
context, function was reworked on 6.2.
But what a surprise was for me not see this fix already in 6.1.y, since
I've CCed stable, and the reason for that is really peculiar:
$ git log -1 --pretty="%an <%ae>: %s" 82132ecc5432
Leo Liu <leo.liu(a)amd.com>: drm/amdgpu: enable Vangogh VCN indirect sram mode
$ git describe --contains 82132ecc5432
v6.2-rc1~124^2~1^2~13
$ git log -1 --pretty="%an <%ae>: %s" 9a8cc8cabc1e
Leo Liu <leo.liu(a)amd.com>: drm/amdgpu: enable Vangogh VCN indirect sram mode
$ git describe --contains 9a8cc8cabc1e
v6.1-rc8~16^2^2
This is quite strange for me, we have 2 commit hashes pointing to the *same*
commit, and each one is present..in a different release !!?!
Since I've marked this patch as fixing 82132ecc5432 originally, 6.1.y stable
misses it, since it only contains 9a8cc8cabc1e (which is the same patch!).
Alex, do you have an idea why sometimes commits from the AMD tree appear
duplicate in mainline? Specially in different releases, this could cause
some confusion I guess.
Thanks in advance,
Guilherme
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index ce64ca1c6e66..5c1193dd7d88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -26,6 +26,7 @@
#include <linux/firmware.h>
#include <linux/module.h>
+#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/debugfs.h>
#include <drm/drm_drv.h>
@@ -84,6 +85,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
{
unsigned long bo_size;
const char *fw_name;
+ const char *bios_ver;
const struct common_firmware_header *hdr;
unsigned char fw_check;
unsigned int fw_shared_size, log_offset;
@@ -159,6 +161,21 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
if ((adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) &&
(adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG))
adev->vcn.indirect_sram = true;
+ /*
+ * Some Steam Deck's BIOS versions are incompatible with the
+ * indirect SRAM mode, leading to amdgpu being unable to get
+ * properly probed (and even potentially crashing the kernel).
+ * Hence, check for these versions here - notice this is
+ * restricted to Vangogh (Deck's APU).
+ */
+ bios_ver = dmi_get_system_info(DMI_BIOS_VERSION);
+
+ if (bios_ver && (!strncmp("F7A0113", bios_ver, 7) ||
+ !strncmp("F7A0114", bios_ver, 7))) {
+ adev->vcn.indirect_sram = false;
+ dev_info(adev->dev,
+ "Steam Deck quirk: indirect SRAM disabled on BIOS %s\n", bios_ver);
+ }
break;
case IP_VERSION(3, 0, 16):
fw_name = FIRMWARE_DIMGREY_CAVEFISH;
--
2.40.0
From: Mel Gorman <mgorman(a)techsingularity.net>
commit 1c0908d8e441631f5b8ba433523cf39339ee2ba0 upstream.
Jan Kara reported the following bug triggering on 6.0.5-rt14 running dbench
on XFS on arm64.
kernel BUG at fs/inode.c:625!
Internal error: Oops - BUG: 0 [#1] PREEMPT_RT SMP
CPU: 11 PID: 6611 Comm: dbench Tainted: G E 6.0.0-rt14-rt+ #1
pc : clear_inode+0xa0/0xc0
lr : clear_inode+0x38/0xc0
Call trace:
clear_inode+0xa0/0xc0
evict+0x160/0x180
iput+0x154/0x240
do_unlinkat+0x184/0x300
__arm64_sys_unlinkat+0x48/0xc0
el0_svc_common.constprop.4+0xe4/0x2c0
do_el0_svc+0xac/0x100
el0_svc+0x78/0x200
el0t_64_sync_handler+0x9c/0xc0
el0t_64_sync+0x19c/0x1a0
It also affects 6.1-rc7-rt5 and affects a preempt-rt fork of 5.14 so this
is likely a bug that existed forever and only became visible when ARM
support was added to preempt-rt. The same problem does not occur on x86-64
and he also reported that converting sb->s_inode_wblist_lock to
raw_spinlock_t makes the problem disappear indicating that the RT spinlock
variant is the problem.
Which in turn means that RT mutexes on ARM64 and any other weakly ordered
architecture are affected by this independent of RT.
Will Deacon observed:
"I'd be more inclined to be suspicious of the slowpath tbh, as we need to
make sure that we have acquire semantics on all paths where the lock can
be taken. Looking at the rtmutex code, this really isn't obvious to me
-- for example, try_to_take_rt_mutex() appears to be able to return via
the 'takeit' label without acquire semantics and it looks like we might
be relying on the caller's subsequent _unlock_ of the wait_lock for
ordering, but that will give us release semantics which aren't correct."
Sebastian Andrzej Siewior prototyped a fix that does work based on that
comment but it was a little bit overkill and added some fences that should
not be necessary.
The lock owner is updated with an IRQ-safe raw spinlock held, but the
spin_unlock does not provide acquire semantics which are needed when
acquiring a mutex.
Adds the necessary acquire semantics for lock owner updates in the slow path
acquisition and the waiter bit logic.
It successfully completed 10 iterations of the dbench workload while the
vanilla kernel fails on the first iteration.
[ bigeasy(a)linutronix.de: Initial prototype fix ]
Fixes: 700318d1d7b38 ("locking/rtmutex: Use acquire/release semantics")
Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core")
Reported-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Mel Gorman <mgorman(a)techsingularity.net>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20221202100223.6mevpbl7i6x5udfd@techsingularity.n…
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
---
Could this be please backported to 5.15 and earlier? It is already part
of the 6.X kernels. I asked about this by the end of January and I'm
kindly asking again ;)
This patch applies against v5.15. Should it not apply to earlier
versions, please let me know an I kindly provide a backport.
I received reports that this fixes "mysterious" crashes and that is how
I noticed that it is not part of the earlier kernels.
kernel/locking/rtmutex.c | 55 ++++++++++++++++++++++++++++++------
kernel/locking/rtmutex_api.c | 6 ++--
2 files changed, 49 insertions(+), 12 deletions(-)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ea5a701ab2408..c9b21fd30bed5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -87,15 +87,31 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
* set this bit before looking at the lock.
*/
-static __always_inline void
-rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- WRITE_ONCE(lock->owner, (struct task_struct *)val);
+ return (struct task_struct *)val;
+}
+
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
@@ -104,7 +120,8 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -170,8 +187,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -206,6 +236,13 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
owner = *p;
} while (cmpxchg_relaxed(p, owner,
owner | RT_MUTEX_HAS_WAITERS) != owner);
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -1231,7 +1268,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
* try_to_take_rt_mutex() sets the lock waiters bit
* unconditionally. Clean this up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1591,7 +1628,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit
* unconditionally. We might have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1701,7 +1738,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
* try_to_take_rt_mutex() sets the waiter bit unconditionally.
* We might have to fix that up:
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
debug_rt_mutex_free_waiter(&waiter);
}
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 5c9299aaabae1..a461be2f873db 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -245,7 +245,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
{
debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
+ rt_mutex_clear_owner(lock);
}
/**
@@ -360,7 +360,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -416,7 +416,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, false);
raw_spin_unlock_irq(&lock->wait_lock);
--
2.39.1
In upstream commit 77e52ae35463 ("futex: Move to kernel/futex/") the
futex code from kernel/futex.c was moved into kernel/futex/core.c in
preparation of the split-up of the implementation in various files.
Point kernel-doc references to the new files as otherwise the
documentation shows errors on build:
[...]
Error: Cannot open file ./kernel/futex.c
Error: Cannot open file ./kernel/futex.c
[...]
WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -sphinx-version 3.4.3 -internal ./kernel/futex.c' failed with return code 2
There is no direct upstream commit for this change. It is made in
analogy to commit bc67f1c454fb ("docs: futex: Fix kernel-doc
references") applied as consequence of the restructuring of the futex
code.
Fixes: 77e52ae35463 ("futex: Move to kernel/futex/")
Signed-off-by: Salvatore Bonaccorso <carnil(a)debian.org>
---
v1->v2:
- Fix typo in description about new target file for futex.c code
- Indent block with build log output
Documentation/kernel-hacking/locking.rst | 2 +-
Documentation/translations/it_IT/kernel-hacking/locking.rst | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index 6ed806e6061b..a6d89efede79 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -1358,7 +1358,7 @@ Mutex API reference
Futex API reference
===================
-.. kernel-doc:: kernel/futex.c
+.. kernel-doc:: kernel/futex/core.c
:internal:
Further reading
diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst
index bf1acd6204ef..192ab8e28125 100644
--- a/Documentation/translations/it_IT/kernel-hacking/locking.rst
+++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst
@@ -1400,7 +1400,7 @@ Riferimento per l'API dei Mutex
Riferimento per l'API dei Futex
===============================
-.. kernel-doc:: kernel/futex.c
+.. kernel-doc:: kernel/futex/core.c
:internal:
Approfondimenti
--
2.40.0
From: Huanhuan Wang <huanhuan.wang(a)corigine.com>
There are two pointers in struct xfrm_dev_offload, *dev, *real_dev.
The *dev points whether bonding interface or real interface, if
bonding IPsec offload is used, it points bonding interface; if not,
it points real interface. And *real_dev always points real interface.
So nfp should always use real_dev instead of dev.
Prior to this change the system becomes unresponsive when offloading
IPsec for a device which is a lower device to a bonding device.
Fixes: 859a497fe80c ("nfp: implement xfrm callbacks and expose ipsec offload feature to upper layer")
CC: stable(a)vger.kernel.org
Signed-off-by: Huanhuan Wang <huanhuan.wang(a)corigine.com>
Acked-by: Simon Horman <simon.horman(a)corigine.com>
Signed-off-by: Louis Peens <louis.peens(a)corigine.com>
---
drivers/net/ethernet/netronome/nfp/crypto/ipsec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c
index c0dcce8ae437..b1f026b81dea 100644
--- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c
+++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c
@@ -269,7 +269,7 @@ static void set_sha2_512hmac(struct nfp_ipsec_cfg_add_sa *cfg, int *trunc_len)
static int nfp_net_xfrm_add_state(struct xfrm_state *x,
struct netlink_ext_ack *extack)
{
- struct net_device *netdev = x->xso.dev;
+ struct net_device *netdev = x->xso.real_dev;
struct nfp_ipsec_cfg_mssg msg = {};
int i, key_len, trunc_len, err = 0;
struct nfp_ipsec_cfg_add_sa *cfg;
@@ -513,7 +513,7 @@ static void nfp_net_xfrm_del_state(struct xfrm_state *x)
.cmd = NFP_IPSEC_CFG_MSSG_INV_SA,
.sa_idx = x->xso.offload_handle - 1,
};
- struct net_device *netdev = x->xso.dev;
+ struct net_device *netdev = x->xso.real_dev;
struct nfp_net *nn;
int err;
--
2.34.1
In upstream commit 77e52ae35463 ("futex: Move to kernel/futex/") the
futex code from kernel/futex.c was moved into kernel/futex/core in
preparation of the split-up of the implementation in various files.
Point kernel-doc references to the new files as otherwise the
documentation shows errors on build:
[...]
Error: Cannot open file ./kernel/futex.c
Error: Cannot open file ./kernel/futex.c
[...]
WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -sphinx-version 3.4.3 -internal ./kernel/futex.c' failed with return code 2
There is no direct upstream commit for this change. It is made in
analogy to commit bc67f1c454fb ("docs: futex: Fix kernel-doc
references") applied as consequence of the restructuring of the futex
code.
Fixes: 77e52ae35463 ("futex: Move to kernel/futex/")
Signed-off-by: Salvatore Bonaccorso <carnil(a)debian.org>
---
Documentation/kernel-hacking/locking.rst | 2 +-
Documentation/translations/it_IT/kernel-hacking/locking.rst | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index 6ed806e6061b..a6d89efede79 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -1358,7 +1358,7 @@ Mutex API reference
Futex API reference
===================
-.. kernel-doc:: kernel/futex.c
+.. kernel-doc:: kernel/futex/core.c
:internal:
Further reading
diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst
index bf1acd6204ef..192ab8e28125 100644
--- a/Documentation/translations/it_IT/kernel-hacking/locking.rst
+++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst
@@ -1400,7 +1400,7 @@ Riferimento per l'API dei Mutex
Riferimento per l'API dei Futex
===============================
-.. kernel-doc:: kernel/futex.c
+.. kernel-doc:: kernel/futex/core.c
:internal:
Approfondimenti
--
2.40.0
From: Pingfan Liu <kernelfans(a)gmail.com>
Purgatory.ro is a standalone binary that is not linked against the rest of
the kernel. Its image is copied into an array that is linked to the
kernel, and from there kexec relocates it wherever it desires.
Unlike the debug info for vmlinux, which can be used for analyzing crash
such info is useless in purgatory.ro. And discarding them can save about
200K space.
Original:
259080 kexec-purgatory.o
Stripped debug info:
29152 kexec-purgatory.o
Signed-off-by: Pingfan Liu <kernelfans(a)gmail.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers(a)google.com>
Reviewed-by: Steve Wahl <steve.wahl(a)hpe.com>
Acked-by: Dave Young <dyoung(a)redhat.com>
Link: https://lore.kernel.org/r/1596433788-3784-1-git-send-email-kernelfans@gmail…
(cherry picked from commit 52416ffcf823ee11aa19792715664ab94757f111)
[Alyssa: fixed for LLVM_IAS=1 by adding -g to AFLAGS_REMOVE_*]
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
---
arch/x86/purgatory/Makefile | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 9733d1cc791d..969d2b2eb7d7 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -27,7 +27,7 @@ KCOV_INSTRUMENT := n
# make up the standalone purgatory.ro
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
-PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss
+PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0
PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
@@ -58,6 +58,9 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS)
CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_string.o += $(PURGATORY_CFLAGS)
+AFLAGS_REMOVE_setup-x86_$(BITS).o += -g -Wa,-gdwarf-2
+AFLAGS_REMOVE_entry64.o += -g -Wa,-gdwarf-2
+
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
--
2.37.1
This is based on upstream commit
d83806c4c0cc ("purgatory: fix disabling debug info"), but adapted to
the linker flags used in 5.10.y.
Since 3a260e9844c9, the linker flags can contain -g instead of
-Wa,-gdwarf-2 (when using the LLVM assembler). As a result, in that
case, debug info was being generated for the purgatory objects, even
though the intention was that it not be.
Fixes: 3a260e9844c9 ("Makefile.debug: re-enable debug info for .S files")
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
Cc: stable(a)vger.kernel.org
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
---
arch/x86/purgatory/Makefile | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..ebaf329a2368 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -64,8 +64,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS)
CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_string.o += $(PURGATORY_CFLAGS)
-AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2
-AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2
+asflags-remove-y += -g -Wa,-gdwarf-2
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
--
2.37.1
This is based on upstream commit
d83806c4c0cc ("purgatory: fix disabling debug info"), but adapted to
the linker flags used in 5.15.y.
Since 0ee2f0567a56, the linker flags can contain -g instead of
-Wa,-gdwarf-2 (when using the LLVM assembler). As a result, in that
case, debug info was being generated for the purgatory objects, even
though the intention was that it not be.
Fixes: 0ee2f0567a56 ("Makefile.debug: re-enable debug info for .S files")
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
Cc: stable(a)vger.kernel.org
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
---
arch/x86/purgatory/Makefile | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..ebaf329a2368 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -64,8 +64,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS)
CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_string.o += $(PURGATORY_CFLAGS)
-AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2
-AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2
+asflags-remove-y += -g -Wa,-gdwarf-2
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
--
2.37.1
Since 32ef9e5054ec, -Wa,-gdwarf-2 is no longer used in KBUILD_AFLAGS.
Instead, it includes -g, the appropriate -gdwarf-* flag, and also the
-Wa versions of both of those if building with Clang and GNU as. As a
result, debug info was being generated for the purgatory objects, even
though the intention was that it not be.
Fixes: 32ef9e5054ec ("Makefile.debug: re-enable debug info for .S files")
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
Cc: stable(a)vger.kernel.org
Acked-by: Nick Desaulniers <ndesaulniers(a)google.com>
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
(cherry picked from commit d83806c4c0cccc0d6d3c3581a11983a9c186a138)
---
arch/riscv/purgatory/Makefile | 4 +---
arch/x86/purgatory/Makefile | 3 +--
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index dd58e1d99397..659e21862077 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -74,9 +74,7 @@ CFLAGS_string.o += $(PURGATORY_CFLAGS)
CFLAGS_REMOVE_ctype.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_ctype.o += $(PURGATORY_CFLAGS)
-AFLAGS_REMOVE_entry.o += -Wa,-gdwarf-2
-AFLAGS_REMOVE_memcpy.o += -Wa,-gdwarf-2
-AFLAGS_REMOVE_memset.o += -Wa,-gdwarf-2
+asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x))
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 17f09dc26381..82fec66d46d2 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -69,8 +69,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS)
CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_string.o += $(PURGATORY_CFLAGS)
-AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2
-AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2
+asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x))
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
base-commit: cdc7aff9ed012801e62eedd99e4a5573eccac4db
--
2.37.1
The quilt patch titled
Subject: ia64: fix an addr to taddr in huge_pte_offset()
has been removed from the -mm tree. Its filename was
ia64-fix-an-addr-to-taddr-in-huge_pte_offset.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: ia64: fix an addr to taddr in huge_pte_offset()
Date: Sun, 16 Apr 2023 22:17:05 -0700 (PDT)
I know nothing of ia64 htlbpage_to_page(), but guess that the p4d
line should be using taddr rather than addr, like everywhere else.
Link: https://lkml.kernel.org/r/732eae88-3beb-246-2c72-281de786740@google.com
Fixes: c03ab9e32a2c ("ia64: add support for folded p4d page tables")
Signed-off-by: Hugh Dickins <hughd(a)google.com
Acked-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Ard Biesheuvel <ardb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/ia64/mm/hugetlbpage.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/ia64/mm/hugetlbpage.c~ia64-fix-an-addr-to-taddr-in-huge_pte_offset
+++ a/arch/ia64/mm/hugetlbpage.c
@@ -58,7 +58,7 @@ huge_pte_offset (struct mm_struct *mm, u
pgd = pgd_offset(mm, taddr);
if (pgd_present(*pgd)) {
- p4d = p4d_offset(pgd, addr);
+ p4d = p4d_offset(pgd, taddr);
if (p4d_present(*p4d)) {
pud = pud_offset(p4d, taddr);
if (pud_present(*pud)) {
_
Patches currently in -mm which might be from hughd(a)google.com are
The quilt patch titled
Subject: mm/hugetlb: fix uffd-wp during fork()
has been removed from the -mm tree. Its filename was
mm-hugetlb-fix-uffd-wp-during-fork.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Peter Xu <peterx(a)redhat.com>
Subject: mm/hugetlb: fix uffd-wp during fork()
Date: Mon, 17 Apr 2023 15:53:12 -0400
Patch series "mm/hugetlb: More fixes around uffd-wp vs fork() / RO pins",
v2.
This patch (of 6):
There're a bunch of things that were wrong:
- Reading uffd-wp bit from a swap entry should use pte_swp_uffd_wp()
rather than huge_pte_uffd_wp().
- When copying over a pte, we should drop uffd-wp bit when
!EVENT_FORK (aka, when !userfaultfd_wp(dst_vma)).
- When doing early CoW for private hugetlb (e.g. when the parent page was
pinned), uffd-wp bit should be properly carried over if necessary.
No bug reported probably because most people do not even care about these
corner cases, but they are still bugs and can be exposed by the recent unit
tests introduced, so fix all of them in one shot.
Link: https://lkml.kernel.org/r/20230417195317.898696-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20230417195317.898696-2-peterx@redhat.com
Fixes: bc70fbf269fd ("mm/hugetlb: handle uffd-wp during fork()")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Mika Penttil�� <mpenttil(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
--- a/mm/hugetlb.c~mm-hugetlb-fix-uffd-wp-during-fork
+++ a/mm/hugetlb.c
@@ -4953,11 +4953,15 @@ static bool is_hugetlb_entry_hwpoisoned(
static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
- struct folio *new_folio)
+ struct folio *new_folio, pte_t old)
{
+ pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+
__folio_mark_uptodate(new_folio);
hugepage_add_new_anon_rmap(new_folio, vma, addr);
- set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1));
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
+ newpte = huge_pte_mkuffd_wp(newpte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, newpte);
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
folio_set_hugetlb_migratable(new_folio);
}
@@ -5032,14 +5036,12 @@ again:
*/
;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
- bool uffd_wp = huge_pte_uffd_wp(entry);
-
- if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_hugetlb_entry_migration(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
- bool uffd_wp = huge_pte_uffd_wp(entry);
+ bool uffd_wp = pte_swp_uffd_wp(entry);
if (!is_readable_migration_entry(swp_entry) && cow) {
/*
@@ -5050,10 +5052,10 @@ again:
swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
if (userfaultfd_wp(src_vma) && uffd_wp)
- entry = huge_pte_mkuffd_wp(entry);
+ entry = pte_swp_mkuffd_wp(entry);
set_huge_pte_at(src, addr, src_pte, entry);
}
- if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
@@ -5118,7 +5120,8 @@ again:
/* huge_ptep of dst_pte won't change as in child */
goto again;
}
- hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
+ hugetlb_install_folio(dst_vma, dst_pte, addr,
+ new_folio, src_pte_old);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
@@ -5136,6 +5139,9 @@ again:
entry = huge_pte_wrprotect(entry);
}
+ if (!userfaultfd_wp(dst_vma))
+ entry = huge_pte_clear_uffd_wp(entry);
+
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(npages, dst);
}
_
Patches currently in -mm which might be from peterx(a)redhat.com are
On some Zhaoxin platforms, xHCI will prefetch TRB for performance
improvement. However this TRB prefetch mechanism may cross page boundary,
which may access memory not allocated by xHCI driver. In order to fix
this issue, two pages was allocated for TRB and only the first
page will be used.
Cc: stable(a)vger.kernel.org
Signed-off-by: Weitao Wang <WeitaoWang-oc(a)zhaoxin.com>
---
drivers/usb/host/xhci-mem.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index d0a9467aa5fc..d5517400d874 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -2369,8 +2369,12 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags)
* and our use of dma addresses in the trb_address_map radix tree needs
* TRB_SEGMENT_SIZE alignment, so we pick the greater alignment need.
*/
- xhci->segment_pool = dma_pool_create("xHCI ring segments", dev,
- TRB_SEGMENT_SIZE, TRB_SEGMENT_SIZE, xhci->page_size);
+ if (xhci->quirks & XHCI_ZHAOXIN_TRB_FETCH)
+ xhci->segment_pool = dma_pool_create("xHCI ring segments", dev,
+ TRB_SEGMENT_SIZE * 2, TRB_SEGMENT_SIZE * 2, xhci->page_size * 2);
+ else
+ xhci->segment_pool = dma_pool_create("xHCI ring segments", dev,
+ TRB_SEGMENT_SIZE, TRB_SEGMENT_SIZE, xhci->page_size);
/* See Table 46 and Note on Figure 55 */
xhci->device_pool = dma_pool_create("xHCI input/output contexts", dev,
--
2.32.0
commit 08d0cc5f34265d1a1e3031f319f594bd1970976c upstream.
This change is desired because without it, it has been observed that
re-applying aspm settings can cause the system to crash with certain pci
devices (ie. Genesys GL9755).
Tested by issuing 100 suspend/resume cycles on a symptomatic system running
5.15.107.
L1 settings looked identical before and after:
```
localhost ~ # lspci -vvv -d 0x17a0: | grep L1Sub
L1SubCap: PCI-PM_L1.2+ PCI-PM_L1.1+ ASPM_L1.2+ ASPM_L1.1+ L1_PM_Substates+
L1SubCtl1: PCI-PM_L1.2- PCI-PM_L1.1+ ASPM_L1.2- ASPM_L1.1+
L1SubCtl2: T_PwrOn=3100us
```
Cc: <stable(a)vger.kernel.org> # 5.15.y
OverCurrent condition is not standardized in the UHCI spec.
Zhaoxin UHCI controllers report OverCurrent bit active off.
In order to handle OverCurrent condition correctly, the uhci-hcd
driver needs to be told to expect the active-off behavior.
Suggested-by: Alan Stern <stern(a)rowland.harvard.edu>
Cc: stable(a)vger.kernel.org
Signed-off-by: Weitao Wang <WeitaoWang-oc(a)zhaoxin.com>
---
v1->v2
- Modify the description of this patch.
- Let Zhaoxin and VIA share a common oc_low flag
drivers/usb/host/uhci-pci.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/usb/host/uhci-pci.c b/drivers/usb/host/uhci-pci.c
index 3592f757fe05..034586911bb5 100644
--- a/drivers/usb/host/uhci-pci.c
+++ b/drivers/usb/host/uhci-pci.c
@@ -119,11 +119,12 @@ static int uhci_pci_init(struct usb_hcd *hcd)
uhci->rh_numports = uhci_count_ports(hcd);
- /* Intel controllers report the OverCurrent bit active on.
- * VIA controllers report it active off, so we'll adjust the
- * bit value. (It's not standardized in the UHCI spec.)
+ /* Intel controllers report the OverCurrent bit active on. VIA
+ * and ZHAOXIN controllers report it active off, so we'll adjust
+ * the bit value. (It's not standardized in the UHCI spec.)
*/
- if (to_pci_dev(uhci_dev(uhci))->vendor == PCI_VENDOR_ID_VIA)
+ if (to_pci_dev(uhci_dev(uhci))->vendor == PCI_VENDOR_ID_VIA ||
+ to_pci_dev(uhci_dev(uhci))->vendor == PCI_VENDOR_ID_ZHAOXIN)
uhci->oc_low = 1;
/* HP's server management chip requires a longer port reset delay. */
--
2.32.0
Hi all,
After merging the tip tree, today's linux-next build (arm
multi_v7_defconfig) failed like this:
/tmp/next/build/kernel/time/posix-cpu-timers.c: In function 'posix_cpu_timer_wait_running_nsleep':
/tmp/next/build/kernel/time/posix-cpu-timers.c:1310:30: error: 'timr' is a pointer; did you mean to use '->'?
1310 | spin_unlock_irq(&timr.it_lock);
| ^
| ->
/tmp/next/build/kernel/time/posix-cpu-timers.c:1312:28: error: 'timr' is a pointer; did you mean to use '->'?
1312 | spin_lock_irq(&timr.it_lock);
| ^
| ->
Caused by commit
2aaae4bf41b101f7e ("posix-cpu-timers: Implement the missing timer_wait_running callback")
The !POSIX_CPU_TIMERS_TASK_WORK case wasn't fully updated. I've used
the version of the tip tree from next-20230420 instead.
The following commit has been merged into the timers/core branch of tip:
Commit-ID: f7abf14f0001a5a47539d9f60bbdca649e43536b
Gitweb: https://git.kernel.org/tip/f7abf14f0001a5a47539d9f60bbdca649e43536b
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Mon, 17 Apr 2023 15:37:55 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 21 Apr 2023 15:34:33 +02:00
posix-cpu-timers: Implement the missing timer_wait_running callback
For some unknown reason the introduction of the timer_wait_running callback
missed to fixup posix CPU timers, which went unnoticed for almost four years.
Marco reported recently that the WARN_ON() in timer_wait_running()
triggers with a posix CPU timer test case.
Posix CPU timers have two execution models for expiring timers depending on
CONFIG_POSIX_CPU_TIMERS_TASK_WORK:
1) If not enabled, the expiry happens in hard interrupt context so
spin waiting on the remote CPU is reasonably time bound.
Implement an empty stub function for that case.
2) If enabled, the expiry happens in task work before returning to user
space or guest mode. The expired timers are marked as firing and moved
from the timer queue to a local list head with sighand lock held. Once
the timers are moved, sighand lock is dropped and the expiry happens in
fully preemptible context. That means the expiring task can be scheduled
out, migrated, interrupted etc. So spin waiting on it is more than
suboptimal.
The timer wheel has a timer_wait_running() mechanism for RT, which uses
a per CPU timer-base expiry lock which is held by the expiry code and the
task waiting for the timer function to complete blocks on that lock.
This does not work in the same way for posix CPU timers as there is no
timer base and expiry for process wide timers can run on any task
belonging to that process, but the concept of waiting on an expiry lock
can be used too in a slightly different way:
- Add a mutex to struct posix_cputimers_work. This struct is per task
and used to schedule the expiry task work from the timer interrupt.
- Add a task_struct pointer to struct cpu_timer which is used to store
a the task which runs the expiry. That's filled in when the task
moves the expired timers to the local expiry list. That's not
affecting the size of the k_itimer union as there are bigger union
members already
- Let the task take the expiry mutex around the expiry function
- Let the waiter acquire a task reference with rcu_read_lock() held and
block on the expiry mutex
This avoids spin-waiting on a task which might not even be on a CPU and
works nicely for RT too.
Fixes: ec8f954a40da ("posix-timers: Use a callback for cancel synchronization on PREEMPT_RT")
Reported-by: Marco Elver <elver(a)google.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marco Elver <elver(a)google.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/87zg764ojw.ffs@tglx
---
include/linux/posix-timers.h | 17 ++++---
kernel/time/posix-cpu-timers.c | 81 +++++++++++++++++++++++++++------
kernel/time/posix-timers.c | 4 ++-
3 files changed, 82 insertions(+), 20 deletions(-)
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 2c6e99c..d607f51 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/alarmtimer.h>
#include <linux/timerqueue.h>
@@ -62,16 +63,18 @@ static inline int clockid_to_fd(const clockid_t clk)
* cpu_timer - Posix CPU timer representation for k_itimer
* @node: timerqueue node to queue in the task/sig
* @head: timerqueue head on which this timer is queued
- * @task: Pointer to target task
+ * @pid: Pointer to target task PID
* @elist: List head for the expiry list
* @firing: Timer is currently firing
+ * @handling: Pointer to the task which handles expiry
*/
struct cpu_timer {
- struct timerqueue_node node;
- struct timerqueue_head *head;
- struct pid *pid;
- struct list_head elist;
- int firing;
+ struct timerqueue_node node;
+ struct timerqueue_head *head;
+ struct pid *pid;
+ struct list_head elist;
+ int firing;
+ struct task_struct __rcu *handling;
};
static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
@@ -135,10 +138,12 @@ struct posix_cputimers {
/**
* posix_cputimers_work - Container for task work based posix CPU timer expiry
* @work: The task work to be scheduled
+ * @mutex: Mutex held around expiry in context of this task work
* @scheduled: @work has been scheduled already, no further processing
*/
struct posix_cputimers_work {
struct callback_head work;
+ struct mutex mutex;
unsigned int scheduled;
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b3..e9c6f9d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a..808a247 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
Thomas,
Here's a small collection of irqchip patches for 6.4. The only
significant thing is the RISC-V IPI rework, which spans both the
irqchip subsystem and the arch code (and is Acked by Palmer).
The rest is a bunch of errata workarounds, fixes and cleanups.
Please pull,
M.
The following changes since commit 197b6b60ae7bc51dd0814953c562833143b292aa:
Linux 6.3-rc4 (2023-03-26 14:40:20 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git tags/irqchip-6.4
for you to fetch changes up to 2ff1b0839ddd514be4752c64c1c6facf91ff3a56:
Merge branch irq/misc-6.4 into irq/irqchip-next (2023-04-21 14:05:31 +0100)
----------------------------------------------------------------
irqchip changes for 6.4
- Large RISC-V IPI rework to make way for a new interrupt
architecture
- More Loongarch fixes from Lianmin Lv, fixing issues in the so
called "dual-bridge" systems.
- Workaround for the nvidia T241 chip that gets confused in
3 and 4 socket configurations, leading to the GIC
malfunctionning in some contexts
- Drop support for non-firmware driven GIC configurarations
now that the old ARM11MP Cavium board is gone
- Workaround for the Rockchip 3588 chip that doesn't
correctly deal with the shareability attributes.
- Replace uses of of_find_property() with the more appropriate
of_property_read_bool()
- Make bcm-6345-l1 request its MMIO region
- Add suspend support to the SiFive PLIC
- Drop support for stih415, stih416 and stid127 platforms
----------------------------------------------------------------
Alain Volmat (1):
irqchip/st: Remove stih415/stih416 and stid127 platforms support
Anup Patel (7):
RISC-V: Clear SIP bit only when using SBI IPI operations
irqchip/riscv-intc: Allow drivers to directly discover INTC hwnode
RISC-V: Treat IPIs as normal Linux IRQs
RISC-V: Allow marking IPIs as suitable for remote FENCEs
RISC-V: Use IPIs for remote TLB flush when possible
RISC-V: Use IPIs for remote icache flush when possible
irqchip/riscv-intc: Add empty irq_eoi() for chained irq handlers
Jianmin Lv (5):
irqchip/loongson-eiointc: Fix returned value on parsing MADT
irqchip/loongson-eiointc: Fix incorrect use of acpi_get_vec_parent
irqchip/loongson-eiointc: Fix registration of syscore_ops
irqchip/loongson-pch-pic: Fix registration of syscore_ops
irqchip/loongson-pch-pic: Fix pch_pic_acpi_init calling
Marc Zyngier (5):
irqchip/gic: Drop support for board files
Merge branch irq/gic-6.4 into irq/irqchip-next
Merge branch irq/riscv-ipi into irq/irqchip-next
Merge branch irq/loongarch-fixes-6.4 into irq/irqchip-next
Merge branch irq/misc-6.4 into irq/irqchip-next
Mason Huo (1):
irqchip/irq-sifive-plic: Add syscore callbacks for hibernation
Rob Herring (1):
irqchip: Use of_property_read_bool() for boolean properties
Sebastian Reichel (1):
irqchip/gic-v3: Add Rockchip 3588001 erratum workaround
Shanker Donthineni (1):
irqchip/gicv3: Workaround for NVIDIA erratum T241-FABRIC-4
Álvaro Fernández Rojas (1):
irqchip/bcm-6345-l1: Request memory region
Documentation/arm64/silicon-errata.rst | 5 +
arch/arm64/Kconfig | 10 ++
arch/riscv/Kconfig | 2 +
arch/riscv/include/asm/irq.h | 4 +
arch/riscv/include/asm/sbi.h | 9 +-
arch/riscv/include/asm/smp.h | 49 +++++++---
arch/riscv/kernel/Makefile | 1 +
arch/riscv/kernel/cpu-hotplug.c | 3 +-
arch/riscv/kernel/irq.c | 21 +++-
arch/riscv/kernel/sbi-ipi.c | 77 +++++++++++++++
arch/riscv/kernel/sbi.c | 100 +++----------------
arch/riscv/kernel/smp.c | 171 +++++++++++++++++----------------
arch/riscv/kernel/smpboot.c | 5 +-
arch/riscv/mm/cacheflush.c | 5 +-
arch/riscv/mm/tlbflush.c | 93 +++++++++++++++---
drivers/clocksource/timer-clint.c | 65 ++++++++++---
drivers/firmware/smccc/smccc.c | 26 +++++
drivers/firmware/smccc/soc_id.c | 28 +-----
drivers/irqchip/Kconfig | 3 +
drivers/irqchip/irq-bcm6345-l1.c | 6 +-
drivers/irqchip/irq-csky-apb-intc.c | 2 +-
drivers/irqchip/irq-gic-v2m.c | 2 +-
drivers/irqchip/irq-gic-v3-its.c | 35 +++++++
drivers/irqchip/irq-gic-v3.c | 115 +++++++++++++++++++---
drivers/irqchip/irq-gic.c | 60 +-----------
drivers/irqchip/irq-loongson-eiointc.c | 32 ++++--
drivers/irqchip/irq-loongson-pch-pic.c | 6 +-
drivers/irqchip/irq-riscv-intc.c | 71 ++++++++------
drivers/irqchip/irq-sifive-plic.c | 93 +++++++++++++++++-
drivers/irqchip/irq-st.c | 15 ---
include/linux/arm-smccc.h | 18 ++++
include/linux/irqchip/arm-gic.h | 6 --
32 files changed, 761 insertions(+), 377 deletions(-)
create mode 100644 arch/riscv/kernel/sbi-ipi.c
From: Kornel Dulęba <korneld(a)chromium.org>
Leverage gpiochip_line_is_irq to check whether a pin has an irq
associated with it. The previous check ("irq == 0") didn't make much
sense. The irq variable refers to the pinctrl irq, and has nothing do to
with an individual pin.
On some systems, during suspend/resume cycle, the firmware leaves
an interrupt enabled on a pin that is not used by the kernel.
Without this patch that caused an interrupt storm.
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217315
Signed-off-by: Kornel Dulęba <korneld(a)chromium.org>
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
---
drivers/pinctrl/pinctrl-amd.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index 24465010397b..675c9826b78a 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -660,21 +660,21 @@ static bool do_amd_gpio_irq_handler(int irq, void *dev_id)
* We must read the pin register again, in case the
* value was changed while executing
* generic_handle_domain_irq() above.
- * If we didn't find a mapping for the interrupt,
- * disable it in order to avoid a system hang caused
- * by an interrupt storm.
+ * If the line is not an irq, disable it in order to
+ * avoid a system hang caused by an interrupt storm.
*/
raw_spin_lock_irqsave(&gpio_dev->lock, flags);
regval = readl(regs + i);
- if (irq == 0) {
- regval &= ~BIT(INTERRUPT_ENABLE_OFF);
+ if (!gpiochip_line_is_irq(gc, irqnr + i)) {
+ regval &= ~BIT(INTERRUPT_MASK_OFF);
dev_dbg(&gpio_dev->pdev->dev,
"Disabling spurious GPIO IRQ %d\n",
irqnr + i);
+ } else {
+ ret = true;
}
writel(regval, regs + i);
raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
- ret = true;
}
}
/* did not cause wake on resume context for shared IRQ */
--
2.34.1
commit 4e5a04be88fe ("pinctrl: amd: disable and mask interrupts on probe")
had a mistake in loop iteration 63 that it would clear offset 0xFC instead
of 0x100. Offset 0xFC is actually `WAKE_INT_MASTER_REG`. This was
clearing bits 13 and 15 from the register which significantly changed the
expected handling for some platforms for GPIO0.
commit b26cd9325be4 ("pinctrl: amd: Disable and mask interrupts on resume")
actually fixed this bug, but lead to regressions on Lenovo Z13 and some
other systems. This is because there was no handling in the driver for bit
15 debounce behavior.
Quoting a public BKDG:
```
EnWinBlueBtn. Read-write. Reset: 0. 0=GPIO0 detect debounced power button;
Power button override is 4 seconds. 1=GPIO0 detect debounced power button
in S3/S5/S0i3, and detect "pressed less than 2 seconds" and "pressed 2~10
seconds" in S0; Power button override is 10 seconds
```
Cross referencing the same master register in Windows it's obvious that
Windows doesn't use debounce values in this configuration. So align the
Linux driver to do this as well. This fixes wake on lid when
WAKE_INT_MASTER_REG is properly programmed.
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217315
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
---
drivers/pinctrl/pinctrl-amd.c | 7 +++++++
drivers/pinctrl/pinctrl-amd.h | 1 +
2 files changed, 8 insertions(+)
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index c250110f6775..6b9ae92017d4 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -125,6 +125,12 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset,
struct amd_gpio *gpio_dev = gpiochip_get_data(gc);
raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+
+ /* Use special handling for Pin0 debounce */
+ pin_reg = readl(gpio_dev->base + WAKE_INT_MASTER_REG);
+ if (pin_reg & INTERNAL_GPIO0_DEBOUNCE)
+ debounce = 0;
+
pin_reg = readl(gpio_dev->base + offset * 4);
if (debounce) {
@@ -219,6 +225,7 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc)
char *debounce_enable;
char *wake_cntrlz;
+ seq_printf(s, "WAKE_INT_MASTER_REG: 0x%08x\n", readl(gpio_dev->base + WAKE_INT_MASTER_REG));
for (bank = 0; bank < gpio_dev->hwbank_num; bank++) {
unsigned int time = 0;
unsigned int unit = 0;
diff --git a/drivers/pinctrl/pinctrl-amd.h b/drivers/pinctrl/pinctrl-amd.h
index 81ae8319a1f0..1cf2d06bbd8c 100644
--- a/drivers/pinctrl/pinctrl-amd.h
+++ b/drivers/pinctrl/pinctrl-amd.h
@@ -17,6 +17,7 @@
#define AMD_GPIO_PINS_BANK3 32
#define WAKE_INT_MASTER_REG 0xfc
+#define INTERNAL_GPIO0_DEBOUNCE (1 << 15)
#define EOI_MASK (1 << 29)
#define WAKE_INT_STATUS_REG0 0x2f8
--
2.34.1
Currently, on a handful of ASICs. We allow the framebuffer for a given
plane to exist in either VRAM or GTT. However, if the plane's new
framebuffer is in a different memory domain than it's previous
framebuffer, flipping between them can cause the screen to flicker. So,
to fix this, don't perform an immediate flip in the aforementioned case.
Cc: stable(a)vger.kernel.org
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2354
Reviewed-by: Roman Li <Roman.Li(a)amd.com>
Fixes: 81d0bcf99009 ("drm/amdgpu: make display pinning more flexible (v2)")
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
---
v2: make a number of clarifications to the commit message and drop
locking.
v3: use a stronger check
v4: drop mem_type
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index dfcb9815b5a8..76a776fd8437 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -7900,6 +7900,13 @@ static void amdgpu_dm_commit_cursors(struct drm_atomic_state *state)
amdgpu_dm_plane_handle_cursor_update(plane, old_plane_state);
}
+static inline uint32_t get_mem_type(struct drm_framebuffer *fb)
+{
+ struct amdgpu_bo *abo = gem_to_amdgpu_bo(fb->obj[0]);
+
+ return abo->tbo.resource ? abo->tbo.resource->mem_type : 0;
+}
+
static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
struct dc_state *dc_state,
struct drm_device *dev,
@@ -8042,11 +8049,13 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
/*
* Only allow immediate flips for fast updates that don't
- * change FB pitch, DCC state, rotation or mirroing.
+ * change memory domain, FB pitch, DCC state, rotation or
+ * mirroring.
*/
bundle->flip_addrs[planes_count].flip_immediate =
crtc->state->async_flip &&
- acrtc_state->update_type == UPDATE_TYPE_FAST;
+ acrtc_state->update_type == UPDATE_TYPE_FAST &&
+ get_mem_type(old_plane_state->fb) == get_mem_type(fb);
timestamp_ns = ktime_get_ns();
bundle->flip_addrs[planes_count].flip_timestamp_in_us = div_u64(timestamp_ns, 1000);
--
2.40.0
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable(a)vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/include/asm/kvm_pgtable.h | 1 +
arch/arm64/kvm/hyp/pgtable.c | 32 ++++++++++++++++++++++++----
2 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4cd6762bda80..dc3c072e862f 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -209,6 +209,7 @@ struct kvm_pgtable_visit_ctx {
kvm_pte_t old;
void *arg;
struct kvm_pgtable_mm_ops *mm_ops;
+ u64 start;
u64 addr;
u64 end;
u32 level;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 3d61bd3e591d..140f82300db5 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -58,6 +58,7 @@
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;
+ u64 start;
u64 addr;
u64 end;
};
@@ -201,6 +202,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
.old = READ_ONCE(*ptep),
.arg = data->walker->arg,
.mm_ops = mm_ops,
+ .start = data->start,
.addr = data->addr,
.end = data->end,
.level = level,
@@ -293,6 +295,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker)
{
struct kvm_pgtable_walk_data walk_data = {
+ .start = ALIGN_DOWN(addr, PAGE_SIZE),
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
.end = PAGE_ALIGN(walk_data.addr + size),
.walker = walker,
@@ -794,20 +797,43 @@ static bool stage2_pte_executable(kvm_pte_t pte)
return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
+static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
+ const struct stage2_map_data *data)
+{
+ u64 phys = data->phys;
+
+ /*
+ * Stage-2 walks to update ownership data are communicated to the map
+ * walker using an invalid PA. Avoid offsetting an already invalid PA,
+ * which could overflow and make the address valid again.
+ */
+ if (!kvm_phys_is_valid(phys))
+ return phys;
+
+ /*
+ * Otherwise, work out the correct PA based on how far the walk has
+ * gotten.
+ */
+ return phys + (ctx->addr - ctx->start);
+}
+
static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
+ u64 phys = stage2_map_walker_phys_addr(ctx, data);
+
if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
return false;
- return kvm_block_mapping_supported(ctx, data->phys);
+ return kvm_block_mapping_supported(ctx, phys);
}
static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
kvm_pte_t new;
- u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
+ u64 phys = stage2_map_walker_phys_addr(ctx, data);
+ u64 granule = kvm_granule_size(ctx->level);
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
@@ -841,8 +867,6 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
stage2_make_pte(ctx, new);
- if (kvm_phys_is_valid(phys))
- data->phys += granule;
return 0;
}
--
2.40.0.634.g4ca3ef3211-goog
--
Hello Dear,
how are you today? I hope you are fine
My name is Dr. Ava Smith, I Am an English and French nationality.
I will give you pictures and more details about me as soon as I hear from you
Thanks
Ava
Do not call gadget stop until the poll for controller halt is
completed. DEVTEN is cleared as part of gadget stop, so the intention to
allow ep0 events to continue while waiting for controller halt is not
happening.
Fixes: c96683798e27 ("usb: dwc3: ep0: Don't prepare beyond Setup stage")
Cc: stable(a)vger.kernel.org
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Wesley Cheng <quic_wcheng(a)quicinc.com>
---
drivers/usb/dwc3/gadget.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 9f492c8a7d0b..dd6057bad37e 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -2637,7 +2637,6 @@ static int dwc3_gadget_soft_disconnect(struct dwc3 *dwc)
* bit.
*/
dwc3_stop_active_transfers(dwc);
- __dwc3_gadget_stop(dwc);
spin_unlock_irqrestore(&dwc->lock, flags);
/*
@@ -2674,7 +2673,19 @@ static int dwc3_gadget_soft_disconnect(struct dwc3 *dwc)
* remaining event generated by the controller while polling for
* DSTS.DEVCTLHLT.
*/
- return dwc3_gadget_run_stop(dwc, false);
+ ret = dwc3_gadget_run_stop(dwc, false);
+
+ /*
+ * Stop the gadget after controller is halted, so that if needed, the
+ * events to update EP0 state can still occur while the run/stop
+ * routine polls for the halted state. DEVTEN is cleared as part of
+ * gadget stop.
+ */
+ spin_lock_irqsave(&dwc->lock, flags);
+ __dwc3_gadget_stop(dwc);
+ spin_unlock_irqrestore(&dwc->lock, flags);
+
+ return ret;
}
static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
Hi,
I would like to see these patches backported. They are needed so perf
can be cross compiled with gcc on v5.15, v5.10 and v5.4.
I built it with tuxmake [1] here are two example commandlines:
tuxmake --runtime podman --target-arch arm64 --toolchain gcc-12 --kconfig defconfig perf
tuxmake --runtime podman --target-arch x86_64 --toolchain gcc-12 --kconfig defconfig perf
Tried to build perf with both gcc-11 and gcc-12.
Patch 'tools perf: Fix compilation error with new binutils'
and 'tools build: Add feature test for init_disassemble_info API changes'
didn't apply cleanly, thats why I send these in a patchset.
When apply 'tools build: Add feature test for
init_disassemble_info API changes' to 5.4 it will be a minor merge
conflict, do you want me to send this patch in two separate patches one
for 5.4 and another for v5.10?
The sha for these two patches in mainline are.
cfd59ca91467 tools build: Add feature test for init_disassemble_info API changes
83aa0120487e tools perf: Fix compilation error with new binutils
The above patches solves these:
util/annotate.c: In function 'symbol__disassemble_bpf':
util/annotate.c:1729:9: error: too few arguments to function 'init_disassemble_info'
1729 | init_disassemble_info(&info, s,
| ^~~~~~~~~~~~~~~~~~~~~
Please apply these to v5.10 and v5.4
a45b3d692623 tools include: add dis-asm-compat.h to handle version differences
d08c84e01afa perf sched: Cast PTHREAD_STACK_MIN to int as it may turn into sysconf(__SC_THREAD_STACK>
The above patches solves these:
/home/anders/src/kernel/stable-5.10/tools/include/linux/kernel.h:43:24: error: comparison of distinct pointer types lacks a cast [-Werror]
43 | (void) (&_max1 == &_max2); \
| ^~
builtin-sched.c:673:34: note: in expansion of macro 'max'
673 | (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
| ^~~
Please apply these to v5.15, v5.10 and v5.4
8e8bf60a6754 perf build: Fixup disabling of -Wdeprecated-declarations for the python scripting engine
4ee3c4da8b1b perf scripting python: Do not build fail on deprecation warnings
63a4354ae75c perf scripting perl: Ignore some warnings to keep building with perl headers
Build error that the above 3 patches solves are:
/usr/lib/x86_64-linux-gnu/perl/5.36/CORE/handy.h:125:23: error: cast from function call of type 'STRLEN' {aka 'long unsigned int'} to non-matching type '_Bool' [-Werror=bad-function-cast]
125 | #define cBOOL(cbool) ((bool) (cbool))
| ^
Cheers,
Anders
[1] https://tuxmake.org/
Andres Freund (2):
tools perf: Fix compilation error with new binutils
tools build: Add feature test for init_disassemble_info API changes
tools/build/Makefile.feature | 1 +
tools/build/feature/Makefile | 4 ++++
tools/build/feature/test-all.c | 4 ++++
tools/build/feature/test-disassembler-init-styled.c | 13 +++++++++++++
tools/perf/Makefile.config | 8 ++++++++
tools/perf/util/annotate.c | 7 ++++---
6 files changed, 34 insertions(+), 3 deletions(-)
create mode 100644 tools/build/feature/test-disassembler-init-styled.c
--
2.39.2
I'm announcing the release of the 4.19.281 kernel.
All users of the 4.19 kernel series must upgrade.
The updated 4.19.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.19.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Documentation/sound/hd-audio/models.rst | 2
Makefile | 2
arch/arm64/kvm/guest.c | 83 +++++++++++++++++++-----
arch/x86/kernel/sysfb_efi.c | 8 ++
arch/x86/kvm/vmx/vmx.c | 10 ++
arch/x86/pci/fixup.c | 21 ++++++
crypto/asymmetric_keys/verify_pefile.c | 12 ++-
drivers/gpio/gpio-davinci.c | 2
drivers/hwtracing/coresight/coresight-etm4x.c | 2
drivers/i2c/busses/i2c-imx-lpi2c.c | 2
drivers/iio/dac/cio-dac.c | 4 -
drivers/mtd/mtdblock.c | 12 ++-
drivers/mtd/ubi/build.c | 21 ++++--
drivers/mtd/ubi/wl.c | 5 -
drivers/net/ethernet/cadence/macb_main.c | 4 +
drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 8 ++
drivers/net/ethernet/sun/niu.c | 2
drivers/pinctrl/pinctrl-amd.c | 56 ++++++++++++----
drivers/power/supply/cros_usbpd-charger.c | 2
drivers/pwm/pwm-cros-ec.c | 1
drivers/scsi/ses.c | 20 ++---
drivers/tty/serial/sh-sci.c | 9 ++
drivers/usb/serial/cp210x.c | 1
drivers/usb/serial/option.c | 10 ++
drivers/watchdog/sbsa_gwdt.c | 1
fs/nfs/nfs4_fs.h | 2
fs/nfs/nfs4proc.c | 25 ++++---
fs/nfs/nfs4state.c | 8 +-
fs/nilfs2/segment.c | 3
fs/nilfs2/super.c | 2
fs/nilfs2/the_nilfs.c | 12 ++-
include/linux/ftrace.h | 2
kernel/cgroup/cpuset.c | 4 -
kernel/events/core.c | 2
kernel/trace/ring_buffer.c | 13 +++
mm/swapfile.c | 3
net/9p/trans_xen.c | 4 +
net/bluetooth/hidp/core.c | 2
net/bluetooth/l2cap_core.c | 24 +-----
net/core/netpoll.c | 19 +++++
net/ipv4/icmp.c | 5 +
net/ipv6/ip6_output.c | 7 +-
net/ipv6/udp.c | 8 +-
net/mac80211/sta_info.c | 3
net/sctp/socket.c | 4 +
net/sctp/stream_interleave.c | 3
sound/i2c/cs8427.c | 7 +-
sound/pci/emu10k1/emupcm.c | 4 -
sound/pci/hda/patch_realtek.c | 1
sound/pci/hda/patch_sigmatel.c | 10 ++
50 files changed, 349 insertions(+), 128 deletions(-)
Alexander Stein (1):
i2c: imx-lpi2c: clean rx/tx buffers upon new message
Bang Li (1):
mtdblock: tolerate corrected bit-flips
Basavaraj Natikar (1):
x86/PCI: Add quirk for AMD XHCI controller that loses MSI-X state in D3hot
Biju Das (2):
tty: serial: sh-sci: Fix transmit end interrupt handler
tty: serial: sh-sci: Fix Rx on RZ/G2L SCI
Bjørn Mork (1):
USB: serial: option: add Quectel RM500U-CN modem
Dave Martin (2):
KVM: arm64: Factor out core register ID enumeration
KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST
Denis Plotnikov (1):
qlcnic: check pci_reset_function result
Dhruva Gole (1):
gpio: davinci: Add irq chip flag to skip set wake
Enrico Sau (1):
USB: serial: option: add Telit FE990 compositions
Eric Dumazet (2):
icmp: guard against too small mtu
udp6: fix potential access to stale information
Felix Fietkau (1):
wifi: mac80211: fix invalid drv_sta_pre_rcu_remove calls for non-uploaded sta
George Cherian (1):
watchdog: sbsa_wdog: Make sure the timeout programming is within the limits
Grant Grundler (1):
power: supply: cros_usbpd: reclassify "default case!" as debug
Greg Kroah-Hartman (1):
Linux 4.19.281
Hans de Goede (1):
efi: sysfb_efi: Add quirk for Lenovo Yoga Book X91F/L
Harshit Mogalapalli (1):
niu: Fix missing unwind goto in niu_alloc_channels()
Jakub Kicinski (1):
net: don't let netpoll invoke NAPI if in xmit context
Jeremy Soller (1):
ALSA: hda/realtek: Add quirk for Clevo X370SNW
Jiri Kosina (1):
scsi: ses: Handle enclosure with just a primary component gracefully
John Keeping (1):
ftrace: Mark get_lock_parent_ip() __always_inline
Kan Liang (1):
perf/core: Fix the same task check in perf_event_set_output
Kees Jan Koster (1):
USB: serial: cp210x: add Silicon Labs IFS-USB-DATACABLE IDs
Kornel Dulęba (2):
pinctrl: amd: Disable and mask interrupts on resume
Revert "pinctrl: amd: Disable and mask interrupts on resume"
Lee Jones (1):
mtd: ubi: wl: Fix a couple of kernel-doc issues
Linus Walleij (1):
pinctrl: amd: Use irqchip template
Luiz Augusto von Dentz (1):
Bluetooth: L2CAP: Fix use-after-free in l2cap_disconnect_{req,rsp}
Marc Zyngier (1):
arm64: KVM: Fix system register enumeration
Min Li (1):
Bluetooth: Fix race condition in hidp_session_thread
Oswald Buddenhagen (4):
ALSA: emu10k1: fix capture interrupt handler unlinking
ALSA: hda/sigmatel: add pin overrides for Intel DP45SG motherboard
ALSA: i2c/cs8427: fix iec958 mixer control deactivation
ALSA: hda/sigmatel: fix S/PDIF out on Intel D*45* motherboards
Paolo Bonzini (1):
KVM: nVMX: add missing consistency checks for CR0 and CR4
Robbie Harwood (1):
verify_pefile: relax wrapper length check
Roman Gushchin (1):
net: macb: fix a memory corruption in extended buffer descriptor mode
Rongwei Wang (1):
mm/swap: fix swap_info_struct race between swapoff and get_swap_pages()
Ryusuke Konishi (2):
nilfs2: fix potential UAF of struct nilfs_sc_info in nilfs_segctor_thread()
nilfs2: fix sysfs interface lifetime
Sachi King (1):
pinctrl: amd: disable and mask interrupts on probe
Sandeep Singh (1):
pinctrl: Added IRQF_SHARED flag for amd-pinctrl driver
Steve Clevenger (1):
coresight-etm4: Fix for() loop drvdata->nr_addr_cmp range bug
Trond Myklebust (3):
NFSv4: Convert struct nfs4_state to use refcount_t
NFSv4: Check the return value of update_open_stateid()
NFSv4: Fix hangs when recovering open state after a server reboot
Uwe Kleine-König (1):
pwm: cros-ec: Explicitly set .polarity in .get_state()
Waiman Long (1):
cgroup/cpuset: Wake up cpuset_attach_wq tasks in cpuset_cancel_attach()
William Breathitt Gray (1):
iio: dac: cio-dac: Fix max DAC write value check for 12-bit
Xin Long (2):
sctp: check send stream number after wait_for_sndbuf
sctp: fix a potential overflow in sctp_ifwdtsn_skip
ZhaoLong Wang (1):
ubi: Fix deadlock caused by recursively holding work_sem
Zheng Wang (1):
9p/xen : Fix use after free bug in xen_9pfs_front_remove due to race condition
Zheng Yejian (1):
ring-buffer: Fix race while reader and writer are on the same page
Zhihao Cheng (1):
ubi: Fix failure attaching when vid_hdr offset equals to (sub)page size
Ziyang Xuan (1):
ipv6: Fix an uninit variable access bug in __ip6_make_skb()
The following commit has been merged into the timers/core branch of tip:
Commit-ID: 2aaae4bf41b101f7e58e8b06778b1cd9a1dddf94
Gitweb: https://git.kernel.org/tip/2aaae4bf41b101f7e58e8b06778b1cd9a1dddf94
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Mon, 17 Apr 2023 15:37:55 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Thu, 20 Apr 2023 09:47:26 +02:00
posix-cpu-timers: Implement the missing timer_wait_running callback
For some unknown reason the introduction of the timer_wait_running callback
missed to fixup posix CPU timers, which went unnoticed for almost four years.
Marco reported recently that the WARN_ON() in timer_wait_running()
triggers with a posix CPU timer test case.
Posix CPU timers have two execution models for expiring timers depending on
CONFIG_POSIX_CPU_TIMERS_TASK_WORK:
1) If not enabled, the expiry happens in hard interrupt context so
spin waiting on the remote CPU is reasonably time bound.
Implement an empty stub function for that case.
2) If enabled, the expiry happens in task work before returning to user
space or guest mode. The expired timers are marked as firing and moved
from the timer queue to a local list head with sighand lock held. Once
the timers are moved, sighand lock is dropped and the expiry happens in
fully preemptible context. That means the expiring task can be scheduled
out, migrated, interrupted etc. So spin waiting on it is more than
suboptimal.
The timer wheel has a timer_wait_running() mechanism for RT, which uses
a per CPU timer-base expiry lock which is held by the expiry code and the
task waiting for the timer function to complete blocks on that lock.
This does not work in the same way for posix CPU timers as there is no
timer base and expiry for process wide timers can run on any task
belonging to that process, but the concept of waiting on an expiry lock
can be used too in a slightly different way:
- Add a mutex to struct posix_cputimers_work. This struct is per task
and used to schedule the expiry task work from the timer interrupt.
- Add a task_struct pointer to struct cpu_timer which is used to store
a the task which runs the expiry. That's filled in when the task
moves the expired timers to the local expiry list. That's not
affecting the size of the k_itimer union as there are bigger union
members already
- Let the task take the expiry mutex around the expiry function
- Let the waiter acquire a task reference with rcu_read_lock() held and
block on the expiry mutex
This avoids spin-waiting on a task which might not even be on a CPU and
works nicely for RT too.
Fixes: ec8f954a40da ("posix-timers: Use a callback for cancel synchronization on PREEMPT_RT")
Reported-by: Marco Elver <elver(a)google.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marco Elver <elver(a)google.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/87zg764ojw.ffs@tglx
---
include/linux/posix-timers.h | 17 ++++---
kernel/time/posix-cpu-timers.c | 81 +++++++++++++++++++++++++++------
kernel/time/posix-timers.c | 4 ++-
3 files changed, 82 insertions(+), 20 deletions(-)
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 2c6e99c..d607f51 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/alarmtimer.h>
#include <linux/timerqueue.h>
@@ -62,16 +63,18 @@ static inline int clockid_to_fd(const clockid_t clk)
* cpu_timer - Posix CPU timer representation for k_itimer
* @node: timerqueue node to queue in the task/sig
* @head: timerqueue head on which this timer is queued
- * @task: Pointer to target task
+ * @pid: Pointer to target task PID
* @elist: List head for the expiry list
* @firing: Timer is currently firing
+ * @handling: Pointer to the task which handles expiry
*/
struct cpu_timer {
- struct timerqueue_node node;
- struct timerqueue_head *head;
- struct pid *pid;
- struct list_head elist;
- int firing;
+ struct timerqueue_node node;
+ struct timerqueue_head *head;
+ struct pid *pid;
+ struct list_head elist;
+ int firing;
+ struct task_struct __rcu *handling;
};
static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
@@ -135,10 +138,12 @@ struct posix_cputimers {
/**
* posix_cputimers_work - Container for task work based posix CPU timer expiry
* @work: The task work to be scheduled
+ * @mutex: Mutex held around expiry in context of this task work
* @scheduled: @work has been scheduled already, no further processing
*/
struct posix_cputimers_work {
struct callback_head work;
+ struct mutex mutex;
unsigned int scheduled;
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b3..93c5a19 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr.it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr.it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a..808a247 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
From: Xiaoyu Li <xiaoyu.li(a)corigine.com>
Before the referenced commit, if fewer interrupts are supported by
hardware than requested, then pci_msix_vec_count() returned the
former. However, after the referenced commit, an error is returned
for this condition. This causes a regression in the NFP driver
preventing probe from completing.
This situation may occur because the firmware allows sharing of
more than one queue per interrupt vector. And, thus, it is valid for
the firmware to advertise the number of queues it does. However,
interrupt sharing is not currently implemented by the NFP driver as
it seems likely - though not tested - that any gains obtained by
having more queues would be mitigated by sharing of interrupts.
Address this problem by limiting the number of vectors requested to
the number supported by hardware.
Also, make correct the max/min_irq types. They were unsigned
previously but should be signed.
Fixes: bab65e48cb06 ("PCI/MSI: Sanitize MSI-X checks")
CC: stable(a)vger.kernel.org
Signed-off-by: Xiaoyu Li <xiaoyu.li(a)corigine.com>
Acked-by: Simon Horman <simon.horman(a)corigine.com>
Signed-off-by: Louis Peens <louis.peens(a)corigine.com>
---
Changes: V1-->V2
* Updated the max/min_irq types to be signed instead of unsigned
* Fixed formatting of commit message to be better aligned at 72 chars
* Also updated the commit message to better explain why this is even
possible to happen, in response to the question from V1.
drivers/net/ethernet/netronome/nfp/nfp_net.h | 4 ++--
drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 12 +++++++++---
drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 9 +++++----
drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c | 8 ++++----
4 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 939cfce15830..960f69325287 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -971,9 +971,9 @@ int nfp_net_mbox_reconfig_and_unlock(struct nfp_net *nn, u32 mbox_cmd);
void nfp_net_mbox_reconfig_post(struct nfp_net *nn, u32 update);
int nfp_net_mbox_reconfig_wait_posted(struct nfp_net *nn);
-unsigned int
+int
nfp_net_irqs_alloc(struct pci_dev *pdev, struct msix_entry *irq_entries,
- unsigned int min_irqs, unsigned int want_irqs);
+ int min_irqs, int want_irqs);
void nfp_net_irqs_disable(struct pci_dev *pdev);
void
nfp_net_irqs_assign(struct nfp_net *nn, struct msix_entry *irq_entries,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 62f0bf91d1e1..ae309ea48356 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -362,14 +362,20 @@ int nfp_net_mbox_reconfig_and_unlock(struct nfp_net *nn, u32 mbox_cmd)
* @min_irqs: Minimal acceptable number of interrupts
* @wanted_irqs: Target number of interrupts to allocate
*
- * Return: Number of irqs obtained or 0 on error.
+ * Return: Number of irqs obtained or an errno.
*/
-unsigned int
+int
nfp_net_irqs_alloc(struct pci_dev *pdev, struct msix_entry *irq_entries,
- unsigned int min_irqs, unsigned int wanted_irqs)
+ int min_irqs, int wanted_irqs)
{
unsigned int i;
int got_irqs;
+ int max_irqs;
+
+ max_irqs = pci_msix_vec_count(pdev);
+ if (max_irqs < 0)
+ return max_irqs;
+ wanted_irqs = min_t(int, max_irqs, wanted_irqs);
for (i = 0; i < wanted_irqs; i++)
irq_entries[i].entry = i;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index cbe4972ba104..c1ac380542b5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -222,7 +222,8 @@ static void nfp_net_pf_clean_vnic(struct nfp_pf *pf, struct nfp_net *nn)
static int nfp_net_pf_alloc_irqs(struct nfp_pf *pf)
{
- unsigned int wanted_irqs, num_irqs, vnics_left, irqs_left;
+ unsigned int vnics_left, irqs_left;
+ int wanted_irqs, num_irqs;
struct nfp_net *nn;
/* Get MSI-X vectors */
@@ -237,10 +238,10 @@ static int nfp_net_pf_alloc_irqs(struct nfp_pf *pf)
num_irqs = nfp_net_irqs_alloc(pf->pdev, pf->irq_entries,
NFP_NET_MIN_VNIC_IRQS * pf->num_vnics,
wanted_irqs);
- if (!num_irqs) {
- nfp_warn(pf->cpp, "Unable to allocate MSI-X vectors\n");
+ if (num_irqs < 0) {
+ nfp_warn(pf->cpp, "Unable to allocate MSI-X vectors (err=%d)\n", num_irqs);
kfree(pf->irq_entries);
- return -ENOMEM;
+ return num_irqs;
}
/* Distribute IRQs to vNICs */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c
index e19bb0150cb5..5f89c7198606 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c
@@ -84,7 +84,7 @@ static int nfp_netvf_pci_probe(struct pci_dev *pdev,
u32 tx_bar_sz, rx_bar_sz;
int tx_bar_no, rx_bar_no;
struct nfp_net_vf *vf;
- unsigned int num_irqs;
+ int num_irqs;
u8 __iomem *ctrl_bar;
struct nfp_net *nn;
u32 startq;
@@ -255,9 +255,9 @@ static int nfp_netvf_pci_probe(struct pci_dev *pdev,
NFP_NET_MIN_VNIC_IRQS,
NFP_NET_NON_Q_VECTORS +
nn->dp.num_r_vecs);
- if (!num_irqs) {
- nn_warn(nn, "Unable to allocate MSI-X Vectors. Exiting\n");
- err = -EIO;
+ if (num_irqs < 0) {
+ nn_warn(nn, "Unable to allocate MSI-X Vectors. Exiting (err=%d)\n", num_irqs);
+ err = num_irqs;
goto err_unmap_rx;
}
nfp_net_irqs_assign(nn, vf->irq_entries, num_irqs);
--
2.34.1