From: Robin Murphy <robin.murphy(a)arm.com>
commit 309a15cb16bb075da1c99d46fb457db6a1a2669e upstream
To work around MMU-700 erratum 2812531 we need to ensure that certain
sequences of commands cannot be issued without an intervening sync. In
practice this falls out of our current command-batching machinery
anyway - each batch only contains a single type of invalidation command,
and ends with a sync. The only exception is when a batch is sufficiently
large to need issuing across multiple command queue slots, wherein the
earlier slots will not contain a sync and thus may in theory interleave
with another batch being issued in parallel to create an affected
sequence across the slot boundary.
Since MMU-700 supports range invalidate commands and thus we will prefer
to use them (which also happens to avoid conditions for other errata),
I'm not entirely sure it's even possible for a single high-level
invalidate call to generate a batch of more than 63 commands, but for
the sake of robustness and documentation, wire up an option to enforce
that a sync is always inserted for every slot issued.
The other aspect is that the relative order of DVM commands cannot be
controlled, so DVM cannot be used. Again that is already the status quo,
but since we have at least defined ARM_SMMU_FEAT_BTM, we can explicitly
disable it for documentation purposes even if it's not wired up anywhere
yet.
Signed-off-by: Robin Murphy <robin.murphy(a)arm.com>
Reviewed-by: Nicolin Chen <nicolinc(a)nvidia.com>
Link: https://lore.kernel.org/r/330221cdfd0003cd51b6c04e7ff3566741ad8374.16837312…
Signed-off-by: Will Deacon <will(a)kernel.org>
Signed-off-by: Easwar Hariharan <eahariha(a)linux.microsoft.com>
---
Documentation/arm64/silicon-errata.rst | 4 +++
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 39 +++++++++++++++++++++
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 +
3 files changed, 44 insertions(+)
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index 55492fea4427..120784507bc0 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -138,6 +138,10 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | MMU-500 | #841119,826419 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | MMU-600 | #1076982 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | MMU-700 | #2812531 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 |
+----------------+-----------------+-----------------+-----------------------------+
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d4d8bfee9feb..3e20c60e9c32 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -882,6 +882,12 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
{
int index;
+ if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
+ (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+ cmds->num = 0;
+ }
+
if (cmds->num == CMDQ_BATCH_ENTRIES) {
arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
cmds->num = 0;
@@ -3410,6 +3416,39 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
return 0;
}
+#define IIDR_IMPLEMENTER_ARM 0x43b
+#define IIDR_PRODUCTID_ARM_MMU_600 0x483
+#define IIDR_PRODUCTID_ARM_MMU_700 0x487
+
+static void arm_smmu_device_iidr_probe(struct arm_smmu_device *smmu)
+{
+ u32 reg;
+ unsigned int implementer, productid, variant, revision;
+
+ reg = readl_relaxed(smmu->base + ARM_SMMU_IIDR);
+ implementer = FIELD_GET(IIDR_IMPLEMENTER, reg);
+ productid = FIELD_GET(IIDR_PRODUCTID, reg);
+ variant = FIELD_GET(IIDR_VARIANT, reg);
+ revision = FIELD_GET(IIDR_REVISION, reg);
+
+ switch (implementer) {
+ case IIDR_IMPLEMENTER_ARM:
+ switch (productid) {
+ case IIDR_PRODUCTID_ARM_MMU_600:
+ /* Arm erratum 1076982 */
+ if (variant == 0 && revision <= 2)
+ smmu->features &= ~ARM_SMMU_FEAT_SEV;
+ break;
+ case IIDR_PRODUCTID_ARM_MMU_700:
+ /* Arm erratum 2812531 */
+ smmu->features &= ~ARM_SMMU_FEAT_BTM;
+ smmu->options |= ARM_SMMU_OPT_CMDQ_FORCE_SYNC;
+ break;
+ }
+ break;
+ }
+}
+
static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
{
u32 reg;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index cd48590ada30..406ac0426762 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -644,6 +644,7 @@ struct arm_smmu_device {
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
#define ARM_SMMU_OPT_PAGE0_REGS_ONLY (1 << 1)
#define ARM_SMMU_OPT_MSIPOLL (1 << 2)
+#define ARM_SMMU_OPT_CMDQ_FORCE_SYNC (1 << 3)
u32 options;
struct arm_smmu_cmdq cmdq;
--
2.25.1
The patch titled
Subject: mm: lock VMA in dup_anon_vma() before setting ->anon_vma
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Jann Horn <jannh(a)google.com>
Subject: mm: lock VMA in dup_anon_vma() before setting ->anon_vma
Date: Fri, 21 Jul 2023 05:46:43 +0200
When VMAs are merged, dup_anon_vma() is called with `dst` pointing to the
VMA that is being expanded to cover the area previously occupied by
another VMA. This currently happens while `dst` is not write-locked.
This means that, in the `src->anon_vma && !dst->anon_vma` case, as soon as
the assignment `dst->anon_vma = src->anon_vma` has happened, concurrent
page faults can happen on `dst` under the per-VMA lock. This is already
icky in itself, since such page faults can now install pages into `dst`
that are attached to an `anon_vma` that is not yet tied back to the
`anon_vma` with an `anon_vma_chain`. But if `anon_vma_clone()` fails due
to an out-of-memory error, things get much worse: `anon_vma_clone()` then
reverts `dst->anon_vma` back to NULL, and `dst` remains completely
unconnected to the `anon_vma`, even though we can have pages in the area
covered by `dst` that point to the `anon_vma`.
This means the `anon_vma` of such pages can be freed while the pages are
still mapped into userspace, which leads to UAF when a helper like
folio_lock_anon_vma_read() tries to look up the anon_vma of such a page.
This theoretically is a security bug, but I believe it is really hard to
actually trigger as an unprivileged user because it requires that you can
make an order-0 GFP_KERNEL allocation fail, and the page allocator tries
pretty hard to prevent that.
I think doing the vma_start_write() call inside dup_anon_vma() is the most
straightforward fix for now.
For a kernel-assisted reproducer, see the notes section of the patch mail.
Link: https://lkml.kernel.org/r/20230721034643.616851-1-jannh@google.com
Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it")
Signed-off-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 1 +
1 file changed, 1 insertion(+)
--- a/mm/mmap.c~mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma
+++ a/mm/mmap.c
@@ -615,6 +615,7 @@ static inline int dup_anon_vma(struct vm
* anon pages imported.
*/
if (src->anon_vma && !dst->anon_vma) {
+ vma_start_write(dst);
dst->anon_vma = src->anon_vma;
return anon_vma_clone(dst, src);
}
_
Patches currently in -mm which might be from jannh(a)google.com are
mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch
mm-lock-vma-in-dup_anon_vma-before-setting-anon_vma.patch
I'm announcing the release of the 4.19.289 kernel.
All AMD processor users of the 4.19 kernel series who have not updated
their microcode to the latest version, must upgrade.
The updated 4.19.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.19.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/microcode.h | 1
arch/x86/include/asm/microcode_amd.h | 2
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/amd.c | 199 ++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 2
arch/x86/kernel/cpu/microcode/amd.c | 2
7 files changed, 135 insertions(+), 74 deletions(-)
Borislav Petkov (AMD) (3):
x86/microcode/AMD: Load late on both threads too
x86/cpu/amd: Move the errata checking functionality up
x86/cpu/amd: Add a Zenbleed fix
Greg Kroah-Hartman (1):
Linux 4.19.289
I'm announcing the release of the 5.4.250 kernel.
All AMD processor users of the 5.4 kernel series who have not updated
their microcode to the latest version, must upgrade.
The updated 5.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/microcode.h | 1
arch/x86/include/asm/microcode_amd.h | 2
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/amd.c | 199 ++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 2
arch/x86/kernel/cpu/microcode/amd.c | 2
7 files changed, 135 insertions(+), 74 deletions(-)
Borislav Petkov (AMD) (3):
x86/microcode/AMD: Load late on both threads too
x86/cpu/amd: Move the errata checking functionality up
x86/cpu/amd: Add a Zenbleed fix
Greg Kroah-Hartman (1):
Linux 5.4.250
I'm announcing the release of the 5.10.187 kernel.
All AMD processor users of the 5.10 kernel series who have not updated
their microcode to the latest version, must upgrade.
The updated 5.10.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.10.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/microcode.h | 1
arch/x86/include/asm/microcode_amd.h | 2
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/amd.c | 199 ++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 2
arch/x86/kernel/cpu/microcode/amd.c | 2
7 files changed, 135 insertions(+), 74 deletions(-)
Borislav Petkov (AMD) (3):
x86/microcode/AMD: Load late on both threads too
x86/cpu/amd: Move the errata checking functionality up
x86/cpu/amd: Add a Zenbleed fix
Greg Kroah-Hartman (1):
Linux 5.10.187
I'm announcing the release of the 5.15.122 kernel.
All AMD processor users of the 5.15 kernel series who have not updated
their microcode to the latest version, must upgrade.
The updated 5.15.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.15.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/microcode.h | 1
arch/x86/include/asm/microcode_amd.h | 2
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/amd.c | 199 ++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 2
6 files changed, 134 insertions(+), 73 deletions(-)
Borislav Petkov (AMD) (2):
x86/cpu/amd: Move the errata checking functionality up
x86/cpu/amd: Add a Zenbleed fix
Greg Kroah-Hartman (1):
Linux 5.15.122
I'm announcing the release of the 6.1.41 kernel.
All AMD processor users of the 6.1 kernel series who have not updated
their microcode to the latest version, must upgrade.
The updated 6.1.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-6.1.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/microcode.h | 1
arch/x86/include/asm/microcode_amd.h | 2
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/amd.c | 199 ++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 2
6 files changed, 134 insertions(+), 73 deletions(-)
Borislav Petkov (AMD) (2):
x86/cpu/amd: Move the errata checking functionality up
x86/cpu/amd: Add a Zenbleed fix
Greg Kroah-Hartman (1):
Linux 6.1.41
I'm announcing the release of the 6.4.6 kernel.
All AMD processor users of the 6.4 kernel series who have not updated
their microcode to the latest version, must upgrade.
The updated 6.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-6.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/microcode.h | 1
arch/x86/include/asm/microcode_amd.h | 2
arch/x86/include/asm/msr-index.h | 1
arch/x86/kernel/cpu/amd.c | 199 ++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 2
6 files changed, 134 insertions(+), 73 deletions(-)
Borislav Petkov (AMD) (2):
x86/cpu/amd: Move the errata checking functionality up
x86/cpu/amd: Add a Zenbleed fix
Greg Kroah-Hartman (1):
Linux 6.4.6
The patch titled
Subject: mm: Fix memory ordering for mm_lock_seq and vm_lock_seq
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Jann Horn <jannh(a)google.com>
Subject: mm: Fix memory ordering for mm_lock_seq and vm_lock_seq
Date: Sat, 22 Jul 2023 00:51:07 +0200
mm->mm_lock_seq effectively functions as a read/write lock; therefore it
must be used with acquire/release semantics.
A specific example is the interaction between userfaultfd_register() and
lock_vma_under_rcu().
userfaultfd_register() does the following from the point where it changes
a VMA's flags to the point where concurrent readers are permitted again
(in a simple scenario where only a single private VMA is accessed and no
merging/splitting is involved):
userfaultfd_register
userfaultfd_set_vm_flags
vm_flags_reset
vma_start_write
down_write(&vma->vm_lock->lock)
vma->vm_lock_seq = mm_lock_seq [marks VMA as busy]
up_write(&vma->vm_lock->lock)
vm_flags_init
[sets VM_UFFD_* in __vm_flags]
vma->vm_userfaultfd_ctx.ctx = ctx
mmap_write_unlock
vma_end_write_all
WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA]
There are no memory barriers in between the __vm_flags update and the
mm->mm_lock_seq update that unlocks the VMA, so the unlock can be
reordered to above the `vm_flags_init()` call, which means from the
perspective of a concurrent reader, a VMA can be marked as a userfaultfd
VMA while it is not VMA-locked. That's bad, we definitely need a
store-release for the unlock operation.
The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly
fine because all accesses to vma->vm_lock_seq that matter are always
protected by the VMA lock. There is a racy read in vma_start_read()
though that can tolerate false-positives, so we should be using
WRITE_ONCE() to keep things tidy and data-race-free (including for KCSAN).
On the other side, lock_vma_under_rcu() works as follows in the relevant
region for locking and userfaultfd check:
lock_vma_under_rcu
vma_start_read
vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout]
down_read_trylock(&vma->vm_lock->lock)
vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check]
userfaultfd_armed
checks vma->vm_flags & __VM_UFFD_FLAGS
Here, the interesting aspect is how far down the mm->mm_lock_seq read can
be reordered - if this read is reordered down below the vma->vm_flags
access, this could cause lock_vma_under_rcu() to partly operate on
information that was read while the VMA was supposed to be locked. To
prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we
need to read it with a load-acquire.
Some of the comment wording is based on suggestions by Suren.
BACKPORT WARNING: One of the functions changed by this patch (which I've
written against Linus' tree) is vma_try_start_write(), but this function
no longer exists in mm/mm-everything. I don't know whether the merged
version of this patch will be ordered before or after the patch that
removes vma_try_start_write(). If you're backporting this patch to a tree
with vma_try_start_write(), make sure this patch changes that function.
Link: https://lkml.kernel.org/r/20230721225107.942336-1-jannh@google.com
Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it")
Signed-off-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 29 +++++++++++++++++++++++------
include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++
include/linux/mmap_lock.h | 10 ++++++++--
3 files changed, 59 insertions(+), 8 deletions(-)
--- a/include/linux/mmap_lock.h~mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq
+++ a/include/linux/mmap_lock.h
@@ -76,8 +76,14 @@ static inline void mmap_assert_write_loc
static inline void vma_end_write_all(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
- /* No races during update due to exclusive mmap_lock being held */
- WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+ /*
+ * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
+ * mmap_lock being held.
+ * We need RELEASE semantics here to ensure that preceding stores into
+ * the VMA take effect before we unlock it with this store.
+ * Pairs with ACQUIRE semantics in vma_start_read().
+ */
+ smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
--- a/include/linux/mm.h~mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq
+++ a/include/linux/mm.h
@@ -641,8 +641,14 @@ static inline void vma_numab_state_free(
*/
static inline bool vma_start_read(struct vm_area_struct *vma)
{
- /* Check before locking. A race might cause false locked result. */
- if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ /*
+ * Check before locking. A race might cause false locked result.
+ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+ * ACQUIRE semantics, because this is just a lockless check whose result
+ * we don't rely on for anything - the mm_lock_seq read against which we
+ * need ordering is below.
+ */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
return false;
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@@ -653,8 +659,13 @@ static inline bool vma_start_read(struct
* False unlocked result is impossible because we modify and check
* vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
* modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
up_read(&vma->vm_lock->lock);
return false;
}
@@ -676,7 +687,7 @@ static bool __is_vma_write_locked(struct
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ *mm_lock_seq = vma->vm_mm->mm_lock_seq;
return (vma->vm_lock_seq == *mm_lock_seq);
}
@@ -688,7 +699,13 @@ static inline void vma_start_write(struc
return;
down_write(&vma->vm_lock->lock);
- vma->vm_lock_seq = mm_lock_seq;
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
up_write(&vma->vm_lock->lock);
}
@@ -702,7 +719,7 @@ static inline bool vma_try_start_write(s
if (!down_write_trylock(&vma->vm_lock->lock))
return false;
- vma->vm_lock_seq = mm_lock_seq;
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
up_write(&vma->vm_lock->lock);
return true;
}
--- a/include/linux/mm_types.h~mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq
+++ a/include/linux/mm_types.h
@@ -514,6 +514,20 @@ struct vm_area_struct {
};
#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * Can only be written (using WRITE_ONCE()) while holding both:
+ * - mmap_lock (in write mode)
+ * - vm_lock->lock (in write mode)
+ * Can be read reliably while holding one of:
+ * - mmap_lock (in read or write mode)
+ * - vm_lock->lock (in read or write mode)
+ * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
+ * while holding nothing (except RCU to keep the VMA struct allocated).
+ *
+ * This sequence counter is explicitly allowed to overflow; sequence
+ * counter reuse can only lead to occasional unnecessary use of the
+ * slowpath.
+ */
int vm_lock_seq;
struct vma_lock *vm_lock;
@@ -679,6 +693,20 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * This field has lock-like semantics, meaning it is sometimes
+ * accessed with ACQUIRE/RELEASE semantics.
+ * Roughly speaking, incrementing the sequence number is
+ * equivalent to releasing locks on VMAs; reading the sequence
+ * number can be part of taking a read lock on a VMA.
+ *
+ * Can be modified under write mmap_lock using RELEASE
+ * semantics.
+ * Can be read with no other protection when holding write
+ * mmap_lock.
+ * Can be read with ACQUIRE semantics if not holding write
+ * mmap_lock.
+ */
int mm_lock_seq;
#endif
_
Patches currently in -mm which might be from jannh(a)google.com are
mm-fix-memory-ordering-for-mm_lock_seq-and-vm_lock_seq.patch
The patch titled
Subject: Revert "um: Use swap() to make code cleaner"
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
revert-um-use-swap-to-make-code-cleaner.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Subject: Revert "um: Use swap() to make code cleaner"
Date: Mon, 24 Jul 2023 17:31:31 +0300
This reverts commit 9b0da3f22307af693be80f5d3a89dc4c7f360a85.
The sigio.c is clearly user space code which is handled by
arch/um/scripts/Makefile.rules (see USER_OBJS rule).
The above mentioned commit simply broke this agreement,
we may not use Linux kernel internal headers in them without
thorough thinking.
Hence, revert the wrong commit.
Link: https://lkml.kernel.org/r/20230724143131.30090-1-andriy.shevchenko@linux.in…
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202307212304.cH79zJp1-lkp@intel.com/
Cc: Anton Ivanov <anton.ivanov(a)cambridgegreys.com>
Cc: Herve Codina <herve.codina(a)bootlin.com>
Cc: Jason A. Donenfeld <Jason(a)zx2c4.com>
Cc: Johannes Berg <johannes(a)sipsolutions.net>
Cc: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
Cc: Richard Weinberger <richard(a)nod.at>
Cc: Yang Guang <yang.guang5(a)zte.com.cn>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/um/os-Linux/sigio.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/arch/um/os-Linux/sigio.c~revert-um-use-swap-to-make-code-cleaner
+++ a/arch/um/os-Linux/sigio.c
@@ -3,7 +3,6 @@
* Copyright (C) 2002 - 2008 Jeff Dike (jdike(a){addtoit,linux.intel}.com)
*/
-#include <linux/minmax.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
@@ -51,7 +50,7 @@ static struct pollfds all_sigio_fds;
static int write_sigio_thread(void *unused)
{
- struct pollfds *fds;
+ struct pollfds *fds, tmp;
struct pollfd *p;
int i, n, respond_fd;
char c;
@@ -78,7 +77,9 @@ static int write_sigio_thread(void *unus
"write_sigio_thread : "
"read on socket failed, "
"err = %d\n", errno);
- swap(current_poll, next_poll);
+ tmp = current_poll;
+ current_poll = next_poll;
+ next_poll = tmp;
respond_fd = sigio_private[1];
}
else {
_
Patches currently in -mm which might be from andriy.shevchenko(a)linux.intel.com are
revert-um-use-swap-to-make-code-cleaner.patch
kernelh-split-out-count_args-and-concatenate-to-argsh.patch
x86-asm-replace-custom-count_args-concatenate-implementations.patch
arm64-smccc-replace-custom-count_args-concatenate-implementations.patch
genetlink-replace-custom-concatenate-implementation.patch
Hi,
as there are new hardware directives, we need a little adaptation
for the AUX invalidation sequence.
In this version we support all the engines affected by this
change.
The stable backport has some challenges because the original
patch that this series fixes has had more changes in between.
This patch is slowly exploding with code refactorings and
features added and fixed.
Thanks a lot Nirmoy, Andrzej and Matt for your review and for the
fruitful discussions!
Thanks,
Andi
Changelog:
=========
v7 -> v8
- Removed the aux invalidation from the device info and added a
helper function, instead (patch 2).
- Use "MTL and beyond" instead of "MTL+" in comments.
- Use the "gen12_" prefix instead of "intel_".
- In patch 6 return an int error instead of an error embedded in
the pointer in the intel_emit_pipe_control_cs() function and
propagate the error to the upper layers.
v6 -> v7
- Fix correct sequence applied to the correct engine. A little
confusion promptly cought by Nirmoy when applying to the VD
engine the sequence belonging to the compute engines. Thanks a
lot, Nirmoy!
v5 -> v6
- Fixed ccs flush in the engines VE and BCS. They are sent as a
separate command instead of added in the pipe control.
- Separated the CCS flusing in the pipe control patch with the
quiescing of the memory. They were meant to be on separate
patch already in the previous verision, but apparently I
squashed them by mistake.
v4 -> v5
- The AUX CCS is added as a device property instead of checking
against FLAT CCS. This adds the new HAS_AUX_CCS check
(Patch 2, new).
- little and trivial refactoring here and there.
- extended the flags{0,1}/bit_group_{0,1} renaming to other
functions.
- Created an intel_emit_pipe_control_cs() wrapper for submitting
the pipe control.
- Quiesce memory for all the engines, not just RCS (Patch 6,
new).
- The PIPE_CONTROL_CCS_FLUSH is added to all the engines.
- Remove redundant EMIT_FLUSH_CCS mode flag.
- Remove unnecessary NOOPs from the command streamer for
invalidating the CCS table.
- Use INVALID_MMIO_REG and gen12_get_aux_inv_reg() instad of
__MMIO(0) and reg.reg.
- Remove useless wrapper and just use gen12_get_aux_inv_reg().
v3 -> v4
- A trivial patch 3 is added to rename the flags with
bit_group_{0,1} to align with the datasheet naming.
- Patch 4 fixes a confusion I made where the CCS flag was
applied to the wrong bit group.
v2 -> v3
- added r-b from Nirmoy in patch 1 and 4.
- added patch 3 which enables the ccs_flush in the control pipe
for mtl+ compute and render engines.
- added redundant checks in patch 2 for enabling the EMIT_FLUSH
flag.
v1 -> v2
- add a clean up preliminary patch for the existing registers
- add support for more engines
- add the Fixes tag
Andi Shyti (7):
drm/i915/gt: Cleanup aux invalidation registers
drm/i915: Add the gen12_needs_ccs_aux_inv helper
drm/i915/gt: Rename flags with bit_group_X according to the datasheet
drm/i915/gt: Enable the CCS_FLUSH bit in the pipe control
drm/i915/gt: Refactor intel_emit_pipe_control_cs() in a single
function
drm/i915/gt: Ensure memory quiesced before invalidation for all
engines
drm/i915/gt: Support aux invalidation on all engines
Jonathan Cavitt (2):
drm/i915/gt: Ensure memory quiesced before invalidation
drm/i915/gt: Poll aux invalidation register bit on invalidation
drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 198 ++++++++++++-------
drivers/gpu/drm/i915/gt/gen8_engine_cs.h | 21 +-
drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 2 +
drivers/gpu/drm/i915/gt/intel_gt_regs.h | 16 +-
drivers/gpu/drm/i915/gt/intel_lrc.c | 17 +-
5 files changed, 155 insertions(+), 99 deletions(-)
--
2.40.1
Dpt objects that are created from internal get evicted when there is
memory pressure and do not get restored when pinned during scanout. The
pinned page table entries look corrupted and programming the display
engine with the incorrect pte's result in DE throwing pipe faults.
Create DPT objects from shmem and mark the object as dirty when pinning so
that the object is restored when shrinker evicts an unpinned buffer object.
v2: Unconditionally mark the dpt objects dirty during pinning(Chris).
Fixes: 0dc987b699ce ("drm/i915/display: Add smem fallback allocation for dpt")
Cc: <stable(a)vger.kernel.org> # v6.0+
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)linux.intel.com>
Suggested-by: Chris Wilson <chris.p.wilson(a)intel.com>
Signed-off-by: Fei Yang <fei.yang(a)intel.com>
Signed-off-by: Radhakrishna Sripada <radhakrishna.sripada(a)intel.com>
---
drivers/gpu/drm/i915/display/intel_dpt.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c b/drivers/gpu/drm/i915/display/intel_dpt.c
index 7c5fddb203ba..fbfd8f959f17 100644
--- a/drivers/gpu/drm/i915/display/intel_dpt.c
+++ b/drivers/gpu/drm/i915/display/intel_dpt.c
@@ -166,6 +166,8 @@ struct i915_vma *intel_dpt_pin(struct i915_address_space *vm)
i915_vma_get(vma);
}
+ dpt->obj->mm.dirty = true;
+
atomic_dec(&i915->gpu_error.pending_fb_pin);
intel_runtime_pm_put(&i915->runtime_pm, wakeref);
@@ -261,7 +263,7 @@ intel_dpt_create(struct intel_framebuffer *fb)
dpt_obj = i915_gem_object_create_stolen(i915, size);
if (IS_ERR(dpt_obj) && !HAS_LMEM(i915)) {
drm_dbg_kms(&i915->drm, "Allocating dpt from smem\n");
- dpt_obj = i915_gem_object_create_internal(i915, size);
+ dpt_obj = i915_gem_object_create_shmem(i915, size);
}
if (IS_ERR(dpt_obj))
return ERR_CAST(dpt_obj);
--
2.34.1
From: hackyzh002 <hackyzh002(a)gmail.com>
[ Upstream commit f828b681d0cd566f86351c0b913e6cb6ed8c7b9c ]
The type of size is unsigned, if size is 0x40000000, there will be an
integer overflow, size will be zero after size *= sizeof(uint32_t),
will cause uninitialized memory to be referenced later
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: hackyzh002 <hackyzh002(a)gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/radeon/radeon_cs.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 1ae31dbc61c64..5e61abb3dce5c 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -265,7 +265,8 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data)
{
struct drm_radeon_cs *cs = data;
uint64_t *chunk_array_ptr;
- unsigned size, i;
+ u64 size;
+ unsigned i;
u32 ring = RADEON_CS_RING_GFX;
s32 priority = 0;
--
2.39.2
Hi Nirmoy,
> static int mtl_dummy_pipe_control(struct i915_request *rq)
> {
> /* Wa_14016712196 */
> if (IS_MTL_GRAPHICS_STEP(rq->engine->i915, M, STEP_A0, STEP_B0) ||
> - IS_MTL_GRAPHICS_STEP(rq->engine->i915, P, STEP_A0, STEP_B0)) {
> - u32 *cs;
> -
> - /* dummy PIPE_CONTROL + depth flush */
> - cs = intel_ring_begin(rq, 6);
> - if (IS_ERR(cs))
> - return PTR_ERR(cs);
> - cs = gen12_emit_pipe_control(cs,
> - 0,
> - PIPE_CONTROL_DEPTH_CACHE_FLUSH,
> - LRC_PPHWSP_SCRATCH_ADDR);
> - intel_ring_advance(rq, cs);
> - }
> + IS_MTL_GRAPHICS_STEP(rq->engine->i915, P, STEP_A0, STEP_B0))
> + return gen12_emit_pipe_control_cs(rq, 0,
> + PIPE_CONTROL_DEPTH_CACHE_FLUSH,
> + LRC_PPHWSP_SCRATCH_ADDR);
>
> Don't think this will get backported to 5.8+. I think mtl introduced after
> that.
>
> If that is not an issue for rest of the series and then
to be honest I don't think I fully understood the stable
policies, as in this series only two are the patches that are
really fixing things and the rest are only support.
In this case I think this will produce a conflict that will be
eventually fixed (... I guess!).
> Reviewed-by: Nirmoy Das <nirmoy.das(a)intel.com>
Thanks,
Andi
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 26efd79c4624294e553aeaa3439c646729bad084
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072114-giblet-unzip-f1db@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
26efd79c4624 ("ftrace: Fix possible warning on checking all pages used in ftrace_process_locs()")
db42523b4f3e ("ftrace: Store the order of pages allocated in ftrace_page")
59300b36f85f ("ftrace: Check if pages were allocated before calling free_pages()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 26efd79c4624294e553aeaa3439c646729bad084 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1(a)huawei.com>
Date: Wed, 12 Jul 2023 14:04:52 +0800
Subject: [PATCH] ftrace: Fix possible warning on checking all pages used in
ftrace_process_locs()
As comments in ftrace_process_locs(), there may be NULL pointers in
mcount_loc section:
> Some architecture linkers will pad between
> the different mcount_loc sections of different
> object files to satisfy alignments.
> Skip any NULL pointers.
After commit 20e5227e9f55 ("ftrace: allow NULL pointers in mcount_loc"),
NULL pointers will be accounted when allocating ftrace pages but skipped
before adding into ftrace pages, this may result in some pages not being
used. Then after commit 706c81f87f84 ("ftrace: Remove extra helper
functions"), warning may occur at:
WARN_ON(pg->next);
To fix it, only warn for case that no pointers skipped but pages not used
up, then free those unused pages after releasing ftrace_lock.
Link: https://lore.kernel.org/linux-trace-kernel/20230712060452.3175675-1-zhengye…
Cc: stable(a)vger.kernel.org
Fixes: 706c81f87f84 ("ftrace: Remove extra helper functions")
Suggested-by: Steven Rostedt <rostedt(a)goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3740aca79fe7..05c0024815bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3305,6 +3305,22 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
return cnt;
}
+static void ftrace_free_pages(struct ftrace_page *pages)
+{
+ struct ftrace_page *pg = pages;
+
+ while (pg) {
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
+ pages = pg->next;
+ kfree(pg);
+ pg = pages;
+ ftrace_number_of_groups--;
+ }
+}
+
static struct ftrace_page *
ftrace_allocate_pages(unsigned long num_to_init)
{
@@ -3343,17 +3359,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- pg = start_pg;
- while (pg) {
- if (pg->records) {
- free_pages((unsigned long)pg->records, pg->order);
- ftrace_number_of_pages -= 1 << pg->order;
- }
- start_pg = pg->next;
- kfree(pg);
- pg = start_pg;
- ftrace_number_of_groups--;
- }
+ ftrace_free_pages(start_pg);
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
}
@@ -6471,9 +6477,11 @@ static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *pg_unuse = NULL;
struct ftrace_page *start_pg;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
+ unsigned long skipped = 0;
unsigned long count;
unsigned long *p;
unsigned long addr;
@@ -6536,8 +6544,10 @@ static int ftrace_process_locs(struct module *mod,
* object files to satisfy alignments.
* Skip any NULL pointers.
*/
- if (!addr)
+ if (!addr) {
+ skipped++;
continue;
+ }
end_offset = (pg->index+1) * sizeof(pg->records[0]);
if (end_offset > PAGE_SIZE << pg->order) {
@@ -6551,8 +6561,10 @@ static int ftrace_process_locs(struct module *mod,
rec->ip = addr;
}
- /* We should have used all pages */
- WARN_ON(pg->next);
+ if (pg->next) {
+ pg_unuse = pg->next;
+ pg->next = NULL;
+ }
/* Assign the last page to ftrace_pages */
ftrace_pages = pg;
@@ -6574,6 +6586,11 @@ static int ftrace_process_locs(struct module *mod,
out:
mutex_unlock(&ftrace_lock);
+ /* We should have used all pages unless we skipped some */
+ if (pg_unuse) {
+ WARN_ON(!skipped);
+ ftrace_free_pages(pg_unuse);
+ }
return ret;
}
The patch below does not apply to the 6.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.4.y
git checkout FETCH_HEAD
git cherry-pick -x 4481913607e58196c48a4fef5e6f45350684ec3c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072146-sports-deluge-22a1@gregkh' --subject-prefix 'PATCH 6.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4481913607e58196c48a4fef5e6f45350684ec3c Mon Sep 17 00:00:00 2001
From: Yunxiang Li <Yunxiang.Li(a)amd.com>
Date: Thu, 22 Jun 2023 10:18:03 -0400
Subject: [PATCH] drm/ttm: fix bulk_move corruption when adding a entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When the resource is the first in the bulk_move range, adding it again
(thus moving it to the tail) will corrupt the list since the first
pointer is not moved. This eventually lead to null pointer deref in
ttm_lru_bulk_move_del()
Fixes: fee2ede15542 ("drm/ttm: rework bulk move handling v5")
Signed-off-by: Yunxiang Li <Yunxiang.Li(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
CC: stable(a)vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20230622141902.28718-3-Yunxia…
Signed-off-by: Christian König <christian.koenig(a)amd.com>
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
index 7333f7a87a2f..e51dbc7a2d53 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -86,6 +86,8 @@ static void ttm_lru_bulk_move_pos_tail(struct ttm_lru_bulk_move_pos *pos,
struct ttm_resource *res)
{
if (pos->last != res) {
+ if (pos->first == res)
+ pos->first = list_next_entry(res, lru);
list_move(&res->lru, &pos->last->lru);
pos->last = res;
}
@@ -111,7 +113,8 @@ static void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk,
{
struct ttm_lru_bulk_move_pos *pos = ttm_lru_bulk_move_pos(bulk, res);
- if (unlikely(pos->first == res && pos->last == res)) {
+ if (unlikely(WARN_ON(!pos->first || !pos->last) ||
+ pos->first == res && pos->last == res)) {
pos->first = NULL;
pos->last = NULL;
} else if (pos->first == res) {
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072412-operable-craving-317a@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy")
b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()")
dbf2bab7935b ("jbd2: simplify journal_clean_one_cp_list()")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
2bf31d94423c ("jbd2: fix kernel-doc markups")
60ed633f51d0 ("jbd2: fix incorrect code style")
597599268e3b ("jbd2: discard dirty data when forgetting an un-journalled buffer")
f69120ce6c02 ("jbd2: fix sphinx kernel-doc build warnings")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072403-virtuous-straw-2f43@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy")
b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()")
dbf2bab7935b ("jbd2: simplify journal_clean_one_cp_list()")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
2bf31d94423c ("jbd2: fix kernel-doc markups")
60ed633f51d0 ("jbd2: fix incorrect code style")
597599268e3b ("jbd2: discard dirty data when forgetting an un-journalled buffer")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072453-tricolor-employed-7622@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy")
b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()")
dbf2bab7935b ("jbd2: simplify journal_clean_one_cp_list()")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
2bf31d94423c ("jbd2: fix kernel-doc markups")
60ed633f51d0 ("jbd2: fix incorrect code style")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072443-tackiness-pennant-36bc@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy")
b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()")
dbf2bab7935b ("jbd2: simplify journal_clean_one_cp_list()")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072441-deprive-glimpse-b891@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
5d5460fa7932 ("ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail()")
f52f3d2b9fba ("ext4: Give symbolic names to mballoc criterias")
7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
1b4200112108 ("ext4: Avoid scanning smaller extents in BG during CR1")
3ef5d2638796 ("ext4: Add counter to track successful allocation of goal length")
fdd9a00943a5 ("ext4: Add per CR extent scanned counter")
4eb7a4a1a33b ("ext4: Convert mballoc cr (criteria) to enum")
c3defd99d58c ("ext4: treat stripe in block unit")
361eb69fc99f ("ext4: Remove the logic to trim inode PAs")
3872778664e3 ("ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list")
a8e38fd37cff ("ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union")
93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()")
0830344c953a ("ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request()")
7692094ac513 ("ext4: Move overlap assert logic into a separate function")
bcf434992145 ("ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated()")
e86a718228b6 ("ext4: Stop searching if PA doesn't satisfy non-extent file")
91a48aaf59d0 ("ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request")
83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree")
4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan")
cf4ff938b47f ("ext4: correct the judgment of BUG in ext4_mb_normalize_request")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8 Mon Sep 17 00:00:00 2001
From: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Date: Fri, 9 Jun 2023 16:04:03 +0530
Subject: [PATCH] ext4: fix off by one issue in
ext4_mb_choose_next_group_best_avail()
In ext4_mb_choose_next_group_best_avail(), we want the start order to be
1 less than goal length and the min_order to be, at max, 1 more than the
original length. This commit fixes an off by one issue that arose due to
the fact that 1 << fls(n) > (n).
After all the processing:
order = 1 order below goal len
min_order = maximum of the three:-
- order - trim_order
- 1 order below B2C(s_stripe)
- 1 order above original len
Cc: stable(a)kernel.org
Fixes: 33122aa930 ("ext4: Add allocation criteria 1.5 (CR1_5)")
Signed-off-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230609103403.112807-1-ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..456150ef6111 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072434-stylist-reflex-c474@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
5d5460fa7932 ("ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail()")
f52f3d2b9fba ("ext4: Give symbolic names to mballoc criterias")
7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
1b4200112108 ("ext4: Avoid scanning smaller extents in BG during CR1")
3ef5d2638796 ("ext4: Add counter to track successful allocation of goal length")
fdd9a00943a5 ("ext4: Add per CR extent scanned counter")
4eb7a4a1a33b ("ext4: Convert mballoc cr (criteria) to enum")
c3defd99d58c ("ext4: treat stripe in block unit")
361eb69fc99f ("ext4: Remove the logic to trim inode PAs")
3872778664e3 ("ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list")
a8e38fd37cff ("ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union")
93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()")
0830344c953a ("ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request()")
7692094ac513 ("ext4: Move overlap assert logic into a separate function")
bcf434992145 ("ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated()")
e86a718228b6 ("ext4: Stop searching if PA doesn't satisfy non-extent file")
91a48aaf59d0 ("ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request")
83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree")
4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan")
cf4ff938b47f ("ext4: correct the judgment of BUG in ext4_mb_normalize_request")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8 Mon Sep 17 00:00:00 2001
From: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Date: Fri, 9 Jun 2023 16:04:03 +0530
Subject: [PATCH] ext4: fix off by one issue in
ext4_mb_choose_next_group_best_avail()
In ext4_mb_choose_next_group_best_avail(), we want the start order to be
1 less than goal length and the min_order to be, at max, 1 more than the
original length. This commit fixes an off by one issue that arose due to
the fact that 1 << fls(n) > (n).
After all the processing:
order = 1 order below goal len
min_order = maximum of the three:-
- order - trim_order
- 1 order below B2C(s_stripe)
- 1 order above original len
Cc: stable(a)kernel.org
Fixes: 33122aa930 ("ext4: Add allocation criteria 1.5 (CR1_5)")
Signed-off-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230609103403.112807-1-ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..456150ef6111 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072433-obsessed-limeade-e4b9@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy")
b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072427-curler-operate-a8d0@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
5d5460fa7932 ("ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail()")
f52f3d2b9fba ("ext4: Give symbolic names to mballoc criterias")
7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
1b4200112108 ("ext4: Avoid scanning smaller extents in BG during CR1")
3ef5d2638796 ("ext4: Add counter to track successful allocation of goal length")
fdd9a00943a5 ("ext4: Add per CR extent scanned counter")
4eb7a4a1a33b ("ext4: Convert mballoc cr (criteria) to enum")
c3defd99d58c ("ext4: treat stripe in block unit")
361eb69fc99f ("ext4: Remove the logic to trim inode PAs")
3872778664e3 ("ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list")
a8e38fd37cff ("ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union")
93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()")
0830344c953a ("ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request()")
7692094ac513 ("ext4: Move overlap assert logic into a separate function")
bcf434992145 ("ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated()")
e86a718228b6 ("ext4: Stop searching if PA doesn't satisfy non-extent file")
91a48aaf59d0 ("ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request")
83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree")
4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan")
cf4ff938b47f ("ext4: correct the judgment of BUG in ext4_mb_normalize_request")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8 Mon Sep 17 00:00:00 2001
From: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Date: Fri, 9 Jun 2023 16:04:03 +0530
Subject: [PATCH] ext4: fix off by one issue in
ext4_mb_choose_next_group_best_avail()
In ext4_mb_choose_next_group_best_avail(), we want the start order to be
1 less than goal length and the min_order to be, at max, 1 more than the
original length. This commit fixes an off by one issue that arose due to
the fact that 1 << fls(n) > (n).
After all the processing:
order = 1 order below goal len
min_order = maximum of the three:-
- order - trim_order
- 1 order below B2C(s_stripe)
- 1 order above original len
Cc: stable(a)kernel.org
Fixes: 33122aa930 ("ext4: Add allocation criteria 1.5 (CR1_5)")
Signed-off-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230609103403.112807-1-ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..456150ef6111 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072424-dejected-raisin-2dd7@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy")
b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072419-silt-kinswoman-c896@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
5d5460fa7932 ("ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail()")
f52f3d2b9fba ("ext4: Give symbolic names to mballoc criterias")
7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
1b4200112108 ("ext4: Avoid scanning smaller extents in BG during CR1")
3ef5d2638796 ("ext4: Add counter to track successful allocation of goal length")
fdd9a00943a5 ("ext4: Add per CR extent scanned counter")
4eb7a4a1a33b ("ext4: Convert mballoc cr (criteria) to enum")
c3defd99d58c ("ext4: treat stripe in block unit")
361eb69fc99f ("ext4: Remove the logic to trim inode PAs")
3872778664e3 ("ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list")
a8e38fd37cff ("ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union")
93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()")
0830344c953a ("ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request()")
7692094ac513 ("ext4: Move overlap assert logic into a separate function")
bcf434992145 ("ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated()")
e86a718228b6 ("ext4: Stop searching if PA doesn't satisfy non-extent file")
91a48aaf59d0 ("ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request")
83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree")
4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan")
cf4ff938b47f ("ext4: correct the judgment of BUG in ext4_mb_normalize_request")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8 Mon Sep 17 00:00:00 2001
From: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Date: Fri, 9 Jun 2023 16:04:03 +0530
Subject: [PATCH] ext4: fix off by one issue in
ext4_mb_choose_next_group_best_avail()
In ext4_mb_choose_next_group_best_avail(), we want the start order to be
1 less than goal length and the min_order to be, at max, 1 more than the
original length. This commit fixes an off by one issue that arose due to
the fact that 1 << fls(n) > (n).
After all the processing:
order = 1 order below goal len
min_order = maximum of the three:-
- order - trim_order
- 1 order below B2C(s_stripe)
- 1 order above original len
Cc: stable(a)kernel.org
Fixes: 33122aa930 ("ext4: Add allocation criteria 1.5 (CR1_5)")
Signed-off-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230609103403.112807-1-ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..456150ef6111 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
The patch below does not apply to the 6.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.4.y
git checkout FETCH_HEAD
git cherry-pick -x 46f881b5b1758dc4a35fba4a643c10717d0cf427
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072415-dyslexic-spoilage-fb58@gregkh' --subject-prefix 'PATCH 6.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:27 +0800
Subject: [PATCH] jbd2: fix a race when checking checkpoint buffer busy
Before removing checkpoint buffer from the t_checkpoint_list, we have to
check both BH_Dirty and BH_Lock bits together to distinguish buffers
have not been or were being written back. But __cp_buffer_busy() checks
them separately, it first check lock state and then check dirty, the
window between these two checks could be raced by writing back
procedure, which locks buffer and clears buffer dirty before I/O
completes. So it cannot guarantee checkpointing buffers been written
back to disk if some error happens later. Finally, it may clean
checkpoint transactions and lead to inconsistent filesystem.
jbd2_journal_forget() and __journal_try_to_free_buffer() also have the
same problem (journal_unmap_buffer() escape from this issue since it's
running under the buffer lock), so fix them through introducing a new
helper to try holding the buffer lock and remove really clean buffer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42b34cab64fb..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy && __cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
return 1;
}
+/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..6ef5022949c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
+ return;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
-out:
return;
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index bd660aac8e07..44c298aa58d4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072412-raft-circle-8700@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
5d5460fa7932 ("ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail()")
f52f3d2b9fba ("ext4: Give symbolic names to mballoc criterias")
7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
1b4200112108 ("ext4: Avoid scanning smaller extents in BG during CR1")
3ef5d2638796 ("ext4: Add counter to track successful allocation of goal length")
fdd9a00943a5 ("ext4: Add per CR extent scanned counter")
4eb7a4a1a33b ("ext4: Convert mballoc cr (criteria) to enum")
c3defd99d58c ("ext4: treat stripe in block unit")
361eb69fc99f ("ext4: Remove the logic to trim inode PAs")
3872778664e3 ("ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list")
a8e38fd37cff ("ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union")
93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()")
0830344c953a ("ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request()")
7692094ac513 ("ext4: Move overlap assert logic into a separate function")
bcf434992145 ("ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated()")
e86a718228b6 ("ext4: Stop searching if PA doesn't satisfy non-extent file")
91a48aaf59d0 ("ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request")
83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree")
4fca50d440cc ("ext4: make mballoc try target group first even with mb_optimize_scan")
cf4ff938b47f ("ext4: correct the judgment of BUG in ext4_mb_normalize_request")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8 Mon Sep 17 00:00:00 2001
From: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Date: Fri, 9 Jun 2023 16:04:03 +0530
Subject: [PATCH] ext4: fix off by one issue in
ext4_mb_choose_next_group_best_avail()
In ext4_mb_choose_next_group_best_avail(), we want the start order to be
1 less than goal length and the min_order to be, at max, 1 more than the
original length. This commit fixes an off by one issue that arose due to
the fact that 1 << fls(n) > (n).
After all the processing:
order = 1 order below goal len
min_order = maximum of the three:-
- order - trim_order
- 1 order below B2C(s_stripe)
- 1 order above original len
Cc: stable(a)kernel.org
Fixes: 33122aa930 ("ext4: Add allocation criteria 1.5 (CR1_5)")
Signed-off-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230609103403.112807-1-ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..456150ef6111 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072404-hubcap-railroad-9c52@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
5d5460fa7932 ("ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail()")
f52f3d2b9fba ("ext4: Give symbolic names to mballoc criterias")
7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
1b4200112108 ("ext4: Avoid scanning smaller extents in BG during CR1")
3ef5d2638796 ("ext4: Add counter to track successful allocation of goal length")
fdd9a00943a5 ("ext4: Add per CR extent scanned counter")
4eb7a4a1a33b ("ext4: Convert mballoc cr (criteria) to enum")
c3defd99d58c ("ext4: treat stripe in block unit")
361eb69fc99f ("ext4: Remove the logic to trim inode PAs")
3872778664e3 ("ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list")
a8e38fd37cff ("ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union")
93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()")
0830344c953a ("ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request()")
7692094ac513 ("ext4: Move overlap assert logic into a separate function")
bcf434992145 ("ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated()")
e86a718228b6 ("ext4: Stop searching if PA doesn't satisfy non-extent file")
91a48aaf59d0 ("ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5460fa7932bed3a9082a6a8852cfbdb46acbe8 Mon Sep 17 00:00:00 2001
From: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Date: Fri, 9 Jun 2023 16:04:03 +0530
Subject: [PATCH] ext4: fix off by one issue in
ext4_mb_choose_next_group_best_avail()
In ext4_mb_choose_next_group_best_avail(), we want the start order to be
1 less than goal length and the min_order to be, at max, 1 more than the
original length. This commit fixes an off by one issue that arose due to
the fact that 1 << fls(n) > (n).
After all the processing:
order = 1 order below goal len
min_order = maximum of the three:-
- order - trim_order
- 1 order below B2C(s_stripe)
- 1 order above original len
Cc: stable(a)kernel.org
Fixes: 33122aa930 ("ext4: Add allocation criteria 1.5 (CR1_5)")
Signed-off-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230609103403.112807-1-ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..456150ef6111 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x c2d6fd9d6f35079f1669f0100f05b46708c74b7f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072459-unmolded-serrated-1862@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
c2d6fd9d6f35 ("jbd2: recheck chechpointing non-dirty buffer")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
2bf31d94423c ("jbd2: fix kernel-doc markups")
60ed633f51d0 ("jbd2: fix incorrect code style")
ccd3c4373eac ("jbd2: fix use after free in jbd2_log_do_checkpoint()")
f69120ce6c02 ("jbd2: fix sphinx kernel-doc build warnings")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2d6fd9d6f35079f1669f0100f05b46708c74b7f Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:23 +0800
Subject: [PATCH] jbd2: recheck chechpointing non-dirty buffer
There is a long-standing metadata corruption issue that happens from
time to time, but it's very difficult to reproduce and analyse, benefit
from the JBD2_CYCLE_RECORD option, we found out that the problem is the
checkpointing process miss to write out some buffers which are raced by
another do_get_write_access(). Looks below for detail.
jbd2_log_do_checkpoint() //transaction X
//buffer A is dirty and not belones to any transaction
__buffer_relink_io() //move it to the IO list
__flush_batch()
write_dirty_buffer()
do_get_write_access()
clear_buffer_dirty
__jbd2_journal_file_buffer()
//add buffer A to a new transaction Y
lock_buffer(bh)
//doesn't write out
__jbd2_journal_remove_checkpoint()
//finish checkpoint except buffer A
//filesystem corrupt if the new transaction Y isn't fully write out.
Due to the t_checkpoint_list walking loop in jbd2_log_do_checkpoint()
have already handles waiting for buffers under IO and re-added new
transaction to complete commit, and it also removing cleaned buffers,
this makes sure the list will eventually get empty. So it's fine to
leave buffers on the t_checkpoint_list while flushing out and completely
stop using the t_checkpoint_io_list.
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Tested-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..25e3c20eb19f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -57,28 +57,6 @@ static inline void __buffer_unlink(struct journal_head *jh)
}
}
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
/*
* Check a checkpoint buffer could be release or not.
*
@@ -183,6 +161,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -242,6 +221,11 @@ int jbd2_log_do_checkpoint(journal_t *journal)
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
+ /*
+ * The buffer may be writing back, or flushing out in the
+ * last couple of cycles, or re-adding into a new transaction,
+ * need to check it again until it's unlocked.
+ */
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
@@ -287,28 +271,32 @@ int jbd2_log_do_checkpoint(journal_t *journal)
}
if (!buffer_dirty(bh)) {
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -322,38 +310,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
result = jbd2_cleanup_journal_tail(journal);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x c2d6fd9d6f35079f1669f0100f05b46708c74b7f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072457-patio-doornail-5f44@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
c2d6fd9d6f35 ("jbd2: recheck chechpointing non-dirty buffer")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
2bf31d94423c ("jbd2: fix kernel-doc markups")
60ed633f51d0 ("jbd2: fix incorrect code style")
ccd3c4373eac ("jbd2: fix use after free in jbd2_log_do_checkpoint()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2d6fd9d6f35079f1669f0100f05b46708c74b7f Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:23 +0800
Subject: [PATCH] jbd2: recheck chechpointing non-dirty buffer
There is a long-standing metadata corruption issue that happens from
time to time, but it's very difficult to reproduce and analyse, benefit
from the JBD2_CYCLE_RECORD option, we found out that the problem is the
checkpointing process miss to write out some buffers which are raced by
another do_get_write_access(). Looks below for detail.
jbd2_log_do_checkpoint() //transaction X
//buffer A is dirty and not belones to any transaction
__buffer_relink_io() //move it to the IO list
__flush_batch()
write_dirty_buffer()
do_get_write_access()
clear_buffer_dirty
__jbd2_journal_file_buffer()
//add buffer A to a new transaction Y
lock_buffer(bh)
//doesn't write out
__jbd2_journal_remove_checkpoint()
//finish checkpoint except buffer A
//filesystem corrupt if the new transaction Y isn't fully write out.
Due to the t_checkpoint_list walking loop in jbd2_log_do_checkpoint()
have already handles waiting for buffers under IO and re-added new
transaction to complete commit, and it also removing cleaned buffers,
this makes sure the list will eventually get empty. So it's fine to
leave buffers on the t_checkpoint_list while flushing out and completely
stop using the t_checkpoint_io_list.
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Tested-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..25e3c20eb19f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -57,28 +57,6 @@ static inline void __buffer_unlink(struct journal_head *jh)
}
}
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
/*
* Check a checkpoint buffer could be release or not.
*
@@ -183,6 +161,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -242,6 +221,11 @@ int jbd2_log_do_checkpoint(journal_t *journal)
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
+ /*
+ * The buffer may be writing back, or flushing out in the
+ * last couple of cycles, or re-adding into a new transaction,
+ * need to check it again until it's unlocked.
+ */
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
@@ -287,28 +271,32 @@ int jbd2_log_do_checkpoint(journal_t *journal)
}
if (!buffer_dirty(bh)) {
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -322,38 +310,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
result = jbd2_cleanup_journal_tail(journal);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x c2d6fd9d6f35079f1669f0100f05b46708c74b7f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072456-thermal-relieving-1b68@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
c2d6fd9d6f35 ("jbd2: recheck chechpointing non-dirty buffer")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
2bf31d94423c ("jbd2: fix kernel-doc markups")
60ed633f51d0 ("jbd2: fix incorrect code style")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2d6fd9d6f35079f1669f0100f05b46708c74b7f Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:23 +0800
Subject: [PATCH] jbd2: recheck chechpointing non-dirty buffer
There is a long-standing metadata corruption issue that happens from
time to time, but it's very difficult to reproduce and analyse, benefit
from the JBD2_CYCLE_RECORD option, we found out that the problem is the
checkpointing process miss to write out some buffers which are raced by
another do_get_write_access(). Looks below for detail.
jbd2_log_do_checkpoint() //transaction X
//buffer A is dirty and not belones to any transaction
__buffer_relink_io() //move it to the IO list
__flush_batch()
write_dirty_buffer()
do_get_write_access()
clear_buffer_dirty
__jbd2_journal_file_buffer()
//add buffer A to a new transaction Y
lock_buffer(bh)
//doesn't write out
__jbd2_journal_remove_checkpoint()
//finish checkpoint except buffer A
//filesystem corrupt if the new transaction Y isn't fully write out.
Due to the t_checkpoint_list walking loop in jbd2_log_do_checkpoint()
have already handles waiting for buffers under IO and re-added new
transaction to complete commit, and it also removing cleaned buffers,
this makes sure the list will eventually get empty. So it's fine to
leave buffers on the t_checkpoint_list while flushing out and completely
stop using the t_checkpoint_io_list.
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Tested-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..25e3c20eb19f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -57,28 +57,6 @@ static inline void __buffer_unlink(struct journal_head *jh)
}
}
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
/*
* Check a checkpoint buffer could be release or not.
*
@@ -183,6 +161,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -242,6 +221,11 @@ int jbd2_log_do_checkpoint(journal_t *journal)
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
+ /*
+ * The buffer may be writing back, or flushing out in the
+ * last couple of cycles, or re-adding into a new transaction,
+ * need to check it again until it's unlocked.
+ */
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
@@ -287,28 +271,32 @@ int jbd2_log_do_checkpoint(journal_t *journal)
}
if (!buffer_dirty(bh)) {
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -322,38 +310,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
result = jbd2_cleanup_journal_tail(journal);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x c2d6fd9d6f35079f1669f0100f05b46708c74b7f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072450-contour-slick-c3db@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
c2d6fd9d6f35 ("jbd2: recheck chechpointing non-dirty buffer")
4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
214eb5a4d8a2 ("jbd2: remove redundant buffer io error checks")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2d6fd9d6f35079f1669f0100f05b46708c74b7f Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang(a)huawei.com>
Date: Tue, 6 Jun 2023 21:59:23 +0800
Subject: [PATCH] jbd2: recheck chechpointing non-dirty buffer
There is a long-standing metadata corruption issue that happens from
time to time, but it's very difficult to reproduce and analyse, benefit
from the JBD2_CYCLE_RECORD option, we found out that the problem is the
checkpointing process miss to write out some buffers which are raced by
another do_get_write_access(). Looks below for detail.
jbd2_log_do_checkpoint() //transaction X
//buffer A is dirty and not belones to any transaction
__buffer_relink_io() //move it to the IO list
__flush_batch()
write_dirty_buffer()
do_get_write_access()
clear_buffer_dirty
__jbd2_journal_file_buffer()
//add buffer A to a new transaction Y
lock_buffer(bh)
//doesn't write out
__jbd2_journal_remove_checkpoint()
//finish checkpoint except buffer A
//filesystem corrupt if the new transaction Y isn't fully write out.
Due to the t_checkpoint_list walking loop in jbd2_log_do_checkpoint()
have already handles waiting for buffers under IO and re-added new
transaction to complete commit, and it also removing cleaned buffers,
this makes sure the list will eventually get empty. So it's fine to
leave buffers on the t_checkpoint_list while flushing out and completely
stop using the t_checkpoint_io_list.
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Tested-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230606135928.434610-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..25e3c20eb19f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -57,28 +57,6 @@ static inline void __buffer_unlink(struct journal_head *jh)
}
}
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
/*
* Check a checkpoint buffer could be release or not.
*
@@ -183,6 +161,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -242,6 +221,11 @@ int jbd2_log_do_checkpoint(journal_t *journal)
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
+ /*
+ * The buffer may be writing back, or flushing out in the
+ * last couple of cycles, or re-adding into a new transaction,
+ * need to check it again until it's unlocked.
+ */
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
@@ -287,28 +271,32 @@ int jbd2_log_do_checkpoint(journal_t *journal)
}
if (!buffer_dirty(bh)) {
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -322,38 +310,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
result = jbd2_cleanup_journal_tail(journal);
From: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
[ Upstream commit 6bd6cd29c92401a101993290051fa55078238a52 ]
Returning early from stm32_usart_serial_remove() results in a resource
leak as several cleanup functions are not called. The driver core ignores
the return value and there is no possibility to clean up later.
uart_remove_one_port() only returns non-zero if there is some
inconsistency (i.e. stm32_usart_driver.state[port->line].uart_port == NULL).
This should never happen, and even if it does it's a bad idea to exit
early in the remove callback without cleaning up.
This prepares changing the prototype of struct platform_driver::remove to
return void. See commit 5c5a7680e67b ("platform: Provide a remove callback
that returns no value") for further details about this quest.
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
Link: https://lore.kernel.org/r/20230512173810.131447-2-u.kleine-koenig@pengutron…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/tty/serial/stm32-usart.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index 1e38fc9b10c11..e9e11a2596211 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -1755,13 +1755,10 @@ static int stm32_usart_serial_remove(struct platform_device *pdev)
struct uart_port *port = platform_get_drvdata(pdev);
struct stm32_port *stm32_port = to_stm32_port(port);
const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs;
- int err;
u32 cr3;
pm_runtime_get_sync(&pdev->dev);
- err = uart_remove_one_port(&stm32_usart_driver, port);
- if (err)
- return(err);
+ uart_remove_one_port(&stm32_usart_driver, port);
pm_runtime_disable(&pdev->dev);
pm_runtime_set_suspended(&pdev->dev);
--
2.39.2
I hope this message finds you well. I am currently downsizing my Late Husband Steinway Piano and have come to the decision of finding a new home for my late husband's piano. It holds sentimental value to me, and I want to ensure it goes to someone who will appreciate and cherish it as much as he did.
If you or anyone you know is interested in acquiring a Steinway piano, I would be delighted to hear from you. I believe that finding a suitable owner who will give it the care and attention it deserves is of utmost importance. The piano has been well-maintained over the years and is in excellent condition.
Please feel free to reach out to me at your earliest convenience. I send you more pictures of the pictures in my next email.
Cheers,
Linda Ramey
Email: rsharon1759(a)gmail.com
From: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
[ Upstream commit 6bd6cd29c92401a101993290051fa55078238a52 ]
Returning early from stm32_usart_serial_remove() results in a resource
leak as several cleanup functions are not called. The driver core ignores
the return value and there is no possibility to clean up later.
uart_remove_one_port() only returns non-zero if there is some
inconsistency (i.e. stm32_usart_driver.state[port->line].uart_port == NULL).
This should never happen, and even if it does it's a bad idea to exit
early in the remove callback without cleaning up.
This prepares changing the prototype of struct platform_driver::remove to
return void. See commit 5c5a7680e67b ("platform: Provide a remove callback
that returns no value") for further details about this quest.
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
Link: https://lore.kernel.org/r/20230512173810.131447-2-u.kleine-koenig@pengutron…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/tty/serial/stm32-usart.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index 28edbaf7bb329..2a9c4058824a8 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -1753,13 +1753,10 @@ static int stm32_usart_serial_remove(struct platform_device *pdev)
struct uart_port *port = platform_get_drvdata(pdev);
struct stm32_port *stm32_port = to_stm32_port(port);
const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs;
- int err;
u32 cr3;
pm_runtime_get_sync(&pdev->dev);
- err = uart_remove_one_port(&stm32_usart_driver, port);
- if (err)
- return(err);
+ uart_remove_one_port(&stm32_usart_driver, port);
pm_runtime_disable(&pdev->dev);
pm_runtime_set_suspended(&pdev->dev);
--
2.39.2
From: hackyzh002 <hackyzh002(a)gmail.com>
[ Upstream commit f828b681d0cd566f86351c0b913e6cb6ed8c7b9c ]
The type of size is unsigned, if size is 0x40000000, there will be an
integer overflow, size will be zero after size *= sizeof(uint32_t),
will cause uninitialized memory to be referenced later
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: hackyzh002 <hackyzh002(a)gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/radeon/radeon_cs.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 1ae31dbc61c64..5e61abb3dce5c 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -265,7 +265,8 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data)
{
struct drm_radeon_cs *cs = data;
uint64_t *chunk_array_ptr;
- unsigned size, i;
+ u64 size;
+ unsigned i;
u32 ring = RADEON_CS_RING_GFX;
s32 priority = 0;
--
2.39.2
From: hackyzh002 <hackyzh002(a)gmail.com>
[ Upstream commit f828b681d0cd566f86351c0b913e6cb6ed8c7b9c ]
The type of size is unsigned, if size is 0x40000000, there will be an
integer overflow, size will be zero after size *= sizeof(uint32_t),
will cause uninitialized memory to be referenced later
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: hackyzh002 <hackyzh002(a)gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/radeon/radeon_cs.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 7b54606783821..ba64dad1d7c9e 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -271,7 +271,8 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data)
{
struct drm_radeon_cs *cs = data;
uint64_t *chunk_array_ptr;
- unsigned size, i;
+ u64 size;
+ unsigned i;
u32 ring = RADEON_CS_RING_GFX;
s32 priority = 0;
--
2.39.2
From: hackyzh002 <hackyzh002(a)gmail.com>
[ Upstream commit f828b681d0cd566f86351c0b913e6cb6ed8c7b9c ]
The type of size is unsigned, if size is 0x40000000, there will be an
integer overflow, size will be zero after size *= sizeof(uint32_t),
will cause uninitialized memory to be referenced later
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: hackyzh002 <hackyzh002(a)gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/radeon/radeon_cs.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 9ed2b2700e0a5..e5fbe851ed930 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -270,7 +270,8 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data)
{
struct drm_radeon_cs *cs = data;
uint64_t *chunk_array_ptr;
- unsigned size, i;
+ u64 size;
+ unsigned i;
u32 ring = RADEON_CS_RING_GFX;
s32 priority = 0;
--
2.39.2
The Hyper-V host is queried to get the max transfer size that it
supports, and this value is used to set max_sectors for the synthetic
SCSI controller. However, this max transfer size may be too large
for virtual Fibre Channel devices, which are limited to 512 Kbytes.
If a larger transfer size is used with a vFC device, Hyper-V always
returns an error, and storvsc logs a message like this where the SRB
status and SCSI status are both zero:
hv_storvsc <GUID>: tag#197 cmd 0x8a status: scsi 0x0 srb 0x0 hv 0xc0000001
Add logic to limit the max transfer size to 512 Kbytes for vFC devices.
Fixes: 1d3e0980782f ("scsi: storvsc: Correct reporting of Hyper-V I/O size limits")
Cc: stable(a)vger.kernel.org
Signed-off-by: Michael Kelley <mikelley(a)microsoft.com>
---
drivers/scsi/storvsc_drv.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 7f12d93..f282321 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -366,6 +366,7 @@ enum storvsc_request_type {
#define STORVSC_FC_MAX_LUNS_PER_TARGET 255
#define STORVSC_FC_MAX_TARGETS 128
#define STORVSC_FC_MAX_CHANNELS 8
+#define STORVSC_FC_MAX_XFER_SIZE ((u32)(512 * 1024))
#define STORVSC_IDE_MAX_LUNS_PER_TARGET 64
#define STORVSC_IDE_MAX_TARGETS 1
@@ -2006,6 +2007,9 @@ static int storvsc_probe(struct hv_device *device,
* protecting it from any weird value.
*/
max_xfer_bytes = round_down(stor_device->max_transfer_bytes, HV_HYP_PAGE_SIZE);
+ if (is_fc)
+ max_xfer_bytes = min(max_xfer_bytes, STORVSC_FC_MAX_XFER_SIZE);
+
/* max_hw_sectors_kb */
host->max_sectors = max_xfer_bytes >> 9;
/*
--
1.8.3.1
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b321c31c9b7b309dcde5e8854b741c8e6a9a05f0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072322-posing-dispatch-863e@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
b321c31c9b7b ("KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t preemption")
0c2f9acf6ae7 ("KVM: arm64: PMU: Don't overwrite PMUSERENR with vcpu loaded")
8681f7175901 ("KVM: arm64: PMU: Restore the host's PMUSERENR_EL0")
009d6dc87a56 ("ARM: perf: Allow the use of the PMUv3 driver on 32bit ARM")
711432770f78 ("perf: pmuv3: Abstract PMU version checks")
df29ddf4f04b ("arm64: perf: Abstract system register accesses away")
7755cec63ade ("arm64: perf: Move PMUv3 driver to drivers/perf")
cc91b9481605 ("arm64/perf: Replace PMU version number '0' with ID_AA64DFR0_EL1_PMUVer_NI")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b321c31c9b7b309dcde5e8854b741c8e6a9a05f0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 13 Jul 2023 08:06:57 +0100
Subject: [PATCH] KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t
preemption
Xiang reports that VMs occasionally fail to boot on GICv4.1 systems when
running a preemptible kernel, as it is possible that a vCPU is blocked
without requesting a doorbell interrupt.
The issue is that any preemption that occurs between vgic_v4_put() and
schedule() on the block path will mark the vPE as nonresident and *not*
request a doorbell irq. This occurs because when the vcpu thread is
resumed on its way to block, vcpu_load() will make the vPE resident
again. Once the vcpu actually blocks, we don't request a doorbell
anymore, and the vcpu won't be woken up on interrupt delivery.
Fix it by tracking that we're entering WFI, and key the doorbell
request on that flag. This allows us not to make the vPE resident
when going through a preempt/schedule cycle, meaning we don't lose
any state.
Cc: stable(a)vger.kernel.org
Fixes: 8e01d9a396e6 ("KVM: arm64: vgic-v4: Move the GICv4 residency flow to be driven by vcpu_load/put")
Reported-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Suggested-by: Zenghui Yu <yuzenghui(a)huawei.com>
Tested-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Co-developed-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Zenghui Yu <yuzenghui(a)huawei.com>
Link: https://lore.kernel.org/r/20230713070657.3873244-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8b6096753740..d3dd05bbfe23 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -727,6 +727,8 @@ struct kvm_vcpu_arch {
#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5))
/* PMUSERENR for the guest EL0 is on physical CPU */
#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6))
+/* WFI instruction trapped */
+#define IN_WFI __vcpu_single_flag(sflags, BIT(7))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a402ea5511f3..72dc53a75d1c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -718,13 +718,15 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
*/
preempt_disable();
kvm_vgic_vmcr_sync(vcpu);
- vgic_v4_put(vcpu, true);
+ vcpu_set_flag(vcpu, IN_WFI);
+ vgic_v4_put(vcpu);
preempt_enable();
kvm_vcpu_halt(vcpu);
vcpu_clear_flag(vcpu, IN_WFIT);
preempt_disable();
+ vcpu_clear_flag(vcpu, IN_WFI);
vgic_v4_load(vcpu);
preempt_enable();
}
@@ -792,7 +794,7 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
/* The distributor enable bits were changed */
preempt_disable();
- vgic_v4_put(vcpu, false);
+ vgic_v4_put(vcpu);
vgic_v4_load(vcpu);
preempt_enable();
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index c3b8e132d599..3dfc8b84e03e 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -749,7 +749,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- WARN_ON(vgic_v4_put(vcpu, false));
+ WARN_ON(vgic_v4_put(vcpu));
vgic_v3_vmcr_sync(vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index c1c28fe680ba..339a55194b2c 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -336,14 +336,14 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
return 0;
- return its_make_vpe_non_resident(vpe, need_db);
+ return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI));
}
int vgic_v4_load(struct kvm_vcpu *vcpu)
@@ -354,6 +354,9 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
return 0;
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ return 0;
+
/*
* Before making the VPE resident, make sure the redistributor
* corresponding to our current CPU expects us here. See the
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 402b545959af..5b27f94d4fad 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -431,7 +431,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db);
+int vgic_v4_put(struct kvm_vcpu *vcpu);
/* CPU HP callbacks */
void kvm_vgic_cpu_up(void);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x df6556adf27b7372cfcd97e1c0afb0d516c8279f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072359-overfull-ascent-520d@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
df6556adf27b ("KVM: arm64: Correctly handle page aging notifiers for unaligned memslot")
ca5de2448c3b ("KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks")
6b91b8f95cad ("KVM: arm64: Use an opaque type for pteps")
fa002e8e79b3 ("KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data")
2a611c7f87f2 ("KVM: arm64: Pass mm_ops through the visitor context")
83844a2317ec ("KVM: arm64: Stash observed pte value in visitor context")
dfc7a7769ab7 ("KVM: arm64: Combine visitor arguments into a context structure")
094d00f8ca58 ("KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance")
e82edcc75c4e ("KVM: arm64: Implement do_share() helper for sharing memory")
82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2")
d6b4bd3f4897 ("KVM: arm64: Fixup hyp stage-1 refcount")
2ea2ff91e822 ("KVM: arm64: Refcount hyp stage-1 pgtable pages")
cf0c7125d578 ("Merge branch kvm-arm64/mmu/el2-tracking into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df6556adf27b7372cfcd97e1c0afb0d516c8279f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Tue, 27 Jun 2023 23:54:05 +0000
Subject: [PATCH] KVM: arm64: Correctly handle page aging notifiers for
unaligned memslot
Userspace is allowed to select any PAGE_SIZE aligned hva to back guest
memory. This is even the case with hugepages, although it is a rather
suboptimal configuration as PTE level mappings are used at stage-2.
The arm64 page aging handlers have an assumption that the specified
range is exactly one page/block of memory, which in the aforementioned
case is not necessarily true. All together this leads to the WARN() in
kvm_age_gfn() firing.
However, the WARN is only part of the issue as the table walkers visit
at most a single leaf PTE. For hugepage-backed memory in a memslot that
isn't hugepage-aligned, page aging entirely misses accesses to the
hugepage beyond the first page in the memslot.
Add a new walker dedicated to handling page aging MMU notifiers capable
of walking a range of PTEs. Convert kvm(_test)_age_gfn() over to the new
walker and drop the WARN that caught the issue in the first place. The
implementation of this walker was inspired by the test_clear_young()
implementation by Yu Zhao [*], but repurposed to address a bug in the
existing aging implementation.
Cc: stable(a)vger.kernel.org # v5.15
Fixes: 056aad67f836 ("kvm: arm/arm64: Rework gpa callback handlers")
Link: https://lore.kernel.org/kvmarm/20230526234435.662652-6-yuzhao@google.com/
Co-developed-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Reiji Watanabe <reijiw(a)google.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Link: https://lore.kernel.org/r/20230627235405.4069823-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8294a9a7e566..929d355eae0a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -608,22 +608,26 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
- * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+ * flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
+ * @size: Size of the address range to visit.
+ * @mkold: True if the access flag should be cleared.
*
* The offset of @addr within a page is ignored.
*
- * If there is a valid, leaf page-table entry used to translate @addr, then
- * clear the access flag in that entry.
+ * Tests and conditionally clears the access flag for every valid, leaf
+ * page-table entry used to translate the range [@addr, @addr + @size).
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
- * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ * Return: True if any of the visited PTEs had the access flag set.
*/
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
@@ -645,18 +649,6 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
-/**
- * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
- * access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Intermediate physical address to identify the page-table entry.
- *
- * The offset of @addr within a page is ignored.
- *
- * Return: True if the page-table entry has the access flag set, false otherwise.
- */
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
-
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index aa740a974e02..f7a93ef29250 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1195,25 +1195,54 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
return pte;
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..d3b4feed460c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1756,27 +1756,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x df6556adf27b7372cfcd97e1c0afb0d516c8279f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072357-trilogy-average-5fd9@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
df6556adf27b ("KVM: arm64: Correctly handle page aging notifiers for unaligned memslot")
ca5de2448c3b ("KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks")
6b91b8f95cad ("KVM: arm64: Use an opaque type for pteps")
fa002e8e79b3 ("KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data")
2a611c7f87f2 ("KVM: arm64: Pass mm_ops through the visitor context")
83844a2317ec ("KVM: arm64: Stash observed pte value in visitor context")
dfc7a7769ab7 ("KVM: arm64: Combine visitor arguments into a context structure")
094d00f8ca58 ("KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance")
e82edcc75c4e ("KVM: arm64: Implement do_share() helper for sharing memory")
82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2")
d6b4bd3f4897 ("KVM: arm64: Fixup hyp stage-1 refcount")
2ea2ff91e822 ("KVM: arm64: Refcount hyp stage-1 pgtable pages")
cf0c7125d578 ("Merge branch kvm-arm64/mmu/el2-tracking into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df6556adf27b7372cfcd97e1c0afb0d516c8279f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Tue, 27 Jun 2023 23:54:05 +0000
Subject: [PATCH] KVM: arm64: Correctly handle page aging notifiers for
unaligned memslot
Userspace is allowed to select any PAGE_SIZE aligned hva to back guest
memory. This is even the case with hugepages, although it is a rather
suboptimal configuration as PTE level mappings are used at stage-2.
The arm64 page aging handlers have an assumption that the specified
range is exactly one page/block of memory, which in the aforementioned
case is not necessarily true. All together this leads to the WARN() in
kvm_age_gfn() firing.
However, the WARN is only part of the issue as the table walkers visit
at most a single leaf PTE. For hugepage-backed memory in a memslot that
isn't hugepage-aligned, page aging entirely misses accesses to the
hugepage beyond the first page in the memslot.
Add a new walker dedicated to handling page aging MMU notifiers capable
of walking a range of PTEs. Convert kvm(_test)_age_gfn() over to the new
walker and drop the WARN that caught the issue in the first place. The
implementation of this walker was inspired by the test_clear_young()
implementation by Yu Zhao [*], but repurposed to address a bug in the
existing aging implementation.
Cc: stable(a)vger.kernel.org # v5.15
Fixes: 056aad67f836 ("kvm: arm/arm64: Rework gpa callback handlers")
Link: https://lore.kernel.org/kvmarm/20230526234435.662652-6-yuzhao@google.com/
Co-developed-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Reiji Watanabe <reijiw(a)google.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Link: https://lore.kernel.org/r/20230627235405.4069823-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8294a9a7e566..929d355eae0a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -608,22 +608,26 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
- * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+ * flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
+ * @size: Size of the address range to visit.
+ * @mkold: True if the access flag should be cleared.
*
* The offset of @addr within a page is ignored.
*
- * If there is a valid, leaf page-table entry used to translate @addr, then
- * clear the access flag in that entry.
+ * Tests and conditionally clears the access flag for every valid, leaf
+ * page-table entry used to translate the range [@addr, @addr + @size).
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
- * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ * Return: True if any of the visited PTEs had the access flag set.
*/
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
@@ -645,18 +649,6 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
-/**
- * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
- * access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Intermediate physical address to identify the page-table entry.
- *
- * The offset of @addr within a page is ignored.
- *
- * Return: True if the page-table entry has the access flag set, false otherwise.
- */
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
-
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index aa740a974e02..f7a93ef29250 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1195,25 +1195,54 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
return pte;
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..d3b4feed460c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1756,27 +1756,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x df6556adf27b7372cfcd97e1c0afb0d516c8279f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072356-margin-freckled-cde8@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
df6556adf27b ("KVM: arm64: Correctly handle page aging notifiers for unaligned memslot")
ca5de2448c3b ("KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks")
6b91b8f95cad ("KVM: arm64: Use an opaque type for pteps")
fa002e8e79b3 ("KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data")
2a611c7f87f2 ("KVM: arm64: Pass mm_ops through the visitor context")
83844a2317ec ("KVM: arm64: Stash observed pte value in visitor context")
dfc7a7769ab7 ("KVM: arm64: Combine visitor arguments into a context structure")
094d00f8ca58 ("KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance")
e82edcc75c4e ("KVM: arm64: Implement do_share() helper for sharing memory")
82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2")
d6b4bd3f4897 ("KVM: arm64: Fixup hyp stage-1 refcount")
2ea2ff91e822 ("KVM: arm64: Refcount hyp stage-1 pgtable pages")
cf0c7125d578 ("Merge branch kvm-arm64/mmu/el2-tracking into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df6556adf27b7372cfcd97e1c0afb0d516c8279f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Tue, 27 Jun 2023 23:54:05 +0000
Subject: [PATCH] KVM: arm64: Correctly handle page aging notifiers for
unaligned memslot
Userspace is allowed to select any PAGE_SIZE aligned hva to back guest
memory. This is even the case with hugepages, although it is a rather
suboptimal configuration as PTE level mappings are used at stage-2.
The arm64 page aging handlers have an assumption that the specified
range is exactly one page/block of memory, which in the aforementioned
case is not necessarily true. All together this leads to the WARN() in
kvm_age_gfn() firing.
However, the WARN is only part of the issue as the table walkers visit
at most a single leaf PTE. For hugepage-backed memory in a memslot that
isn't hugepage-aligned, page aging entirely misses accesses to the
hugepage beyond the first page in the memslot.
Add a new walker dedicated to handling page aging MMU notifiers capable
of walking a range of PTEs. Convert kvm(_test)_age_gfn() over to the new
walker and drop the WARN that caught the issue in the first place. The
implementation of this walker was inspired by the test_clear_young()
implementation by Yu Zhao [*], but repurposed to address a bug in the
existing aging implementation.
Cc: stable(a)vger.kernel.org # v5.15
Fixes: 056aad67f836 ("kvm: arm/arm64: Rework gpa callback handlers")
Link: https://lore.kernel.org/kvmarm/20230526234435.662652-6-yuzhao@google.com/
Co-developed-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Reiji Watanabe <reijiw(a)google.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Link: https://lore.kernel.org/r/20230627235405.4069823-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8294a9a7e566..929d355eae0a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -608,22 +608,26 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
- * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+ * flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
+ * @size: Size of the address range to visit.
+ * @mkold: True if the access flag should be cleared.
*
* The offset of @addr within a page is ignored.
*
- * If there is a valid, leaf page-table entry used to translate @addr, then
- * clear the access flag in that entry.
+ * Tests and conditionally clears the access flag for every valid, leaf
+ * page-table entry used to translate the range [@addr, @addr + @size).
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
- * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ * Return: True if any of the visited PTEs had the access flag set.
*/
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
@@ -645,18 +649,6 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
-/**
- * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
- * access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Intermediate physical address to identify the page-table entry.
- *
- * The offset of @addr within a page is ignored.
- *
- * Return: True if the page-table entry has the access flag set, false otherwise.
- */
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
-
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index aa740a974e02..f7a93ef29250 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1195,25 +1195,54 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
return pte;
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..d3b4feed460c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1756,27 +1756,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x df6556adf27b7372cfcd97e1c0afb0d516c8279f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072354-sublet-observing-2ff8@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
df6556adf27b ("KVM: arm64: Correctly handle page aging notifiers for unaligned memslot")
ca5de2448c3b ("KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks")
6b91b8f95cad ("KVM: arm64: Use an opaque type for pteps")
fa002e8e79b3 ("KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data")
2a611c7f87f2 ("KVM: arm64: Pass mm_ops through the visitor context")
83844a2317ec ("KVM: arm64: Stash observed pte value in visitor context")
dfc7a7769ab7 ("KVM: arm64: Combine visitor arguments into a context structure")
094d00f8ca58 ("KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance")
e82edcc75c4e ("KVM: arm64: Implement do_share() helper for sharing memory")
82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2")
d6b4bd3f4897 ("KVM: arm64: Fixup hyp stage-1 refcount")
2ea2ff91e822 ("KVM: arm64: Refcount hyp stage-1 pgtable pages")
cf0c7125d578 ("Merge branch kvm-arm64/mmu/el2-tracking into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df6556adf27b7372cfcd97e1c0afb0d516c8279f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Tue, 27 Jun 2023 23:54:05 +0000
Subject: [PATCH] KVM: arm64: Correctly handle page aging notifiers for
unaligned memslot
Userspace is allowed to select any PAGE_SIZE aligned hva to back guest
memory. This is even the case with hugepages, although it is a rather
suboptimal configuration as PTE level mappings are used at stage-2.
The arm64 page aging handlers have an assumption that the specified
range is exactly one page/block of memory, which in the aforementioned
case is not necessarily true. All together this leads to the WARN() in
kvm_age_gfn() firing.
However, the WARN is only part of the issue as the table walkers visit
at most a single leaf PTE. For hugepage-backed memory in a memslot that
isn't hugepage-aligned, page aging entirely misses accesses to the
hugepage beyond the first page in the memslot.
Add a new walker dedicated to handling page aging MMU notifiers capable
of walking a range of PTEs. Convert kvm(_test)_age_gfn() over to the new
walker and drop the WARN that caught the issue in the first place. The
implementation of this walker was inspired by the test_clear_young()
implementation by Yu Zhao [*], but repurposed to address a bug in the
existing aging implementation.
Cc: stable(a)vger.kernel.org # v5.15
Fixes: 056aad67f836 ("kvm: arm/arm64: Rework gpa callback handlers")
Link: https://lore.kernel.org/kvmarm/20230526234435.662652-6-yuzhao@google.com/
Co-developed-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Reiji Watanabe <reijiw(a)google.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Link: https://lore.kernel.org/r/20230627235405.4069823-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8294a9a7e566..929d355eae0a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -608,22 +608,26 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
- * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+ * flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
+ * @size: Size of the address range to visit.
+ * @mkold: True if the access flag should be cleared.
*
* The offset of @addr within a page is ignored.
*
- * If there is a valid, leaf page-table entry used to translate @addr, then
- * clear the access flag in that entry.
+ * Tests and conditionally clears the access flag for every valid, leaf
+ * page-table entry used to translate the range [@addr, @addr + @size).
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
- * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ * Return: True if any of the visited PTEs had the access flag set.
*/
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
@@ -645,18 +649,6 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
-/**
- * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
- * access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Intermediate physical address to identify the page-table entry.
- *
- * The offset of @addr within a page is ignored.
- *
- * Return: True if the page-table entry has the access flag set, false otherwise.
- */
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
-
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index aa740a974e02..f7a93ef29250 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1195,25 +1195,54 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
return pte;
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..d3b4feed460c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1756,27 +1756,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x df6556adf27b7372cfcd97e1c0afb0d516c8279f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072352-ooze-childcare-fd84@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
df6556adf27b ("KVM: arm64: Correctly handle page aging notifiers for unaligned memslot")
ca5de2448c3b ("KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks")
6b91b8f95cad ("KVM: arm64: Use an opaque type for pteps")
fa002e8e79b3 ("KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data")
2a611c7f87f2 ("KVM: arm64: Pass mm_ops through the visitor context")
83844a2317ec ("KVM: arm64: Stash observed pte value in visitor context")
dfc7a7769ab7 ("KVM: arm64: Combine visitor arguments into a context structure")
094d00f8ca58 ("KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance")
e82edcc75c4e ("KVM: arm64: Implement do_share() helper for sharing memory")
82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2")
d6b4bd3f4897 ("KVM: arm64: Fixup hyp stage-1 refcount")
2ea2ff91e822 ("KVM: arm64: Refcount hyp stage-1 pgtable pages")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df6556adf27b7372cfcd97e1c0afb0d516c8279f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Tue, 27 Jun 2023 23:54:05 +0000
Subject: [PATCH] KVM: arm64: Correctly handle page aging notifiers for
unaligned memslot
Userspace is allowed to select any PAGE_SIZE aligned hva to back guest
memory. This is even the case with hugepages, although it is a rather
suboptimal configuration as PTE level mappings are used at stage-2.
The arm64 page aging handlers have an assumption that the specified
range is exactly one page/block of memory, which in the aforementioned
case is not necessarily true. All together this leads to the WARN() in
kvm_age_gfn() firing.
However, the WARN is only part of the issue as the table walkers visit
at most a single leaf PTE. For hugepage-backed memory in a memslot that
isn't hugepage-aligned, page aging entirely misses accesses to the
hugepage beyond the first page in the memslot.
Add a new walker dedicated to handling page aging MMU notifiers capable
of walking a range of PTEs. Convert kvm(_test)_age_gfn() over to the new
walker and drop the WARN that caught the issue in the first place. The
implementation of this walker was inspired by the test_clear_young()
implementation by Yu Zhao [*], but repurposed to address a bug in the
existing aging implementation.
Cc: stable(a)vger.kernel.org # v5.15
Fixes: 056aad67f836 ("kvm: arm/arm64: Rework gpa callback handlers")
Link: https://lore.kernel.org/kvmarm/20230526234435.662652-6-yuzhao@google.com/
Co-developed-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Reiji Watanabe <reijiw(a)google.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Link: https://lore.kernel.org/r/20230627235405.4069823-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8294a9a7e566..929d355eae0a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -608,22 +608,26 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
- * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+ * flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
+ * @size: Size of the address range to visit.
+ * @mkold: True if the access flag should be cleared.
*
* The offset of @addr within a page is ignored.
*
- * If there is a valid, leaf page-table entry used to translate @addr, then
- * clear the access flag in that entry.
+ * Tests and conditionally clears the access flag for every valid, leaf
+ * page-table entry used to translate the range [@addr, @addr + @size).
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
- * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ * Return: True if any of the visited PTEs had the access flag set.
*/
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
@@ -645,18 +649,6 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
-/**
- * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
- * access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Intermediate physical address to identify the page-table entry.
- *
- * The offset of @addr within a page is ignored.
- *
- * Return: True if the page-table entry has the access flag set, false otherwise.
- */
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
-
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index aa740a974e02..f7a93ef29250 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1195,25 +1195,54 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
return pte;
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..d3b4feed460c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1756,27 +1756,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x df6556adf27b7372cfcd97e1c0afb0d516c8279f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072351-slackness-unlearned-6f45@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
df6556adf27b ("KVM: arm64: Correctly handle page aging notifiers for unaligned memslot")
ca5de2448c3b ("KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks")
6b91b8f95cad ("KVM: arm64: Use an opaque type for pteps")
fa002e8e79b3 ("KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data")
2a611c7f87f2 ("KVM: arm64: Pass mm_ops through the visitor context")
83844a2317ec ("KVM: arm64: Stash observed pte value in visitor context")
dfc7a7769ab7 ("KVM: arm64: Combine visitor arguments into a context structure")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df6556adf27b7372cfcd97e1c0afb0d516c8279f Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Tue, 27 Jun 2023 23:54:05 +0000
Subject: [PATCH] KVM: arm64: Correctly handle page aging notifiers for
unaligned memslot
Userspace is allowed to select any PAGE_SIZE aligned hva to back guest
memory. This is even the case with hugepages, although it is a rather
suboptimal configuration as PTE level mappings are used at stage-2.
The arm64 page aging handlers have an assumption that the specified
range is exactly one page/block of memory, which in the aforementioned
case is not necessarily true. All together this leads to the WARN() in
kvm_age_gfn() firing.
However, the WARN is only part of the issue as the table walkers visit
at most a single leaf PTE. For hugepage-backed memory in a memslot that
isn't hugepage-aligned, page aging entirely misses accesses to the
hugepage beyond the first page in the memslot.
Add a new walker dedicated to handling page aging MMU notifiers capable
of walking a range of PTEs. Convert kvm(_test)_age_gfn() over to the new
walker and drop the WARN that caught the issue in the first place. The
implementation of this walker was inspired by the test_clear_young()
implementation by Yu Zhao [*], but repurposed to address a bug in the
existing aging implementation.
Cc: stable(a)vger.kernel.org # v5.15
Fixes: 056aad67f836 ("kvm: arm/arm64: Rework gpa callback handlers")
Link: https://lore.kernel.org/kvmarm/20230526234435.662652-6-yuzhao@google.com/
Co-developed-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Reiji Watanabe <reijiw(a)google.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Link: https://lore.kernel.org/r/20230627235405.4069823-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8294a9a7e566..929d355eae0a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -608,22 +608,26 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
- * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+ * flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
+ * @size: Size of the address range to visit.
+ * @mkold: True if the access flag should be cleared.
*
* The offset of @addr within a page is ignored.
*
- * If there is a valid, leaf page-table entry used to translate @addr, then
- * clear the access flag in that entry.
+ * Tests and conditionally clears the access flag for every valid, leaf
+ * page-table entry used to translate the range [@addr, @addr + @size).
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
- * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ * Return: True if any of the visited PTEs had the access flag set.
*/
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
@@ -645,18 +649,6 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
-/**
- * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
- * access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr: Intermediate physical address to identify the page-table entry.
- *
- * The offset of @addr within a page is ignored.
- *
- * Return: True if the page-table entry has the access flag set, false otherwise.
- */
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
-
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index aa740a974e02..f7a93ef29250 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1195,25 +1195,54 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
return pte;
}
-kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+struct stage2_age_data {
+ bool mkold;
+ bool young;
+};
+
+static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
- &pte, NULL, 0);
+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data *data = ctx->arg;
+
+ if (!kvm_pte_valid(ctx->old) || new == ctx->old)
+ return 0;
+
+ data->young = true;
+
+ /*
+ * stage2_age_walker() is always called while holding the MMU lock for
+ * write, so this will always succeed. Nonetheless, this deliberately
+ * follows the race detection pattern of the other stage-2 walkers in
+ * case the locking mechanics of the MMU notifiers is ever changed.
+ */
+ if (data->mkold && !stage2_try_set_pte(ctx, new))
+ return -EAGAIN;
+
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
- return pte;
+ return 0;
}
-bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, bool mkold)
{
- kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
- return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct stage2_age_data data = {
+ .mkold = mkold,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_age_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+ return data.young;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..d3b4feed460c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1756,27 +1756,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
Recently Luiz Capitulino reported BPF test failure for kernel version
6.1.36 (see [7]). The following test_verifier test failed:
"precise: ST insn causing spi > allocated_stack".
After back-port of the following upstream commit:
ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
Investigation in [8] shows that test failure is not a bug, but a
difference in BPF verifier behavior between upstream, where commits
[1,2,3] by Andrii Nakryiko are present, and 6.1.36, where these
commits are absent. Both Luiz and Greg suggested back-porting [1,2,3]
from upstream to avoid divergences.
Commits [1,2,3] break test_progs selftest "align/packet variable offset",
commit [4] fixes this selftest.
I did some additional testing using the following compiler versions:
- Kernel compilation
- gcc version 11.3.0
- BPF tests compilation
- clang version 16.0.6
- clang version 17.0.0 (fa46feb31481)
And identified a few more failing BPF selftests:
- Tests failing with LLVM 16:
- test_verifier:
- precise: ST insn causing spi > allocated_stack FAIL (fixed by [1,2,3])
- test_progs:
- sk_assign (fixed by [6])
- Tests failing with LLVM 17:
- test_verifier:
- precise: ST insn causing spi > allocated_stack FAIL (fixed by [1,2,3])
- test_progs:
- fexit_bpf2bpf/func_replace_verify (fixed by [5])
- fexit_bpf2bpf/func_replace_return_code (fixed by [5])
- sk_assign (fixed by [6])
Commits [4,5,6] only apply to BPF selftests and don't change verifier
behavior.
After applying all of the listed commits I have test_verifier,
test_progs, test_progs-no_alu32 and test_maps passing on my x86 setup,
both for LLVM 16 and LLVM 17.
Upstream commits in chronological order:
[1] be2ef8161572 ("bpf: allow precision tracking for programs with subprogs")
[2] f63181b6ae79 ("bpf: stop setting precise in current state")
[3] 7a830b53c17b ("bpf: aggressively forget precise markings during state checkpointing")
[4] 4f999b767769 ("selftests/bpf: make test_align selftest more robust")
[5] 63d78b7e8ca2 ("selftests/bpf: Workaround verification failure for fexit_bpf2bpf/func_replace_return_code")
[6] 7ce878ca81bc ("selftests/bpf: Fix sk_assign on s390x")
Links:
[7] https://lore.kernel.org/stable/935c4751-d368-df29-33a6-9f4fcae720fa@amazon.…
[8] https://lore.kernel.org/stable/c9b10a8a551edafdfec855fbd35757c6238ad258.cam…
Reported-by: Luiz Capitulino <luizcap(a)amazon.com>
Andrii Nakryiko (4):
bpf: allow precision tracking for programs with subprogs
bpf: stop setting precise in current state
bpf: aggressively forget precise markings during state checkpointing
selftests/bpf: make test_align selftest more robust
Ilya Leoshkevich (1):
selftests/bpf: Fix sk_assign on s390x
Yonghong Song (1):
selftests/bpf: Workaround verification failure for
fexit_bpf2bpf/func_replace_return_code
kernel/bpf/verifier.c | 202 ++++++++++++++++--
.../testing/selftests/bpf/prog_tests/align.c | 38 ++--
.../selftests/bpf/prog_tests/sk_assign.c | 25 ++-
.../selftests/bpf/progs/connect4_prog.c | 2 +-
.../selftests/bpf/progs/test_sk_assign.c | 11 +
.../bpf/progs/test_sk_assign_libbpf.c | 3 +
6 files changed, 247 insertions(+), 34 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
--
2.41.0
Seen in arm:allmodconfig and arm64:allmodconfig builds with v5.4.249-278-g78f9a3d1c959.
sound/soc/mediatek/mt8173/mt8173-afe-pcm.c: In function 'mt8173_afe_pcm_dev_probe':
sound/soc/mediatek/mt8173/mt8173-afe-pcm.c:1159:17: error: label 'err_cleanup_components' used but not defined
Guenter
From: Mohamed Khalfella <mkhalfella(a)purestorage.com>
Commit 6018b585e8c6 ("tracing/histograms: Add histograms to hist_vars if
they have referenced variables") added a check to fail histogram creation
if save_hist_vars() failed to add histogram to hist_vars list. But the
commit failed to set ret to failed return code before jumping to
unregister histogram, fix it.
Link: https://lore.kernel.org/linux-trace-kernel/20230714203341.51396-1-mkhalfell…
Cc: stable(a)vger.kernel.org
Fixes: 6018b585e8c6 ("tracing/histograms: Add histograms to hist_vars if they have referenced variables")
Signed-off-by: Mohamed Khalfella <mkhalfella(a)purestorage.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_events_hist.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index c8c61381eba4..d06938ae0717 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6668,7 +6668,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
goto out_unreg;
if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
- if (save_hist_vars(hist_data))
+ ret = save_hist_vars(hist_data);
+ if (ret)
goto out_unreg;
}
--
2.40.1
`rustc` outputs by default the temporary files (i.e. the ones saved
by `-Csave-temps`, such as `*.rcgu*` files) in the current working
directory when `-o` and `--out-dir` are not given (even if
`--emit=x=path` is given, i.e. it does not use those for temporaries).
Since out-of-tree modules are compiled from the `linux` tree,
`rustc` then tries to create them there, which may not be accessible.
Thus pass `--out-dir` explicitly, even if it is just for the temporary
files.
Reported-by: Raphael Nestler <raphael.nestler(a)gmail.com>
Closes: https://github.com/Rust-for-Linux/linux/issues/1015
Reported-by: Andrea Righi <andrea.righi(a)canonical.com>
Tested-by: Raphael Nestler <raphael.nestler(a)gmail.com>
Tested-by: Andrea Righi <andrea.righi(a)canonical.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
---
scripts/Makefile.build | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 6413342a03f4..82e3fb19fdaf 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -264,6 +264,9 @@ $(obj)/%.lst: $(src)/%.c FORCE
rust_allowed_features := new_uninit
+# `--out-dir` is required to avoid temporaries being created by `rustc` in the
+# current working directory, which may be not accessible in the out-of-tree
+# modules case.
rust_common_cmd = \
RUST_MODFILE=$(modfile) $(RUSTC_OR_CLIPPY) $(rust_flags) \
-Zallow-features=$(rust_allowed_features) \
@@ -272,7 +275,7 @@ rust_common_cmd = \
--extern alloc --extern kernel \
--crate-type rlib -L $(objtree)/rust/ \
--crate-name $(basename $(notdir $@)) \
- --emit=dep-info=$(depfile)
+ --out-dir $(dir $@) --emit=dep-info=$(depfile)
# `--emit=obj`, `--emit=asm` and `--emit=llvm-ir` imply a single codegen unit
# will be used. We explicitly request `-Ccodegen-units=1` in any case, and
base-commit: 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
--
2.41.0
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x e51df4f81b02bcdd828a04de7c1eb6a92988b61e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072307-art-revocable-bd1b@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
e51df4f81b02 ("ASoC: cs42l51: fix driver to properly autoload with automatic module loading")
4a4043456cb8 ("ASoC: cs*: use simple i2c probe function")
f8593e885400 ("ASoC: cs42l42: Handle system suspend")
5982b5a8ec7d ("ASoC: cs42l42: Change jack_detect_mutex to a lock of all IRQ handling")
8d06f797f844 ("ASoC: cs42l42: Report full jack status when plug is detected")
a319cb32e7cf ("ASoC: cs4265: Add a remove() function")
fdd535283779 ("ASoC: cs42l42: Report initial jack state")
c778c01d3e66 ("ASoC: cs42l42: Remove unused runtime_suspend/runtime_resume callbacks")
bfceb9c21601 ("Merge branch 'asoc-5.15' into asoc-5.16")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e51df4f81b02bcdd828a04de7c1eb6a92988b61e Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni(a)bootlin.com>
Date: Thu, 13 Jul 2023 13:21:12 +0200
Subject: [PATCH] ASoC: cs42l51: fix driver to properly autoload with automatic
module loading
In commit 2cb1e0259f50 ("ASoC: cs42l51: re-hook of_match_table
pointer"), 9 years ago, some random guy fixed the cs42l51 after it was
split into a core part and an I2C part to properly match based on a
Device Tree compatible string.
However, the fix in this commit is wrong: the MODULE_DEVICE_TABLE(of,
....) is in the core part of the driver, not the I2C part. Therefore,
automatic module loading based on module.alias, based on matching with
the DT compatible string, loads the core part of the driver, but not
the I2C part. And threfore, the i2c_driver is not registered, and the
codec is not known to the system, nor matched with a DT node with the
corresponding compatible string.
In order to fix that, we move the MODULE_DEVICE_TABLE(of, ...) into
the I2C part of the driver. The cs42l51_of_match[] array is also moved
as well, as it is not possible to have this definition in one file,
and the MODULE_DEVICE_TABLE(of, ...) invocation in another file, due
to how MODULE_DEVICE_TABLE works.
Thanks to this commit, the I2C part of the driver now properly
autoloads, and thanks to its dependency on the core part, the core
part gets autoloaded as well, resulting in a functional sound card
without having to manually load kernel modules.
Fixes: 2cb1e0259f50 ("ASoC: cs42l51: re-hook of_match_table pointer")
Cc: stable(a)vger.kernel.org
Signed-off-by: Thomas Petazzoni <thomas.petazzoni(a)bootlin.com>
Link: https://lore.kernel.org/r/20230713112112.778576-1-thomas.petazzoni@bootlin.…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/codecs/cs42l51-i2c.c b/sound/soc/codecs/cs42l51-i2c.c
index b2106ff6a7cb..e7db7bcd0296 100644
--- a/sound/soc/codecs/cs42l51-i2c.c
+++ b/sound/soc/codecs/cs42l51-i2c.c
@@ -19,6 +19,12 @@ static struct i2c_device_id cs42l51_i2c_id[] = {
};
MODULE_DEVICE_TABLE(i2c, cs42l51_i2c_id);
+const struct of_device_id cs42l51_of_match[] = {
+ { .compatible = "cirrus,cs42l51", },
+ { }
+};
+MODULE_DEVICE_TABLE(of, cs42l51_of_match);
+
static int cs42l51_i2c_probe(struct i2c_client *i2c)
{
struct regmap_config config;
diff --git a/sound/soc/codecs/cs42l51.c b/sound/soc/codecs/cs42l51.c
index a67cd3ee84e0..a7079ae0ca09 100644
--- a/sound/soc/codecs/cs42l51.c
+++ b/sound/soc/codecs/cs42l51.c
@@ -823,13 +823,6 @@ int __maybe_unused cs42l51_resume(struct device *dev)
}
EXPORT_SYMBOL_GPL(cs42l51_resume);
-const struct of_device_id cs42l51_of_match[] = {
- { .compatible = "cirrus,cs42l51", },
- { }
-};
-MODULE_DEVICE_TABLE(of, cs42l51_of_match);
-EXPORT_SYMBOL_GPL(cs42l51_of_match);
-
MODULE_AUTHOR("Arnaud Patard <arnaud.patard(a)rtp-net.org>");
MODULE_DESCRIPTION("Cirrus Logic CS42L51 ALSA SoC Codec Driver");
MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/cs42l51.h b/sound/soc/codecs/cs42l51.h
index a79343e8a54e..125703ede113 100644
--- a/sound/soc/codecs/cs42l51.h
+++ b/sound/soc/codecs/cs42l51.h
@@ -16,7 +16,6 @@ int cs42l51_probe(struct device *dev, struct regmap *regmap);
void cs42l51_remove(struct device *dev);
int __maybe_unused cs42l51_suspend(struct device *dev);
int __maybe_unused cs42l51_resume(struct device *dev);
-extern const struct of_device_id cs42l51_of_match[];
#define CS42L51_CHIP_ID 0x1B
#define CS42L51_CHIP_REV_A 0x00
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2a9482e55968ed7368afaa9c2133404069117320
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072302-unengaged-strum-b2ad@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
2a9482e55968 ("drm/amd/display: Prevent vtotal from being set to 0")
1a4bcdbea431 ("drm/amd/display: Fix possible underflow for displays with large vblank")
469a62938a45 ("drm/amd/display: update extended blank for dcn314 onwards")
e3416e872f84 ("drm/amd/display: Add FAMS validation before trying to use it")
0db13eae41fc ("drm/amd/display: Add minimum Z8 residency debug option")
73dd4ca4b5a0 ("drm/amd/display: Fix Z8 support configurations")
db4107e92a81 ("drm/amd/display: fix dc/core/dc.c kernel-doc")
00812bfc7bcb ("drm/amd/display: Add debug option to skip PSR CRTC disable")
80676936805e ("drm/amd/display: Add Z8 allow states to z-state support list")
e366f36958f6 ("drm/amd/display: Rework comments on dc file")
bd829d570773 ("drm/amd/display: Refactor eDP PSR codes")
6f4f8ff567c4 ("drm/amd/display: Display does not light up after S4 resume")
4f5bdde386d3 ("drm/amd/display: Update PMFW z-state interface for DCN314")
8cab4ef0ad95 ("drm/amd/display: Keep OTG on when Z10 is disable")
1178ac68dc28 ("drm/amd/display: Refactor edp ILR caps codes")
5d4b59146078 ("drm/amd/display: Add ABM control to panel_config struct.")
936675464b1f ("drm/amd/display: Add debug option for exiting idle optimizations on cursor updates")
eccff6cdde6f ("drm/amd/display: Refactor edp panel power sequencer(PPS) codes")
9f6f6be163df ("drm/amd/display: Add comments.")
c17a34e0526f ("drm/amd/display: Refactor edp dsc codes.")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2a9482e55968ed7368afaa9c2133404069117320 Mon Sep 17 00:00:00 2001
From: Daniel Miess <daniel.miess(a)amd.com>
Date: Thu, 22 Jun 2023 08:11:48 -0400
Subject: [PATCH] drm/amd/display: Prevent vtotal from being set to 0
[Why]
In dcn314 DML the destination pipe vtotal was being set
to the crtc adjustment vtotal_min value even in cases
where that value is 0.
[How]
Only set vtotal to the crtc adjustment vtotal_min value
in cases where the value is non-zero.
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Acked-by: Alan Liu <haoping.liu(a)amd.com>
Signed-off-by: Daniel Miess <daniel.miess(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
index d9e049e7ff0a..ed8ddb75b333 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
@@ -295,7 +295,11 @@ int dcn314_populate_dml_pipes_from_context_fpu(struct dc *dc, struct dc_state *c
pipe = &res_ctx->pipe_ctx[i];
timing = &pipe->stream->timing;
- pipes[pipe_cnt].pipe.dest.vtotal = pipe->stream->adjust.v_total_min;
+ if (pipe->stream->adjust.v_total_min != 0)
+ pipes[pipe_cnt].pipe.dest.vtotal = pipe->stream->adjust.v_total_min;
+ else
+ pipes[pipe_cnt].pipe.dest.vtotal = timing->v_total;
+
pipes[pipe_cnt].pipe.dest.vblank_nom = timing->v_total - pipes[pipe_cnt].pipe.dest.vactive;
pipes[pipe_cnt].pipe.dest.vblank_nom = min(pipes[pipe_cnt].pipe.dest.vblank_nom, dcn3_14_ip.VBlankNomDefaultUS);
pipes[pipe_cnt].pipe.dest.vblank_nom = max(pipes[pipe_cnt].pipe.dest.vblank_nom, timing->v_sync_width);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 2a9482e55968ed7368afaa9c2133404069117320
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072301-urging-upheld-1851@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
2a9482e55968 ("drm/amd/display: Prevent vtotal from being set to 0")
1a4bcdbea431 ("drm/amd/display: Fix possible underflow for displays with large vblank")
469a62938a45 ("drm/amd/display: update extended blank for dcn314 onwards")
e3416e872f84 ("drm/amd/display: Add FAMS validation before trying to use it")
0db13eae41fc ("drm/amd/display: Add minimum Z8 residency debug option")
73dd4ca4b5a0 ("drm/amd/display: Fix Z8 support configurations")
db4107e92a81 ("drm/amd/display: fix dc/core/dc.c kernel-doc")
00812bfc7bcb ("drm/amd/display: Add debug option to skip PSR CRTC disable")
80676936805e ("drm/amd/display: Add Z8 allow states to z-state support list")
e366f36958f6 ("drm/amd/display: Rework comments on dc file")
bd829d570773 ("drm/amd/display: Refactor eDP PSR codes")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2a9482e55968ed7368afaa9c2133404069117320 Mon Sep 17 00:00:00 2001
From: Daniel Miess <daniel.miess(a)amd.com>
Date: Thu, 22 Jun 2023 08:11:48 -0400
Subject: [PATCH] drm/amd/display: Prevent vtotal from being set to 0
[Why]
In dcn314 DML the destination pipe vtotal was being set
to the crtc adjustment vtotal_min value even in cases
where that value is 0.
[How]
Only set vtotal to the crtc adjustment vtotal_min value
in cases where the value is non-zero.
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Acked-by: Alan Liu <haoping.liu(a)amd.com>
Signed-off-by: Daniel Miess <daniel.miess(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
index d9e049e7ff0a..ed8ddb75b333 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
@@ -295,7 +295,11 @@ int dcn314_populate_dml_pipes_from_context_fpu(struct dc *dc, struct dc_state *c
pipe = &res_ctx->pipe_ctx[i];
timing = &pipe->stream->timing;
- pipes[pipe_cnt].pipe.dest.vtotal = pipe->stream->adjust.v_total_min;
+ if (pipe->stream->adjust.v_total_min != 0)
+ pipes[pipe_cnt].pipe.dest.vtotal = pipe->stream->adjust.v_total_min;
+ else
+ pipes[pipe_cnt].pipe.dest.vtotal = timing->v_total;
+
pipes[pipe_cnt].pipe.dest.vblank_nom = timing->v_total - pipes[pipe_cnt].pipe.dest.vactive;
pipes[pipe_cnt].pipe.dest.vblank_nom = min(pipes[pipe_cnt].pipe.dest.vblank_nom, dcn3_14_ip.VBlankNomDefaultUS);
pipes[pipe_cnt].pipe.dest.vblank_nom = max(pipes[pipe_cnt].pipe.dest.vblank_nom, timing->v_sync_width);
The patch below does not apply to the 6.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2a9482e55968ed7368afaa9c2133404069117320
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072300-thwarting-luckiness-18bd@gregkh' --subject-prefix 'PATCH 6.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2a9482e55968ed7368afaa9c2133404069117320 Mon Sep 17 00:00:00 2001
From: Daniel Miess <daniel.miess(a)amd.com>
Date: Thu, 22 Jun 2023 08:11:48 -0400
Subject: [PATCH] drm/amd/display: Prevent vtotal from being set to 0
[Why]
In dcn314 DML the destination pipe vtotal was being set
to the crtc adjustment vtotal_min value even in cases
where that value is 0.
[How]
Only set vtotal to the crtc adjustment vtotal_min value
in cases where the value is non-zero.
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Acked-by: Alan Liu <haoping.liu(a)amd.com>
Signed-off-by: Daniel Miess <daniel.miess(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
index d9e049e7ff0a..ed8ddb75b333 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c
@@ -295,7 +295,11 @@ int dcn314_populate_dml_pipes_from_context_fpu(struct dc *dc, struct dc_state *c
pipe = &res_ctx->pipe_ctx[i];
timing = &pipe->stream->timing;
- pipes[pipe_cnt].pipe.dest.vtotal = pipe->stream->adjust.v_total_min;
+ if (pipe->stream->adjust.v_total_min != 0)
+ pipes[pipe_cnt].pipe.dest.vtotal = pipe->stream->adjust.v_total_min;
+ else
+ pipes[pipe_cnt].pipe.dest.vtotal = timing->v_total;
+
pipes[pipe_cnt].pipe.dest.vblank_nom = timing->v_total - pipes[pipe_cnt].pipe.dest.vactive;
pipes[pipe_cnt].pipe.dest.vblank_nom = min(pipes[pipe_cnt].pipe.dest.vblank_nom, dcn3_14_ip.VBlankNomDefaultUS);
pipes[pipe_cnt].pipe.dest.vblank_nom = max(pipes[pipe_cnt].pipe.dest.vblank_nom, timing->v_sync_width);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b42ae87a7b3878afaf4c3852ca66c025a5b996e0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072344-gyration-coveted-1049@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
b42ae87a7b38 ("drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel")
deefd07eedb7 ("drm/amdgpu: fix vkms crtc settings")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b42ae87a7b3878afaf4c3852ca66c025a5b996e0 Mon Sep 17 00:00:00 2001
From: Guchun Chen <guchun.chen(a)amd.com>
Date: Thu, 6 Jul 2023 15:57:21 +0800
Subject: [PATCH] drm/amdgpu/vkms: relax timer deactivation by
hrtimer_try_to_cancel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
In below thousands of screen rotation loop tests with virtual display
enabled, a CPU hard lockup issue may happen, leading system to unresponsive
and crash.
do {
xrandr --output Virtual --rotate inverted
xrandr --output Virtual --rotate right
xrandr --output Virtual --rotate left
xrandr --output Virtual --rotate normal
} while (1);
NMI watchdog: Watchdog detected hard LOCKUP on cpu 1
? hrtimer_run_softirq+0x140/0x140
? store_vblank+0xe0/0xe0 [drm]
hrtimer_cancel+0x15/0x30
amdgpu_vkms_disable_vblank+0x15/0x30 [amdgpu]
drm_vblank_disable_and_save+0x185/0x1f0 [drm]
drm_crtc_vblank_off+0x159/0x4c0 [drm]
? record_print_text.cold+0x11/0x11
? wait_for_completion_timeout+0x232/0x280
? drm_crtc_wait_one_vblank+0x40/0x40 [drm]
? bit_wait_io_timeout+0xe0/0xe0
? wait_for_completion_interruptible+0x1d7/0x320
? mutex_unlock+0x81/0xd0
amdgpu_vkms_crtc_atomic_disable
It's caused by a stuck in lock dependency in such scenario on different
CPUs.
CPU1 CPU2
drm_crtc_vblank_off hrtimer_interrupt
grab event_lock (irq disabled) __hrtimer_run_queues
grab vbl_lock/vblank_time_block amdgpu_vkms_vblank_simulate
amdgpu_vkms_disable_vblank drm_handle_vblank
hrtimer_cancel grab dev->event_lock
So CPU1 stucks in hrtimer_cancel as timer callback is running endless on
current clock base, as that timer queue on CPU2 has no chance to finish it
because of failing to hold the lock. So NMI watchdog will throw the errors
after its threshold, and all later CPUs are impacted/blocked.
So use hrtimer_try_to_cancel to fix this, as disable_vblank callback
does not need to wait the handler to finish. And also it's not necessary
to check the return value of hrtimer_try_to_cancel, because even if it's
-1 which means current timer callback is running, it will be reprogrammed
in hrtimer_start with calling enable_vblank to make it works.
v2: only re-arm timer when vblank is enabled (Christian) and add a Fixes
tag as well
v3: drop warn printing (Christian)
v4: drop superfluous check of blank->enabled in timer function, as it's
guaranteed in drm_handle_vblank (Christian)
Fixes: 84ec374bd580 ("drm/amdgpu: create amdgpu_vkms (v4)")
Cc: stable(a)vger.kernel.org
Suggested-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Guchun Chen <guchun.chen(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 53ff91fc6cf6..d0748bcfad16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -55,8 +55,9 @@ static enum hrtimer_restart amdgpu_vkms_vblank_simulate(struct hrtimer *timer)
DRM_WARN("%s: vblank timer overrun\n", __func__);
ret = drm_crtc_handle_vblank(crtc);
+ /* Don't queue timer again when vblank is disabled. */
if (!ret)
- DRM_ERROR("amdgpu_vkms failure on handling vblank");
+ return HRTIMER_NORESTART;
return HRTIMER_RESTART;
}
@@ -81,7 +82,7 @@ static void amdgpu_vkms_disable_vblank(struct drm_crtc *crtc)
{
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
- hrtimer_cancel(&amdgpu_crtc->vblank_timer);
+ hrtimer_try_to_cancel(&amdgpu_crtc->vblank_timer);
}
static bool amdgpu_vkms_get_vblank_timestamp(struct drm_crtc *crtc,