In case of the COW file, new updates and GC writes are already
separated to page caches of the atomic file and COW file. As some cases
that use the meta inode for GC, there are some race issues between a
foreground thread and GC thread.
To handle them, we need to take care when to invalidate and wait
writeback of GC pages in COW files as the case of using the meta inode.
Also, a pointer from the COW inode to the original inode is required to
check the state of original pages.
For the former, we can solve the problem by using the meta inode for GC
of COW files. Then let's get a page from the original inode in
move_data_block when GCing the COW file to avoid race condition.
Fixes: 3db1de0e582c ("f2fs: change the current atomic write way")
Cc: stable(a)vger.kernel.org #v5.19+
Reviewed-by: Sungjong Seo <sj1557.seo(a)samsung.com>
Reviewed-by: Yeongjin Gil <youngjin.gil(a)samsung.com>
Signed-off-by: Sunmin Jeong <s_min.jeong(a)samsung.com>
---
fs/f2fs/data.c | 2 +-
fs/f2fs/f2fs.h | 7 ++++++-
fs/f2fs/file.c | 3 +++
fs/f2fs/gc.c | 12 ++++++++++--
fs/f2fs/inline.c | 2 +-
fs/f2fs/inode.c | 3 ++-
6 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 05158f89ef32..90ff0f6f7f7f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2651,7 +2651,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (IS_NOQUOTA(inode))
return true;
- if (f2fs_is_atomic_file(inode))
+ if (f2fs_used_in_atomic_write(inode))
return true;
/* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */
if (f2fs_compressed_file(inode) &&
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 59c5117e54b1..4f9fd1c1d024 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4267,9 +4267,14 @@ static inline bool f2fs_post_read_required(struct inode *inode)
f2fs_compressed_file(inode);
}
+static inline bool f2fs_used_in_atomic_write(struct inode *inode)
+{
+ return f2fs_is_atomic_file(inode) || f2fs_is_cow_file(inode);
+}
+
static inline bool f2fs_meta_inode_gc_required(struct inode *inode)
{
- return f2fs_post_read_required(inode) || f2fs_is_atomic_file(inode);
+ return f2fs_post_read_required(inode) || f2fs_used_in_atomic_write(inode);
}
/*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 25b119cf3499..c9f0ba658cfd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2116,6 +2116,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
set_inode_flag(fi->cow_inode, FI_COW_FILE);
clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+
+ /* Set the COW inode's cow_inode to the atomic inode */
+ F2FS_I(fi->cow_inode)->cow_inode = inode;
} else {
/* Reuse the already created COW inode */
ret = f2fs_do_truncate_blocks(fi->cow_inode, 0, true);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 136b9e8180a3..76854e732b35 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1188,7 +1188,11 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
};
int err;
- page = f2fs_grab_cache_page(mapping, index, true);
+ if (f2fs_is_cow_file(inode))
+ page = f2fs_grab_cache_page(F2FS_I(inode)->cow_inode->i_mapping,
+ index, true);
+ else
+ page = f2fs_grab_cache_page(mapping, index, true);
if (!page)
return -ENOMEM;
@@ -1287,7 +1291,11 @@ static int move_data_block(struct inode *inode, block_t bidx,
CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
/* do not read out */
- page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
+ if (f2fs_is_cow_file(inode))
+ page = f2fs_grab_cache_page(F2FS_I(inode)->cow_inode->i_mapping,
+ bidx, false);
+ else
+ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
return -ENOMEM;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index ac00423f117b..0186ec049db6 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -16,7 +16,7 @@
static bool support_inline_data(struct inode *inode)
{
- if (f2fs_is_atomic_file(inode))
+ if (f2fs_used_in_atomic_write(inode))
return false;
if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index c26effdce9aa..c810304e2681 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -807,8 +807,9 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_abort_atomic_write(inode, true);
- if (fi->cow_inode) {
+ if (fi->cow_inode && f2fs_is_cow_file(fi->cow_inode)) {
clear_inode_flag(fi->cow_inode, FI_COW_FILE);
+ F2FS_I(fi->cow_inode)->cow_inode = NULL;
iput(fi->cow_inode);
fi->cow_inode = NULL;
}
--
2.25.1
The page cache of the atomic file keeps new data pages which will be
stored in the COW file. It can also keep old data pages when GCing the
atomic file. In this case, new data can be overwritten by old data if a
GC thread sets the old data page as dirty after new data page was
evicted.
Also, since all writes to the atomic file are redirected to COW inodes,
GC for the atomic file is not working well as below.
f2fs_gc(gc_type=FG_GC)
- select A as a victim segment
do_garbage_collect
- iget atomic file's inode for block B
move_data_page
f2fs_do_write_data_page
- use dn of cow inode
- set fio->old_blkaddr from cow inode
- seg_freed is 0 since block B is still valid
- goto gc_more and A is selected as victim again
To solve the problem, let's separate GC writes and updates in the atomic
file by using the meta inode for GC writes.
Fixes: 3db1de0e582c ("f2fs: change the current atomic write way")
Cc: stable(a)vger.kernel.org #v5.19+
Reviewed-by: Sungjong Seo <sj1557.seo(a)samsung.com>
Reviewed-by: Yeongjin Gil <youngjin.gil(a)samsung.com>
Signed-off-by: Sunmin Jeong <s_min.jeong(a)samsung.com>
---
fs/f2fs/f2fs.h | 5 +++++
fs/f2fs/gc.c | 6 +++---
fs/f2fs/segment.c | 4 ++--
3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a000cb024dbe..59c5117e54b1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4267,6 +4267,11 @@ static inline bool f2fs_post_read_required(struct inode *inode)
f2fs_compressed_file(inode);
}
+static inline bool f2fs_meta_inode_gc_required(struct inode *inode)
+{
+ return f2fs_post_read_required(inode) || f2fs_is_atomic_file(inode);
+}
+
/*
* compress.c
*/
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a079eebfb080..136b9e8180a3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1580,7 +1580,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
ofs_in_node;
- if (f2fs_post_read_required(inode)) {
+ if (f2fs_meta_inode_gc_required(inode)) {
int err = ra_data_block(inode, start_bidx);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1631,7 +1631,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
start_bidx = f2fs_start_bidx_of_node(nofs, inode)
+ ofs_in_node;
- if (f2fs_post_read_required(inode))
+ if (f2fs_meta_inode_gc_required(inode))
err = move_data_block(inode, start_bidx,
gc_type, segno, off);
else
@@ -1639,7 +1639,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
segno, off);
if (!err && (gc_type == FG_GC ||
- f2fs_post_read_required(inode)))
+ f2fs_meta_inode_gc_required(inode)))
submitted++;
if (locked) {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7e47b8054413..b55fc4bd416a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3823,7 +3823,7 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *cpage;
- if (!f2fs_post_read_required(inode))
+ if (!f2fs_meta_inode_gc_required(inode))
return;
if (!__is_valid_data_blkaddr(blkaddr))
@@ -3842,7 +3842,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
block_t i;
- if (!f2fs_post_read_required(inode))
+ if (!f2fs_meta_inode_gc_required(inode))
return;
for (i = 0; i < len; i++)
--
2.25.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b321c31c9b7b309dcde5e8854b741c8e6a9a05f0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072323-trident-unturned-7999@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
b321c31c9b7b ("KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t preemption")
0c2f9acf6ae7 ("KVM: arm64: PMU: Don't overwrite PMUSERENR with vcpu loaded")
8681f7175901 ("KVM: arm64: PMU: Restore the host's PMUSERENR_EL0")
009d6dc87a56 ("ARM: perf: Allow the use of the PMUv3 driver on 32bit ARM")
711432770f78 ("perf: pmuv3: Abstract PMU version checks")
df29ddf4f04b ("arm64: perf: Abstract system register accesses away")
7755cec63ade ("arm64: perf: Move PMUv3 driver to drivers/perf")
cc91b9481605 ("arm64/perf: Replace PMU version number '0' with ID_AA64DFR0_EL1_PMUVer_NI")
4151bb636acf ("KVM: arm64: Fix SMPRI_EL1/TPIDR2_EL0 trapping on VHE")
bb0cca240a16 ("Merge branch kvm-arm64/single-step-async-exception into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b321c31c9b7b309dcde5e8854b741c8e6a9a05f0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 13 Jul 2023 08:06:57 +0100
Subject: [PATCH] KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t
preemption
Xiang reports that VMs occasionally fail to boot on GICv4.1 systems when
running a preemptible kernel, as it is possible that a vCPU is blocked
without requesting a doorbell interrupt.
The issue is that any preemption that occurs between vgic_v4_put() and
schedule() on the block path will mark the vPE as nonresident and *not*
request a doorbell irq. This occurs because when the vcpu thread is
resumed on its way to block, vcpu_load() will make the vPE resident
again. Once the vcpu actually blocks, we don't request a doorbell
anymore, and the vcpu won't be woken up on interrupt delivery.
Fix it by tracking that we're entering WFI, and key the doorbell
request on that flag. This allows us not to make the vPE resident
when going through a preempt/schedule cycle, meaning we don't lose
any state.
Cc: stable(a)vger.kernel.org
Fixes: 8e01d9a396e6 ("KVM: arm64: vgic-v4: Move the GICv4 residency flow to be driven by vcpu_load/put")
Reported-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Suggested-by: Zenghui Yu <yuzenghui(a)huawei.com>
Tested-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Co-developed-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Zenghui Yu <yuzenghui(a)huawei.com>
Link: https://lore.kernel.org/r/20230713070657.3873244-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8b6096753740..d3dd05bbfe23 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -727,6 +727,8 @@ struct kvm_vcpu_arch {
#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5))
/* PMUSERENR for the guest EL0 is on physical CPU */
#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6))
+/* WFI instruction trapped */
+#define IN_WFI __vcpu_single_flag(sflags, BIT(7))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a402ea5511f3..72dc53a75d1c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -718,13 +718,15 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
*/
preempt_disable();
kvm_vgic_vmcr_sync(vcpu);
- vgic_v4_put(vcpu, true);
+ vcpu_set_flag(vcpu, IN_WFI);
+ vgic_v4_put(vcpu);
preempt_enable();
kvm_vcpu_halt(vcpu);
vcpu_clear_flag(vcpu, IN_WFIT);
preempt_disable();
+ vcpu_clear_flag(vcpu, IN_WFI);
vgic_v4_load(vcpu);
preempt_enable();
}
@@ -792,7 +794,7 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
/* The distributor enable bits were changed */
preempt_disable();
- vgic_v4_put(vcpu, false);
+ vgic_v4_put(vcpu);
vgic_v4_load(vcpu);
preempt_enable();
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index c3b8e132d599..3dfc8b84e03e 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -749,7 +749,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- WARN_ON(vgic_v4_put(vcpu, false));
+ WARN_ON(vgic_v4_put(vcpu));
vgic_v3_vmcr_sync(vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index c1c28fe680ba..339a55194b2c 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -336,14 +336,14 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
return 0;
- return its_make_vpe_non_resident(vpe, need_db);
+ return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI));
}
int vgic_v4_load(struct kvm_vcpu *vcpu)
@@ -354,6 +354,9 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
return 0;
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ return 0;
+
/*
* Before making the VPE resident, make sure the redistributor
* corresponding to our current CPU expects us here. See the
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 402b545959af..5b27f94d4fad 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -431,7 +431,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db);
+int vgic_v4_put(struct kvm_vcpu *vcpu);
/* CPU HP callbacks */
void kvm_vgic_cpu_up(void);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 75dde792d6f6c2d0af50278bd374bf0c512fe196
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062455-glazing-flask-cf0c@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
75dde792d6f6 ("efi/x86: Free EFI memory map only when installing a new one.")
d85e3e349407 ("efi: xen: Set EFI_PARAVIRT for Xen dom0 boot on all architectures")
fdc6d38d64a2 ("efi: memmap: Move manipulation routines into x86 arch tree")
1df4d1724baa ("drivers: fix typo in firmware/efi/memmap.c")
db01ea882bf6 ("efi: Correct comment on efi_memmap_alloc")
3ecc68349bba ("memblock: rename memblock_free to memblock_phys_free")
fa27717110ae ("memblock: drop memblock_free_early_nid() and memblock_free_early()")
658aafc8139c ("memblock: exclude MEMBLOCK_NOMAP regions from kmemleak")
c6460daea23d ("Merge tag 'for-linus-5.15b-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 75dde792d6f6c2d0af50278bd374bf0c512fe196 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb(a)kernel.org>
Date: Mon, 10 Jun 2024 16:02:13 +0200
Subject: [PATCH] efi/x86: Free EFI memory map only when installing a new one.
The logic in __efi_memmap_init() is shared between two different
execution flows:
- mapping the EFI memory map early or late into the kernel VA space, so
that its entries can be accessed;
- the x86 specific cloning of the EFI memory map in order to insert new
entries that are created as a result of making a memory reservation
via a call to efi_mem_reserve().
In the former case, the underlying memory containing the kernel's view
of the EFI memory map (which may be heavily modified by the kernel
itself on x86) is not modified at all, and the only thing that changes
is the virtual mapping of this memory, which is different between early
and late boot.
In the latter case, an entirely new allocation is created that carries a
new, updated version of the kernel's view of the EFI memory map. When
installing this new version, the old version will no longer be
referenced, and if the memory was allocated by the kernel, it will leak
unless it gets freed.
The logic that implements this freeing currently lives on the code path
that is shared between these two use cases, but it should only apply to
the latter. So move it to the correct spot.
While at it, drop the dummy definition for non-x86 architectures, as
that is no longer needed.
Cc: <stable(a)vger.kernel.org>
Fixes: f0ef6523475f ("efi: Fix efi_memmap_alloc() leaks")
Tested-by: Ashish Kalra <Ashish.Kalra(a)amd.com>
Link: https://lore.kernel.org/all/36ad5079-4326-45ed-85f6-928ff76483d3@amd.com
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 1dc600fa3ba5..481096177500 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -401,7 +401,6 @@ extern int __init efi_memmap_alloc(unsigned int num_entries,
struct efi_memory_map_data *data);
extern void __efi_memmap_free(u64 phys, unsigned long size,
unsigned long flags);
-#define __efi_memmap_free __efi_memmap_free
extern int __init efi_memmap_install(struct efi_memory_map_data *data);
extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
index 4ef20b49eb5e..6ed1935504b9 100644
--- a/arch/x86/platform/efi/memmap.c
+++ b/arch/x86/platform/efi/memmap.c
@@ -92,12 +92,22 @@ int __init efi_memmap_alloc(unsigned int num_entries,
*/
int __init efi_memmap_install(struct efi_memory_map_data *data)
{
+ unsigned long size = efi.memmap.desc_size * efi.memmap.nr_map;
+ unsigned long flags = efi.memmap.flags;
+ u64 phys = efi.memmap.phys_map;
+ int ret;
+
efi_memmap_unmap();
if (efi_enabled(EFI_PARAVIRT))
return 0;
- return __efi_memmap_init(data);
+ ret = __efi_memmap_init(data);
+ if (ret)
+ return ret;
+
+ __efi_memmap_free(phys, size, flags);
+ return 0;
}
/**
diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c
index 3365944f7965..34109fd86c55 100644
--- a/drivers/firmware/efi/memmap.c
+++ b/drivers/firmware/efi/memmap.c
@@ -15,10 +15,6 @@
#include <asm/early_ioremap.h>
#include <asm/efi.h>
-#ifndef __efi_memmap_free
-#define __efi_memmap_free(phys, size, flags) do { } while (0)
-#endif
-
/**
* __efi_memmap_init - Common code for mapping the EFI memory map
* @data: EFI memory map data
@@ -51,11 +47,6 @@ int __init __efi_memmap_init(struct efi_memory_map_data *data)
return -ENOMEM;
}
- if (efi.memmap.flags & (EFI_MEMMAP_MEMBLOCK | EFI_MEMMAP_SLAB))
- __efi_memmap_free(efi.memmap.phys_map,
- efi.memmap.desc_size * efi.memmap.nr_map,
- efi.memmap.flags);
-
map.phys_map = data->phys_map;
map.nr_map = data->size / data->desc_size;
map.map_end = map.map + data->size;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 75dde792d6f6c2d0af50278bd374bf0c512fe196
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062442-bonfire-detonator-1b17@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
75dde792d6f6 ("efi/x86: Free EFI memory map only when installing a new one.")
d85e3e349407 ("efi: xen: Set EFI_PARAVIRT for Xen dom0 boot on all architectures")
fdc6d38d64a2 ("efi: memmap: Move manipulation routines into x86 arch tree")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 75dde792d6f6c2d0af50278bd374bf0c512fe196 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb(a)kernel.org>
Date: Mon, 10 Jun 2024 16:02:13 +0200
Subject: [PATCH] efi/x86: Free EFI memory map only when installing a new one.
The logic in __efi_memmap_init() is shared between two different
execution flows:
- mapping the EFI memory map early or late into the kernel VA space, so
that its entries can be accessed;
- the x86 specific cloning of the EFI memory map in order to insert new
entries that are created as a result of making a memory reservation
via a call to efi_mem_reserve().
In the former case, the underlying memory containing the kernel's view
of the EFI memory map (which may be heavily modified by the kernel
itself on x86) is not modified at all, and the only thing that changes
is the virtual mapping of this memory, which is different between early
and late boot.
In the latter case, an entirely new allocation is created that carries a
new, updated version of the kernel's view of the EFI memory map. When
installing this new version, the old version will no longer be
referenced, and if the memory was allocated by the kernel, it will leak
unless it gets freed.
The logic that implements this freeing currently lives on the code path
that is shared between these two use cases, but it should only apply to
the latter. So move it to the correct spot.
While at it, drop the dummy definition for non-x86 architectures, as
that is no longer needed.
Cc: <stable(a)vger.kernel.org>
Fixes: f0ef6523475f ("efi: Fix efi_memmap_alloc() leaks")
Tested-by: Ashish Kalra <Ashish.Kalra(a)amd.com>
Link: https://lore.kernel.org/all/36ad5079-4326-45ed-85f6-928ff76483d3@amd.com
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 1dc600fa3ba5..481096177500 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -401,7 +401,6 @@ extern int __init efi_memmap_alloc(unsigned int num_entries,
struct efi_memory_map_data *data);
extern void __efi_memmap_free(u64 phys, unsigned long size,
unsigned long flags);
-#define __efi_memmap_free __efi_memmap_free
extern int __init efi_memmap_install(struct efi_memory_map_data *data);
extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
index 4ef20b49eb5e..6ed1935504b9 100644
--- a/arch/x86/platform/efi/memmap.c
+++ b/arch/x86/platform/efi/memmap.c
@@ -92,12 +92,22 @@ int __init efi_memmap_alloc(unsigned int num_entries,
*/
int __init efi_memmap_install(struct efi_memory_map_data *data)
{
+ unsigned long size = efi.memmap.desc_size * efi.memmap.nr_map;
+ unsigned long flags = efi.memmap.flags;
+ u64 phys = efi.memmap.phys_map;
+ int ret;
+
efi_memmap_unmap();
if (efi_enabled(EFI_PARAVIRT))
return 0;
- return __efi_memmap_init(data);
+ ret = __efi_memmap_init(data);
+ if (ret)
+ return ret;
+
+ __efi_memmap_free(phys, size, flags);
+ return 0;
}
/**
diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c
index 3365944f7965..34109fd86c55 100644
--- a/drivers/firmware/efi/memmap.c
+++ b/drivers/firmware/efi/memmap.c
@@ -15,10 +15,6 @@
#include <asm/early_ioremap.h>
#include <asm/efi.h>
-#ifndef __efi_memmap_free
-#define __efi_memmap_free(phys, size, flags) do { } while (0)
-#endif
-
/**
* __efi_memmap_init - Common code for mapping the EFI memory map
* @data: EFI memory map data
@@ -51,11 +47,6 @@ int __init __efi_memmap_init(struct efi_memory_map_data *data)
return -ENOMEM;
}
- if (efi.memmap.flags & (EFI_MEMMAP_MEMBLOCK | EFI_MEMMAP_SLAB))
- __efi_memmap_free(efi.memmap.phys_map,
- efi.memmap.desc_size * efi.memmap.nr_map,
- efi.memmap.flags);
-
map.phys_map = data->phys_map;
map.nr_map = data->size / data->desc_size;
map.map_end = map.map + data->size;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x bf14ed81f571f8dba31cd72ab2e50fbcc877cc31
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024070129-movable-commend-1b2a@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
bf14ed81f571 ("mm/page_alloc: Separate THP PCP into movable and non-movable categories")
6303d1c553c8 ("mm: page_alloc: use the correct THP order for THP PCP")
c1dc69e6ce65 ("mm/page_alloc: remove unneeded variable base")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bf14ed81f571f8dba31cd72ab2e50fbcc877cc31 Mon Sep 17 00:00:00 2001
From: yangge <yangge1116(a)126.com>
Date: Thu, 20 Jun 2024 08:59:50 +0800
Subject: [PATCH] mm/page_alloc: Separate THP PCP into movable and non-movable
categories
Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for
THP-sized allocations") no longer differentiates the migration type of
pages in THP-sized PCP list, it's possible that non-movable allocation
requests may get a CMA page from the list, in some cases, it's not
acceptable.
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
machine with device passthrough will get stuck. During starting the
virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM,
...) to pin memory. Normally if a page is present and in CMA area,
pin_user_pages_remote() will migrate the page from CMA area to non-CMA
area because of FOLL_LONGTERM flag. But if non-movable allocation
requests return CMA memory, migrate_longterm_unpinnable_pages() will
migrate a CMA page to another CMA page, which will fail to pass the check
in check_and_migrate_movable_pages() and cause migration endless.
Call trace:
pin_user_pages_remote
--__gup_longterm_locked // endless loops in this function
----_get_user_pages_locked
----check_and_migrate_movable_pages
------migrate_longterm_unpinnable_pages
--------alloc_migration_target
This problem will also have a negative impact on CMA itself. For example,
when CMA is borrowed by THP, and we need to reclaim it through cma_alloc()
or dma_alloc_coherent(), we must move those pages out to ensure CMA's
users can retrieve that contigous memory. Currently, CMA's memory is
occupied by non-movable pages, meaning we can't relocate them. As a
result, cma_alloc() is more likely to fail.
To fix the problem above, we add one PCP list for THP, which will not
introduce a new cacheline for struct per_cpu_pages. THP will have 2 PCP
lists, one PCP list is used by MOVABLE allocation, and the other PCP list
is used by UNMOVABLE allocation. MOVABLE allocation contains GPF_MOVABLE,
and UNMOVABLE allocation contains GFP_UNMOVABLE and GFP_RECLAIMABLE.
Link: https://lkml.kernel.org/r/1718845190-4456-1-git-send-email-yangge1116@126.c…
Fixes: 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations")
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8f9c9590a42c..586a8f0104d7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -654,13 +654,12 @@ enum zone_watermarks {
};
/*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
- * for THP which will usually be GFP_MOVABLE. Even if it is another type,
- * it should not contribute to serious fragmentation causing THP allocation
- * failures.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
+ * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
+ * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define NR_PCP_THP 1
+#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7300aa9f14b0..9ecf99190ea2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -504,10 +504,15 @@ static void bad_page(struct page *page, const char *reason)
static inline unsigned int order_to_pindex(int migratetype, int order)
{
+ bool __maybe_unused movable;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
- return NR_LOWORDER_PCP_LISTS;
+
+ movable = migratetype == MIGRATE_MOVABLE;
+
+ return NR_LOWORDER_PCP_LISTS + movable;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -521,7 +526,7 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pindex == NR_LOWORDER_PCP_LISTS)
+ if (pindex >= NR_LOWORDER_PCP_LISTS)
order = HPAGE_PMD_ORDER;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
commit 0b768a9610c6de9811c6d33900bebfb665192ee1 upstream
The pagecache handles readpage failing by itself; it doesn't want
filesystems to remove pages from under it.
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu(a)amazon.com>
---
When NFS server returns NFS4ERR_SERVERFAULT, the client returned
Remote I/O error immediately on 4.14 and 6.1, but on 5.4/5.10/5.15,
the client retries forever and get stuck until userspace aborts it.
The patch fixed the issue but did not have Fixes: tag.
Please backport this to 5.4/5.10/5.15.
---
fs/nfs/read.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 08d6cc57cbc3..b02372ec07a5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -120,12 +120,8 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
SetPageError(page);
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- struct address_space *mapping = page_file_mapping(page);
-
if (PageUptodate(page))
nfs_readpage_to_fscache(inode, page, 0);
- else if (!PageError(page) && !PagePrivate(page))
- generic_error_remove_page(mapping, page);
unlock_page(page);
}
nfs_release_request(req);
--
2.30.2
The patch below does not apply to the 6.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.9.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062455-green-reach-3f21@gregkh' --subject-prefix 'PATCH 6.9.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
Recently Luiz Capitulino reported BPF test failure for kernel version
6.1.36 (see [7]). The following test_verifier test failed:
"precise: ST insn causing spi > allocated_stack".
After back-port of the following upstream commit:
ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
Investigation in [8] shows that test failure is not a bug, but a
difference in BPF verifier behavior between upstream, where commits
[1,2,3] by Andrii Nakryiko are present, and 6.1.36, where these
commits are absent. Both Luiz and Greg suggested back-porting [1,2,3]
from upstream to avoid divergences.
Commits [1,2,3] break test_progs selftest "align/packet variable offset",
commit [4] fixes this selftest.
I did some additional testing using the following compiler versions:
- Kernel compilation
- gcc version 11.3.0
- BPF tests compilation
- clang version 16.0.6
- clang version 17.0.0 (fa46feb31481)
And identified a few more failing BPF selftests:
- Tests failing with LLVM 16:
- test_verifier:
- precise: ST insn causing spi > allocated_stack FAIL (fixed by [1,2,3])
- test_progs:
- sk_assign (fixed by [6])
- Tests failing with LLVM 17:
- test_verifier:
- precise: ST insn causing spi > allocated_stack FAIL (fixed by [1,2,3])
- test_progs:
- fexit_bpf2bpf/func_replace_verify (fixed by [5])
- fexit_bpf2bpf/func_replace_return_code (fixed by [5])
- sk_assign (fixed by [6])
Commits [4,5,6] only apply to BPF selftests and don't change verifier
behavior.
After applying all of the listed commits I have test_verifier,
test_progs, test_progs-no_alu32 and test_maps passing on my x86 setup,
both for LLVM 16 and LLVM 17.
Upstream commits in chronological order:
[1] be2ef8161572 ("bpf: allow precision tracking for programs with subprogs")
[2] f63181b6ae79 ("bpf: stop setting precise in current state")
[3] 7a830b53c17b ("bpf: aggressively forget precise markings during state checkpointing")
[4] 4f999b767769 ("selftests/bpf: make test_align selftest more robust")
[5] 63d78b7e8ca2 ("selftests/bpf: Workaround verification failure for fexit_bpf2bpf/func_replace_return_code")
[6] 7ce878ca81bc ("selftests/bpf: Fix sk_assign on s390x")
Links:
[7] https://lore.kernel.org/stable/935c4751-d368-df29-33a6-9f4fcae720fa@amazon.…
[8] https://lore.kernel.org/stable/c9b10a8a551edafdfec855fbd35757c6238ad258.cam…
Changelog:
V1 -> V2: added missing signed-off-by tags
V1: https://lore.kernel.org/stable/20230722004514.767618-1-eddyz87@gmail.com/
Reported-by: Luiz Capitulino <luizcap(a)amazon.com>
Andrii Nakryiko (4):
bpf: allow precision tracking for programs with subprogs
bpf: stop setting precise in current state
bpf: aggressively forget precise markings during state checkpointing
selftests/bpf: make test_align selftest more robust
Ilya Leoshkevich (1):
selftests/bpf: Fix sk_assign on s390x
Yonghong Song (1):
selftests/bpf: Workaround verification failure for
fexit_bpf2bpf/func_replace_return_code
kernel/bpf/verifier.c | 202 ++++++++++++++++--
.../testing/selftests/bpf/prog_tests/align.c | 38 ++--
.../selftests/bpf/prog_tests/sk_assign.c | 25 ++-
.../selftests/bpf/progs/connect4_prog.c | 2 +-
.../selftests/bpf/progs/test_sk_assign.c | 11 +
.../bpf/progs/test_sk_assign_libbpf.c | 3 +
6 files changed, 247 insertions(+), 34 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
--
2.41.0