commit c9b528c35795b711331ed36dc3dbee90d5812d4e upstream.
This mostly reverts commit 6bd97bf273bd ("ext4: remove redundant
mb_regenerate_buddy()") and reintroduces mb_regenerate_buddy(). Based on
code in mb_free_blocks(), fast commit replay can end up marking as free
blocks that are already marked as such. This causes corruption of the
buddy bitmap so we need to regenerate it in that case.
Reported-by: Jan Kara <jack(a)suse.cz>
Fixes: 6bd97bf273bd ("ext4: remove redundant mb_regenerate_buddy()")
CVE: CVE-2024-26601
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240104142040.2835097-4-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/mballoc.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 762c2f8b5b2a..63e4c3b9e608 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1168,6 +1168,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
mb_update_avg_fragment_size(sb, grp);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count)))
+ ext4_set_bits(buddy, 0, count);
+
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -1846,6 +1864,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
ext4_mark_group_bitmap_corrupted(
sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ } else {
+ mb_regenerate_buddy(e4b);
}
goto done;
}
--
2.31.1
From: Conor Dooley <conor.dooley(a)microchip.com>
On RISC-V, and presumably x86/arm64, if CFI_CLANG is enabled loading a
rust module will trigger a kernel panic. Support for sanitisers,
including kcfi (CFI_CLANG), is in the works, but for now they're
nightly-only options in rustc. Make RUST depend on !CFI_CLANG to prevent
configuring a kernel without symmetrical support for kfi.
Fixes: 2f7ab1267dc9 ("Kbuild: add Rust support")
cc: stable(a)vger.kernel.org
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
---
This probably needs to go to stable. The correct fixes tag for that I am
not sure of however, but since CFI_CLANG predates RUST, I blamed the
commit adding rust support.
---
init/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/init/Kconfig b/init/Kconfig
index 8d4e836e1b6b..6cf05824859e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1895,6 +1895,7 @@ config RUST
bool "Rust support"
depends on HAVE_RUST
depends on RUST_IS_AVAILABLE
+ depends on !CFI_CLANG
depends on !MODVERSIONS
depends on !GCC_PLUGINS
depends on !RANDSTRUCT
--
2.43.0
commit c9b528c35795b711331ed36dc3dbee90d5812d4e upstream.
This mostly reverts commit 6bd97bf273bd ("ext4: remove redundant
mb_regenerate_buddy()") and reintroduces mb_regenerate_buddy(). Based on
code in mb_free_blocks(), fast commit replay can end up marking as free
blocks that are already marked as such. This causes corruption of the
buddy bitmap so we need to regenerate it in that case.
Reported-by: Jan Kara <jack(a)suse.cz>
Fixes: 6bd97bf273bd ("ext4: remove redundant mb_regenerate_buddy()")
CVE: CVE-2024-26601
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240104142040.2835097-4-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/mballoc.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9bec75847b85..5799706e20cc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -823,6 +823,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
atomic64_add(period, &sbi->s_mb_generation_time);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count)))
+ ext4_set_bits(buddy, 0, count);
+
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -1505,6 +1523,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
ext4_mark_group_bitmap_corrupted(
sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ } else {
+ mb_regenerate_buddy(e4b);
}
goto done;
}
--
2.31.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f1796544a0ca0f14386a679d3d05fbc69235015e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022702-ignition-astonish-a4f1@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f1796544a0ca ("memcg: fix use-after-free in uncharge_batch")
1a3e1f40962c ("mm: memcontrol: decouple reference counting from page accounting")
8d22a9351035 ("mm/memcg: fix refcount error while moving and swapping")
d9eb1ea2bf87 ("mm: memcontrol: delete unused lrucare handling")
4c6355b25e8b ("mm: memcontrol: charge swapin pages on instantiation")
f0e45fb4da29 ("mm: memcontrol: drop unused try/commit/cancel charge API")
9d82c69438d0 ("mm: memcontrol: convert anon and file-thp to new mem_cgroup_charge() API")
468c398233da ("mm: memcontrol: switch to native NR_ANON_THPS counter")
be5d0a74c62d ("mm: memcontrol: switch to native NR_ANON_MAPPED counter")
0d1c20722ab3 ("mm: memcontrol: switch to native NR_FILE_PAGES and NR_SHMEM counters")
49e50d277ba2 ("mm: memcontrol: prepare move_account for removal of private page type counters")
9f762dbe19b9 ("mm: memcontrol: prepare uncharging for removal of private page type counters")
3fea5a499d57 ("mm: memcontrol: convert page cache to a new mem_cgroup_charge() API")
6caa6a0703e0 ("mm: memcontrol: move out cgroup swaprate throttling")
14235ab36019 ("mm: shmem: remove rare optimization when swapin races with hole punching")
3fba69a56e16 ("mm: memcontrol: drop @compound parameter from memcg charging API")
abb242f57196 ("mm: memcontrol: fix stat-corrupting race in charge moving")
f4129ea3591a ("mm: fix NUMA node file count error in replace_page_cache()")
ffe945e633b5 ("khugepaged: do not stop collapse if less than half PTEs are referenced")
396bcc5299c2 ("mm: remove CONFIG_TRANSPARENT_HUGE_PAGECACHE")
85b9f46e8ea4 ("mm, thp: track fallbacks due to failed memcg charges separately")
dcdf11ee1441 ("mm, shmem: add vmstat for hugepage fallback")
9c315e4d7d8c ("mm: memcg/slab: cache page number in memcg_(un)charge_slab()")
92d0510c3585 ("mm: kmem: switch to nr_pages in (__)memcg_kmem_charge_memcg()")
f4b00eab5004 ("mm: kmem: rename memcg_kmem_(un)charge() into memcg_kmem_(un)charge_page()")
50591183fa86 ("mm: kmem: cleanup memcg_kmem_uncharge_memcg() arguments")
10eaec2f63b6 ("mm: kmem: cleanup (__)memcg_kmem_charge_memcg() arguments")
47e29d32afba ("mm/gup: page->hpage_pinned_refcount: exact pin counts for huge pages")
3faa52c03f44 ("mm/gup: track FOLL_PIN pages")
3b78d8347d31 ("mm/gup: pass gup flags to two more routines")
c23a0c99793f ("mm/migrate: clean up some minor coding style")
92855270ff08 ("mm/memcontrol.c: cleanup some useless code")
f1f6a7dd9b53 ("mm, tree-wide: rename put_user_page*() to unpin_user_page*()")
aa4b87fe9ea3 ("powerpc: book3s64: convert to pin_user_pages() and put_user_page()")
19fed0dae94d ("vfio, mm: pin_user_pages (FOLL_PIN) and put_user_page() conversion")
1f815afcfca7 ("media/v4l2-core: pin_user_pages (FOLL_PIN) and put_user_page() conversion")
803e4572d7c5 ("mm/process_vm_access: set FOLL_PIN via pin_user_pages_remote()")
57459435cff5 ("goldish_pipe: convert to pin_user_pages() and put_user_page()")
eddb1c228f79 ("mm/gup: introduce pin_user_pages*() and FOLL_PIN")
3c7470b6f684 ("media/v4l2-core: set pages dirty upon releasing DMA buffers")
f4000fdf435b ("mm/gup: allow FOLL_FORCE for get_user_pages_fast()")
3567813eae5e ("vfio: fix FOLL_LONGTERM use, simplify get_user_pages_remote() call")
c4237f8b1f4f ("mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM")
a707cdd55f0f ("mm/gup: move try_get_compound_head() to top, fix minor issues")
a43e982082c2 ("mm/gup: factor out duplicate code from four routines")
fac0516b5534 ("mm: thp: don't need care deferred split queue in memcg charge move path")
f1fe80d4ae33 ("mm, thp: do not queue fully unmapped pages for deferred split")
acbfb087e3b1 ("mm/hugetlb: avoid looping to the same hugepage if !pages and !vmas")
867e5e1de14b ("mm: clean up and clarify lruvec lookup procedure")
242c37b459ce ("include/linux/memcontrol.h: fix comments based on per-node memcg")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f1796544a0ca0f14386a679d3d05fbc69235015e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko(a)suse.com>
Date: Fri, 4 Sep 2020 16:35:24 -0700
Subject: [PATCH] memcg: fix use-after-free in uncharge_batch
syzbot has reported an use-after-free in the uncharge_batch path
BUG: KASAN: use-after-free in instrument_atomic_write include/linux/instrumented.h:71 [inline]
BUG: KASAN: use-after-free in atomic64_sub_return include/asm-generic/atomic-instrumented.h:970 [inline]
BUG: KASAN: use-after-free in atomic_long_sub_return include/asm-generic/atomic-long.h:113 [inline]
BUG: KASAN: use-after-free in page_counter_cancel mm/page_counter.c:54 [inline]
BUG: KASAN: use-after-free in page_counter_uncharge+0x3d/0xc0 mm/page_counter.c:155
Write of size 8 at addr ffff8880371c0148 by task syz-executor.0/9304
CPU: 0 PID: 9304 Comm: syz-executor.0 Not tainted 5.8.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1f0/0x31e lib/dump_stack.c:118
print_address_description+0x66/0x620 mm/kasan/report.c:383
__kasan_report mm/kasan/report.c:513 [inline]
kasan_report+0x132/0x1d0 mm/kasan/report.c:530
check_memory_region_inline mm/kasan/generic.c:183 [inline]
check_memory_region+0x2b5/0x2f0 mm/kasan/generic.c:192
instrument_atomic_write include/linux/instrumented.h:71 [inline]
atomic64_sub_return include/asm-generic/atomic-instrumented.h:970 [inline]
atomic_long_sub_return include/asm-generic/atomic-long.h:113 [inline]
page_counter_cancel mm/page_counter.c:54 [inline]
page_counter_uncharge+0x3d/0xc0 mm/page_counter.c:155
uncharge_batch+0x6c/0x350 mm/memcontrol.c:6764
uncharge_page+0x115/0x430 mm/memcontrol.c:6796
uncharge_list mm/memcontrol.c:6835 [inline]
mem_cgroup_uncharge_list+0x70/0xe0 mm/memcontrol.c:6877
release_pages+0x13a2/0x1550 mm/swap.c:911
tlb_batch_pages_flush mm/mmu_gather.c:49 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:242 [inline]
tlb_flush_mmu+0x780/0x910 mm/mmu_gather.c:249
tlb_finish_mmu+0xcb/0x200 mm/mmu_gather.c:328
exit_mmap+0x296/0x550 mm/mmap.c:3185
__mmput+0x113/0x370 kernel/fork.c:1076
exit_mm+0x4cd/0x550 kernel/exit.c:483
do_exit+0x576/0x1f20 kernel/exit.c:793
do_group_exit+0x161/0x2d0 kernel/exit.c:903
get_signal+0x139b/0x1d30 kernel/signal.c:2743
arch_do_signal+0x33/0x610 arch/x86/kernel/signal.c:811
exit_to_user_mode_loop kernel/entry/common.c:135 [inline]
exit_to_user_mode_prepare+0x8d/0x1b0 kernel/entry/common.c:166
syscall_exit_to_user_mode+0x5e/0x1a0 kernel/entry/common.c:241
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Commit 1a3e1f40962c ("mm: memcontrol: decouple reference counting from
page accounting") reworked the memcg lifetime to be bound the the struct
page rather than charges. It also removed the css_put_many from
uncharge_batch and that is causing the above splat.
uncharge_batch() is supposed to uncharge accumulated charges for all
pages freed from the same memcg. The queuing is done by uncharge_page
which however drops the memcg reference after it adds charges to the
batch. If the current page happens to be the last one holding the
reference for its memcg then the memcg is OK to go and the next page to
be freed will trigger batched uncharge which needs to access the memcg
which is gone already.
Fix the issue by taking a reference for the memcg in the current batch.
Fixes: 1a3e1f40962c ("mm: memcontrol: decouple reference counting from page accounting")
Reported-by: syzbot+b305848212deec86eabe(a)syzkaller.appspotmail.com
Reported-by: syzbot+b5ea6fb6f139c8b9482b(a)syzkaller.appspotmail.com
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb(a)google.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: Hugh Dickins <hughd(a)google.com>
Link: https://lkml.kernel.org/r/20200820090341.GC5033@dhcp22.suse.cz
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b807952b4d43..cfa6cbad21d5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6774,6 +6774,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
+
+ /* drop reference from uncharge_page */
+ css_put(&ug->memcg->css);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6797,6 +6800,9 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
uncharge_gather_clear(ug);
}
ug->memcg = page->mem_cgroup;
+
+ /* pairs with css_put in uncharge_batch */
+ css_get(&ug->memcg->css);
}
nr_pages = compound_nr(page);
Hi,
this series does basically two things:
1. Disables automatic load balancing as adviced by the hardware
workaround.
2. Forces the sharing of the load submitted to CCS among all the
CCS available (as of now only DG2 has more than one CCS). This
way the user, when sending a query, will see only one CCS
available.
Andi
Andi Shyti (2):
drm/i915/gt: Disable HW load balancing for CCS
drm/i915/gt: Set default CCS mode '1'
drivers/gpu/drm/i915/gt/intel_gt.c | 11 +++++++++++
drivers/gpu/drm/i915/gt/intel_gt_regs.h | 3 +++
drivers/gpu/drm/i915/gt/intel_workarounds.c | 6 ++++++
drivers/gpu/drm/i915/i915_drv.h | 17 +++++++++++++++++
drivers/gpu/drm/i915/i915_query.c | 5 +++--
5 files changed, 40 insertions(+), 2 deletions(-)
--
2.43.0
From: David Woodhouse <dwmw(a)amazon.co.uk>
Linux guests since commit b1c3497e604d ("x86/xen: Add support for
HVMOP_set_evtchn_upcall_vector") in v6.0 onwards will use the per-vCPU
upcall vector when it's advertised in the Xen CPUID leaves.
This upcall is injected through the guest's local APIC as an MSI, unlike
the older system vector which was merely injected by the hypervisor any
time the CPU was able to receive an interrupt and the upcall_pending
flags is set in its vcpu_info.
Effectively, that makes the per-CPU upcall edge triggered instead of
level triggered, which results in the upcall being lost if the MSI is
delivered when the local APIC is *disabled*.
Xen checks the vcpu_info->evtchn_upcall_pending flag when the local APIC
for a vCPU is software enabled (in fact, on any write to the SPIV
register which doesn't disable the APIC). Do the same in KVM since KVM
doesn't provide a way for userspace to intervene and trap accesses to
the SPIV register of a local APIC emulated by KVM.
Fixes: fde0451be8fb3 ("KVM: x86/xen: Support per-vCPU event channel upcall via local APIC")
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Reviewed-by: Paul Durrant <paul(a)xen.org>
Cc: stable(a)vger.kernel.org
---
arch/x86/kvm/lapic.c | 5 ++++-
arch/x86/kvm/xen.c | 2 +-
arch/x86/kvm/xen.h | 18 ++++++++++++++++++
3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3242f3da2457..75bc7d3f0022 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -499,8 +500,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index ccd2dc753fd6..06904696759c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -568,7 +568,7 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
int r;
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index f8f1fe22d090..f5841d9000ae 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -18,6 +18,7 @@ extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
@@ -36,6 +37,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue);
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The local APIC is being enabled. If the per-vCPU upcall vector is
+ * set and the vCPU's evtchn_upcall_pending flag is set, inject the
+ * interrupt.
+ */
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu))
+ kvm_xen_inject_vcpu_vector(vcpu);
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -101,6 +115,10 @@ static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
{
}
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
--
2.43.0
Because sandboxing can be used as an opportunistic security measure,
user space may not log unsupported features. Let the system
administrator know if an application tries to use Landlock but failed
because it isn't enabled at boot time. This may be caused by bootloader
configurations with outdated "lsm" kernel's command-line parameter.
Cc: stable(a)vger.kernel.org
Fixes: 265885daf3e5 ("landlock: Add syscall implementations")
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Günther Noack <gnoack3000(a)gmail.com>
Signed-off-by: Mickaël Salaün <mic(a)digikod.net>
---
Changes since v1:
* Add Kees's and Günther's Reviewed-by.
* Rename is_not_initialized() to not_initialized() and invert the logic,
as suggested by Günther. This is a cosmetic change without global
behavioral changed.
* Update link to point to a new subsection.
---
security/landlock/syscalls.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 898358f57fa0..6788e73b6681 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -33,6 +33,18 @@
#include "ruleset.h"
#include "setup.h"
+static bool is_initialized(void)
+{
+ if (likely(landlock_initialized))
+ return true;
+
+ pr_warn_once(
+ "Disabled but requested by user space. "
+ "You should enable Landlock at boot time: "
+ "https://docs.kernel.org/userspace-api/landlock.html#boot-time-configuration…");
+ return false;
+}
+
/**
* copy_min_struct_from_user - Safe future-proof argument copying
*
@@ -173,7 +185,7 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
/* Build-time checks. */
build_check_abi();
- if (!landlock_initialized)
+ if (!is_initialized())
return -EOPNOTSUPP;
if (flags) {
@@ -398,7 +410,7 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
struct landlock_ruleset *ruleset;
int err;
- if (!landlock_initialized)
+ if (!is_initialized())
return -EOPNOTSUPP;
/* No flag for now. */
@@ -458,7 +470,7 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
struct landlock_cred_security *new_llcred;
int err;
- if (!landlock_initialized)
+ if (!is_initialized())
return -EOPNOTSUPP;
/*
--
2.44.0
After a couple recent changes in LLVM, there is a warning (or error with
CONFIG_WERROR=y or W=e) from the compile time fortify source routines,
specifically the memset() in copy_to_user_tmpl().
In file included from net/xfrm/xfrm_user.c:14:
...
include/linux/fortify-string.h:438:4: error: call to '__write_overflow_field' declared with 'warning' attribute: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror,-Wattribute-warning]
438 | __write_overflow_field(p_size_field, size);
| ^
1 error generated.
While ->xfrm_nr has been validated against XFRM_MAX_DEPTH when its value
is first assigned in copy_templates() by calling validate_tmpl() first
(so there should not be any issue in practice), LLVM/clang cannot really
deduce that across the boundaries of these functions. Without that
knowledge, it cannot assume that the loop stops before i is greater than
XFRM_MAX_DEPTH, which would indeed result a stack buffer overflow in the
memset().
To make the bounds of ->xfrm_nr clear to the compiler and add additional
defense in case copy_to_user_tmpl() is ever used in a path where
->xfrm_nr has not been properly validated against XFRM_MAX_DEPTH first,
add an explicit bound check and early return, which clears up the
warning.
Cc: stable(a)vger.kernel.org
Link: https://github.com/ClangBuiltLinux/linux/issues/1985
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
net/xfrm/xfrm_user.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index f037be190bae..912c1189ba41 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2017,6 +2017,9 @@ static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb)
if (xp->xfrm_nr == 0)
return 0;
+ if (xp->xfrm_nr > XFRM_MAX_DEPTH)
+ return -ENOBUFS;
+
for (i = 0; i < xp->xfrm_nr; i++) {
struct xfrm_user_tmpl *up = &vec[i];
struct xfrm_tmpl *kp = &xp->xfrm_vec[i];
---
base-commit: 14dec56fdd4c70a0ebe40077368e367421ea6fef
change-id: 20240221-xfrm-avoid-clang-fortify-warning-copy_to_user_tmpl-40cb10b003e3
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
Svacer reports a potential division by zero at rcu_torture_writer() in
5.15 stable release. The problem has been fixed by the following patch
that can be cleanly applied to 5.15 branch.
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x e3b63e966cac0bf78aaa1efede1827a252815a1d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022612-uncloak-pretext-f4a2@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
e3b63e966cac ("mm: zswap: fix missing folio cleanup in writeback race path")
96c7b0b42239 ("mm: return the folio from __read_swap_cache_async()")
e947ba0bbf47 ("mm/zswap: cleanup zswap_writeback_entry()")
32acba4c0483 ("mm/zswap: refactor out __zswap_load()")
c75f5c1e0f1d ("mm/zswap: reuse dstmem when decompress")
b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure")
a65b0e7607cc ("zswap: make shrinking memcg-aware")
ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma")
23e4883248f0 ("mm: add page_rmappable_folio() wrapper")
c36f6e6dff4d ("mempolicy trivia: slightly more consistent naming")
7f1ee4e20708 ("mempolicy trivia: delete those ancient pr_debug()s")
1cb5d11a370f ("mempolicy: fix migrate_pages(2) syscall return nr_failed")
3657fdc2451a ("mm: move vma_policy() and anon_vma_name() decls to mm_types.h")
3022fd7af960 ("shmem: _add_to_page_cache() before shmem_inode_acct_blocks()")
054a9f7ccd0a ("shmem: move memcg charge out of shmem_add_to_page_cache()")
4199f51a7eb2 ("shmem: shmem_acct_blocks() and shmem_inode_acct_blocks()")
e3e1a5067fd2 ("shmem: remove vma arg from shmem_get_folio_gfp()")
75c70128a673 ("mm: mempolicy: make mpol_misplaced() to take a folio")
cda6d93672ac ("mm: memory: make numa_migrate_prep() to take a folio")
6695cf68b15c ("mm: memory: use a folio in do_numa_page()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e3b63e966cac0bf78aaa1efede1827a252815a1d Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Thu, 25 Jan 2024 08:51:27 +0000
Subject: [PATCH] mm: zswap: fix missing folio cleanup in writeback race path
In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled. If it was, we delete the
folio we just added to the swap cache and exit.
However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd. Make sure to unlock and put the folio before returning.
This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.
Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs(a)gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 350dd2fc8159..d2423247acfd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x e3b63e966cac0bf78aaa1efede1827a252815a1d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022611-tropics-deferred-2483@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
e3b63e966cac ("mm: zswap: fix missing folio cleanup in writeback race path")
96c7b0b42239 ("mm: return the folio from __read_swap_cache_async()")
e947ba0bbf47 ("mm/zswap: cleanup zswap_writeback_entry()")
32acba4c0483 ("mm/zswap: refactor out __zswap_load()")
c75f5c1e0f1d ("mm/zswap: reuse dstmem when decompress")
b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure")
a65b0e7607cc ("zswap: make shrinking memcg-aware")
ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma")
23e4883248f0 ("mm: add page_rmappable_folio() wrapper")
c36f6e6dff4d ("mempolicy trivia: slightly more consistent naming")
7f1ee4e20708 ("mempolicy trivia: delete those ancient pr_debug()s")
1cb5d11a370f ("mempolicy: fix migrate_pages(2) syscall return nr_failed")
3657fdc2451a ("mm: move vma_policy() and anon_vma_name() decls to mm_types.h")
3022fd7af960 ("shmem: _add_to_page_cache() before shmem_inode_acct_blocks()")
054a9f7ccd0a ("shmem: move memcg charge out of shmem_add_to_page_cache()")
4199f51a7eb2 ("shmem: shmem_acct_blocks() and shmem_inode_acct_blocks()")
e3e1a5067fd2 ("shmem: remove vma arg from shmem_get_folio_gfp()")
75c70128a673 ("mm: mempolicy: make mpol_misplaced() to take a folio")
cda6d93672ac ("mm: memory: make numa_migrate_prep() to take a folio")
6695cf68b15c ("mm: memory: use a folio in do_numa_page()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e3b63e966cac0bf78aaa1efede1827a252815a1d Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Thu, 25 Jan 2024 08:51:27 +0000
Subject: [PATCH] mm: zswap: fix missing folio cleanup in writeback race path
In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled. If it was, we delete the
folio we just added to the swap cache and exit.
However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd. Make sure to unlock and put the folio before returning.
This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.
Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs(a)gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 350dd2fc8159..d2423247acfd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x 678e54d4bb9a4822f8ae99690ac131c5d490cdb1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022622-agony-salvaging-5082@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
678e54d4bb9a ("mm/zswap: invalidate duplicate entry when !zswap_enabled")
a65b0e7607cc ("zswap: make shrinking memcg-aware")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 678e54d4bb9a4822f8ae99690ac131c5d490cdb1 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming(a)bytedance.com>
Date: Thu, 8 Feb 2024 02:32:54 +0000
Subject: [PATCH] mm/zswap: invalidate duplicate entry when !zswap_enabled
We have to invalidate any duplicate entry even when !zswap_enabled since
zswap can be disabled anytime. If the folio store success before, then
got dirtied again but zswap disabled, we won't invalidate the old
duplicate entry in the zswap_store(). So later lru writeback may
overwrite the new data in swapfile.
Link: https://lkml.kernel.org/r/20240208023254.3873823-1-chengming.zhou@linux.dev
Fixes: 42c06a0e8ebe ("mm: kill frontswap")
Signed-off-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 36903d938c15..db4625af65fb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1518,7 +1518,7 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!zswap_enabled || !tree)
+ if (!tree)
return false;
/*
@@ -1533,6 +1533,10 @@ bool zswap_store(struct folio *folio)
zswap_invalidate_entry(tree, dupentry);
}
spin_unlock(&tree->lock);
+
+ if (!zswap_enabled)
+ return false;
+
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
This is the backport of recently upstreamed series that moves VERW
execution to a later point in exit-to-user path. This is needed because
in some cases it may be possible for data accessed after VERW executions
may end into MDS affected CPU buffers. Moving VERW closer to ring
transition reduces the attack surface.
Patch 1/6 includes a minor fix that is queued for upstream:
https://lore.kernel.org/lkml/170899674562.398.6398007479766564897.tip-bot2@…
Patch 2/6 needed a conflict to be resolved for the hunk
swapgs_restore_regs_and_return_to_usermode.
Signed-off-by: Pawan Gupta <pawan.kumar.gupta(a)linux.intel.com>
---
Pawan Gupta (5):
x86/bugs: Add asm helpers for executing VERW
x86/entry_64: Add VERW just before userspace transition
x86/entry_32: Add VERW just before userspace transition
x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key
KVM/VMX: Move VERW closer to VMentry for MDS mitigation
Sean Christopherson (1):
KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH
Documentation/arch/x86/mds.rst | 38 +++++++++++++++++++++++++-----------
arch/x86/entry/entry.S | 23 ++++++++++++++++++++++
arch/x86/entry/entry_32.S | 3 +++
arch/x86/entry/entry_64.S | 11 +++++++++++
arch/x86/entry/entry_64_compat.S | 1 +
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/entry-common.h | 1 -
arch/x86/include/asm/nospec-branch.h | 25 ++++++++++++------------
arch/x86/kernel/cpu/bugs.c | 15 ++++++--------
arch/x86/kernel/nmi.c | 3 ---
arch/x86/kvm/vmx/run_flags.h | 7 +++++--
arch/x86/kvm/vmx/vmenter.S | 9 ++++++---
arch/x86/kvm/vmx/vmx.c | 20 +++++++++++++++----
13 files changed, 112 insertions(+), 46 deletions(-)
---
base-commit: b631f5b445dc3379f67ff63a2e4c58f22d4975dc
change-id: 20240226-delay-verw-backport-6-7-y-a2cb3f26bb90
Best regards,
--
Thanks,
Pawan
Hi,
This series fixes errors during module removal. It also
implements PHY core voltage selection as per TI recommendation
and workaround for Errata i2409 [1].
The workaround needs PHY2 region to be present in device node.
The device tree patch will be sent later after the DT binding doc
is merged.
[1] - https://www.ti.com/lit/er/sprz487d/sprz487d.pdf
Signed-off-by: Roger Quadros <rogerq(a)kernel.org>
---
Changes in v4:
- re-arranged patches into first 2 bug-fixes and added Cc stable for them
- Added Acked-by
- Link to v3: https://lore.kernel.org/r/20240214-for-v6-9-am62-usb-errata-3-0-v3-0-147ec5…
---
Roger Quadros (4):
usb: dwc3-am62: fix module unload/reload behavior
usb: dwc3-am62: Disable wakeup at remove
usb: dwc3-am62: Fix PHY core voltage selection
usb: dwc3-am62: add workaround for Errata i2409
drivers/usb/dwc3/dwc3-am62.c | 42 ++++++++++++++++++++++++++++++------------
1 file changed, 30 insertions(+), 12 deletions(-)
---
base-commit: 6613476e225e090cc9aad49be7fa504e290dd33d
change-id: 20240206-for-v6-9-am62-usb-errata-3-0-233024ea8e9d
Best regards,
--
Roger Quadros <rogerq(a)kernel.org>
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x b820de741ae48ccf50dd95e297889c286ff4f760
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022602-unwrapped-haggler-daae@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
b820de741ae4 ("fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio")
9cf3516c29e6 ("fs: add IOCB flags related to passing back dio completions")
f6c73a11133e ("fs.h: Add TRACE_IOCB_STRINGS for use in trace points")
1da8cf961bb1 ("Merge tag 'io_uring-6.0-2022-08-13' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Thu, 15 Feb 2024 12:47:38 -0800
Subject: [PATCH] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via
libaio
If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the
following kernel warning appears:
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
kiocb_set_cancel_fn+0x9c/0xa8
ffs_epfile_read_iter+0x144/0x1d0
io_read+0x19c/0x498
io_issue_sqe+0x118/0x27c
io_submit_sqes+0x25c/0x5fc
__arm64_sys_io_uring_enter+0x104/0xab0
invoke_syscall+0x58/0x11c
el0_svc_common+0xb4/0xf4
do_el0_svc+0x2c/0xb0
el0_svc+0x2c/0xa4
el0t_64_sync_handler+0x68/0xb4
el0t_64_sync+0x1a4/0x1a8
Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is
submitted by libaio.
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Avi Kivity <avi(a)scylladb.com>
Cc: Sandeep Dhavale <dhavale(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..da18dbcfcb22 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..c2dcc98cb4c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@ enum rw_hint {
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
+/* kiocb is a read or write operation submitted by fs/aio.c. */
+#define IOCB_AIO_RW (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 1b0ca4e4ff10a2c8402e2cf70132c683e1c772e4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022643-scorn-filtrate-8677@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
1b0ca4e4ff10 ("mm/damon/reclaim: fix quota stauts loss due to online tunings")
66d9faec0745 ("mm/damon/reclaim: add a parameter called skip_anon for avoiding anonymous pages reclamation")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b0ca4e4ff10a2c8402e2cf70132c683e1c772e4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj(a)kernel.org>
Date: Fri, 16 Feb 2024 11:40:24 -0800
Subject: [PATCH] mm/damon/reclaim: fix quota stauts loss due to online tunings
Patch series "mm/damon: fix quota status loss due to online tunings".
DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status
when applying new user parameters, and hence could cause temporal quota
accuracy degradation. Fix it by preserving the status.
This patch (of 2):
For online parameters change, DAMON_RECLAIM creates new scheme based on
latest values of the parameters and replaces the old scheme with the new
one. When creating it, the internal status of the quota of the old
scheme is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org
Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.19+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index ab974e477d2f..66e190f0374a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_scheme(void)
&damon_reclaim_wmarks);
}
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_reclaim_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *old_scheme;
struct damos_filter *filter;
int err = 0;
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (!list_empty(&ctx->schemes)) {
+ damon_for_each_scheme(old_scheme, ctx)
+ damon_reclaim_copy_quota_status(&scheme->quota,
+ &old_scheme->quota);
+ }
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
Hi Greg,
the issue might be due to this patch:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/tre…
2024-02-23T15:39:05.6297767Z CC kernel/sys_ni.o
2024-02-23T15:39:05.7583048Z security/apparmor/af_unix.c: In function
‘unix_state_double_lock’:
2024-02-23T15:39:05.7586076Z security/apparmor/af_unix.c:583:17: error:
too few arguments to function ‘unix_state_lock_nested’
2024-02-23T15:39:05.7588374Z 583 |
unix_state_lock_nested(sk2);
2024-02-23T15:39:05.7589913Z | ^~~~~~~~~~~~~~~~~~~~~~
2024-02-23T15:39:05.7591564Z In file included from
security/apparmor/include/af_unix.h:15,
2024-02-23T15:39:05.7593341Z from
security/apparmor/af_unix.c:17:
2024-02-23T15:39:05.7594989Z ./include/net/af_unix.h:77:20: note:
declared here
2024-02-23T15:39:05.7596733Z 77 | static inline void
unix_state_lock_nested(struct sock *sk,
2024-02-23T15:39:05.7598516Z |
^~~~~~~~~~~~~~~~~~~~~~
2024-02-23T15:39:05.7600862Z security/apparmor/af_unix.c:586:17: error:
too few arguments to function ‘unix_state_lock_nested’
2024-02-23T15:39:05.7603177Z 586 |
unix_state_lock_nested(sk1);
2024-02-23T15:39:05.7605189Z | ^~~~~~~~~~~~~~~~~~~~~~
2024-02-23T15:39:05.7606765Z ./include/net/af_unix.h:77:20: note:
declared here
2024-02-23T15:39:05.7608497Z 77 | static inline void
unix_state_lock_nested(struct sock *sk,
2024-02-23T15:39:05.7610208Z |
^~~~~~~~~~~~~~~~~~~~~~
2024-02-23T15:39:05.8002385Z make[2]: *** [scripts/Makefile.build:262:
security/apparmor/af_unix.o] Error 1
2024-02-23T15:39:05.8005077Z make[2]: *** Waiting for unfinished jobs....
2024-02-23T15:39:05.8094726Z CC crypto/scatterwalk.o
2024-02-23T15:39:05.9082621Z CC [M] fs/btrfs/sysfs.o
2024-02-23T15:39:06.2502316Z CC kernel/nsproxy.o
2024-02-23T15:39:06.4094246Z make[1]: *** [scripts/Makefile.build:497:
security/apparmor] Error 2
2024-02-23T15:39:06.4207119Z make: *** [Makefile:1750: security] Error 2
2024-02-23T15:39:06.4208636Z CC kernel/notifier.o
2024-02-23T15:39:06.4210296Z make: *** Waiting for unfinished jobs....
2024-02-23T15:39:06.8604827Z CC crypto/proc.o
--
Best, Philip
This is the backport of recently upstreamed series that moves VERW
execution to a later point in exit-to-user path. This is needed because
in some cases it may be possible for data accessed after VERW executions
may end into MDS affected CPU buffers. Moving VERW closer to ring
transition reduces the attack surface.
Patch 1/6 includes a minor fix that is queued for upstream:
https://lore.kernel.org/lkml/170899674562.398.6398007479766564897.tip-bot2@…
Patch 2/6 needed a conflict to be resolved for the hunk
swapgs_restore_regs_and_return_to_usermode.
This is only compile and boot tested on qemu.
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
To: stable(a)vger.kernel.org
Signed-off-by: Pawan Gupta <pawan.kumar.gupta(a)linux.intel.com>
---
Pawan Gupta (5):
x86/bugs: Add asm helpers for executing VERW
x86/entry_64: Add VERW just before userspace transition
x86/entry_32: Add VERW just before userspace transition
x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key
KVM/VMX: Move VERW closer to VMentry for MDS mitigation
Sean Christopherson (1):
KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH
Documentation/arch/x86/mds.rst | 38 +++++++++++++++++++++++++-----------
arch/x86/entry/entry.S | 23 ++++++++++++++++++++++
arch/x86/entry/entry_32.S | 3 +++
arch/x86/entry/entry_64.S | 11 +++++++++++
arch/x86/entry/entry_64_compat.S | 1 +
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/entry-common.h | 1 -
arch/x86/include/asm/nospec-branch.h | 25 ++++++++++++------------
arch/x86/kernel/cpu/bugs.c | 15 ++++++--------
arch/x86/kernel/nmi.c | 3 ---
arch/x86/kvm/vmx/run_flags.h | 7 +++++--
arch/x86/kvm/vmx/vmenter.S | 9 ++++++---
arch/x86/kvm/vmx/vmx.c | 20 +++++++++++++++----
13 files changed, 112 insertions(+), 46 deletions(-)
---
base-commit: d8a27ea2c98685cdaa5fa66c809c7069a4ff394b
change-id: 20240226-delay-verw-backport-6-6-y-2cda3298e600
Oliver Upton (2):
KVM: arm64: vgic-its: Test for valid IRQ in
its_sync_lpi_pending_table()
KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler
virt/kvm/arm/vgic/vgic-its.c | 5 +++++
1 file changed, 5 insertions(+)
base-commit: ab219d38aef198d26083cc800954d352acd5137b
--
2.44.0.rc1.240.g4c46232300-goog
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 678e54d4bb9a4822f8ae99690ac131c5d490cdb1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022622-resent-ripeness-43f1@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
678e54d4bb9a ("mm/zswap: invalidate duplicate entry when !zswap_enabled")
a65b0e7607cc ("zswap: make shrinking memcg-aware")
ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma")
23e4883248f0 ("mm: add page_rmappable_folio() wrapper")
c36f6e6dff4d ("mempolicy trivia: slightly more consistent naming")
7f1ee4e20708 ("mempolicy trivia: delete those ancient pr_debug()s")
1cb5d11a370f ("mempolicy: fix migrate_pages(2) syscall return nr_failed")
3657fdc2451a ("mm: move vma_policy() and anon_vma_name() decls to mm_types.h")
3022fd7af960 ("shmem: _add_to_page_cache() before shmem_inode_acct_blocks()")
054a9f7ccd0a ("shmem: move memcg charge out of shmem_add_to_page_cache()")
4199f51a7eb2 ("shmem: shmem_acct_blocks() and shmem_inode_acct_blocks()")
e3e1a5067fd2 ("shmem: remove vma arg from shmem_get_folio_gfp()")
75c70128a673 ("mm: mempolicy: make mpol_misplaced() to take a folio")
cda6d93672ac ("mm: memory: make numa_migrate_prep() to take a folio")
6695cf68b15c ("mm: memory: use a folio in do_numa_page()")
667ffc31aa95 ("mm: huge_memory: use a folio in do_huge_pmd_numa_page()")
73eab3ca481e ("mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio()")
2ac9e99f3b21 ("mm: migrate: convert numamigrate_isolate_page() to numamigrate_isolate_folio()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 678e54d4bb9a4822f8ae99690ac131c5d490cdb1 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming(a)bytedance.com>
Date: Thu, 8 Feb 2024 02:32:54 +0000
Subject: [PATCH] mm/zswap: invalidate duplicate entry when !zswap_enabled
We have to invalidate any duplicate entry even when !zswap_enabled since
zswap can be disabled anytime. If the folio store success before, then
got dirtied again but zswap disabled, we won't invalidate the old
duplicate entry in the zswap_store(). So later lru writeback may
overwrite the new data in swapfile.
Link: https://lkml.kernel.org/r/20240208023254.3873823-1-chengming.zhou@linux.dev
Fixes: 42c06a0e8ebe ("mm: kill frontswap")
Signed-off-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 36903d938c15..db4625af65fb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1518,7 +1518,7 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!zswap_enabled || !tree)
+ if (!tree)
return false;
/*
@@ -1533,6 +1533,10 @@ bool zswap_store(struct folio *folio)
zswap_invalidate_entry(tree, dupentry);
}
spin_unlock(&tree->lock);
+
+ if (!zswap_enabled)
+ return false;
+
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x db744ddd59be798c2627efbfc71f707f5a935a40
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022609-womanless-imprison-678c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
db744ddd59be ("PCI/MSI: Prevent MSI hardware interrupt number truncation")
aa423ac4221a ("PCI/MSI: Split out irqdomain code")
a01e09ef1237 ("PCI/MSI: Split out !IRQDOMAIN code")
54324c2f3d72 ("PCI/MSI: Split out CONFIG_PCI_MSI independent part")
288c81ce4be7 ("PCI/MSI: Move code into a separate directory")
29a03ada4a00 ("PCI/MSI: Cleanup include zoo")
ae72f3156729 ("PCI/MSI: Make arch_restore_msi_irqs() less horrible.")
e58f2259b91c ("genirq/msi, treewide: Use a named struct for PCI/MSI attributes")
ade044a3d0f0 ("PCI/MSI: Remove msi_desc_to_pci_sysdata()")
9e8688c5f299 ("PCI/MSI: Make pci_msi_domain_write_msg() static")
29bbc35e29d9 ("PCI/MSI: Fix pci_irq_vector()/pci_irq_get_affinity()")
c36e33e2f477 ("Merge tag 'irq-urgent-2021-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From db744ddd59be798c2627efbfc71f707f5a935a40 Mon Sep 17 00:00:00 2001
From: Vidya Sagar <vidyas(a)nvidia.com>
Date: Mon, 15 Jan 2024 19:26:49 +0530
Subject: [PATCH] PCI/MSI: Prevent MSI hardware interrupt number truncation
While calculating the hardware interrupt number for a MSI interrupt, the
higher bits (i.e. from bit-5 onwards a.k.a domain_nr >= 32) of the PCI
domain number gets truncated because of the shifted value casting to return
type of pci_domain_nr() which is 'int'. This for example is resulting in
same hardware interrupt number for devices 0019:00:00.0 and 0039:00:00.0.
To address this cast the PCI domain number to 'irq_hw_number_t' before left
shifting it to calculate the hardware interrupt number.
Please note that this fixes the issue only on 64-bit systems and doesn't
change the behavior for 32-bit systems i.e. the 32-bit systems continue to
have the issue. Since the issue surfaces only if there are too many PCIe
controllers in the system which usually is the case in modern server
systems and they don't tend to run 32-bit kernels.
Fixes: 3878eaefb89a ("PCI/MSI: Enhance core to support hierarchy irqdomain")
Signed-off-by: Vidya Sagar <vidyas(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Shanker Donthineni <sdonthineni(a)nvidia.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240115135649.708536-1-vidyas@nvidia.com
diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index c8be056c248d..cfd84a899c82 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -61,7 +61,7 @@ static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
return (irq_hw_number_t)desc->msi_index |
pci_dev_id(dev) << 11 |
- (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 27;
+ ((irq_hw_number_t)(pci_domain_nr(dev->bus) & 0xFFFFFFFF)) << 27;
}
static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
From: Shyam Prasad N <nspmangalore(a)gmail.com>
commit 69cba9d3c1284e0838ae408830a02c4a063104bc upstream.
When the number of responses with status of STATUS_IO_TIMEOUT
exceeds a specified threshold (NUM_STATUS_IO_TIMEOUT), we reconnect
the connection. But we do not return the mid, or the credits
returned for the mid, or reduce the number of in-flight requests.
This bug could result in the server->in_flight count to go bad,
and also cause a leak in the mids.
This change moves the check to a few lines below where the
response is decrypted, even of the response is read from the
transform header. This way, the code for returning the mids
can be reused.
Also, the cifs_reconnect was reconnecting just the transport
connection before. In case of multi-channel, this may not be
what we want to do after several timeouts. Changed that to
reconnect the session and the tree too.
Also renamed NUM_STATUS_IO_TIMEOUT to a more appropriate name
MAX_STATUS_IO_TIMEOUT.
Fixes: 8e670f77c4a5 ("Handle STATUS_IO_TIMEOUT gracefully")
Signed-off-by: Shyam Prasad N <sprasad(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
[Harshit: Backport to 5.15.y]
Conflicts:
fs/cifs/connect.c -- 5.15.y doesn't have commit 183eea2ee5ba
("cifs: reconnect only the connection and not smb session where
possible") -- User cifs_reconnect(server) instead of
cifs_reconnect(server, true)
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
---
Would be nice to get a review from author/maintainer of the upstream patch.
A backport request was made previously but the patch didnot apply
cleanly then:
https://lore.kernel.org/all/CANT5p=oPGnCd4H5ppMbAiHsAKMor3LT_aQRqU7tKu=q6q1…
xfstests with cifs done: before and after patching with this patch on 5.15.149.
There is no change in test results before and after the patch.
Ran: cifs/001 generic/001 generic/002 generic/005 generic/006 generic/007
generic/010 generic/011 generic/013 generic/014 generic/023 generic/024
generic/028 generic/029 generic/030 generic/036 generic/069 generic/074
generic/075 generic/084 generic/091 generic/095 generic/098 generic/100
generic/109 generic/112 generic/113 generic/124 generic/127 generic/129
generic/130 generic/132 generic/133 generic/135 generic/141 generic/169
generic/198 generic/207 generic/208 generic/210 generic/211 generic/212
generic/221 generic/239 generic/241 generic/245 generic/246 generic/247
generic/248 generic/249 generic/257 generic/263 generic/285 generic/286
generic/308 generic/309 generic/310 generic/315 generic/323 generic/339
generic/340 generic/344 generic/345 generic/346 generic/354 generic/360
generic/393 generic/394
Not run: generic/010 generic/286 generic/315
Failures: generic/075 generic/112 generic/127 generic/285
Failed 4 of 68 tests
SECTION -- smb3
=========================
Ran: cifs/001 generic/001 generic/002 generic/005 generic/006 generic/007
generic/010 generic/011 generic/013 generic/014 generic/023 generic/024
generic/028 generic/029 generic/030 generic/036 generic/069 generic/074
generic/075 generic/084 generic/091 generic/095 generic/098 generic/100
generic/109 generic/112 generic/113 generic/124 generic/127 generic/129
generic/130 generic/132 generic/133 generic/135 generic/141 generic/169
generic/198 generic/207 generic/208 generic/210 generic/211 generic/212
generic/221 generic/239 generic/241 generic/245 generic/246 generic/247
generic/248 generic/249 generic/257 generic/263 generic/285 generic/286
generic/308 generic/309 generic/310 generic/315 generic/323 generic/339
generic/340 generic/344 generic/345 generic/346 generic/354 generic/360
generic/393 generic/394
Not run: generic/010 generic/014 generic/129 generic/130 generic/239
Failures: generic/075 generic/091 generic/112 generic/127 generic/263 generic/285 generic/286
Failed 7 of 68 tests
SECTION -- smb21
=========================
Ran: cifs/001 generic/001 generic/002 generic/005 generic/006 generic/007
generic/010 generic/011 generic/013 generic/014 generic/023 generic/024
generic/028 generic/029 generic/030 generic/036 generic/069 generic/074
generic/075 generic/084 generic/091 generic/095 generic/098 generic/100
generic/109 generic/112 generic/113 generic/124 generic/127 generic/129
generic/130 generic/132 generic/133 generic/135 generic/141 generic/169
generic/198 generic/207 generic/208 generic/210 generic/211 generic/212
generic/221 generic/239 generic/241 generic/245 generic/246 generic/247
generic/248 generic/249 generic/257 generic/263 generic/285 generic/286
generic/308 generic/309 generic/310 generic/315 generic/323 generic/339
generic/340 generic/344 generic/345 generic/346 generic/354 generic/360
generic/393 generic/394
Not run: generic/010 generic/014 generic/129 generic/130 generic/239 generic/286 generic/315
Failures: generic/075 generic/112 generic/127 generic/285
Failed 4 of 68 tests
SECTION -- smb2
=========================
Ran: cifs/001 generic/001 generic/002 generic/005 generic/006 generic/007
generic/010 generic/011 generic/013 generic/014 generic/023 generic/024
generic/028 generic/029 generic/030 generic/036 generic/069 generic/074
generic/075 generic/084 generic/091 generic/095 generic/098 generic/100
generic/109 generic/112 generic/113 generic/124 generic/127 generic/129
generic/130 generic/132 generic/133 generic/135 generic/141 generic/169
generic/198 generic/207 generic/208 generic/210 generic/211 generic/212
generic/221 generic/239 generic/241 generic/245 generic/246 generic/247
generic/248 generic/249 generic/257 generic/263 generic/285 generic/286
generic/308 generic/309 generic/310 generic/315 generic/323 generic/339
generic/340 generic/344 generic/345 generic/346 generic/354 generic/360
generic/393 generic/394
Not run: generic/010 generic/286 generic/315
Failures: generic/075 generic/112 generic/127 generic/285
Failed 4 of 68 tests
---
fs/cifs/connect.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a521c705b0d7..a3e4811b7871 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -59,7 +59,7 @@ extern bool disable_legacy_dialects;
#define TLINK_IDLE_EXPIRE (600 * HZ)
/* Drop the connection to not overload the server */
-#define NUM_STATUS_IO_TIMEOUT 5
+#define MAX_STATUS_IO_TIMEOUT 5
struct mount_ctx {
struct cifs_sb_info *cifs_sb;
@@ -965,6 +965,7 @@ cifs_demultiplex_thread(void *p)
struct mid_q_entry *mids[MAX_COMPOUND];
char *bufs[MAX_COMPOUND];
unsigned int noreclaim_flag, num_io_timeout = 0;
+ bool pending_reconnect = false;
noreclaim_flag = memalloc_noreclaim_save();
cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
@@ -1004,6 +1005,8 @@ cifs_demultiplex_thread(void *p)
cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
+
+ pending_reconnect = false;
next_pdu:
server->pdu_size = pdu_length;
@@ -1063,10 +1066,13 @@ cifs_demultiplex_thread(void *p)
if (server->ops->is_status_io_timeout &&
server->ops->is_status_io_timeout(buf)) {
num_io_timeout++;
- if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) {
- cifs_reconnect(server);
+ if (num_io_timeout > MAX_STATUS_IO_TIMEOUT) {
+ cifs_server_dbg(VFS,
+ "Number of request timeouts exceeded %d. Reconnecting",
+ MAX_STATUS_IO_TIMEOUT);
+
+ pending_reconnect = true;
num_io_timeout = 0;
- continue;
}
}
@@ -1113,6 +1119,11 @@ cifs_demultiplex_thread(void *p)
buf = server->smallbuf;
goto next_pdu;
}
+
+ /* do this reconnect at the very end after processing all MIDs */
+ if (pending_reconnect)
+ cifs_reconnect(server);
+
} /* end while !EXITING */
/* buffer usually freed in free_mid - need to free it here on exit */
--
2.43.0
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x e3b63e966cac0bf78aaa1efede1827a252815a1d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022610-amino-basically-add3@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
e3b63e966cac ("mm: zswap: fix missing folio cleanup in writeback race path")
96c7b0b42239 ("mm: return the folio from __read_swap_cache_async()")
e947ba0bbf47 ("mm/zswap: cleanup zswap_writeback_entry()")
32acba4c0483 ("mm/zswap: refactor out __zswap_load()")
c75f5c1e0f1d ("mm/zswap: reuse dstmem when decompress")
b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure")
a65b0e7607cc ("zswap: make shrinking memcg-aware")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e3b63e966cac0bf78aaa1efede1827a252815a1d Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Thu, 25 Jan 2024 08:51:27 +0000
Subject: [PATCH] mm: zswap: fix missing folio cleanup in writeback race path
In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled. If it was, we delete the
folio we just added to the swap cache and exit.
However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd. Make sure to unlock and put the folio before returning.
This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.
Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs(a)gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 350dd2fc8159..d2423247acfd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
The following patch accidentally removed the code for delivering
completions for cancelled reads and writes to user space: "[PATCH 04/33]
aio: remove retry-based AIO"
(https://lore.kernel.org/all/1363883754-27966-5-git-send-email-koverstreet@g…)
From that patch:
- if (kiocbIsCancelled(iocb)) {
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- /* must not access the iocb after this */
- goto out;
- }
This leads to a leak in user space of a struct iocb. Hence this patch
that restores the code that reports to user space that a read or write
has been cancelled successfully.
Fixes: 41003a7bcfed ("aio: remove retry-based AIO")
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Avi Kivity <avi(a)scylladb.com>
Cc: Sandeep Dhavale <dhavale(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
---
fs/aio.c | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index da18dbcfcb22..28223f511931 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2165,14 +2165,11 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
#endif
/* sys_io_cancel:
- * Attempts to cancel an iocb previously passed to io_submit. If
- * the operation is successfully cancelled, the resulting event is
- * copied into the memory pointed to by result without being placed
- * into the completion queue and 0 is returned. May fail with
- * -EFAULT if any of the data structures pointed to are invalid.
- * May fail with -EINVAL if aio_context specified by ctx_id is
- * invalid. May fail with -EAGAIN if the iocb specified was not
- * cancelled. Will fail with -ENOSYS if not implemented.
+ * Attempts to cancel an iocb previously passed to io_submit(). If the
+ * operation is successfully cancelled 0 is returned. May fail with
+ * -EFAULT if any of the data structures pointed to are invalid. May
+ * fail with -EINVAL if aio_context specified by ctx_id is invalid. Will
+ * fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
@@ -2203,14 +2200,12 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
}
spin_unlock_irq(&ctx->ctx_lock);
- if (!ret) {
- /*
- * The result argument is no longer used - the io_event is
- * always delivered via the ring buffer. -EINPROGRESS indicates
- * cancellation is progress:
- */
- ret = -EINPROGRESS;
- }
+ /*
+ * The result argument is no longer used - the io_event is always
+ * delivered via the ring buffer.
+ */
+ if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW)
+ aio_complete_rw(&kiocb->rw, -EINTR);
percpu_ref_put(&ctx->users);
reg_read() callback registered with nvmem core expects an integer error
as a return value but rmem_read() returns the number of bytes read, as a
result error checks in nvmem core fail even when they shouldn't.
Return 0 on success where number of bytes read match the number of bytes
requested and a negative error -EINVAL on all other cases.
Fixes: 5a3fa75a4d9c ("nvmem: Add driver to expose reserved memory as nvmem")
Cc: stable(a)vger.kernel.org
Signed-off-by: Joy Chakraborty <joychakr(a)google.com>
---
drivers/nvmem/rmem.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/nvmem/rmem.c b/drivers/nvmem/rmem.c
index 752d0bf4445e..a74dfa279ff4 100644
--- a/drivers/nvmem/rmem.c
+++ b/drivers/nvmem/rmem.c
@@ -46,7 +46,12 @@ static int rmem_read(void *context, unsigned int offset,
memunmap(addr);
- return count;
+ if (count != bytes) {
+ dev_err(priv->dev, "Failed read memory (%d)\n", count);
+ return -EINVAL;
+ }
+
+ return 0;
}
static int rmem_probe(struct platform_device *pdev)
--
2.43.0.594.gd9cf4e227d-goog
Hi,
I wanted to check with you if you had a time to go through my previous
email,
Let me know your thoughts about acquiring this email list
Regards,
*Olivia*
______________________________________________________________________________________________
Hi,
I hope you are the right person to discuss about *Healthcare Leads*, which
includes complete contact details, and tele-verified email addresses.
Please find the Leads Breakdown Chart below:
*Criteria*
*Counts*
*Criteria*
*Counts*
*Criteria*
*Counts*
Allergy immunology
5,064
Healthcare Technology
20,540
Plastic surgery
8,371
Anesthesiology
30,155
Nephrology
6,606
Preventive medicine
6,642
Cardiology
24,577
Neurological surgery
7,066
Psychiatry
4,315
Dermatology
8,467
Neurology
13,354
Radiology
32,763
Emergency medicine
22,300
Obgyn
35,163
Surgery
39,517
Endocrinology diabetes metabolism
3,756
Oncology
17,881
Urology
10,135
Family practice1
62,544
Ophthalmology
15,237
Physician
100,000
Gastroenterology
11,913
Orthopedics
22,145
Doctors
128,000
General practice
12,957
Other
15,559
Dentists
150,200
Geriatrics Doctors
9,634
Otolaryngology
9,539
Osteopathic
25,000
Infectious disease
5,677
Pathology
15,467
Acupuncture
5,000
Internal medicine1
120,029
Pediatrics
55,684
Chiropractors
11,000
Haematology Doctors
12,850
Physical medicine
8,437
Rheumatology
5,000
*Data Fields:* Every lead includes Name, Company, Job Title, Website,
Physical Address, Industry, *Phone Number and Verified/Opt-In Email
Address.* Please let me know if you have any queries about our custom
opt-in list and I would love to answer them.
Kindly share your thoughts.
Warm Regards,
*Olivia Stewart*
*Marketing Executive *
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
We respect your privacy, if you want to remove it from this list. Please
reply with the subject line as “Leave Out”.
This series includes 6 types of fixes:
- Patch 1 fixes v4 mapped in v6 addresses support for the userspace PM,
when asking to delete a subflow. It was done everywhere else, but not
there. Patch 2 validates the modification, thanks to a subtest in
mptcp_join.sh. These patches can be backported up to v5.19.
- Patch 3 is a small fix for a recent bug-fix patch, just to avoid
printing an irrelevant warning (pr_warn()) once. It can be backported
up to v5.6, alongside the bug-fix that has been introduced in the
v6.8-rc5.
- Patches 4 to 6 are fixes for bugs found by Paolo while working on
TCP_NOTSENT_LOWAT support for MPTCP. These fixes can improve the
performances in some cases. Patches can be backported up to v5.6,
v5.11 and v6.7 respectively.
- Patch 7 makes sure 'ss -M' is available when starting MPTCP Join
selftest as it is required for some subtests since v5.18.
- Patch 8 fixes a possible double-free on socket dismantle. The issue
always existed, but was unnoticed because it was not causing any
problem so far. This fix can be backported up to v5.6.
- Patch 9 is a fix for a very recent patch causing lockdep warnings in
subflow diag. The patch causing the regression -- which fixes another
issue present since v5.7 -- should be part of the future v6.8-rc6.
Patch 10 validates the modification, thanks to a new subtest in
diag.sh.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Davide Caratti (1):
mptcp: fix double-free on socket dismantle
Geliang Tang (3):
mptcp: map v4 address to v6 when destroying subflow
selftests: mptcp: rm subflow with v4/v4mapped addr
selftests: mptcp: join: add ss mptcp support check
Matthieu Baerts (NGI0) (1):
mptcp: avoid printing warning once on client side
Paolo Abeni (5):
mptcp: push at DSS boundaries
mptcp: fix snd_wnd initialization for passive socket
mptcp: fix potential wake-up event loss
mptcp: fix possible deadlock in subflow diag
selftests: mptcp: explicitly trigger the listener diag code-path
net/mptcp/diag.c | 3 ++
net/mptcp/options.c | 2 +-
net/mptcp/pm_userspace.c | 10 +++++
net/mptcp/protocol.c | 52 ++++++++++++++++++++++++-
net/mptcp/protocol.h | 21 +++++-----
tools/testing/selftests/net/mptcp/diag.sh | 30 +++++++++++++-
tools/testing/selftests/net/mptcp/mptcp_join.sh | 33 ++++++++++------
tools/testing/selftests/net/mptcp/mptcp_lib.sh | 4 +-
8 files changed, 128 insertions(+), 27 deletions(-)
---
base-commit: b0b1210bc150fbd741b4b9fce8a24541306b40fc
change-id: 20240223-upstream-net-20240223-misc-fixes-1630cd6b3b0a
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
> -----Original Message-----
> From: Larry Finger <Larry.Finger(a)gmail.com>
> Sent: Tuesday, February 27, 2024 9:41 AM
> To: Kalle Valo <kvalo(a)kernel.org>
> Cc: Johannes Berg <johannes(a)sipsolutions.net>; linux-wireless(a)vger.kernel.org; Nick Morrow
> <morrownr(a)gmail.com>; Larry Finger <Larry.Finger(a)lwfinger.net>; stable(a)vger.kernel.org
> Subject: [PATCH] wifi:rtw88: Add missing VID/PIDs
Missing a space between "wifi:" and "rtw88:", and suggest to mention 8811CU
and 8821CU in subject. Others look good to me.
>
> From: Nick Morrow <morrownr(a)gmail.com>
>
> Purpose: Add VID/PIDs that are known to be missing for this driver.
> - removed /* 8811CU */ and /* 8821CU */ as they are redundant
> since the file is specific to those chips.
> - removed /* TOTOLINK A650UA v3 */ as the manufacturer. It has a REALTEK
> VID so it may not be specific to this adapter.
>
> Source is
> https://1EHFQ.trk.elasticemail.com/tracking/click?d=I82H0YR_W_h175Lb3Nkb0D8…
> 0SPxd1Olp3PNJEm7h1Gft8lKFiXqYf1jEjniUnBHTdCi0Ypi2Y9ugy88eGHqb5MB9U0M7ZbBBaOwoaG0eHpd73dxUfRcicgS3TFBvw
> 066sdoIh1JxdrADO_ro60
> Verified and tested.
>
> Signed-off-by: Nick Morrow <morrownr(a)gmail.com>
> Signed-off-by: Larry Finger <Larry.Finger(a)lwfinger.net>
> Cc: stable(a)vger.kernel.org
Acked-by: Ping-Ke Shih <pkshih(a)realtek.com>
It looks like both 5.15.146 and 5.10.206 are impacted by this regression as they both have the
bad commit 33eae65c6f (smb: client: fix OOB in SMB2_query_info_init()). We tried to
apply the proposed fix eb3e28c1e89b ("smb3: Replace smb2pdu 1-element
arrays with flex-arrays”) but there are a lot of dependencies required to do the backport.
Is it possible to consider the simple fix that Paulo proposed as a solution for 5.10 and 5.15.
We were lucky with 5.4 as it doesn’t have the bad commit because of merge conflict reported
in https://lore.kernel.org/all/2023122857-doubling-crazed-27f4@gregkh/T/#m3aa0…
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 05ff8a457a3d..aed5067661de 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -3556,7 +3556,7 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
- iov[0].iov_len = len;
+ iov[0].iov_len = len - 1;
return 0;
}
Hazem
The following commit has been merged into the x86/misc branch of tip:
Commit-ID: d54e56f31a34fa38fcb5e91df609f9633419a79a
Gitweb: https://git.kernel.org/tip/d54e56f31a34fa38fcb5e91df609f9633419a79a
Author: Breno Leitao <leitao(a)debian.org>
AuthorDate: Wed, 07 Feb 2024 08:52:35 -08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Mon, 26 Feb 2024 23:41:30 +01:00
x86/nmi: Fix the inverse "in NMI handler" check
Commit 344da544f177 ("x86/nmi: Print reasons why backtrace NMIs are
ignored") creates a super nice framework to diagnose NMIs.
Every time nmi_exc() is called, it increments a per_cpu counter
(nsp->idt_nmi_seq). At its exit, it also increments the same counter. By
reading this counter it can be seen how many times that function was called
(dividing by 2), and, if the function is still being executed, by checking
the idt_nmi_seq's least significant bit.
On the check side (nmi_backtrace_stall_check()), that variable is queried
to check if the NMI is still being executed, but, there is a mistake in the
bitwise operation. That code wants to check if the least significant bit of
the idt_nmi_seq is set or not, but does the opposite, and checks for all
the other bits, which will always be true after the first exc_nmi()
executed successfully.
This appends the misleading string to the dump "(CPU currently in NMI
handler function)"
Fix it by checking the least significant bit, and if it is set, append the
string.
Fixes: 344da544f177 ("x86/nmi: Print reasons why backtrace NMIs are ignored")
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240207165237.1048837-1-leitao@debian.org
---
arch/x86/kernel/nmi.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d238679..c95dc1b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -639,7 +639,7 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
msgp = nmi_check_stall_msg[idx];
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
modp = ", but OK because ignore_nmis was set";
- if (nmi_seq & ~0x1)
+ if (nmi_seq & 0x1)
msghp = " (CPU currently in NMI handler function)";
else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
msghp = " (CPU exited one NMI handler function)";
On Mon, Feb 26, 2024 at 05:27:50PM +0200, Радослав Ненчовски wrote:
> Hi. IDK how more clear to write it in the title, so let me explain what the
> problem is.
I'm sending your message to stable instead, because helpdesk is only for
requesting help with kernel.org infrastructure.
Stable folks, please see below.
-K
> In the past 4 or 5 years I've been using this script (with an alias) to
> compress a single folder:
> 7z a "$1.7z" "$1"/ -mx=0 -mmt=8
>
> I know it doesn't look like much but essentially it creates a 7z archive
> (with "store" level of compression) with a name I've entered right after the
> alias. For instance: 7z0 "my dir" will create "my dir.7z".
> And in the past 4 or 5 years this script was working just fine because it
> was recognizing the slash as an indication that the target to compress is a
> directory.
> However, ever since 6.6.17-LTS arrived (altough I've heard the same
> complaints from people who use the regular rolling kernel, but they didn't
> tell me which version) bash stopped recognizing the slash as an indication
> for directory and thinks of it as the entire root directory, thus it
> attempts to compress not only "my dir" but also the whole root (/)
> directory. And it doesn't matter whether I'll put the slash between the
> quotes or outside of them - the result is the same. And, naturally, it
> throws out an unlimited number of errors about "access denied" to everything
> in root. I can't even begin to comprehend why on Earth you or whoever writes
> the kernel would make this change. Forget about me but ALL linux sysadmins I
> know use all kinds of scripts and changing the slash at the end of a word to
> mean "root" instead of a sign for directory is a rude way to ruin their
> work. Since this change occurred, I can no longer put a directory in an
> archive through CLI and I have to do it through GUI, which is about 10 times
> slower. I have a DE and I can do that but what about the sysadmins who
> usually use linux without a DE or directly SSH into the distro they're
> admins of? With this change you're literally hindering their job!
>
> I downgraded the kernel to 6.6.15-LTS and the problem disappeared - now the
> slash is properly recognized as a sign for directory.
>
> The point is: *it is urgent that you undo this change back to the way it
> was! I'm pretty sure sysadmins will begin to email you about this, if they
> haven't already.
> *
Oliver Upton (2):
KVM: arm64: vgic-its: Test for valid IRQ in
its_sync_lpi_pending_table()
KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler
virt/kvm/arm/vgic/vgic-its.c | 5 +++++
1 file changed, 5 insertions(+)
base-commit: 6e1f54a4985b63bc1b55a09e5e75a974c5d6719b
--
2.44.0.rc1.240.g4c46232300-goog
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 04b57c9e096a9479fe0ad31e3956e336fa589cb2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021924-setback-disinfect-0bd6@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
04b57c9e096a ("selftests: mptcp: join: stop transfer when check is done (part 2)")
b9fb176081fb ("selftests: mptcp: userspace pm send RM_ADDR for ID 0")
e3b47e460b4b ("selftests: mptcp: userspace pm remove initial subflow")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 04b57c9e096a9479fe0ad31e3956e336fa589cb2 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe(a)kernel.org>
Date: Wed, 31 Jan 2024 22:49:54 +0100
Subject: [PATCH] selftests: mptcp: join: stop transfer when check is done
(part 2)
Since the "Fixes" commits mentioned below, the newly added "userspace
pm" subtests of mptcp_join selftests are launching the whole transfer in
the background, do the required checks, then wait for the end of
transfer.
There is no need to wait longer, especially because the checks at the
end of the transfer are ignored (which is fine). This saves quite a few
seconds on slow environments.
While at it, use 'mptcp_lib_kill_wait()' helper everywhere, instead of
on a specific one with 'kill_tests_wait()'.
Fixes: b2e2248f365a ("selftests: mptcp: userspace pm create id 0 subflow")
Fixes: e3b47e460b4b ("selftests: mptcp: userspace pm remove initial subflow")
Fixes: b9fb176081fb ("selftests: mptcp: userspace pm send RM_ADDR for ID 0")
Cc: stable(a)vger.kernel.org
Reviewed-and-tested-by: Geliang Tang <geliang(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 85bcc95f4ede..c07386e21e0a 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -643,13 +643,6 @@ kill_events_pids()
mptcp_lib_kill_wait $evts_ns2_pid
}
-kill_tests_wait()
-{
- #shellcheck disable=SC2046
- kill -SIGUSR1 $(ip netns pids $ns2) $(ip netns pids $ns1)
- wait
-}
-
pm_nl_set_limits()
{
local ns=$1
@@ -3494,7 +3487,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 2 2
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm remove initial subflow
@@ -3518,7 +3511,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm send RM_ADDR for ID 0
@@ -3544,7 +3537,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
}
@@ -3558,7 +3551,8 @@ endpoint_tests()
pm_nl_set_limits $ns2 2 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
speed=slow \
- run_tests $ns1 $ns2 10.0.1.1 2>/dev/null &
+ run_tests $ns1 $ns2 10.0.1.1 &
+ local tests_pid=$!
wait_mpj $ns1
pm_nl_check_endpoint "creation" \
@@ -3573,7 +3567,7 @@ endpoint_tests()
pm_nl_add_endpoint $ns2 10.0.2.2 flags signal
pm_nl_check_endpoint "modif is allowed" \
$ns2 10.0.2.2 id 1 flags signal
- kill_tests_wait
+ mptcp_lib_kill_wait $tests_pid
fi
if reset "delete and re-add" &&
@@ -3582,7 +3576,8 @@ endpoint_tests()
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
test_linkfail=4 speed=20 \
- run_tests $ns1 $ns2 10.0.1.1 2>/dev/null &
+ run_tests $ns1 $ns2 10.0.1.1 &
+ local tests_pid=$!
wait_mpj $ns2
chk_subflow_nr "before delete" 2
@@ -3597,7 +3592,7 @@ endpoint_tests()
wait_mpj $ns2
chk_subflow_nr "after re-add" 2
chk_mptcp_info subflows 1 subflows 1
- kill_tests_wait
+ mptcp_lib_kill_wait $tests_pid
fi
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 31ee4ad86afd6ed6f4bb1b38c43011216080c42a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021917-nuzzle-magenta-7de4@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
31ee4ad86afd ("selftests: mptcp: join: stop transfer when check is done (part 1)")
80775412882e ("selftests: mptcp: add chk_subflows_total helper")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 31ee4ad86afd6ed6f4bb1b38c43011216080c42a Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe(a)kernel.org>
Date: Wed, 31 Jan 2024 22:49:53 +0100
Subject: [PATCH] selftests: mptcp: join: stop transfer when check is done
(part 1)
Since the "Fixes" commit mentioned below, "userspace pm" subtests of
mptcp_join selftests introduced in v6.5 are launching the whole transfer
in the background, do the required checks, then wait for the end of
transfer.
There is no need to wait longer, especially because the checks at the
end of the transfer are ignored (which is fine). This saves quite a few
seconds in slow environments.
Note that old versions will need commit bdbef0a6ff10 ("selftests: mptcp:
add mptcp_lib_kill_wait") as well to get 'mptcp_lib_kill_wait()' helper.
Fixes: 4369c198e599 ("selftests: mptcp: test userspace pm out of transfer")
Cc: stable(a)vger.kernel.org # 6.5.x: bdbef0a6ff10: selftests: mptcp: add mptcp_lib_kill_wait
Cc: stable(a)vger.kernel.org # 6.5.x
Reviewed-and-tested-by: Geliang Tang <geliang(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 3a5b63026191..85bcc95f4ede 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3453,7 +3453,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm create destroy subflow
@@ -3475,7 +3475,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm create id 0 subflow
From: Geliang Tang <geliang.tang(a)suse.com>
This patch adds the ability to send RM_ADDR for local ID 0. Check
whether id 0 address is removed, if not, put id 0 into a removing
list, pass it to mptcp_pm_remove_addr() to remove id 0 address.
There is no reason not to allow the userspace to remove the initial
address (ID 0). This special case was not taken into account not
letting the userspace to delete all addresses as announced.
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/379
Reviewed-by: Matthieu Baerts <matttbe(a)kernel.org>
Signed-off-by: Geliang Tang <geliang.tang(a)suse.com>
Signed-off-by: Mat Martineau <martineau(a)kernel.org>
Link: https://lore.kernel.org/r/20231025-send-net-next-20231025-v1-3-db8f25f798eb…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
(cherry picked from commit 84c531f54ad9a124a924c9505d74e33d16965146)
Fixes: d9a4594edabf ("mptcp: netlink: Add MPTCP_PM_CMD_REMOVE")
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Notes:
- As mentioned in [1], the 'Fixes' tag has been accidentally dropped:
[1] https://lore.kernel.org/stable/a7a3675a-4531-4559-bea2-c7689317764a@kernel.…
- Conflict in pm_userspace.c because the new helper function expected
to be on top of mptcp_pm_nl_remove_doit() which has been recently
renamed in commit 1e07938e29c5 ("net: mptcp: rename netlink handlers
to mptcp_pm_nl_<blah>_{doit,dumpit}").
---
net/mptcp/pm_userspace.c | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 3b34b7cf56c9..ecd166ce047d 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -220,6 +220,40 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk,
+ struct genl_info *info)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ bool has_id_0 = false;
+ int err = -EINVAL;
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (subflow->local_id == 0) {
+ has_id_0 = true;
+ break;
+ }
+ }
+ if (!has_id_0) {
+ GENL_SET_ERR_MSG(info, "address with id 0 not found");
+ goto remove_err;
+ }
+
+ list.ids[list.nr++] = 0;
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ err = 0;
+
+remove_err:
+ release_sock(sk);
+ return err;
+}
+
int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
@@ -251,6 +285,11 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
goto remove_err;
}
+ if (id_val == 0) {
+ err = mptcp_userspace_pm_remove_id_zero_address(msk, info);
+ goto remove_err;
+ }
+
lock_sock((struct sock *)msk);
list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
--
2.43.0
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x b820de741ae48ccf50dd95e297889c286ff4f760
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022601-stem-comfort-1bb5@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
b820de741ae4 ("fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio")
9cf3516c29e6 ("fs: add IOCB flags related to passing back dio completions")
f6c73a11133e ("fs.h: Add TRACE_IOCB_STRINGS for use in trace points")
1da8cf961bb1 ("Merge tag 'io_uring-6.0-2022-08-13' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Thu, 15 Feb 2024 12:47:38 -0800
Subject: [PATCH] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via
libaio
If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the
following kernel warning appears:
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
kiocb_set_cancel_fn+0x9c/0xa8
ffs_epfile_read_iter+0x144/0x1d0
io_read+0x19c/0x498
io_issue_sqe+0x118/0x27c
io_submit_sqes+0x25c/0x5fc
__arm64_sys_io_uring_enter+0x104/0xab0
invoke_syscall+0x58/0x11c
el0_svc_common+0xb4/0xf4
do_el0_svc+0x2c/0xb0
el0_svc+0x2c/0xa4
el0t_64_sync_handler+0x68/0xb4
el0t_64_sync+0x1a4/0x1a8
Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is
submitted by libaio.
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Avi Kivity <avi(a)scylladb.com>
Cc: Sandeep Dhavale <dhavale(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..da18dbcfcb22 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..c2dcc98cb4c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@ enum rw_hint {
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
+/* kiocb is a read or write operation submitted by fs/aio.c. */
+#define IOCB_AIO_RW (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b820de741ae48ccf50dd95e297889c286ff4f760
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022654-stainless-aground-196f@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
b820de741ae4 ("fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio")
9cf3516c29e6 ("fs: add IOCB flags related to passing back dio completions")
f6c73a11133e ("fs.h: Add TRACE_IOCB_STRINGS for use in trace points")
1da8cf961bb1 ("Merge tag 'io_uring-6.0-2022-08-13' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Thu, 15 Feb 2024 12:47:38 -0800
Subject: [PATCH] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via
libaio
If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the
following kernel warning appears:
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
kiocb_set_cancel_fn+0x9c/0xa8
ffs_epfile_read_iter+0x144/0x1d0
io_read+0x19c/0x498
io_issue_sqe+0x118/0x27c
io_submit_sqes+0x25c/0x5fc
__arm64_sys_io_uring_enter+0x104/0xab0
invoke_syscall+0x58/0x11c
el0_svc_common+0xb4/0xf4
do_el0_svc+0x2c/0xb0
el0_svc+0x2c/0xa4
el0t_64_sync_handler+0x68/0xb4
el0t_64_sync+0x1a4/0x1a8
Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is
submitted by libaio.
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Avi Kivity <avi(a)scylladb.com>
Cc: Sandeep Dhavale <dhavale(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..da18dbcfcb22 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..c2dcc98cb4c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@ enum rw_hint {
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
+/* kiocb is a read or write operation submitted by fs/aio.c. */
+#define IOCB_AIO_RW (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b820de741ae48ccf50dd95e297889c286ff4f760
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022653-schedule-unloaded-e4ed@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
b820de741ae4 ("fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio")
9cf3516c29e6 ("fs: add IOCB flags related to passing back dio completions")
f6c73a11133e ("fs.h: Add TRACE_IOCB_STRINGS for use in trace points")
1da8cf961bb1 ("Merge tag 'io_uring-6.0-2022-08-13' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Thu, 15 Feb 2024 12:47:38 -0800
Subject: [PATCH] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via
libaio
If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the
following kernel warning appears:
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
kiocb_set_cancel_fn+0x9c/0xa8
ffs_epfile_read_iter+0x144/0x1d0
io_read+0x19c/0x498
io_issue_sqe+0x118/0x27c
io_submit_sqes+0x25c/0x5fc
__arm64_sys_io_uring_enter+0x104/0xab0
invoke_syscall+0x58/0x11c
el0_svc_common+0xb4/0xf4
do_el0_svc+0x2c/0xb0
el0_svc+0x2c/0xa4
el0t_64_sync_handler+0x68/0xb4
el0t_64_sync+0x1a4/0x1a8
Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is
submitted by libaio.
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Avi Kivity <avi(a)scylladb.com>
Cc: Sandeep Dhavale <dhavale(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..da18dbcfcb22 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..c2dcc98cb4c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@ enum rw_hint {
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
+/* kiocb is a read or write operation submitted by fs/aio.c. */
+#define IOCB_AIO_RW (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b820de741ae48ccf50dd95e297889c286ff4f760
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022651-shrimp-freezing-6b17@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
b820de741ae4 ("fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio")
9cf3516c29e6 ("fs: add IOCB flags related to passing back dio completions")
f6c73a11133e ("fs.h: Add TRACE_IOCB_STRINGS for use in trace points")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Thu, 15 Feb 2024 12:47:38 -0800
Subject: [PATCH] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via
libaio
If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the
following kernel warning appears:
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
kiocb_set_cancel_fn+0x9c/0xa8
ffs_epfile_read_iter+0x144/0x1d0
io_read+0x19c/0x498
io_issue_sqe+0x118/0x27c
io_submit_sqes+0x25c/0x5fc
__arm64_sys_io_uring_enter+0x104/0xab0
invoke_syscall+0x58/0x11c
el0_svc_common+0xb4/0xf4
do_el0_svc+0x2c/0xb0
el0_svc+0x2c/0xa4
el0t_64_sync_handler+0x68/0xb4
el0t_64_sync+0x1a4/0x1a8
Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is
submitted by libaio.
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Avi Kivity <avi(a)scylladb.com>
Cc: Sandeep Dhavale <dhavale(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..da18dbcfcb22 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..c2dcc98cb4c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@ enum rw_hint {
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
+/* kiocb is a read or write operation submitted by fs/aio.c. */
+#define IOCB_AIO_RW (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x 04b57c9e096a9479fe0ad31e3956e336fa589cb2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021925-saloon-pursuit-2736@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
04b57c9e096a ("selftests: mptcp: join: stop transfer when check is done (part 2)")
b9fb176081fb ("selftests: mptcp: userspace pm send RM_ADDR for ID 0")
e3b47e460b4b ("selftests: mptcp: userspace pm remove initial subflow")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 04b57c9e096a9479fe0ad31e3956e336fa589cb2 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe(a)kernel.org>
Date: Wed, 31 Jan 2024 22:49:54 +0100
Subject: [PATCH] selftests: mptcp: join: stop transfer when check is done
(part 2)
Since the "Fixes" commits mentioned below, the newly added "userspace
pm" subtests of mptcp_join selftests are launching the whole transfer in
the background, do the required checks, then wait for the end of
transfer.
There is no need to wait longer, especially because the checks at the
end of the transfer are ignored (which is fine). This saves quite a few
seconds on slow environments.
While at it, use 'mptcp_lib_kill_wait()' helper everywhere, instead of
on a specific one with 'kill_tests_wait()'.
Fixes: b2e2248f365a ("selftests: mptcp: userspace pm create id 0 subflow")
Fixes: e3b47e460b4b ("selftests: mptcp: userspace pm remove initial subflow")
Fixes: b9fb176081fb ("selftests: mptcp: userspace pm send RM_ADDR for ID 0")
Cc: stable(a)vger.kernel.org
Reviewed-and-tested-by: Geliang Tang <geliang(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 85bcc95f4ede..c07386e21e0a 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -643,13 +643,6 @@ kill_events_pids()
mptcp_lib_kill_wait $evts_ns2_pid
}
-kill_tests_wait()
-{
- #shellcheck disable=SC2046
- kill -SIGUSR1 $(ip netns pids $ns2) $(ip netns pids $ns1)
- wait
-}
-
pm_nl_set_limits()
{
local ns=$1
@@ -3494,7 +3487,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 2 2
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm remove initial subflow
@@ -3518,7 +3511,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm send RM_ADDR for ID 0
@@ -3544,7 +3537,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
}
@@ -3558,7 +3551,8 @@ endpoint_tests()
pm_nl_set_limits $ns2 2 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
speed=slow \
- run_tests $ns1 $ns2 10.0.1.1 2>/dev/null &
+ run_tests $ns1 $ns2 10.0.1.1 &
+ local tests_pid=$!
wait_mpj $ns1
pm_nl_check_endpoint "creation" \
@@ -3573,7 +3567,7 @@ endpoint_tests()
pm_nl_add_endpoint $ns2 10.0.2.2 flags signal
pm_nl_check_endpoint "modif is allowed" \
$ns2 10.0.2.2 id 1 flags signal
- kill_tests_wait
+ mptcp_lib_kill_wait $tests_pid
fi
if reset "delete and re-add" &&
@@ -3582,7 +3576,8 @@ endpoint_tests()
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
test_linkfail=4 speed=20 \
- run_tests $ns1 $ns2 10.0.1.1 2>/dev/null &
+ run_tests $ns1 $ns2 10.0.1.1 &
+ local tests_pid=$!
wait_mpj $ns2
chk_subflow_nr "before delete" 2
@@ -3597,7 +3592,7 @@ endpoint_tests()
wait_mpj $ns2
chk_subflow_nr "after re-add" 2
chk_mptcp_info subflows 1 subflows 1
- kill_tests_wait
+ mptcp_lib_kill_wait $tests_pid
fi
}
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x 31ee4ad86afd6ed6f4bb1b38c43011216080c42a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021916-striking-evoke-4847@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
31ee4ad86afd ("selftests: mptcp: join: stop transfer when check is done (part 1)")
80775412882e ("selftests: mptcp: add chk_subflows_total helper")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 31ee4ad86afd6ed6f4bb1b38c43011216080c42a Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe(a)kernel.org>
Date: Wed, 31 Jan 2024 22:49:53 +0100
Subject: [PATCH] selftests: mptcp: join: stop transfer when check is done
(part 1)
Since the "Fixes" commit mentioned below, "userspace pm" subtests of
mptcp_join selftests introduced in v6.5 are launching the whole transfer
in the background, do the required checks, then wait for the end of
transfer.
There is no need to wait longer, especially because the checks at the
end of the transfer are ignored (which is fine). This saves quite a few
seconds in slow environments.
Note that old versions will need commit bdbef0a6ff10 ("selftests: mptcp:
add mptcp_lib_kill_wait") as well to get 'mptcp_lib_kill_wait()' helper.
Fixes: 4369c198e599 ("selftests: mptcp: test userspace pm out of transfer")
Cc: stable(a)vger.kernel.org # 6.5.x: bdbef0a6ff10: selftests: mptcp: add mptcp_lib_kill_wait
Cc: stable(a)vger.kernel.org # 6.5.x
Reviewed-and-tested-by: Geliang Tang <geliang(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 3a5b63026191..85bcc95f4ede 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3453,7 +3453,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm create destroy subflow
@@ -3475,7 +3475,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- wait $tests_pid
+ mptcp_lib_kill_wait $tests_pid
fi
# userspace pm create id 0 subflow
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 9a458198eba98b7207669a166e64d04b04cb651b
Gitweb: https://git.kernel.org/tip/9a458198eba98b7207669a166e64d04b04cb651b
Author: Paolo Bonzini <pbonzini(a)redhat.com>
AuthorDate: Thu, 01 Feb 2024 00:09:01 +01:00
Committer: Dave Hansen <dave.hansen(a)linux.intel.com>
CommitterDate: Mon, 26 Feb 2024 08:16:15 -08:00
x86/cpu: Allow reducing x86_phys_bits during early_identify_cpu()
In commit fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct
value straight away, instead of a two-phase approach"), the initialization
of c->x86_phys_bits was moved after this_cpu->c_early_init(c). This is
incorrect because early_init_amd() expected to be able to reduce the
value according to the contents of CPUID leaf 0x8000001f.
Fortunately, the bug was negated by init_amd()'s call to early_init_amd(),
which does reduce x86_phys_bits in the end. However, this is very
late in the boot process and, most notably, the wrong value is used for
x86_phys_bits when setting up MTRRs.
To fix this, call get_cpu_address_sizes() as soon as X86_FEATURE_CPUID is
set/cleared, and c->extended_cpuid_level is retrieved.
Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach")
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20240131230902.1867092-2-pbonzini%40redhat.com
---
arch/x86/kernel/cpu/common.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0b97bcd..fbc4e60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1589,6 +1589,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
get_cpu_vendor(c);
get_cpu_cap(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
cpu_parse_early_param();
if (this_cpu->c_early_init)
@@ -1601,10 +1602,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_bsp_init(c);
} else {
setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
}
- get_cpu_address_sizes(c);
-
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
cpu_set_bug_bits(c);
Because sandboxing can be used as an opportunistic security measure,
user space may not log unsupported features. Let the system
administrator know if an application tries to use Landlock but failed
because it isn't enabled at boot time. This may be caused by bootloader
configurations with outdated "lsm" kernel's command-line parameter.
Cc: Günther Noack <gnoack(a)google.com>
Cc: stable(a)vger.kernel.org
Fixes: 265885daf3e5 ("landlock: Add syscall implementations")
Signed-off-by: Mickaël Salaün <mic(a)digikod.net>
---
security/landlock/syscalls.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index f0bc50003b46..b5b424819dee 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -33,6 +33,18 @@
#include "ruleset.h"
#include "setup.h"
+static bool is_not_initialized(void)
+{
+ if (likely(landlock_initialized))
+ return false;
+
+ pr_warn_once(
+ "Disabled but requested by user space. "
+ "You should enable Landlock at boot time: "
+ "https://docs.kernel.org/userspace-api/landlock.html#kernel-support\n");
+ return true;
+}
+
/**
* copy_min_struct_from_user - Safe future-proof argument copying
*
@@ -173,7 +185,7 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
/* Build-time checks. */
build_check_abi();
- if (!landlock_initialized)
+ if (is_not_initialized())
return -EOPNOTSUPP;
if (flags) {
@@ -407,7 +419,7 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
struct landlock_ruleset *ruleset;
int err;
- if (!landlock_initialized)
+ if (is_not_initialized())
return -EOPNOTSUPP;
/* No flag for now. */
@@ -467,7 +479,7 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
struct landlock_cred_security *new_llcred;
int err;
- if (!landlock_initialized)
+ if (is_not_initialized())
return -EOPNOTSUPP;
/*
--
2.43.0
Svacer reports a potential division by zero at rcu_torture_writer() in
5.10 stable release. The problem has been fixed by the following patch
that can be cleanly applied to 5.10 branches.
From: Bjorn Helgaas <bhelgaas(a)google.com>
When booting with "pci=noaer", we don't request control of AER, but we
previously *did* request control of DPC, as in the dmesg log attached at
the bugzilla below:
Command line: ... pci=noaer
acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI EDR HPX-Type3]
acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug SHPCHotplug PME PCIeCapability LTR DPC]
That's illegal per PCI Firmware Spec, r3.3, sec 4.5.1, table 4-5, which
says:
If the operating system sets this bit [OSC_PCI_EXPRESS_DPC_CONTROL], it
must also set bit 7 of the Support field (indicating support for Error
Disconnect Recover notifications) and bits 3 and 4 of the Control field
(requesting control of PCI Express Advanced Error Reporting and the PCI
Express Capability Structure).
Request DPC control only if we have also requested AER control.
Fixes: ac1c8e35a326 ("PCI/DPC: Add Error Disconnect Recover (EDR) support")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218491#c12
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: <stable(a)vger.kernel.org> # v5.7+
Cc: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Cc: Matthew W Carlis <mattc(a)purestorage.com>
Cc: Keith Busch <kbusch(a)kernel.org>
Cc: Lukas Wunner <lukas(a)wunner.de>
Cc: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg(a)intel.com>
---
drivers/acpi/pci_root.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 58b89b8d950e..efc292b6214e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -518,17 +518,19 @@ static u32 calculate_control(void)
if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC))
control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL;
- if (pci_aer_available())
+ if (pci_aer_available()) {
control |= OSC_PCI_EXPRESS_AER_CONTROL;
- /*
- * Per the Downstream Port Containment Related Enhancements ECN to
- * the PCI Firmware Spec, r3.2, sec 4.5.1, table 4-5,
- * OSC_PCI_EXPRESS_DPC_CONTROL indicates the OS supports both DPC
- * and EDR.
- */
- if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR))
- control |= OSC_PCI_EXPRESS_DPC_CONTROL;
+ /*
+ * Per PCI Firmware Spec, r3.3, sec 4.5.1, table 4-5, the
+ * OS can request DPC control only if it has advertised
+ * OSC_PCI_EDR_SUPPORT and requested both
+ * OSC_PCI_EXPRESS_CAPABILITY_CONTROL and
+ * OSC_PCI_EXPRESS_AER_CONTROL.
+ */
+ if (IS_ENABLED(CONFIG_PCIE_DPC))
+ control |= OSC_PCI_EXPRESS_DPC_CONTROL;
+ }
return control;
}
--
2.34.1
This bug was found by syzkaller. This series of patches
is fix for this particular bug. Both of these patches were taken
from upstream and applied clearly without any conflicts.
First one is the fix for the problem
and another one is for fix first patch.
Luiz Augusto von Dentz (1):
Bluetooth: SCO: Fix possible circular locking dependency on
sco_connect_cfm
Pauli Virtanen (1):
Bluetooth: SCO: fix sco_conn related locking and validity issues
net/bluetooth/sco.c | 76 ++++++++++++++++++++++++++-------------------
1 file changed, 44 insertions(+), 32 deletions(-)
--
2.42.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 1c9be13846c0b2abc2480602f8ef421360e1ad9e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022632-wise-dose-46ed@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
1c9be13846c0 ("usb: roles: fix NULL pointer issue when put module's reference")
044a61158b9e ("USB: roles: make role_class a static const structure")
1aaba11da9aa ("driver core: class: remove module * from class_create()")
6e30a66433af ("driver core: class: remove struct module owner out of struct class")
0b2a1a3938aa ("driver core: class: Clear private pointer on registration failures")
71a7507afbc3 ("Merge tag 'driver-core-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1c9be13846c0b2abc2480602f8ef421360e1ad9e Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2(a)nxp.com>
Date: Mon, 29 Jan 2024 17:37:38 +0800
Subject: [PATCH] usb: roles: fix NULL pointer issue when put module's
reference
In current design, usb role class driver will get usb_role_switch parent's
module reference after the user get usb_role_switch device and put the
reference after the user put the usb_role_switch device. However, the
parent device of usb_role_switch may be removed before the user put the
usb_role_switch. If so, then, NULL pointer issue will be met when the user
put the parent module's reference.
This will save the module pointer in structure of usb_role_switch. Then,
we don't need to find module by iterating long relations.
Fixes: 5c54fcac9a9d ("usb: roles: Take care of driver module reference counting")
cc: stable(a)vger.kernel.org
Signed-off-by: Xu Yang <xu.yang_2(a)nxp.com>
Acked-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Link: https://lore.kernel.org/r/20240129093739.2371530-1-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c
index ae41578bd014..2bad038fb9ad 100644
--- a/drivers/usb/roles/class.c
+++ b/drivers/usb/roles/class.c
@@ -21,6 +21,7 @@ static const struct class role_class = {
struct usb_role_switch {
struct device dev;
struct mutex lock; /* device lock*/
+ struct module *module; /* the module this device depends on */
enum usb_role role;
/* From descriptor */
@@ -135,7 +136,7 @@ struct usb_role_switch *usb_role_switch_get(struct device *dev)
usb_role_switch_match);
if (!IS_ERR_OR_NULL(sw))
- WARN_ON(!try_module_get(sw->dev.parent->driver->owner));
+ WARN_ON(!try_module_get(sw->module));
return sw;
}
@@ -157,7 +158,7 @@ struct usb_role_switch *fwnode_usb_role_switch_get(struct fwnode_handle *fwnode)
sw = fwnode_connection_find_match(fwnode, "usb-role-switch",
NULL, usb_role_switch_match);
if (!IS_ERR_OR_NULL(sw))
- WARN_ON(!try_module_get(sw->dev.parent->driver->owner));
+ WARN_ON(!try_module_get(sw->module));
return sw;
}
@@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(fwnode_usb_role_switch_get);
void usb_role_switch_put(struct usb_role_switch *sw)
{
if (!IS_ERR_OR_NULL(sw)) {
- module_put(sw->dev.parent->driver->owner);
+ module_put(sw->module);
put_device(&sw->dev);
}
}
@@ -189,15 +190,18 @@ struct usb_role_switch *
usb_role_switch_find_by_fwnode(const struct fwnode_handle *fwnode)
{
struct device *dev;
+ struct usb_role_switch *sw = NULL;
if (!fwnode)
return NULL;
dev = class_find_device_by_fwnode(&role_class, fwnode);
- if (dev)
- WARN_ON(!try_module_get(dev->parent->driver->owner));
+ if (dev) {
+ sw = to_role_switch(dev);
+ WARN_ON(!try_module_get(sw->module));
+ }
- return dev ? to_role_switch(dev) : NULL;
+ return sw;
}
EXPORT_SYMBOL_GPL(usb_role_switch_find_by_fwnode);
@@ -338,6 +342,7 @@ usb_role_switch_register(struct device *parent,
sw->set = desc->set;
sw->get = desc->get;
+ sw->module = parent->driver->owner;
sw->dev.parent = parent;
sw->dev.fwnode = desc->fwnode;
sw->dev.class = &role_class;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 1c9be13846c0b2abc2480602f8ef421360e1ad9e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022630-streak-bleep-1f75@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
1c9be13846c0 ("usb: roles: fix NULL pointer issue when put module's reference")
044a61158b9e ("USB: roles: make role_class a static const structure")
1aaba11da9aa ("driver core: class: remove module * from class_create()")
6e30a66433af ("driver core: class: remove struct module owner out of struct class")
0b2a1a3938aa ("driver core: class: Clear private pointer on registration failures")
71a7507afbc3 ("Merge tag 'driver-core-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1c9be13846c0b2abc2480602f8ef421360e1ad9e Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2(a)nxp.com>
Date: Mon, 29 Jan 2024 17:37:38 +0800
Subject: [PATCH] usb: roles: fix NULL pointer issue when put module's
reference
In current design, usb role class driver will get usb_role_switch parent's
module reference after the user get usb_role_switch device and put the
reference after the user put the usb_role_switch device. However, the
parent device of usb_role_switch may be removed before the user put the
usb_role_switch. If so, then, NULL pointer issue will be met when the user
put the parent module's reference.
This will save the module pointer in structure of usb_role_switch. Then,
we don't need to find module by iterating long relations.
Fixes: 5c54fcac9a9d ("usb: roles: Take care of driver module reference counting")
cc: stable(a)vger.kernel.org
Signed-off-by: Xu Yang <xu.yang_2(a)nxp.com>
Acked-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Link: https://lore.kernel.org/r/20240129093739.2371530-1-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c
index ae41578bd014..2bad038fb9ad 100644
--- a/drivers/usb/roles/class.c
+++ b/drivers/usb/roles/class.c
@@ -21,6 +21,7 @@ static const struct class role_class = {
struct usb_role_switch {
struct device dev;
struct mutex lock; /* device lock*/
+ struct module *module; /* the module this device depends on */
enum usb_role role;
/* From descriptor */
@@ -135,7 +136,7 @@ struct usb_role_switch *usb_role_switch_get(struct device *dev)
usb_role_switch_match);
if (!IS_ERR_OR_NULL(sw))
- WARN_ON(!try_module_get(sw->dev.parent->driver->owner));
+ WARN_ON(!try_module_get(sw->module));
return sw;
}
@@ -157,7 +158,7 @@ struct usb_role_switch *fwnode_usb_role_switch_get(struct fwnode_handle *fwnode)
sw = fwnode_connection_find_match(fwnode, "usb-role-switch",
NULL, usb_role_switch_match);
if (!IS_ERR_OR_NULL(sw))
- WARN_ON(!try_module_get(sw->dev.parent->driver->owner));
+ WARN_ON(!try_module_get(sw->module));
return sw;
}
@@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(fwnode_usb_role_switch_get);
void usb_role_switch_put(struct usb_role_switch *sw)
{
if (!IS_ERR_OR_NULL(sw)) {
- module_put(sw->dev.parent->driver->owner);
+ module_put(sw->module);
put_device(&sw->dev);
}
}
@@ -189,15 +190,18 @@ struct usb_role_switch *
usb_role_switch_find_by_fwnode(const struct fwnode_handle *fwnode)
{
struct device *dev;
+ struct usb_role_switch *sw = NULL;
if (!fwnode)
return NULL;
dev = class_find_device_by_fwnode(&role_class, fwnode);
- if (dev)
- WARN_ON(!try_module_get(dev->parent->driver->owner));
+ if (dev) {
+ sw = to_role_switch(dev);
+ WARN_ON(!try_module_get(sw->module));
+ }
- return dev ? to_role_switch(dev) : NULL;
+ return sw;
}
EXPORT_SYMBOL_GPL(usb_role_switch_find_by_fwnode);
@@ -338,6 +342,7 @@ usb_role_switch_register(struct device *parent,
sw->set = desc->set;
sw->get = desc->get;
+ sw->module = parent->driver->owner;
sw->dev.parent = parent;
sw->dev.fwnode = desc->fwnode;
sw->dev.class = &role_class;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x ec4308ecfc887128a468f03fb66b767559c57c23
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022602-daunting-dreamland-882c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
ec4308ecfc88 ("irqchip/gic-v3-its: Do not assume vPE tables are preallocated")
c0cdc89072a3 ("irqchip/gic-v3-its: Give the percpu rdist struct its own flags field")
5e5168461c22 ("irqchip/gic-v4.1: VPE table (aka GICR_VPROPBASER) allocation")
b25319d279b6 ("irqchip/gic-v3: Detect GICv4.1 supporting RVPEID")
576a83429757 ("irqchip/gic-v3-its: Kill its->device_ids and use TYPER copy instead")
ffedbf0cba15 ("irqchip/gic-v3-its: Kill its->ite_size and use TYPER copy instead")
0dd57fed6b46 ("irqchip/gic-v3-its: Make is_v4 use a TYPER copy")
8424312516e5 ("irqchip/gic-v3-its: Use the exact ITSList for VMOVP")
5f51f803826e ("irqchip/gic-v3: Add EPPI range support")
81a43273045b ("irqchip/gic-v3: Dynamically allocate PPI NMI refcounts")
1a60e1e64391 ("irqchip/gic: Prepare for more than 16 PPIs")
211bddd210a6 ("irqchip/gic-v3: Add ESPI range support")
e91b036e1c20 ("irqchip/gic-v3: Add INTID range and convertion primitives")
13d22e2e1f35 ("irqchip/gic: Rework gic_configure_irq to take the full ICFGR base")
3d8dfe75ef69 ("Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4308ecfc887128a468f03fb66b767559c57c23 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Mon, 19 Feb 2024 18:58:06 +0000
Subject: [PATCH] irqchip/gic-v3-its: Do not assume vPE tables are preallocated
The GIC/ITS code is designed to ensure to pick up any preallocated LPI
tables on the redistributors, as enabling LPIs is a one-way switch. There
is no such restriction for vLPIs, and for GICv4.1 it is expected to
allocate a new vPE table at boot.
This works as intended when initializing an ITS, however when setting up a
redistributor in cpu_init_lpis() the early return for preallocated RD
tables skips straight past the GICv4 setup. This all comes to a head when
trying to kexec() into a new kernel, as the new kernel silently fails to
set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs.
Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always
initializes GICv4 on the way out, even if the other RD tables were
preallocated.
Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop")
Reported-by: George Cherian <gcherian(a)marvell.com>
Co-developed-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240219185809.286724-2-oliver.upton@linux.dev
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 53abd4779914..b822752c4261 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3181,6 +3181,7 @@ static void its_cpu_init_lpis(void)
val |= GICR_CTLR_ENABLE_LPIS;
writel_relaxed(val, rbase + GICR_CTLR);
+out:
if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) {
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -3216,7 +3217,6 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
-out:
gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED;
pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n",
smp_processor_id(),
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x ec4308ecfc887128a468f03fb66b767559c57c23
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022658-brethren-stopper-8b5e@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
ec4308ecfc88 ("irqchip/gic-v3-its: Do not assume vPE tables are preallocated")
c0cdc89072a3 ("irqchip/gic-v3-its: Give the percpu rdist struct its own flags field")
5e5168461c22 ("irqchip/gic-v4.1: VPE table (aka GICR_VPROPBASER) allocation")
b25319d279b6 ("irqchip/gic-v3: Detect GICv4.1 supporting RVPEID")
576a83429757 ("irqchip/gic-v3-its: Kill its->device_ids and use TYPER copy instead")
ffedbf0cba15 ("irqchip/gic-v3-its: Kill its->ite_size and use TYPER copy instead")
0dd57fed6b46 ("irqchip/gic-v3-its: Make is_v4 use a TYPER copy")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4308ecfc887128a468f03fb66b767559c57c23 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Mon, 19 Feb 2024 18:58:06 +0000
Subject: [PATCH] irqchip/gic-v3-its: Do not assume vPE tables are preallocated
The GIC/ITS code is designed to ensure to pick up any preallocated LPI
tables on the redistributors, as enabling LPIs is a one-way switch. There
is no such restriction for vLPIs, and for GICv4.1 it is expected to
allocate a new vPE table at boot.
This works as intended when initializing an ITS, however when setting up a
redistributor in cpu_init_lpis() the early return for preallocated RD
tables skips straight past the GICv4 setup. This all comes to a head when
trying to kexec() into a new kernel, as the new kernel silently fails to
set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs.
Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always
initializes GICv4 on the way out, even if the other RD tables were
preallocated.
Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop")
Reported-by: George Cherian <gcherian(a)marvell.com>
Co-developed-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240219185809.286724-2-oliver.upton@linux.dev
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 53abd4779914..b822752c4261 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3181,6 +3181,7 @@ static void its_cpu_init_lpis(void)
val |= GICR_CTLR_ENABLE_LPIS;
writel_relaxed(val, rbase + GICR_CTLR);
+out:
if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) {
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -3216,7 +3217,6 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
-out:
gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED;
pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n",
smp_processor_id(),
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ec4308ecfc887128a468f03fb66b767559c57c23
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022654-drained-afterglow-ddb6@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
ec4308ecfc88 ("irqchip/gic-v3-its: Do not assume vPE tables are preallocated")
c0cdc89072a3 ("irqchip/gic-v3-its: Give the percpu rdist struct its own flags field")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4308ecfc887128a468f03fb66b767559c57c23 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Mon, 19 Feb 2024 18:58:06 +0000
Subject: [PATCH] irqchip/gic-v3-its: Do not assume vPE tables are preallocated
The GIC/ITS code is designed to ensure to pick up any preallocated LPI
tables on the redistributors, as enabling LPIs is a one-way switch. There
is no such restriction for vLPIs, and for GICv4.1 it is expected to
allocate a new vPE table at boot.
This works as intended when initializing an ITS, however when setting up a
redistributor in cpu_init_lpis() the early return for preallocated RD
tables skips straight past the GICv4 setup. This all comes to a head when
trying to kexec() into a new kernel, as the new kernel silently fails to
set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs.
Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always
initializes GICv4 on the way out, even if the other RD tables were
preallocated.
Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop")
Reported-by: George Cherian <gcherian(a)marvell.com>
Co-developed-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240219185809.286724-2-oliver.upton@linux.dev
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 53abd4779914..b822752c4261 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3181,6 +3181,7 @@ static void its_cpu_init_lpis(void)
val |= GICR_CTLR_ENABLE_LPIS;
writel_relaxed(val, rbase + GICR_CTLR);
+out:
if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) {
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -3216,7 +3217,6 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
-out:
gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED;
pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n",
smp_processor_id(),
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ec4308ecfc887128a468f03fb66b767559c57c23
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022650-washroom-undusted-2aff@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
ec4308ecfc88 ("irqchip/gic-v3-its: Do not assume vPE tables are preallocated")
c0cdc89072a3 ("irqchip/gic-v3-its: Give the percpu rdist struct its own flags field")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4308ecfc887128a468f03fb66b767559c57c23 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Mon, 19 Feb 2024 18:58:06 +0000
Subject: [PATCH] irqchip/gic-v3-its: Do not assume vPE tables are preallocated
The GIC/ITS code is designed to ensure to pick up any preallocated LPI
tables on the redistributors, as enabling LPIs is a one-way switch. There
is no such restriction for vLPIs, and for GICv4.1 it is expected to
allocate a new vPE table at boot.
This works as intended when initializing an ITS, however when setting up a
redistributor in cpu_init_lpis() the early return for preallocated RD
tables skips straight past the GICv4 setup. This all comes to a head when
trying to kexec() into a new kernel, as the new kernel silently fails to
set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs.
Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always
initializes GICv4 on the way out, even if the other RD tables were
preallocated.
Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop")
Reported-by: George Cherian <gcherian(a)marvell.com>
Co-developed-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240219185809.286724-2-oliver.upton@linux.dev
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 53abd4779914..b822752c4261 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3181,6 +3181,7 @@ static void its_cpu_init_lpis(void)
val |= GICR_CTLR_ENABLE_LPIS;
writel_relaxed(val, rbase + GICR_CTLR);
+out:
if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) {
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -3216,7 +3217,6 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
-out:
gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED;
pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n",
smp_processor_id(),
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 321da3dc1f3c92a12e3c5da934090d2992a8814c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022657-skintight-fetal-ec4b@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
321da3dc1f3c ("scsi: sd: usb_storage: uas: Access media prior to querying device properties")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 321da3dc1f3c92a12e3c5da934090d2992a8814c Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen(a)oracle.com>
Date: Tue, 13 Feb 2024 09:33:06 -0500
Subject: [PATCH] scsi: sd: usb_storage: uas: Access media prior to querying
device properties
It has been observed that some USB/UAS devices return generic properties
hardcoded in firmware for mode pages for a period of time after a device
has been discovered. The reported properties are either garbage or they do
not accurately reflect the characteristics of the physical storage device
attached in the case of a bridge.
Prior to commit 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to
avoid calling revalidate twice") we would call revalidate several
times during device discovery. As a result, incorrect values would
eventually get replaced with ones accurately describing the attached
storage. When we did away with the redundant revalidate pass, several
cases were reported where devices reported nonsensical values or would
end up in write-protected state.
An initial attempt at addressing this issue involved introducing a
delayed second revalidate invocation. However, this approach still
left some devices reporting incorrect characteristics.
Tasos Sahanidis debugged the problem further and identified that
introducing a READ operation prior to MODE SENSE fixed the problem and that
it wasn't a timing issue. Issuing a READ appears to cause the devices to
update their state to reflect the actual properties of the storage
media. Device properties like vendor, model, and storage capacity appear to
be correctly reported from the get-go. It is unclear why these devices
defer populating the remaining characteristics.
Match the behavior of a well known commercial operating system and
trigger a READ operation prior to querying device characteristics to
force the device to populate the mode pages.
The additional READ is triggered by a flag set in the USB storage and
UAS drivers. We avoid issuing the READ for other transport classes
since some storage devices identify Linux through our particular
discovery command sequence.
Link: https://lore.kernel.org/r/20240213143306.2194237-1-martin.petersen@oracle.c…
Fixes: 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to avoid calling revalidate twice")
Cc: stable(a)vger.kernel.org
Reported-by: Tasos Sahanidis <tasos(a)tasossah.com>
Reviewed-by: Ewan D. Milne <emilne(a)redhat.com>
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
Tested-by: Tasos Sahanidis <tasos(a)tasossah.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0833b3e6aa6e..bdd0acf7fa3c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3407,6 +3407,24 @@ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp,
return true;
}
+static void sd_read_block_zero(struct scsi_disk *sdkp)
+{
+ unsigned int buf_len = sdkp->device->sector_size;
+ char *buffer, cmd[10] = { };
+
+ buffer = kmalloc(buf_len, GFP_KERNEL);
+ if (!buffer)
+ return;
+
+ cmd[0] = READ_10;
+ put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */
+ put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */
+
+ scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len,
+ SD_TIMEOUT, sdkp->max_retries, NULL);
+ kfree(buffer);
+}
+
/**
* sd_revalidate_disk - called the first time a new disk is seen,
* performs disk spin up, read_capacity, etc.
@@ -3446,7 +3464,13 @@ static int sd_revalidate_disk(struct gendisk *disk)
*/
if (sdkp->media_present) {
sd_read_capacity(sdkp, buffer);
-
+ /*
+ * Some USB/UAS devices return generic values for mode pages
+ * until the media has been accessed. Trigger a READ operation
+ * to force the device to populate mode pages.
+ */
+ if (sdp->read_before_ms)
+ sd_read_block_zero(sdkp);
/*
* set the default to rotational. All non-rotational devices
* support the block characteristics VPD page, which will
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index c54e9805da53..12cf9940e5b6 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -179,6 +179,13 @@ static int slave_configure(struct scsi_device *sdev)
*/
sdev->use_192_bytes_for_3f = 1;
+ /*
+ * Some devices report generic values until the media has been
+ * accessed. Force a READ(10) prior to querying device
+ * characteristics.
+ */
+ sdev->read_before_ms = 1;
+
/*
* Some devices don't like MODE SENSE with page=0x3f,
* which is the command used for checking if a device
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 9707f53cfda9..71ace274761f 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -878,6 +878,13 @@ static int uas_slave_configure(struct scsi_device *sdev)
if (devinfo->flags & US_FL_CAPACITY_HEURISTICS)
sdev->guess_capacity = 1;
+ /*
+ * Some devices report generic values until the media has been
+ * accessed. Force a READ(10) prior to querying device
+ * characteristics.
+ */
+ sdev->read_before_ms = 1;
+
/*
* Some devices don't like MODE SENSE with page=0x3f,
* which is the command used for checking if a device
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 5ec1e71a09de..01c02cb76ea6 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -208,6 +208,7 @@ struct scsi_device {
unsigned use_10_for_rw:1; /* first try 10-byte read / write */
unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */
+ unsigned read_before_ms:1; /* perform a READ before MODE SENSE */
unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */
unsigned no_write_same:1; /* no WRITE SAME command */
unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
During the handoff from earlycon to the real console driver, we have
two separate drivers operating on the same device concurrently. In the
case of the 8250 driver these concurrent accesses cause problems due
to the driver's use of banked registers, controlled by LCR.DLAB. It is
possible for the setup(), config_port(), pm() and set_mctrl() callbacks
to set DLAB, which can cause the earlycon code that intends to access
TX to instead access DLL, leading to missed output and corruption on
the serial line due to unintended modifications to the baud rate.
In particular, for setup() we have:
univ8250_console_setup()
-> serial8250_console_setup()
-> uart_set_options()
-> serial8250_set_termios()
-> serial8250_do_set_termios()
-> serial8250_do_set_divisor()
For config_port() we have:
serial8250_config_port()
-> autoconfig()
For pm() we have:
serial8250_pm()
-> serial8250_do_pm()
-> serial8250_set_sleep()
For set_mctrl() we have (for some devices):
serial8250_set_mctrl()
-> omap8250_set_mctrl()
-> __omap8250_set_mctrl()
To avoid such problems, let's make it so that the console is locked
during pre-registration calls to these callbacks, which will prevent
the earlycon driver from running concurrently.
Remove the partial solution to this problem in the 8250 driver
that locked the console only during autoconfig_irq(), as this would
result in a deadlock with the new approach. The console continues
to be locked during autoconfig_irq() because it can only be called
through uart_configure_port().
Although this patch introduces more locking than strictly necessary
(and in particular it also locks during the call to rs485_config()
which is not affected by this issue as far as I can tell), it follows
the principle that it is the responsibility of the generic console
code to manage the earlycon handoff by ensuring that earlycon and real
console driver code cannot run concurrently, and not the individual
drivers.
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Link: https://linux-review.googlesource.com/id/I7cf8124dcebf8618e6b2ee543fa5b2553…
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
---
drivers/tty/serial/8250/8250_port.c | 6 ------
drivers/tty/serial/serial_core.c | 10 ++++++++++
kernel/printk/printk.c | 20 +++++++++++++++++---
3 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 8ca061d3bbb9..1d65055dde27 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1329,9 +1329,6 @@ static void autoconfig_irq(struct uart_8250_port *up)
inb_p(ICP);
}
- if (uart_console(port))
- console_lock();
-
/* forget possible initially masked and pending IRQ */
probe_irq_off(probe_irq_on());
save_mcr = serial8250_in_MCR(up);
@@ -1371,9 +1368,6 @@ static void autoconfig_irq(struct uart_8250_port *up)
if (port->flags & UPF_FOURPORT)
outb_p(save_ICP, ICP);
- if (uart_console(port))
- console_unlock();
-
port->irq = (irq > 0) ? irq : 0;
}
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index d6a58a9e072a..128aa0e0ae24 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2608,7 +2608,11 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
port->type = PORT_UNKNOWN;
flags |= UART_CONFIG_TYPE;
}
+ if (uart_console(port))
+ console_lock();
port->ops->config_port(port, flags);
+ if (uart_console(port))
+ console_unlock();
}
if (port->type != PORT_UNKNOWN) {
@@ -2616,6 +2620,9 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
uart_report_port(drv, port);
+ if (uart_console(port))
+ console_lock();
+
/* Power up port for set_mctrl() */
uart_change_pm(state, UART_PM_STATE_ON);
@@ -2632,6 +2639,9 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
uart_rs485_config(port);
+ if (uart_console(port))
+ console_unlock();
+
/*
* If this driver supports console, and it hasn't been
* successfully registered yet, try to re-register it.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581e16..db69545e6250 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3263,6 +3263,20 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -3298,8 +3312,8 @@ static int try_enable_preferred_console(struct console *newcon,
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- (err = newcon->setup(newcon, c->options)) != 0)
+ err = console_call_setup(newcon, c->options);
+ if (err != 0)
return err;
}
newcon->flags |= CON_ENABLED;
@@ -3325,7 +3339,7 @@ static void try_enable_default_console(struct console *newcon)
if (newcon->index < 0)
newcon->index = 0;
- if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ if (console_call_setup(newcon, NULL) != 0)
return;
newcon->flags |= CON_ENABLED;
--
2.44.0.rc1.240.g4c46232300-goog
On Okt 29 2023, Peter Ujfalusi wrote:
> The core twl chip is probed via i2c and the dev->driver->of_match_table is
> NULL, causing the driver to fail to probe.
>
> This partially reverts commit 1e0c866887f4.
>
> Fixes: 1e0c866887f4 ("mfd: Use device_get_match_data() in a bunch of drivers")
That commit id does not exist, which is why it hasn't been picked up by
stable. The correct commit id is 830fafce06e6f.
--
Andreas Schwab, SUSE Labs, schwab(a)suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE 1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."
Here's the recently merged mds improvement patches adapted to latest stable tree.
I've only compile tested them, but since I have also done similar backports for
older kernels I'm sure they should work.
The main difference is in the definition of the CLEAR_CPU_BUFFERS macro since
5.4 doesn't contains the alternative relocation handling logic hence the verw
instruction is moved out of the alternative definition and instead we have a jump which
skips the verw instruction there. That way the relocation will be handled by the
toolchain rather than the kernel.
H. Peter Anvin (Intel) (1):
x86/asm: Add _ASM_RIP() macro for x86-64 (%rip) suffix
Pawan Gupta (5):
x86/bugs: Add asm helpers for executing VERW
x86/entry_64: Add VERW just before userspace transition
x86/entry_32: Add VERW just before userspace transition
x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key
KVM/VMX: Move VERW closer to VMentry for MDS mitigation
Sean Christopherson (1):
KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH
Documentation/x86/mds.rst | 38 ++++++++++++++++++++--------
arch/x86/entry/Makefile | 2 +-
arch/x86/entry/common.c | 2 --
arch/x86/entry/entry.S | 23 +++++++++++++++++
arch/x86/entry/entry_32.S | 3 +++
arch/x86/entry/entry_64.S | 10 ++++++++
arch/x86/entry/entry_64_compat.S | 1 +
arch/x86/include/asm/asm.h | 6 ++++-
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/irqflags.h | 1 +
arch/x86/include/asm/nospec-branch.h | 26 ++++++++++---------
arch/x86/kernel/cpu/bugs.c | 15 +++++------
arch/x86/kernel/nmi.c | 3 ---
arch/x86/kvm/vmx/run_flags.h | 7 +++--
arch/x86/kvm/vmx/vmenter.S | 9 ++++---
arch/x86/kvm/vmx/vmx.c | 12 ++++++---
16 files changed, 111 insertions(+), 49 deletions(-)
create mode 100644 arch/x86/entry/entry.S
--
2.34.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 8d3a7dfb801d157ac423261d7cd62c33e95375f8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022635-thinner-disinfect-f761@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
8d3a7dfb801d ("KVM: arm64: vgic-its: Test for valid IRQ in its_sync_lpi_pending_table()")
9ed24f4b712b ("KVM: arm64: Move virt/kvm/arm to arch/arm64")
3b50142d8528 ("MAINTAINERS: sort field names for all entries")
4400b7d68f6e ("MAINTAINERS: sort entries by entry name")
b032227c6293 ("Merge tag 'nios2-v5.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/lftan/nios2")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d3a7dfb801d157ac423261d7cd62c33e95375f8 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Wed, 21 Feb 2024 09:27:31 +0000
Subject: [PATCH] KVM: arm64: vgic-its: Test for valid IRQ in
its_sync_lpi_pending_table()
vgic_get_irq() may not return a valid descriptor if there is no ITS that
holds a valid translation for the specified INTID. If that is the case,
it is safe to silently ignore it and continue processing the LPI pending
table.
Cc: stable(a)vger.kernel.org
Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Link: https://lore.kernel.org/r/20240221092732.4126848-2-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index e2764d0ffa9f..082448de27ed 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
}
irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 8d3a7dfb801d157ac423261d7cd62c33e95375f8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022634-rut-premises-24cc@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
8d3a7dfb801d ("KVM: arm64: vgic-its: Test for valid IRQ in its_sync_lpi_pending_table()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d3a7dfb801d157ac423261d7cd62c33e95375f8 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Wed, 21 Feb 2024 09:27:31 +0000
Subject: [PATCH] KVM: arm64: vgic-its: Test for valid IRQ in
its_sync_lpi_pending_table()
vgic_get_irq() may not return a valid descriptor if there is no ITS that
holds a valid translation for the specified INTID. If that is the case,
it is safe to silently ignore it and continue processing the LPI pending
table.
Cc: stable(a)vger.kernel.org
Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Link: https://lore.kernel.org/r/20240221092732.4126848-2-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index e2764d0ffa9f..082448de27ed 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
}
irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 85a71ee9a0700f6c18862ef3b0011ed9dad99aca
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022619-opulently-accustom-dbe1@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
85a71ee9a070 ("KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85a71ee9a0700f6c18862ef3b0011ed9dad99aca Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Wed, 21 Feb 2024 09:27:32 +0000
Subject: [PATCH] KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler
It is possible that an LPI mapped in a different ITS gets unmapped while
handling the MOVALL command. If that is the case, there is no state that
can be migrated to the destination. Silently ignore it and continue
migrating other LPIs.
Cc: stable(a)vger.kernel.org
Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Link: https://lore.kernel.org/r/20240221092732.4126848-3-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 082448de27ed..28a93074eca1 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
for (i = 0; i < irq_count; i++) {
irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
update_affinity(irq, vcpu2);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 85a71ee9a0700f6c18862ef3b0011ed9dad99aca
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022618-maternal-runny-28b5@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
85a71ee9a070 ("KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85a71ee9a0700f6c18862ef3b0011ed9dad99aca Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton(a)linux.dev>
Date: Wed, 21 Feb 2024 09:27:32 +0000
Subject: [PATCH] KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler
It is possible that an LPI mapped in a different ITS gets unmapped while
handling the MOVALL command. If that is the case, there is no state that
can be migrated to the destination. Silently ignore it and continue
migrating other LPIs.
Cc: stable(a)vger.kernel.org
Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
Link: https://lore.kernel.org/r/20240221092732.4126848-3-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 082448de27ed..28a93074eca1 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
for (i = 0; i < irq_count; i++) {
irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
update_affinity(irq, vcpu2);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x e21a2f17566cbd64926fb8f16323972f7a064444
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022639-selection-angrily-d6ff@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
e21a2f17566c ("cachefiles: fix memory leak in cachefiles_add_cache()")
d1065b0a6fd9 ("cachefiles: Implement cache registration and withdrawal")
32759f7d7af5 ("cachefiles: Implement a function to get/create a directory in the cache")
1bd9c4e4f049 ("vfs, cachefiles: Mark a backing file in use with an inode flag")
80f94f29f677 ("cachefiles: Provide a function to check how much space there is")
8667d434b2a9 ("cachefiles: Register a miscdev and parse commands over it")
254947d47945 ("cachefiles: Add security derivation")
1493bf74bcf2 ("cachefiles: Add cache error reporting macro")
ecf5a6ce15f9 ("cachefiles: Add a couple of tracepoints for logging errors")
a70f6526267e ("cachefiles: Add some error injection support")
8390fbc46570 ("cachefiles: Define structs")
77443f6171f3 ("cachefiles: Introduce rewritten driver")
850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite")
b6773cdb0e9f ("Merge tag 'for-5.16/ki_complete-2021-10-29' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e21a2f17566cbd64926fb8f16323972f7a064444 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1(a)huawei.com>
Date: Sat, 17 Feb 2024 16:14:31 +0800
Subject: [PATCH] cachefiles: fix memory leak in cachefiles_add_cache()
The following memory leak was reported after unbinding /dev/cachefiles:
==================================================================
unreferenced object 0xffff9b674176e3c0 (size 192):
comm "cachefilesd2", pid 680, jiffies 4294881224
hex dump (first 32 bytes):
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc ea38a44b):
[<ffffffff8eb8a1a5>] kmem_cache_alloc+0x2d5/0x370
[<ffffffff8e917f86>] prepare_creds+0x26/0x2e0
[<ffffffffc002eeef>] cachefiles_determine_cache_security+0x1f/0x120
[<ffffffffc00243ec>] cachefiles_add_cache+0x13c/0x3a0
[<ffffffffc0025216>] cachefiles_daemon_write+0x146/0x1c0
[<ffffffff8ebc4a3b>] vfs_write+0xcb/0x520
[<ffffffff8ebc5069>] ksys_write+0x69/0xf0
[<ffffffff8f6d4662>] do_syscall_64+0x72/0x140
[<ffffffff8f8000aa>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
==================================================================
Put the reference count of cache_cred in cachefiles_daemon_unbind() to
fix the problem. And also put cache_cred in cachefiles_add_cache() error
branch to avoid memory leaks.
Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem")
CC: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Link: https://lore.kernel.org/r/20240217081431.796809-1-libaokun1@huawei.com
Acked-by: David Howells <dhowells(a)redhat.com>
Reviewed-by: Jingbo Xu <jefflexu(a)linux.alibaba.com>
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ int cachefiles_add_cache(struct cachefiles_cache *cache)
dput(root);
error_open_root:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
error_getsec:
fscache_relinquish_cache(cache_cookie);
cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3f24905f4066..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
cachefiles_put_directory(cache->graveyard);
cachefiles_put_directory(cache->store);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x e21a2f17566cbd64926fb8f16323972f7a064444
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022638-jingling-atlas-4340@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
e21a2f17566c ("cachefiles: fix memory leak in cachefiles_add_cache()")
d1065b0a6fd9 ("cachefiles: Implement cache registration and withdrawal")
32759f7d7af5 ("cachefiles: Implement a function to get/create a directory in the cache")
1bd9c4e4f049 ("vfs, cachefiles: Mark a backing file in use with an inode flag")
80f94f29f677 ("cachefiles: Provide a function to check how much space there is")
8667d434b2a9 ("cachefiles: Register a miscdev and parse commands over it")
254947d47945 ("cachefiles: Add security derivation")
1493bf74bcf2 ("cachefiles: Add cache error reporting macro")
ecf5a6ce15f9 ("cachefiles: Add a couple of tracepoints for logging errors")
a70f6526267e ("cachefiles: Add some error injection support")
8390fbc46570 ("cachefiles: Define structs")
77443f6171f3 ("cachefiles: Introduce rewritten driver")
850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite")
b6773cdb0e9f ("Merge tag 'for-5.16/ki_complete-2021-10-29' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e21a2f17566cbd64926fb8f16323972f7a064444 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1(a)huawei.com>
Date: Sat, 17 Feb 2024 16:14:31 +0800
Subject: [PATCH] cachefiles: fix memory leak in cachefiles_add_cache()
The following memory leak was reported after unbinding /dev/cachefiles:
==================================================================
unreferenced object 0xffff9b674176e3c0 (size 192):
comm "cachefilesd2", pid 680, jiffies 4294881224
hex dump (first 32 bytes):
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc ea38a44b):
[<ffffffff8eb8a1a5>] kmem_cache_alloc+0x2d5/0x370
[<ffffffff8e917f86>] prepare_creds+0x26/0x2e0
[<ffffffffc002eeef>] cachefiles_determine_cache_security+0x1f/0x120
[<ffffffffc00243ec>] cachefiles_add_cache+0x13c/0x3a0
[<ffffffffc0025216>] cachefiles_daemon_write+0x146/0x1c0
[<ffffffff8ebc4a3b>] vfs_write+0xcb/0x520
[<ffffffff8ebc5069>] ksys_write+0x69/0xf0
[<ffffffff8f6d4662>] do_syscall_64+0x72/0x140
[<ffffffff8f8000aa>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
==================================================================
Put the reference count of cache_cred in cachefiles_daemon_unbind() to
fix the problem. And also put cache_cred in cachefiles_add_cache() error
branch to avoid memory leaks.
Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem")
CC: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Link: https://lore.kernel.org/r/20240217081431.796809-1-libaokun1@huawei.com
Acked-by: David Howells <dhowells(a)redhat.com>
Reviewed-by: Jingbo Xu <jefflexu(a)linux.alibaba.com>
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ int cachefiles_add_cache(struct cachefiles_cache *cache)
dput(root);
error_open_root:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
error_getsec:
fscache_relinquish_cache(cache_cookie);
cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3f24905f4066..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
cachefiles_put_directory(cache->graveyard);
cachefiles_put_directory(cache->store);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x e21a2f17566cbd64926fb8f16323972f7a064444
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022636-prevail-headway-01c9@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
e21a2f17566c ("cachefiles: fix memory leak in cachefiles_add_cache()")
d1065b0a6fd9 ("cachefiles: Implement cache registration and withdrawal")
32759f7d7af5 ("cachefiles: Implement a function to get/create a directory in the cache")
1bd9c4e4f049 ("vfs, cachefiles: Mark a backing file in use with an inode flag")
80f94f29f677 ("cachefiles: Provide a function to check how much space there is")
8667d434b2a9 ("cachefiles: Register a miscdev and parse commands over it")
254947d47945 ("cachefiles: Add security derivation")
1493bf74bcf2 ("cachefiles: Add cache error reporting macro")
ecf5a6ce15f9 ("cachefiles: Add a couple of tracepoints for logging errors")
a70f6526267e ("cachefiles: Add some error injection support")
8390fbc46570 ("cachefiles: Define structs")
77443f6171f3 ("cachefiles: Introduce rewritten driver")
850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite")
b6773cdb0e9f ("Merge tag 'for-5.16/ki_complete-2021-10-29' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e21a2f17566cbd64926fb8f16323972f7a064444 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1(a)huawei.com>
Date: Sat, 17 Feb 2024 16:14:31 +0800
Subject: [PATCH] cachefiles: fix memory leak in cachefiles_add_cache()
The following memory leak was reported after unbinding /dev/cachefiles:
==================================================================
unreferenced object 0xffff9b674176e3c0 (size 192):
comm "cachefilesd2", pid 680, jiffies 4294881224
hex dump (first 32 bytes):
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc ea38a44b):
[<ffffffff8eb8a1a5>] kmem_cache_alloc+0x2d5/0x370
[<ffffffff8e917f86>] prepare_creds+0x26/0x2e0
[<ffffffffc002eeef>] cachefiles_determine_cache_security+0x1f/0x120
[<ffffffffc00243ec>] cachefiles_add_cache+0x13c/0x3a0
[<ffffffffc0025216>] cachefiles_daemon_write+0x146/0x1c0
[<ffffffff8ebc4a3b>] vfs_write+0xcb/0x520
[<ffffffff8ebc5069>] ksys_write+0x69/0xf0
[<ffffffff8f6d4662>] do_syscall_64+0x72/0x140
[<ffffffff8f8000aa>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
==================================================================
Put the reference count of cache_cred in cachefiles_daemon_unbind() to
fix the problem. And also put cache_cred in cachefiles_add_cache() error
branch to avoid memory leaks.
Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem")
CC: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Link: https://lore.kernel.org/r/20240217081431.796809-1-libaokun1@huawei.com
Acked-by: David Howells <dhowells(a)redhat.com>
Reviewed-by: Jingbo Xu <jefflexu(a)linux.alibaba.com>
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ int cachefiles_add_cache(struct cachefiles_cache *cache)
dput(root);
error_open_root:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
error_getsec:
fscache_relinquish_cache(cache_cookie);
cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3f24905f4066..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
cachefiles_put_directory(cache->graveyard);
cachefiles_put_directory(cache->store);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x e21a2f17566cbd64926fb8f16323972f7a064444
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022635-princess-penniless-bfa6@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
e21a2f17566c ("cachefiles: fix memory leak in cachefiles_add_cache()")
d1065b0a6fd9 ("cachefiles: Implement cache registration and withdrawal")
32759f7d7af5 ("cachefiles: Implement a function to get/create a directory in the cache")
1bd9c4e4f049 ("vfs, cachefiles: Mark a backing file in use with an inode flag")
80f94f29f677 ("cachefiles: Provide a function to check how much space there is")
8667d434b2a9 ("cachefiles: Register a miscdev and parse commands over it")
254947d47945 ("cachefiles: Add security derivation")
1493bf74bcf2 ("cachefiles: Add cache error reporting macro")
ecf5a6ce15f9 ("cachefiles: Add a couple of tracepoints for logging errors")
a70f6526267e ("cachefiles: Add some error injection support")
8390fbc46570 ("cachefiles: Define structs")
77443f6171f3 ("cachefiles: Introduce rewritten driver")
850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite")
b6773cdb0e9f ("Merge tag 'for-5.16/ki_complete-2021-10-29' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e21a2f17566cbd64926fb8f16323972f7a064444 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1(a)huawei.com>
Date: Sat, 17 Feb 2024 16:14:31 +0800
Subject: [PATCH] cachefiles: fix memory leak in cachefiles_add_cache()
The following memory leak was reported after unbinding /dev/cachefiles:
==================================================================
unreferenced object 0xffff9b674176e3c0 (size 192):
comm "cachefilesd2", pid 680, jiffies 4294881224
hex dump (first 32 bytes):
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc ea38a44b):
[<ffffffff8eb8a1a5>] kmem_cache_alloc+0x2d5/0x370
[<ffffffff8e917f86>] prepare_creds+0x26/0x2e0
[<ffffffffc002eeef>] cachefiles_determine_cache_security+0x1f/0x120
[<ffffffffc00243ec>] cachefiles_add_cache+0x13c/0x3a0
[<ffffffffc0025216>] cachefiles_daemon_write+0x146/0x1c0
[<ffffffff8ebc4a3b>] vfs_write+0xcb/0x520
[<ffffffff8ebc5069>] ksys_write+0x69/0xf0
[<ffffffff8f6d4662>] do_syscall_64+0x72/0x140
[<ffffffff8f8000aa>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
==================================================================
Put the reference count of cache_cred in cachefiles_daemon_unbind() to
fix the problem. And also put cache_cred in cachefiles_add_cache() error
branch to avoid memory leaks.
Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem")
CC: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Link: https://lore.kernel.org/r/20240217081431.796809-1-libaokun1@huawei.com
Acked-by: David Howells <dhowells(a)redhat.com>
Reviewed-by: Jingbo Xu <jefflexu(a)linux.alibaba.com>
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ int cachefiles_add_cache(struct cachefiles_cache *cache)
dput(root);
error_open_root:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
error_getsec:
fscache_relinquish_cache(cache_cookie);
cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3f24905f4066..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
cachefiles_put_directory(cache->graveyard);
cachefiles_put_directory(cache->store);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022608-trombone-banker-5ed4@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
dbcbfd662a72 ("platform/x86: touchscreen_dmi: Allow partial (prefix) matches for ACPI names")
87eaede45385 ("platform/x86: touchscreen_dmi: Handle device properties with software node API")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Mon, 12 Feb 2024 13:06:07 +0100
Subject: [PATCH] platform/x86: touchscreen_dmi: Allow partial (prefix) matches
for ACPI names
On some devices the ACPI name of the touchscreen is e.g. either
MSSL1680:00 or MSSL1680:01 depending on the BIOS version.
This happens for example on the "Chuwi Hi8 Air" tablet where the initial
commit's ts_data uses "MSSL1680:00" but the tablets from the github issue
and linux-hardware.org probe linked below both use "MSSL1680:01".
Replace the strcmp() match on ts_data->acpi_name with a strstarts()
check to allow using a partial match on just the ACPI HID of "MSSL1680"
and change the ts_data->acpi_name for the "Chuwi Hi8 Air" accordingly
to fix the touchscreen not working on models where it is "MSSL1680:01".
Note this drops the length check for I2C_NAME_SIZE. This never was
necessary since the ACPI names used are never more then 11 chars and
I2C_NAME_SIZE is 20 so the replaced strncmp() would always stop long
before reaching I2C_NAME_SIZE.
Link: https://linux-hardware.org/?computer=AC4301C0542A
Fixes: bbb97d728f77 ("platform/x86: touchscreen_dmi: Add info for the Chuwi Hi8 Air tablet")
Closes: https://github.com/onitake/gsl-firmware/issues/91
Cc: stable(a)vger.kernel.org
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Link: https://lore.kernel.org/r/20240212120608.30469-1-hdegoede@redhat.com
diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c
index 7aee5e9ff2b8..969477c83e56 100644
--- a/drivers/platform/x86/touchscreen_dmi.c
+++ b/drivers/platform/x86/touchscreen_dmi.c
@@ -81,7 +81,7 @@ static const struct property_entry chuwi_hi8_air_props[] = {
};
static const struct ts_dmi_data chuwi_hi8_air_data = {
- .acpi_name = "MSSL1680:00",
+ .acpi_name = "MSSL1680",
.properties = chuwi_hi8_air_props,
};
@@ -1821,7 +1821,7 @@ static void ts_dmi_add_props(struct i2c_client *client)
int error;
if (has_acpi_companion(dev) &&
- !strncmp(ts_data->acpi_name, client->name, I2C_NAME_SIZE)) {
+ strstarts(client->name, ts_data->acpi_name)) {
error = device_create_managed_software_node(dev, ts_data->properties, NULL);
if (error)
dev_err(dev, "failed to add properties: %d\n", error);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022607-tainted-tinderbox-4920@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
dbcbfd662a72 ("platform/x86: touchscreen_dmi: Allow partial (prefix) matches for ACPI names")
87eaede45385 ("platform/x86: touchscreen_dmi: Handle device properties with software node API")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dbcbfd662a725641d118fb3ae5ffb7be4e3d0fb0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Mon, 12 Feb 2024 13:06:07 +0100
Subject: [PATCH] platform/x86: touchscreen_dmi: Allow partial (prefix) matches
for ACPI names
On some devices the ACPI name of the touchscreen is e.g. either
MSSL1680:00 or MSSL1680:01 depending on the BIOS version.
This happens for example on the "Chuwi Hi8 Air" tablet where the initial
commit's ts_data uses "MSSL1680:00" but the tablets from the github issue
and linux-hardware.org probe linked below both use "MSSL1680:01".
Replace the strcmp() match on ts_data->acpi_name with a strstarts()
check to allow using a partial match on just the ACPI HID of "MSSL1680"
and change the ts_data->acpi_name for the "Chuwi Hi8 Air" accordingly
to fix the touchscreen not working on models where it is "MSSL1680:01".
Note this drops the length check for I2C_NAME_SIZE. This never was
necessary since the ACPI names used are never more then 11 chars and
I2C_NAME_SIZE is 20 so the replaced strncmp() would always stop long
before reaching I2C_NAME_SIZE.
Link: https://linux-hardware.org/?computer=AC4301C0542A
Fixes: bbb97d728f77 ("platform/x86: touchscreen_dmi: Add info for the Chuwi Hi8 Air tablet")
Closes: https://github.com/onitake/gsl-firmware/issues/91
Cc: stable(a)vger.kernel.org
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Link: https://lore.kernel.org/r/20240212120608.30469-1-hdegoede@redhat.com
diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c
index 7aee5e9ff2b8..969477c83e56 100644
--- a/drivers/platform/x86/touchscreen_dmi.c
+++ b/drivers/platform/x86/touchscreen_dmi.c
@@ -81,7 +81,7 @@ static const struct property_entry chuwi_hi8_air_props[] = {
};
static const struct ts_dmi_data chuwi_hi8_air_data = {
- .acpi_name = "MSSL1680:00",
+ .acpi_name = "MSSL1680",
.properties = chuwi_hi8_air_props,
};
@@ -1821,7 +1821,7 @@ static void ts_dmi_add_props(struct i2c_client *client)
int error;
if (has_acpi_companion(dev) &&
- !strncmp(ts_data->acpi_name, client->name, I2C_NAME_SIZE)) {
+ strstarts(client->name, ts_data->acpi_name)) {
error = device_create_managed_software_node(dev, ts_data->properties, NULL);
if (error)
dev_err(dev, "failed to add properties: %d\n", error);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 13ddaf26be324a7f951891ecd9ccd04466d27458
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022608-green-engaging-46db@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
c9edc242811d ("swap: add swap_cache_get_folio()")
1baec203b77c ("mm/khugepaged: try to free transhuge swapcache when possible")
442701e7058b ("mm/swap: remove swap_cache_info statistics")
014bb1de4fc1 ("mm: create new mm/swap.h header file")
1493a1913e34 ("mm/swap: remember PG_anon_exclusive via a swp pte bit")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
78fbe906cc90 ("mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
6b1f86f8e9c7 ("Merge tag 'folio-5.18b' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong(a)tencent.com>
Date: Wed, 7 Feb 2024 02:25:59 +0800
Subject: [PATCH] mm/swap: fix race when skipping swapcache
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem. Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache. Release the pin after
PT unlocked.
Racers just loop and wait since it's a rare and very short event. A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics. A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
[kasong(a)tencent.com: v4]
Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..8d28f6091a32 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
diff --git a/mm/memory.c b/mm/memory.c
index 15f8b10ea17c..0bfc8b007c01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..746aa9da5302 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 13ddaf26be324a7f951891ecd9ccd04466d27458
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022604-labrador-edgy-5b56@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
c9edc242811d ("swap: add swap_cache_get_folio()")
1baec203b77c ("mm/khugepaged: try to free transhuge swapcache when possible")
442701e7058b ("mm/swap: remove swap_cache_info statistics")
014bb1de4fc1 ("mm: create new mm/swap.h header file")
1493a1913e34 ("mm/swap: remember PG_anon_exclusive via a swp pte bit")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
78fbe906cc90 ("mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
6b1f86f8e9c7 ("Merge tag 'folio-5.18b' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong(a)tencent.com>
Date: Wed, 7 Feb 2024 02:25:59 +0800
Subject: [PATCH] mm/swap: fix race when skipping swapcache
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem. Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache. Release the pin after
PT unlocked.
Racers just loop and wait since it's a rare and very short event. A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics. A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
[kasong(a)tencent.com: v4]
Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..8d28f6091a32 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
diff --git a/mm/memory.c b/mm/memory.c
index 15f8b10ea17c..0bfc8b007c01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..746aa9da5302 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 13ddaf26be324a7f951891ecd9ccd04466d27458
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022601-skinny-audacious-d173@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
c9edc242811d ("swap: add swap_cache_get_folio()")
1baec203b77c ("mm/khugepaged: try to free transhuge swapcache when possible")
442701e7058b ("mm/swap: remove swap_cache_info statistics")
014bb1de4fc1 ("mm: create new mm/swap.h header file")
1493a1913e34 ("mm/swap: remember PG_anon_exclusive via a swp pte bit")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
78fbe906cc90 ("mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
6b1f86f8e9c7 ("Merge tag 'folio-5.18b' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong(a)tencent.com>
Date: Wed, 7 Feb 2024 02:25:59 +0800
Subject: [PATCH] mm/swap: fix race when skipping swapcache
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem. Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache. Release the pin after
PT unlocked.
Racers just loop and wait since it's a rare and very short event. A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics. A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
[kasong(a)tencent.com: v4]
Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..8d28f6091a32 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
diff --git a/mm/memory.c b/mm/memory.c
index 15f8b10ea17c..0bfc8b007c01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..746aa9da5302 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 13ddaf26be324a7f951891ecd9ccd04466d27458
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022658-expensive-autograph-1b92@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
c9edc242811d ("swap: add swap_cache_get_folio()")
1baec203b77c ("mm/khugepaged: try to free transhuge swapcache when possible")
442701e7058b ("mm/swap: remove swap_cache_info statistics")
014bb1de4fc1 ("mm: create new mm/swap.h header file")
1493a1913e34 ("mm/swap: remember PG_anon_exclusive via a swp pte bit")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
78fbe906cc90 ("mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
6b1f86f8e9c7 ("Merge tag 'folio-5.18b' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong(a)tencent.com>
Date: Wed, 7 Feb 2024 02:25:59 +0800
Subject: [PATCH] mm/swap: fix race when skipping swapcache
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem. Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache. Release the pin after
PT unlocked.
Racers just loop and wait since it's a rare and very short event. A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics. A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
[kasong(a)tencent.com: v4]
Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..8d28f6091a32 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
diff --git a/mm/memory.c b/mm/memory.c
index 15f8b10ea17c..0bfc8b007c01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..746aa9da5302 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 5c6224bfabbf7f3e491c51ab50fd2c6f92ba1141
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022643-rearview-spouse-b8a1@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
5c6224bfabbf ("cxl/acpi: Fix load failures due to single window creation failure")
790815902ec6 ("cxl: Add support for _DSM Function for retrieving QTG ID")
91019b5bc7c2 ("cxl/acpi: Return 'rc' instead of '0' in cxl_parse_cfmws()")
4cf67d3cc999 ("cxl/acpi: Fix a use-after-free in cxl_parse_cfmws()")
d35b495ddf92 ("cxl/port: Fix find_cxl_root() for RCDs and simplify it")
a32320b71f08 ("cxl/region: Add region autodiscovery")
32ce3f185bbb ("cxl/port: Split endpoint and switch port probe")
9995576cef48 ("cxl/region: Move region-position validation to a helper")
86987c766276 ("cxl/region: Cleanup target list on attach error")
1b9b7a6fd618 ("cxl/region: Validate region mode vs decoder mode")
7d505f982f53 ("cxl/region: Add a mode attribute for regions")
02fedf146656 ("Merge branch 'for-6.2/cxl-xor' into for-6.2/cxl")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5c6224bfabbf7f3e491c51ab50fd2c6f92ba1141 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Fri, 16 Feb 2024 19:11:34 -0800
Subject: [PATCH] cxl/acpi: Fix load failures due to single window creation
failure
The expectation is that cxl_parse_cfwms() continues in the face the of
failure as evidenced by code like:
cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb);
if (IS_ERR(cxlrd))
return 0;
There are other error paths in that function which mistakenly follow
idiomatic expectations and return an error when they should not. Most of
those mistakes are innocuous checks that hardly ever fail in practice.
However, a recent change succeed in making the implementation more
fragile by applying an idiomatic, but still wrong "fix" [1]. In this
failure case the kernel reports:
cxl root0: Failed to populate active decoder targets
cxl_acpi ACPI0017:00: Failed to add decode range: [mem 0x00000000-0x7fffffff flags 0x200]
...which is a real issue with that one window (to be fixed separately),
but ends up failing the entirety of cxl_acpi_probe().
Undo that recent breakage while also removing the confusion about
ignoring errors. Update all exits paths to return an error per typical
expectations and let an outer wrapper function handle dropping the
error.
Fixes: 91019b5bc7c2 ("cxl/acpi: Return 'rc' instead of '0' in cxl_parse_cfmws()") [1]
Cc: <stable(a)vger.kernel.org>
Cc: Breno Leitao <leitao(a)debian.org>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index dcf2b39e1048..1a3e6aafbdcc 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -316,31 +316,27 @@ static const struct cxl_root_ops acpi_root_ops = {
.qos_class = cxl_acpi_qos_class,
};
-static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
- const unsigned long end)
+static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
+ struct cxl_cfmws_context *ctx)
{
int target_map[CXL_DECODER_MAX_INTERLEAVE];
- struct cxl_cfmws_context *ctx = arg;
struct cxl_port *root_port = ctx->root_port;
struct resource *cxl_res = ctx->cxl_res;
struct cxl_cxims_context cxims_ctx;
struct cxl_root_decoder *cxlrd;
struct device *dev = ctx->dev;
- struct acpi_cedt_cfmws *cfmws;
cxl_calc_hb_fn cxl_calc_hb;
struct cxl_decoder *cxld;
unsigned int ways, i, ig;
struct resource *res;
int rc;
- cfmws = (struct acpi_cedt_cfmws *) header;
-
rc = cxl_acpi_cfmws_verify(dev, cfmws);
if (rc) {
dev_err(dev, "CFMWS range %#llx-%#llx not registered\n",
cfmws->base_hpa,
cfmws->base_hpa + cfmws->window_size - 1);
- return 0;
+ return rc;
}
rc = eiw_to_ways(cfmws->interleave_ways, &ways);
@@ -376,7 +372,7 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb);
if (IS_ERR(cxlrd))
- return 0;
+ return PTR_ERR(cxlrd);
cxld = &cxlrd->cxlsd.cxld;
cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
@@ -420,16 +416,7 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
put_device(&cxld->dev);
else
rc = cxl_decoder_autoremove(dev, cxld);
- if (rc) {
- dev_err(dev, "Failed to add decode range: %pr", res);
- return rc;
- }
- dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n",
- dev_name(&cxld->dev),
- phys_to_target_node(cxld->hpa_range.start),
- cxld->hpa_range.start, cxld->hpa_range.end);
-
- return 0;
+ return rc;
err_insert:
kfree(res->name);
@@ -438,6 +425,29 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
return -ENOMEM;
}
+static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
+{
+ struct acpi_cedt_cfmws *cfmws = (struct acpi_cedt_cfmws *)header;
+ struct cxl_cfmws_context *ctx = arg;
+ struct device *dev = ctx->dev;
+ int rc;
+
+ rc = __cxl_parse_cfmws(cfmws, ctx);
+ if (rc)
+ dev_err(dev,
+ "Failed to add decode range: [%#llx - %#llx] (%d)\n",
+ cfmws->base_hpa,
+ cfmws->base_hpa + cfmws->window_size - 1, rc);
+ else
+ dev_dbg(dev, "decode range: node: %d range [%#llx - %#llx]\n",
+ phys_to_target_node(cfmws->base_hpa), cfmws->base_hpa,
+ cfmws->base_hpa + cfmws->window_size - 1);
+
+ /* never fail cxl_acpi load for a single window failure */
+ return 0;
+}
+
__mock struct acpi_device *to_cxl_host_bridge(struct device *host,
struct device *dev)
{
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x bd915ae73a2d78559b376ad2caf5e4ef51de2455
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022642-retool-clover-ecd7@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
bd915ae73a2d ("drm/meson: Don't remove bridges which are created by other drivers")
42dcf15f901c ("drm/meson: add DSI encoder")
6a044642988b ("drm/meson: fix unbind path if HDMI fails to bind")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bd915ae73a2d78559b376ad2caf5e4ef51de2455 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Date: Thu, 15 Feb 2024 23:04:42 +0100
Subject: [PATCH] drm/meson: Don't remove bridges which are created by other
drivers
Stop calling drm_bridge_remove() for bridges allocated/managed by other
drivers in the remove paths of meson_encoder_{cvbs,dsi,hdmi}.
drm_bridge_remove() unregisters the bridge so it cannot be used
anymore. Doing so for bridges we don't own can lead to the video
pipeline not being able to come up after -EPROBE_DEFER of the VPU
because we're unregistering a bridge that's managed by another driver.
The other driver doesn't know that we have unregistered it's bridge
and on subsequent .probe() we're not able to find those bridges anymore
(since nobody re-creates them).
This fixes probe errors on Meson8b boards with the CVBS outputs enabled.
Fixes: 09847723c12f ("drm/meson: remove drm bridges at aggregate driver unbind time")
Fixes: 42dcf15f901c ("drm/meson: add DSI encoder")
Cc: <stable(a)vger.kernel.org>
Reported-by: Steve Morvai <stevemorvai(a)hotmail.com>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Reviewed-by: Neil Armstrong <neil.armstrong(a)linaro.org>
Tested-by: Steve Morvai <stevemorvai(a)hotmail.com>
Link: https://lore.kernel.org/r/20240215220442.1343152-1-martin.blumenstingl@goog…
Reviewed-by: Neil Armstrong <neil.armstrong(a)linaro.org>
Signed-off-by: Neil Armstrong <neil.armstrong(a)linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240215220442.1343152-1-mart…
diff --git a/drivers/gpu/drm/meson/meson_encoder_cvbs.c b/drivers/gpu/drm/meson/meson_encoder_cvbs.c
index 3f73b211fa8e..3407450435e2 100644
--- a/drivers/gpu/drm/meson/meson_encoder_cvbs.c
+++ b/drivers/gpu/drm/meson/meson_encoder_cvbs.c
@@ -294,6 +294,5 @@ void meson_encoder_cvbs_remove(struct meson_drm *priv)
if (priv->encoders[MESON_ENC_CVBS]) {
meson_encoder_cvbs = priv->encoders[MESON_ENC_CVBS];
drm_bridge_remove(&meson_encoder_cvbs->bridge);
- drm_bridge_remove(meson_encoder_cvbs->next_bridge);
}
}
diff --git a/drivers/gpu/drm/meson/meson_encoder_dsi.c b/drivers/gpu/drm/meson/meson_encoder_dsi.c
index 3f93c70488ca..311b91630fbe 100644
--- a/drivers/gpu/drm/meson/meson_encoder_dsi.c
+++ b/drivers/gpu/drm/meson/meson_encoder_dsi.c
@@ -168,6 +168,5 @@ void meson_encoder_dsi_remove(struct meson_drm *priv)
if (priv->encoders[MESON_ENC_DSI]) {
meson_encoder_dsi = priv->encoders[MESON_ENC_DSI];
drm_bridge_remove(&meson_encoder_dsi->bridge);
- drm_bridge_remove(meson_encoder_dsi->next_bridge);
}
}
diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c
index 25ea76558690..c4686568c9ca 100644
--- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c
@@ -474,6 +474,5 @@ void meson_encoder_hdmi_remove(struct meson_drm *priv)
if (priv->encoders[MESON_ENC_HDMI]) {
meson_encoder_hdmi = priv->encoders[MESON_ENC_HDMI];
drm_bridge_remove(&meson_encoder_hdmi->bridge);
- drm_bridge_remove(meson_encoder_hdmi->next_bridge);
}
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b0ad381fa7690244802aed119b478b4bdafc31dd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022655-fled-exes-9ef4@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
b0ad381fa769 ("btrfs: fix deadlock with fiemap and extent locking")
519b7e13b5ae ("btrfs: lock the inode in shared mode before starting fiemap")
b3e744fe6d28 ("btrfs: use cached state when looking for delalloc ranges with fiemap")
8c6e53a79d16 ("btrfs: allow passing a cached state record to count_range_bits()")
8ddc8274e4be ("btrfs: search for delalloc more efficiently during lseek/fiemap")
af979fd618a4 ("btrfs: skip unnecessary delalloc searches during lseek/fiemap")
40daf3e095db ("btrfs: add an early exit when searching for delalloc range for lseek/fiemap")
af142b6f44d3 ("btrfs: move file prototypes to file.h")
7572dec8f522 ("btrfs: move ioctl prototypes into ioctl.h")
c7a03b524d30 ("btrfs: move uuid tree prototypes to uuid-tree.h")
7c8ede162805 ("btrfs: move file-item prototypes into their own header")
f2b39277b87d ("btrfs: move dir-item prototypes into dir-item.h")
59b818e064ab ("btrfs: move defrag related prototypes to their own header")
a6a01ca61f49 ("btrfs: move the file defrag code into defrag.c")
6e3df18ba7e8 ("btrfs: move the auto defrag code to defrag.c")
2885fd632050 ("btrfs: move inode prototypes to btrfs_inode.h")
911bd75aca73 ("btrfs: remove unused function prototypes")
45c40c8f9541 ("btrfs: move root tree prototypes to their own header")
2839c2c142dd ("btrfs: move delalloc space related prototypes to delalloc-space.h")
a0231804affe ("btrfs: move extent-tree helpers into their own header file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b0ad381fa7690244802aed119b478b4bdafc31dd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Mon, 12 Feb 2024 11:56:02 -0500
Subject: [PATCH] btrfs: fix deadlock with fiemap and extent locking
While working on the patchset to remove extent locking I got a lockdep
splat with fiemap and pagefaulting with my new extent lock replacement
lock.
This deadlock exists with our normal code, we just don't have lockdep
annotations with the extent locking so we've never noticed it.
Since we're copying the fiemap extent to user space on every iteration
we have the chance of pagefaulting. Because we hold the extent lock for
the entire range we could mkwrite into a range in the file that we have
mmap'ed. This would deadlock with the following stack trace
[<0>] lock_extent+0x28d/0x2f0
[<0>] btrfs_page_mkwrite+0x273/0x8a0
[<0>] do_page_mkwrite+0x50/0xb0
[<0>] do_fault+0xc1/0x7b0
[<0>] __handle_mm_fault+0x2fa/0x460
[<0>] handle_mm_fault+0xa4/0x330
[<0>] do_user_addr_fault+0x1f4/0x800
[<0>] exc_page_fault+0x7c/0x1e0
[<0>] asm_exc_page_fault+0x26/0x30
[<0>] rep_movs_alternative+0x33/0x70
[<0>] _copy_to_user+0x49/0x70
[<0>] fiemap_fill_next_extent+0xc8/0x120
[<0>] emit_fiemap_extent+0x4d/0xa0
[<0>] extent_fiemap+0x7f8/0xad0
[<0>] btrfs_fiemap+0x49/0x80
[<0>] __x64_sys_ioctl+0x3e1/0xb50
[<0>] do_syscall_64+0x94/0x1a0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
I wrote an fstest to reproduce this deadlock without my replacement lock
and verified that the deadlock exists with our existing locking.
To fix this simply don't take the extent lock for the entire duration of
the fiemap. This is safe in general because we keep track of where we
are when we're searching the tree, so if an ordered extent updates in
the middle of our fiemap call we'll still emit the correct extents
because we know what offset we were on before.
The only place we maintain the lock is searching delalloc. Since the
delalloc stuff can change during writeback we want to lock the extent
range so we have a consistent view of delalloc at the time we're
checking to see if we need to set the delalloc flag.
With this patch applied we no longer deadlock with my testcase.
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a0ffd41c5cc1..61d961a30dee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2689,16 +2689,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
* it beyond i_size.
*/
while (cur_offset < end && cur_offset < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
+ u64 lockstart;
+ u64 lockend;
u64 prealloc_len = 0;
bool delalloc;
+ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+ lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
break;
@@ -2866,15 +2884,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
@@ -2885,12 +2903,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
@@ -2898,7 +2915,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
@@ -2910,7 +2927,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2933,19 +2950,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
@@ -2955,7 +2972,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -3049,29 +3066,41 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
btrfs_free_path(path);
path = NULL;
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
goto out_unlock;
- prev_extent_end = lockend;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
const u64 i_size = i_size_read(&inode->vfs_inode);
if (prev_extent_end < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
+ u64 lockstart;
+ u64 lockend;
bool delalloc;
+ lockstart = round_down(prev_extent_end, sectorsize);
+ lockend = round_up(i_size, sectorsize);
+
+ /*
+ * See the comment in fiemap_process_hole as to why
+ * we're doing the locking here.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
&delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
cache.flags |= FIEMAP_EXTENT_LAST;
} else {
@@ -3082,7 +3111,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
ret = emit_last_fiemap_cache(fieinfo, &cache);
out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
out:
free_extent_state(delalloc_cached_state);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x e42b9d8b9ea2672811285e6a7654887ff64d23f3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024022640-deepness-manatee-1a30@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
e42b9d8b9ea2 ("btrfs: defrag: avoid unnecessary defrag caused by incorrect extent size")
a6a01ca61f49 ("btrfs: move the file defrag code into defrag.c")
6e3df18ba7e8 ("btrfs: move the auto defrag code to defrag.c")
07e81dc94474 ("btrfs: move accessor helpers into accessors.h")
ad1ac5012c2b ("btrfs: move btrfs_map_token to accessors")
55e5cfd36da5 ("btrfs: remove fs_info::pending_changes and related code")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
7a66eda351ba ("btrfs: move the btrfs_verity_descriptor_item defs up in ctree.h")
956504a331a6 ("btrfs: move trans_handle_cachep out of ctree.h")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
ed4c491a3db2 ("btrfs: move BTRFS_MAX_MIRRORS into scrub.c")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e42b9d8b9ea2672811285e6a7654887ff64d23f3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Wed, 7 Feb 2024 10:00:42 +1030
Subject: [PATCH] btrfs: defrag: avoid unnecessary defrag caused by incorrect
extent size
[BUG]
With the following file extent layout, defrag would do unnecessary IO
and result more on-disk space usage.
# mkfs.btrfs -f $dev
# mount $dev $mnt
# xfs_io -f -c "pwrite 0 40m" $mnt/foobar
# sync
# xfs_io -f -c "pwrite 40m 16k" $mnt/foobar
# sync
Above command would lead to the following file extent layout:
item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53
generation 7 type 1 (regular)
extent data disk byte 298844160 nr 41943040
extent data offset 0 nr 41943040 ram 41943040
extent compression 0 (none)
item 7 key (257 EXTENT_DATA 41943040) itemoff 15763 itemsize 53
generation 8 type 1 (regular)
extent data disk byte 13631488 nr 16384
extent data offset 0 nr 16384 ram 16384
extent compression 0 (none)
Which is mostly fine. We can allow the final 16K to be merged with the
previous 40M, but it's upon the end users' preference.
But if we defrag the file using the default parameters, it would result
worse file layout:
# btrfs filesystem defrag $mnt/foobar
# sync
item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53
generation 7 type 1 (regular)
extent data disk byte 298844160 nr 41943040
extent data offset 0 nr 8650752 ram 41943040
extent compression 0 (none)
item 7 key (257 EXTENT_DATA 8650752) itemoff 15763 itemsize 53
generation 9 type 1 (regular)
extent data disk byte 340787200 nr 33292288
extent data offset 0 nr 33292288 ram 33292288
extent compression 0 (none)
item 8 key (257 EXTENT_DATA 41943040) itemoff 15710 itemsize 53
generation 8 type 1 (regular)
extent data disk byte 13631488 nr 16384
extent data offset 0 nr 16384 ram 16384
extent compression 0 (none)
Note the original 40M extent is still there, but a new 32M extent is
created for no benefit at all.
[CAUSE]
There is an existing check to make sure we won't defrag a large enough
extent (the threshold is by default 32M).
But the check is using the length to the end of the extent:
range_len = em->len - (cur - em->start);
/* Skip too large extent */
if (range_len >= extent_thresh)
goto next;
This means, for the first 8MiB of the extent, the range_len is always
smaller than the default threshold, and would not be defragged.
But after the first 8MiB, the remaining part would fit the requirement,
and be defragged.
Such different behavior inside the same extent caused the above problem,
and we should avoid different defrag decision inside the same extent.
[FIX]
Instead of using @range_len, just use @em->len, so that we have a
consistent decision among the same file extent.
Now with this fix, we won't touch the extent, thus not making it any
worse.
Reported-by: Filipe Manana <fdmanana(a)suse.com>
Fixes: 0cb5950f3f3b ("btrfs: fix deadlock when reserving space during defrag")
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index c276b136ab63..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
From: Rui Qi <qirui.001(a)bytedance.com>
Since kernel version 5.4.250 LTS, there has been an issue with the kernel live patching feature becoming unavailable. When compiling the sample code for kernel live patching, the following message is displayed when enabled:
livepatch: klp_check_stack: kworker/u256:6:23490 has an unreliable stack
After investigation, it was found that this is due to objtool not supporting intra-function calls, resulting in incorrect orc entry generation.
This patchset adds support for intra-function calls, allowing the kernel live patching feature to work correctly.
Alexandre Chartre (2):
objtool: is_fentry_call() crashes if call has no destination
objtool: Add support for intra-function calls
Rui Qi (1):
x86/speculation: Support intra-function call validation
arch/x86/include/asm/nospec-branch.h | 7 ++
include/linux/frame.h | 11 ++++
.../Documentation/stack-validation.txt | 8 +++
tools/objtool/arch/x86/decode.c | 6 ++
tools/objtool/check.c | 64 +++++++++++++++++--
5 files changed, 91 insertions(+), 5 deletions(-)
--
2.39.2 (Apple Git-143)
commit 912680064f94 ("media: atomisp: make sh_css similar to Intel Aero driver")
removes the affected code, but in versions
tags/v5.8-rc1~10^2~220 - tags/v5.17-rc1~114^2~261
there is no check for the return value of the
ia_css_pipeline_create_and_add_stage() function.
ia_css_pipeline_create_and_add_stage() may return an
error code, so check and return it on error.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: 7796e455170e ("media: staging: media: atomisp: Fix alignment and line length issues")
Signed-off-by: Alexandra Diupina <adiupina(a)astralinux.ru>
---
drivers/staging/media/atomisp/pci/sh_css.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/staging/media/atomisp/pci/sh_css.c b/drivers/staging/media/atomisp/pci/sh_css.c
index ba25d0da8b81..8502adb75a5a 100644
--- a/drivers/staging/media/atomisp/pci/sh_css.c
+++ b/drivers/staging/media/atomisp/pci/sh_css.c
@@ -7912,6 +7912,10 @@ create_host_regular_capture_pipeline(struct ia_css_pipe *pipe)
out_frames, in_frame, NULL);
err = ia_css_pipeline_create_and_add_stage(me, &stage_desc,
NULL);
+ if (err) {
+ IA_CSS_LEAVE_ERR_PRIVATE(err);
+ return err;
+ }
} else if (need_pp && current_stage) {
in_frame = current_stage->args.out_frame[0];
err = add_capture_pp_stage(pipe, me, in_frame,
--
2.30.2
From: Rui Qi <qirui.001(a)bytedance.com>
Since kernel version 5.4.250 LTS, there has been an issue with the kernel live patching feature becoming unavailable. When compiling the sample code for kernel live patching, the following message is displayed when enabled:
livepatch: klp_check_stack: kworker/u256:6:23490 has an unreliable stack
After investigation, it was found that this is due to objtool not supporting intra-function calls, resulting in incorrect orc entry generation.
This patchset adds support for intra-function calls, allowing the kernel live patching feature to work correctly.
Alexandre Chartre (2):
objtool: is_fentry_call() crashes if call has no destination
objtool: Add support for intra-function calls
Rui Qi (1):
x86/speculation: Support intra-function call validation
arch/x86/include/asm/nospec-branch.h | 7 ++
include/linux/frame.h | 11 ++++
.../Documentation/stack-validation.txt | 8 +++
tools/objtool/arch/x86/decode.c | 6 ++
tools/objtool/check.c | 64 +++++++++++++++++--
5 files changed, 91 insertions(+), 5 deletions(-)
--
2.39.2 (Apple Git-143)
From: Ondrej Jirman <megi(a)xff.cz>
The reverted commit makes the state machine only ever go from SRC_ATTACH_WAIT
to SNK_TRY in endless loop when toggling. After revert it goes to SRC_ATTACHED
after initially trying SNK_TRY earlier, as it should for toggling to ever detect
the power source mode and the port is again able to provide power to attached
power sinks.
This reverts commit 2d6d80127006ae3da26b1f21a65eccf957f2d1e5.
Cc: stable(a)vger.kernel.org
Fixes: 2d6d80127006 ("usb: typec: tcpm: reset counter when enter into unattached state after try role")
Signed-of-by: Ondrej Jirman <megi(a)xff.cz>
---
drivers/usb/typec/tcpm/tcpm.c | 3 ---
1 file changed, 3 deletions(-)
See https://lore.kernel.org/all/odggrbbgjpardze76qiv57mw6tllisyu5sbrta37iadjzwa…
for more.
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index f7d7daa60c8d..295ae7eb912c 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -3743,9 +3743,6 @@ static void tcpm_detach(struct tcpm_port *port)
if (tcpm_port_is_disconnected(port))
port->hard_reset_count = 0;
- port->try_src_count = 0;
- port->try_snk_count = 0;
-
if (!port->attached)
return;
--
2.43.0
From: Roman Gushchin <guro(a)fb.com>
commit e1a366be5cb4f849ec4de170d50eebc08bb0af20 upstream.
Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge")
introduced css_tryget()/css_put() calls in drain_all_stock(), which are
supposed to protect the target memory cgroup from being released during
the mem_cgroup_is_descendant() call.
However, it's not completely safe. In theory, memcg can go away between
reading stock->cached pointer and calling css_tryget().
This can happen if drain_all_stock() races with drain_local_stock()
performed on the remote cpu as a result of a work, scheduled by the
previous invocation of drain_all_stock().
The race is a bit theoretical and there are few chances to trigger it, but
the current code looks a bit confusing, so it makes sense to fix it
anyway. The code looks like as if css_tryget() and css_put() are used to
protect stocks drainage. It's not necessary because stocked pages are
holding references to the cached cgroup. And it obviously won't work for
works, scheduled on other cpus.
So, let's read the stock->cached pointer and evaluate the memory cgroup
inside a rcu read section, and get rid of css_tryget()/css_put() calls.
Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro(a)fb.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org # 4.19
Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
Signed-off-by: GONG, Ruiqi <gongruiqi1(a)huawei.com>
---
This patch [1] fixed a UAF problem in drain_all_stock() existed prior to
5.9, and following discussions [2] mentioned that the fix depends on an
RCU read protection to stock->cached (introduced in 5.4), which doesn't
existed in 4.19. So backport this part to 4.19 as well.
[1]: https://lore.kernel.org/all/20240221081801.69764-1-gongruiqi1@huawei.com/
[2]: https://lore.kernel.org/all/ZdXLgjpUfpwEwAe0@tiehlicka/
mm/memcontrol.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c04296df1c7..d187bfb43b1f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2094,21 +2094,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
--
2.25.1