The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x e32b120071ea114efc0b4ddd439547750b85f618
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118948184196(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
e32b120071ea ("KVM: VMX: Do _all_ initialization before exposing /dev/kvm to userspace")
4f8396b96a9f ("KVM: x86: Move guts of kvm_arch_init() to standalone helper")
da66de44b01e ("KVM: VMX: Don't bother disabling eVMCS static key on module exit")
2916b70fc342 ("KVM: VMX: Reset eVMCS controls in VP assist page during hardware disabling")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e32b120071ea114efc0b4ddd439547750b85f618 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:58 +0000
Subject: [PATCH] KVM: VMX: Do _all_ initialization before exposing /dev/kvm to
userspace
Call kvm_init() only after _all_ setup is complete, as kvm_init() exposes
/dev/kvm to userspace and thus allows userspace to create VMs (and call
other ioctls). E.g. KVM will encounter a NULL pointer when attempting to
add a vCPU to the per-CPU loaded_vmcss_on_cpu list if userspace is able to
create a VM before vmx_init() configures said list.
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 0 P4D 0
Oops: 0002 [#1] SMP
CPU: 6 PID: 1143 Comm: stable Not tainted 6.0.0-rc7+ #988
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:vmx_vcpu_load_vmcs+0x68/0x230 [kvm_intel]
<TASK>
vmx_vcpu_load+0x16/0x60 [kvm_intel]
kvm_arch_vcpu_load+0x32/0x1f0 [kvm]
vcpu_load+0x2f/0x40 [kvm]
kvm_arch_vcpu_create+0x231/0x310 [kvm]
kvm_vm_ioctl+0x79f/0xe10 [kvm]
? handle_mm_fault+0xb1/0x220
__x64_sys_ioctl+0x80/0xb0
do_syscall_64+0x2b/0x50
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7f5a6b05743b
</TASK>
Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel(+) kvm irqbypass
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-15-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1e2eab6bda16..e0e3f2c1f681 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8564,19 +8564,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
kvm_x86_vendor_exit();
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8594,11 +8598,6 @@ static int __init vmx_init(void)
if (r)
return r;
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
/*
* Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
@@ -8632,11 +8631,20 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
-err_l1d_flush:
- vmx_exit();
err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
The patch below does not apply to the 6.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.2.y
git checkout FETCH_HEAD
git cherry-pick -x e32b120071ea114efc0b4ddd439547750b85f618
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811894610616(a)kroah.com' --subject-prefix 'PATCH 6.2.y' HEAD^..
Possible dependencies:
e32b120071ea ("KVM: VMX: Do _all_ initialization before exposing /dev/kvm to userspace")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e32b120071ea114efc0b4ddd439547750b85f618 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:58 +0000
Subject: [PATCH] KVM: VMX: Do _all_ initialization before exposing /dev/kvm to
userspace
Call kvm_init() only after _all_ setup is complete, as kvm_init() exposes
/dev/kvm to userspace and thus allows userspace to create VMs (and call
other ioctls). E.g. KVM will encounter a NULL pointer when attempting to
add a vCPU to the per-CPU loaded_vmcss_on_cpu list if userspace is able to
create a VM before vmx_init() configures said list.
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 0 P4D 0
Oops: 0002 [#1] SMP
CPU: 6 PID: 1143 Comm: stable Not tainted 6.0.0-rc7+ #988
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:vmx_vcpu_load_vmcs+0x68/0x230 [kvm_intel]
<TASK>
vmx_vcpu_load+0x16/0x60 [kvm_intel]
kvm_arch_vcpu_load+0x32/0x1f0 [kvm]
vcpu_load+0x2f/0x40 [kvm]
kvm_arch_vcpu_create+0x231/0x310 [kvm]
kvm_vm_ioctl+0x79f/0xe10 [kvm]
? handle_mm_fault+0xb1/0x220
__x64_sys_ioctl+0x80/0xb0
do_syscall_64+0x2b/0x50
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7f5a6b05743b
</TASK>
Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel(+) kvm irqbypass
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-15-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1e2eab6bda16..e0e3f2c1f681 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8564,19 +8564,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
kvm_x86_vendor_exit();
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8594,11 +8598,6 @@ static int __init vmx_init(void)
if (r)
return r;
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
/*
* Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
@@ -8632,11 +8631,20 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
-err_l1d_flush:
- vmx_exit();
err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781189338715(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
e649b3f0188f ("KVM: x86: Fix APIC page invalidation race")
54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
db5a95ec166f ("KVM: x86: remove set but not used variable 'called'")
7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
dfcd66604c1c ("mm/mmu_notifier: convert user range->blockable to helper function")
a3e0d41c2b1f ("mm/hmm: improve driver API to work and wait over a range")
73231612dc7c ("mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()")
25f23a0c7127 ("mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()")
9f454612f602 ("mm/hmm: do not erase snapshot when a range is invalidated")
704f3f2cf63c ("mm/hmm: use reference counting for HMM struct")
484d9a844d0d ("drm/i915/userptr: Avoid struct_mutex recursion for mmu_invalidate_range_start")
ac46d4f3c432 ("mm/mmu_notifier: use structure for invalidate_range_start/end calls v2")
5d6527a784f7 ("mm/mmu_notifier: use structure for invalidate_range_start/end callback")
ec131b2d7fa6 ("mm/hmm: invalidate device page table at start of invalidation")
44532d4c591c ("mm/hmm: use a structure for update callback parameters")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811893352(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
e649b3f0188f ("KVM: x86: Fix APIC page invalidation race")
54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
db5a95ec166f ("KVM: x86: remove set but not used variable 'called'")
7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
dfcd66604c1c ("mm/mmu_notifier: convert user range->blockable to helper function")
a3e0d41c2b1f ("mm/hmm: improve driver API to work and wait over a range")
73231612dc7c ("mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()")
25f23a0c7127 ("mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()")
9f454612f602 ("mm/hmm: do not erase snapshot when a range is invalidated")
704f3f2cf63c ("mm/hmm: use reference counting for HMM struct")
484d9a844d0d ("drm/i915/userptr: Avoid struct_mutex recursion for mmu_invalidate_range_start")
ac46d4f3c432 ("mm/mmu_notifier: use structure for invalidate_range_start/end calls v2")
5d6527a784f7 ("mm/mmu_notifier: use structure for invalidate_range_start/end callback")
ec131b2d7fa6 ("mm/hmm: invalidate device page table at start of invalidation")
44532d4c591c ("mm/hmm: use a structure for update callback parameters")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118932196(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
e649b3f0188f ("KVM: x86: Fix APIC page invalidation race")
54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
db5a95ec166f ("KVM: x86: remove set but not used variable 'called'")
7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811893117738(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118930227127(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 93827a0a36396f2fd6368a54a020f420c8916e9b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118898253227(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
93827a0a3639 ("KVM: VMX: Fix crash due to uninitialized current_vmcs")
3cd7cd8a62e6 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 93827a0a36396f2fd6368a54a020f420c8916e9b Mon Sep 17 00:00:00 2001
From: Alexandru Matei <alexandru.matei(a)uipath.com>
Date: Tue, 24 Jan 2023 00:12:08 +0200
Subject: [PATCH] KVM: VMX: Fix crash due to uninitialized current_vmcs
KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
that the msr bitmap was changed.
vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
-> vmx_msr_bitmap_l01_changed which in the end calls this function. The
function checks for current_vmcs if it is null but the check is
insufficient because current_vmcs is not initialized. Because of this, the
code might incorrectly write to the structure pointed by current_vmcs value
left by another task. Preemption is not disabled, the current task can be
preempted and moved to another CPU while current_vmcs is accessed multiple
times from evmcs_touch_msr_bitmap() which leads to crash.
The manipulation of MSR bitmaps by callers happens only for vmcs01 so the
solution is to use vmx->vmcs01.vmcs instead of current_vmcs.
BUG: kernel NULL pointer dereference, address: 0000000000000338
PGD 4e1775067 P4D 0
Oops: 0002 [#1] PREEMPT SMP NOPTI
...
RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel]
...
Call Trace:
vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel]
vmx_vcpu_create+0xe6/0x540 [kvm_intel]
kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm]
kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm]
kvm_vm_ioctl+0x53f/0x790 [kvm]
__x64_sys_ioctl+0x8a/0xc0
do_syscall_64+0x5c/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd
Fixes: ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support")
Cc: stable(a)vger.kernel.org
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Alexandru Matei <alexandru.matei(a)uipath.com>
Link: https://lore.kernel.org/r/20230123221208.4964-1-alexandru.matei@uipath.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index caf658726169..78d17667e7ec 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -250,16 +250,6 @@ static __always_inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -280,7 +270,6 @@ static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8a9911ae1240..33614ee2cd67 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3936,8 +3936,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 93827a0a36396f2fd6368a54a020f420c8916e9b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781188963067(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
93827a0a3639 ("KVM: VMX: Fix crash due to uninitialized current_vmcs")
3cd7cd8a62e6 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 93827a0a36396f2fd6368a54a020f420c8916e9b Mon Sep 17 00:00:00 2001
From: Alexandru Matei <alexandru.matei(a)uipath.com>
Date: Tue, 24 Jan 2023 00:12:08 +0200
Subject: [PATCH] KVM: VMX: Fix crash due to uninitialized current_vmcs
KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
that the msr bitmap was changed.
vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
-> vmx_msr_bitmap_l01_changed which in the end calls this function. The
function checks for current_vmcs if it is null but the check is
insufficient because current_vmcs is not initialized. Because of this, the
code might incorrectly write to the structure pointed by current_vmcs value
left by another task. Preemption is not disabled, the current task can be
preempted and moved to another CPU while current_vmcs is accessed multiple
times from evmcs_touch_msr_bitmap() which leads to crash.
The manipulation of MSR bitmaps by callers happens only for vmcs01 so the
solution is to use vmx->vmcs01.vmcs instead of current_vmcs.
BUG: kernel NULL pointer dereference, address: 0000000000000338
PGD 4e1775067 P4D 0
Oops: 0002 [#1] PREEMPT SMP NOPTI
...
RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel]
...
Call Trace:
vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel]
vmx_vcpu_create+0xe6/0x540 [kvm_intel]
kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm]
kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm]
kvm_vm_ioctl+0x53f/0x790 [kvm]
__x64_sys_ioctl+0x8a/0xc0
do_syscall_64+0x5c/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd
Fixes: ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support")
Cc: stable(a)vger.kernel.org
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Alexandru Matei <alexandru.matei(a)uipath.com>
Link: https://lore.kernel.org/r/20230123221208.4964-1-alexandru.matei@uipath.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index caf658726169..78d17667e7ec 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -250,16 +250,6 @@ static __always_inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -280,7 +270,6 @@ static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8a9911ae1240..33614ee2cd67 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3936,8 +3936,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f2d3155e2a6bac44d16f04415a321e8707d895c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781000032425(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
f2d3155e2a6b ("KVM: s390: disable migration mode when dirty tracking is disabled")
ec5c86976674 ("KVM: s390: Skip gfn/size sanity checks on memslot DELETE or FLAGS_ONLY")
cf5b486922dc ("KVM: s390: Use "new" memslot instead of userspace memory region")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f2d3155e2a6bac44d16f04415a321e8707d895c6 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Fri, 27 Jan 2023 15:05:32 +0100
Subject: [PATCH] KVM: s390: disable migration mode when dirty tracking is
disabled
Migration mode is a VM attribute which enables tracking of changes in
storage attributes (PGSTE). It assumes dirty tracking is enabled on all
memslots to keep a dirty bitmap of pages with changed storage attributes.
When enabling migration mode, we currently check that dirty tracking is
enabled for all memslots. However, userspace can disable dirty tracking
without disabling migration mode.
Since migration mode is pointless with dirty tracking disabled, disable
migration mode whenever userspace disables dirty tracking on any slot.
Also update the documentation to clarify that dirty tracking must be
enabled when enabling migration mode, which is already enforced by the
code in kvm_s390_vm_start_migration().
Also highlight in the documentation for KVM_S390_GET_CMMA_BITS that it
can now fail with -EINVAL when dirty tracking is disabled while
migration mode is on. Move all the error codes to a table so this stays
readable.
To disable migration mode, slots_lock should be held, which is taken
in kvm_set_memory_region() and thus held in
kvm_arch_prepare_memory_region().
Restructure the prepare code a bit so all the sanity checking is done
before disabling migration mode. This ensures migration mode isn't
disabled when some sanity check fails.
Cc: stable(a)vger.kernel.org
Fixes: 190df4a212a7 ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230127140532.230651-2-nrb@linux.ibm.com
Message-Id: <20230127140532.230651-2-nrb(a)linux.ibm.com>
[frankja(a)linux.ibm.com: fixed commit message typo, moved api.rst error table upwards]
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index deb494f759ed..8cd7fd05d53b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4449,6 +4449,18 @@ not holding a previously reported uncorrected error).
:Parameters: struct kvm_s390_cmma_log (in, out)
:Returns: 0 on success, a negative value on error
+Errors:
+
+ ====== =============================================================
+ ENOMEM not enough memory can be allocated to complete the task
+ ENXIO if CMMA is not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but migration mode was not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but dirty tracking has been
+ disabled (and thus migration mode was automatically disabled)
+ EFAULT if the userspace address is invalid or if no page table is
+ present for the addresses (e.g. when using hugepages).
+ ====== =============================================================
+
This ioctl is used to get the values of the CMMA bits on the s390
architecture. It is meant to be used in two scenarios:
@@ -4529,12 +4541,6 @@ mask is unused.
values points to the userspace buffer where the result will be stored.
-This ioctl can fail with -ENOMEM if not enough memory can be allocated to
-complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
-KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
--EFAULT if the userspace address is invalid or if no page table is
-present for the addresses (e.g. when using hugepages).
-
4.108 KVM_S390_SET_CMMA_BITS
----------------------------
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 60acc39e0e93..147efec626e5 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -302,6 +302,10 @@ Allows userspace to start migration mode, needed for PGSTE migration.
Setting this attribute when migration mode is already active will have
no effects.
+Dirty tracking must be enabled on all memslots, else -EINVAL is returned. When
+dirty tracking is disabled on any memslot, migration mode is automatically
+stopped.
+
:Parameters: none
:Returns: -ENOMEM if there is not enough free memory to start migration mode;
-EINVAL if the state of the VM is invalid (e.g. no memory defined);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..cb72f9a09fb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5633,23 +5633,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f2d3155e2a6bac44d16f04415a321e8707d895c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781000053227(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f2d3155e2a6b ("KVM: s390: disable migration mode when dirty tracking is disabled")
ec5c86976674 ("KVM: s390: Skip gfn/size sanity checks on memslot DELETE or FLAGS_ONLY")
cf5b486922dc ("KVM: s390: Use "new" memslot instead of userspace memory region")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f2d3155e2a6bac44d16f04415a321e8707d895c6 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Fri, 27 Jan 2023 15:05:32 +0100
Subject: [PATCH] KVM: s390: disable migration mode when dirty tracking is
disabled
Migration mode is a VM attribute which enables tracking of changes in
storage attributes (PGSTE). It assumes dirty tracking is enabled on all
memslots to keep a dirty bitmap of pages with changed storage attributes.
When enabling migration mode, we currently check that dirty tracking is
enabled for all memslots. However, userspace can disable dirty tracking
without disabling migration mode.
Since migration mode is pointless with dirty tracking disabled, disable
migration mode whenever userspace disables dirty tracking on any slot.
Also update the documentation to clarify that dirty tracking must be
enabled when enabling migration mode, which is already enforced by the
code in kvm_s390_vm_start_migration().
Also highlight in the documentation for KVM_S390_GET_CMMA_BITS that it
can now fail with -EINVAL when dirty tracking is disabled while
migration mode is on. Move all the error codes to a table so this stays
readable.
To disable migration mode, slots_lock should be held, which is taken
in kvm_set_memory_region() and thus held in
kvm_arch_prepare_memory_region().
Restructure the prepare code a bit so all the sanity checking is done
before disabling migration mode. This ensures migration mode isn't
disabled when some sanity check fails.
Cc: stable(a)vger.kernel.org
Fixes: 190df4a212a7 ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230127140532.230651-2-nrb@linux.ibm.com
Message-Id: <20230127140532.230651-2-nrb(a)linux.ibm.com>
[frankja(a)linux.ibm.com: fixed commit message typo, moved api.rst error table upwards]
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index deb494f759ed..8cd7fd05d53b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4449,6 +4449,18 @@ not holding a previously reported uncorrected error).
:Parameters: struct kvm_s390_cmma_log (in, out)
:Returns: 0 on success, a negative value on error
+Errors:
+
+ ====== =============================================================
+ ENOMEM not enough memory can be allocated to complete the task
+ ENXIO if CMMA is not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but migration mode was not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but dirty tracking has been
+ disabled (and thus migration mode was automatically disabled)
+ EFAULT if the userspace address is invalid or if no page table is
+ present for the addresses (e.g. when using hugepages).
+ ====== =============================================================
+
This ioctl is used to get the values of the CMMA bits on the s390
architecture. It is meant to be used in two scenarios:
@@ -4529,12 +4541,6 @@ mask is unused.
values points to the userspace buffer where the result will be stored.
-This ioctl can fail with -ENOMEM if not enough memory can be allocated to
-complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
-KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
--EFAULT if the userspace address is invalid or if no page table is
-present for the addresses (e.g. when using hugepages).
-
4.108 KVM_S390_SET_CMMA_BITS
----------------------------
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 60acc39e0e93..147efec626e5 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -302,6 +302,10 @@ Allows userspace to start migration mode, needed for PGSTE migration.
Setting this attribute when migration mode is already active will have
no effects.
+Dirty tracking must be enabled on all memslots, else -EINVAL is returned. When
+dirty tracking is disabled on any memslot, migration mode is automatically
+stopped.
+
:Parameters: none
:Returns: -ENOMEM if there is not enough free memory to start migration mode;
-EINVAL if the state of the VM is invalid (e.g. no memory defined);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..cb72f9a09fb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5633,23 +5633,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
Since commit d59f6617eef0 ("genirq: Allow fwnode to carry name
information only") an IRQ domain is always given a name during
allocation (e.g. used for the debugfs entry).
Drop the no longer valid name assignment, which would lead to an attempt
to free a string constant when removing the domain on late probe
failures (e.g. probe deferral).
Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only")
Cc: stable(a)vger.kernel.org # 4.13
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/pinctrl/pinctrl-at91-pio4.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c
index 39b233f73e13..d699588677a5 100644
--- a/drivers/pinctrl/pinctrl-at91-pio4.c
+++ b/drivers/pinctrl/pinctrl-at91-pio4.c
@@ -1206,7 +1206,6 @@ static int atmel_pinctrl_probe(struct platform_device *pdev)
dev_err(dev, "can't add the irq domain\n");
return -ENODEV;
}
- atmel_pioctrl->irq_domain->name = "atmel gpio";
for (i = 0; i < atmel_pioctrl->npins; i++) {
int irq = irq_create_mapping(atmel_pioctrl->irq_domain, i);
--
2.39.2
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 85a37983ec69cc9fcd188bc37c4de15ee326355a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781103179272(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
85a37983ec69 ("udf: Detect system inodes linked into directory hierarchy")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85a37983ec69cc9fcd188bc37c4de15ee326355a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 3 Jan 2023 10:03:35 +0100
Subject: [PATCH] udf: Detect system inodes linked into directory hierarchy
When UDF filesystem is corrupted, hidden system inodes can be linked
into directory hierarchy which is an avenue for further serious
corruption of the filesystem and kernel confusion as noticed by syzbot
fuzzed images. Refuse to access system inodes linked into directory
hierarchy and vice versa.
CC: stable(a)vger.kernel.org
Reported-by: syzbot+38695a20b8addcbc1084(a)syzkaller.appspotmail.com
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 9ee269d3d546..96873fa2f683 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1813,8 +1813,13 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->i_state & I_NEW)) {
+ if (UDF_I(inode)->i_hidden != hidden_inode) {
+ iput(inode);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
return inode;
+ }
memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
err = udf_read_inode(inode, hidden_inode);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 85a37983ec69cc9fcd188bc37c4de15ee326355a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678110262133121(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
85a37983ec69 ("udf: Detect system inodes linked into directory hierarchy")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85a37983ec69cc9fcd188bc37c4de15ee326355a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 3 Jan 2023 10:03:35 +0100
Subject: [PATCH] udf: Detect system inodes linked into directory hierarchy
When UDF filesystem is corrupted, hidden system inodes can be linked
into directory hierarchy which is an avenue for further serious
corruption of the filesystem and kernel confusion as noticed by syzbot
fuzzed images. Refuse to access system inodes linked into directory
hierarchy and vice versa.
CC: stable(a)vger.kernel.org
Reported-by: syzbot+38695a20b8addcbc1084(a)syzkaller.appspotmail.com
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 9ee269d3d546..96873fa2f683 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1813,8 +1813,13 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->i_state & I_NEW)) {
+ if (UDF_I(inode)->i_hidden != hidden_inode) {
+ iput(inode);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
return inode;
+ }
memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
err = udf_read_inode(inode, hidden_inode);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811000815093(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
16d055656814 ("udf: Discard preallocation before extending file with a hole")
979a6e28dd96 ("udf: Get rid of 0-length arrays in struct fileIdentDesc")
63c9e47a1642 ("udf: fix silent AED tagLocation corruption")
382a2287bf9c ("udf: Remove pointless union in udf_inode_info")
044e2e26f214 ("udf: Avoid accessing uninitialized data on failed inode read")
c3367a1b47d5 ("udf: augment UDF permissions on new inodes")
ab9a3a737284 ("udf: reduce leakage of blocks related to named streams")
fa33cdbf3ece ("udf: Fix incorrect final NOT_ALLOCATED (hole) extent length")
c3b9cecd89b8 ("udf: convert inode stamps to timespec64")
f2e83347119a ("udf: Provide function for calculating dir entry length")
95582b008388 ("vfs: change inode times to use struct timespec64")
0220eddac66d ("udf: Simplify calls to udf_disk_stamp_to_time")
0a2dfbecb361 ("fs: nfs: get rid of memcpys for inode times")
13442b036a13 ("ceph: make inode time prints to be long long")
8efd6894ff08 ("fs: add timespec64_truncate()")
ffdeec7aa41a ("ceph: always update atime/mtime/ctime for new inode")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678110007243154(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
16d055656814 ("udf: Discard preallocation before extending file with a hole")
979a6e28dd96 ("udf: Get rid of 0-length arrays in struct fileIdentDesc")
63c9e47a1642 ("udf: fix silent AED tagLocation corruption")
382a2287bf9c ("udf: Remove pointless union in udf_inode_info")
044e2e26f214 ("udf: Avoid accessing uninitialized data on failed inode read")
c3367a1b47d5 ("udf: augment UDF permissions on new inodes")
ab9a3a737284 ("udf: reduce leakage of blocks related to named streams")
fa33cdbf3ece ("udf: Fix incorrect final NOT_ALLOCATED (hole) extent length")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678110007191167(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
16d055656814 ("udf: Discard preallocation before extending file with a hole")
979a6e28dd96 ("udf: Get rid of 0-length arrays in struct fileIdentDesc")
63c9e47a1642 ("udf: fix silent AED tagLocation corruption")
382a2287bf9c ("udf: Remove pointless union in udf_inode_info")
044e2e26f214 ("udf: Avoid accessing uninitialized data on failed inode read")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811000666204(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
16d055656814 ("udf: Discard preallocation before extending file with a hole")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811000613644(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
16d055656814 ("udf: Discard preallocation before extending file with a hole")
979a6e28dd96 ("udf: Get rid of 0-length arrays in struct fileIdentDesc")
63c9e47a1642 ("udf: fix silent AED tagLocation corruption")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678110005174144(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
16d055656814 ("udf: Discard preallocation before extending file with a hole")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 6.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.2.y
git checkout FETCH_HEAD
git cherry-pick -x f54aa97fb7e5329a373f9df4e5e213ced4fc8759
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811000425499(a)kroah.com' --subject-prefix 'PATCH 6.2.y' HEAD^..
Possible dependencies:
f54aa97fb7e5 ("udf: Fix off-by-one error when discarding preallocation")
f3a30be77750 ("udf: Factor out block mapping into udf_map_block()")
a27b2923de7e ("udf: Move udf_expand_dir_adinicb() to its callsite")
57bda9fb169d ("udf: Convert udf_expand_dir_adinicb() to new directory iteration")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f54aa97fb7e5329a373f9df4e5e213ced4fc8759 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Mon, 23 Jan 2023 14:29:15 +0100
Subject: [PATCH] udf: Fix off-by-one error when discarding preallocation
The condition determining whether the preallocation can be used had
an off-by-one error so we didn't discard preallocation when new
allocation was just following it. This can then confuse code in
inode_getblk().
CC: stable(a)vger.kernel.org
Fixes: 16d055656814 ("udf: Discard preallocation before extending file with a hole")
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 51deada8b928..ee440d16411e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -361,7 +361,7 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
* Block beyond EOF and prealloc extents? Just discard preallocation
* as it is not useful and complicates things.
*/
- if (((loff_t)map->lblk) << inode->i_blkbits > iinfo->i_lenExtents)
+ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
err = inode_getblk(inode, map);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x fc8033a34a3ca7d23353e645e6dde5d364ac5f12
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810996089240(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
fc8033a34a3c ("udf: Preserve link count of system files")
382a2287bf9c ("udf: Remove pointless union in udf_inode_info")
044e2e26f214 ("udf: Avoid accessing uninitialized data on failed inode read")
ab9a3a737284 ("udf: reduce leakage of blocks related to named streams")
d288d95842f1 ("udf: Fix BUG on corrupted inode")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fc8033a34a3ca7d23353e645e6dde5d364ac5f12 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 3 Jan 2023 09:56:56 +0100
Subject: [PATCH] udf: Preserve link count of system files
System files in UDF filesystem have link count 0. To not confuse VFS we
fudge the link count to be 1 when reading such inodes however we forget
to restore the link count of 0 when writing such inodes. Fix that.
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 31965c3798f2..9ee269d3d546 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1301,6 +1301,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
ret = -EIO;
goto out;
}
+ iinfo->i_hidden = hidden_inode;
iinfo->i_unique = 0;
iinfo->i_lenEAttr = 0;
iinfo->i_lenExtents = 0;
@@ -1636,8 +1637,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
- else
- fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+ else {
+ if (iinfo->i_hidden)
+ fe->fileLinkCount = cpu_to_le16(0);
+ else
+ fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+ }
fe->informationLength = cpu_to_le64(inode->i_size);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 06eda8177b5f..241b40e886b3 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -147,6 +147,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
ei->i_next_alloc_goal = 0;
ei->i_strat4096 = 0;
ei->i_streamdir = 0;
+ ei->i_hidden = 0;
init_rwsem(&ei->i_data_sem);
ei->cached_extent.lstart = -1;
spin_lock_init(&ei->i_extent_cache_lock);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 06ff7006b822..312b7c9ef10e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -44,7 +44,8 @@ struct udf_inode_info {
unsigned i_use : 1; /* unallocSpaceEntry */
unsigned i_strat4096 : 1;
unsigned i_streamdir : 1;
- unsigned reserved : 25;
+ unsigned i_hidden : 1; /* hidden system inode */
+ unsigned reserved : 24;
__u8 *i_data;
struct kernel_lb_addr i_locStreamdir;
__u64 i_lenStreams;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x fc8033a34a3ca7d23353e645e6dde5d364ac5f12
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810995913998(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
fc8033a34a3c ("udf: Preserve link count of system files")
382a2287bf9c ("udf: Remove pointless union in udf_inode_info")
044e2e26f214 ("udf: Avoid accessing uninitialized data on failed inode read")
ab9a3a737284 ("udf: reduce leakage of blocks related to named streams")
d288d95842f1 ("udf: Fix BUG on corrupted inode")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fc8033a34a3ca7d23353e645e6dde5d364ac5f12 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 3 Jan 2023 09:56:56 +0100
Subject: [PATCH] udf: Preserve link count of system files
System files in UDF filesystem have link count 0. To not confuse VFS we
fudge the link count to be 1 when reading such inodes however we forget
to restore the link count of 0 when writing such inodes. Fix that.
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 31965c3798f2..9ee269d3d546 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1301,6 +1301,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
ret = -EIO;
goto out;
}
+ iinfo->i_hidden = hidden_inode;
iinfo->i_unique = 0;
iinfo->i_lenEAttr = 0;
iinfo->i_lenExtents = 0;
@@ -1636,8 +1637,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
- else
- fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+ else {
+ if (iinfo->i_hidden)
+ fe->fileLinkCount = cpu_to_le16(0);
+ else
+ fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+ }
fe->informationLength = cpu_to_le64(inode->i_size);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 06eda8177b5f..241b40e886b3 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -147,6 +147,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
ei->i_next_alloc_goal = 0;
ei->i_strat4096 = 0;
ei->i_streamdir = 0;
+ ei->i_hidden = 0;
init_rwsem(&ei->i_data_sem);
ei->cached_extent.lstart = -1;
spin_lock_init(&ei->i_extent_cache_lock);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 06ff7006b822..312b7c9ef10e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -44,7 +44,8 @@ struct udf_inode_info {
unsigned i_use : 1; /* unallocSpaceEntry */
unsigned i_strat4096 : 1;
unsigned i_streamdir : 1;
- unsigned reserved : 25;
+ unsigned i_hidden : 1; /* hidden system inode */
+ unsigned reserved : 24;
__u8 *i_data;
struct kernel_lb_addr i_locStreamdir;
__u64 i_lenStreams;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 844545c51a5b2a524b22a2fe9d0b353b827d24b4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678109703254217(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
844545c51a5b ("f2fs: fix cgroup writeback accounting with fs-layer encryption")
9637d517347e ("Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 844545c51a5b2a524b22a2fe9d0b353b827d24b4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Thu, 2 Feb 2023 17:02:39 -0800
Subject: [PATCH] f2fs: fix cgroup writeback accounting with fs-layer
encryption
When writing a page from an encrypted file that is using
filesystem-layer encryption (not inline encryption), f2fs encrypts the
pagecache page into a bounce page, then writes the bounce page.
It also passes the bounce page to wbc_account_cgroup_owner(). That's
incorrect, because the bounce page is a newly allocated temporary page
that doesn't have the memory cgroup of the original pagecache page.
This makes wbc_account_cgroup_owner() not account the I/O to the owner
of the pagecache page as it should.
Fix this by always passing the pagecache page to
wbc_account_cgroup_owner().
Fixes: 578c647879f7 ("f2fs: implement cgroup writeback support")
Cc: stable(a)vger.kernel.org
Reported-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Reviewed-by: Chao Yu <chao(a)kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 754841bce389..8a636500db0e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -739,7 +739,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page) : WB_DATA_TYPE(fio->page));
@@ -949,7 +949,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page));
@@ -1023,7 +1023,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 164272113b685927126c938b4a9cbd2075eb15ee
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781096071167(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
164272113b68 ("fs: dlm: fix race setting stop tx flag")
e01c4b7bd415 ("fd: dlm: trace send/recv of dlm message and rcom")
00e99ccde757 ("dlm: use __le types for dlm messages")
2f9dbeda8dc0 ("dlm: use __le types for rcom messages")
3428785a65da ("dlm: use __le types for dlm header")
a8449f232ee3 ("dlm: add __CHECKER__ for false positives")
6c2e3bf68f3e ("fs: dlm: filter user dlm messages for kernel locks")
9af5b8f0ead7 ("fs: dlm: add debugfs rawmsg send functionality")
92732376fd29 ("fs: dlm: trace socket handling")
f1d3b8f91d96 ("fs: dlm: initial support for tracepoints")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 164272113b685927126c938b4a9cbd2075eb15ee Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:34 -0500
Subject: [PATCH] fs: dlm: fix race setting stop tx flag
This patch sets the stop tx flag before we commit the dlm message.
This flag will report about unexpected transmissions after we
send the DLM_FIN message out, which should be the last message sent.
When we commit the dlm fin message, it could be that we already
got an ack back and the CLOSED state change already happened.
We should not set this flag when we are in CLOSED state. To avoid this
race we simply set the tx flag before the state change can be in
progress by moving it before dlm_midcomms_commit_mhandle().
Cc: stable(a)vger.kernel.org
Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index a3eb19c8cec5..9d459d5bf800 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -406,6 +406,7 @@ static int dlm_send_fin(struct midcomms_node *node,
if (!mh)
return -ENOMEM;
+ set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
mh->ack_rcv = ack_rcv;
m_header = (struct dlm_header *)ppc;
@@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node,
pr_debug("sending fin msg to node %d\n", node->nodeid);
dlm_midcomms_commit_mhandle(mh, NULL, 0);
- set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
return 0;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 164272113b685927126c938b4a9cbd2075eb15ee
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810960625226(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
164272113b68 ("fs: dlm: fix race setting stop tx flag")
e01c4b7bd415 ("fd: dlm: trace send/recv of dlm message and rcom")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 164272113b685927126c938b4a9cbd2075eb15ee Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:34 -0500
Subject: [PATCH] fs: dlm: fix race setting stop tx flag
This patch sets the stop tx flag before we commit the dlm message.
This flag will report about unexpected transmissions after we
send the DLM_FIN message out, which should be the last message sent.
When we commit the dlm fin message, it could be that we already
got an ack back and the CLOSED state change already happened.
We should not set this flag when we are in CLOSED state. To avoid this
race we simply set the tx flag before the state change can be in
progress by moving it before dlm_midcomms_commit_mhandle().
Cc: stable(a)vger.kernel.org
Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index a3eb19c8cec5..9d459d5bf800 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -406,6 +406,7 @@ static int dlm_send_fin(struct midcomms_node *node,
if (!mh)
return -ENOMEM;
+ set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
mh->ack_rcv = ack_rcv;
m_header = (struct dlm_header *)ppc;
@@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node,
pr_debug("sending fin msg to node %d\n", node->nodeid);
dlm_midcomms_commit_mhandle(mh, NULL, 0);
- set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
return 0;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 7354fa4ef697191effedc2ae9a8293427708bbf5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781095928214(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
7354fa4ef697 ("fs: dlm: be sure to call dlm_send_queue_flush()")
775af207464b ("fs: dlm: use WARN_ON_ONCE() instead of WARN_ON()")
e01c4b7bd415 ("fd: dlm: trace send/recv of dlm message and rcom")
00e99ccde757 ("dlm: use __le types for dlm messages")
2f9dbeda8dc0 ("dlm: use __le types for rcom messages")
3428785a65da ("dlm: use __le types for dlm header")
a8449f232ee3 ("dlm: add __CHECKER__ for false positives")
6c547f264077 ("fs: dlm: memory cache for midcomms hotpath")
6c2e3bf68f3e ("fs: dlm: filter user dlm messages for kernel locks")
9af5b8f0ead7 ("fs: dlm: add debugfs rawmsg send functionality")
92732376fd29 ("fs: dlm: trace socket handling")
f1d3b8f91d96 ("fs: dlm: initial support for tracepoints")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7354fa4ef697191effedc2ae9a8293427708bbf5 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:33 -0500
Subject: [PATCH] fs: dlm: be sure to call dlm_send_queue_flush()
If we release a midcomms node structure, there should be nothing left
inside the dlm midcomms send queue. However, sometimes this is not true
because I believe some DLM_FIN message was not acked... if we run
into a shutdown timeout, then we should be sure there is no pending send
dlm message inside this queue when releasing midcomms node structure.
Cc: stable(a)vger.kernel.org
Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 2e60d9a2c883..a3eb19c8cec5 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1402,6 +1402,7 @@ static void midcomms_node_release(struct rcu_head *rcu)
struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
+ dlm_send_queue_flush(node);
kfree(node);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 7354fa4ef697191effedc2ae9a8293427708bbf5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781095911850(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
7354fa4ef697 ("fs: dlm: be sure to call dlm_send_queue_flush()")
775af207464b ("fs: dlm: use WARN_ON_ONCE() instead of WARN_ON()")
e01c4b7bd415 ("fd: dlm: trace send/recv of dlm message and rcom")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7354fa4ef697191effedc2ae9a8293427708bbf5 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:33 -0500
Subject: [PATCH] fs: dlm: be sure to call dlm_send_queue_flush()
If we release a midcomms node structure, there should be nothing left
inside the dlm midcomms send queue. However, sometimes this is not true
because I believe some DLM_FIN message was not acked... if we run
into a shutdown timeout, then we should be sure there is no pending send
dlm message inside this queue when releasing midcomms node structure.
Cc: stable(a)vger.kernel.org
Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 2e60d9a2c883..a3eb19c8cec5 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1402,6 +1402,7 @@ static void midcomms_node_release(struct rcu_head *rcu)
struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
+ dlm_send_queue_flush(node);
kfree(node);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 724b6bab0d75f1dc01fdfbf7fe8d4217a5cb90ba
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781095834451(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
724b6bab0d75 ("fs: dlm: fix use after free in midcomms commit")
e01c4b7bd415 ("fd: dlm: trace send/recv of dlm message and rcom")
00e99ccde757 ("dlm: use __le types for dlm messages")
2f9dbeda8dc0 ("dlm: use __le types for rcom messages")
3428785a65da ("dlm: use __le types for dlm header")
a8449f232ee3 ("dlm: add __CHECKER__ for false positives")
6c2e3bf68f3e ("fs: dlm: filter user dlm messages for kernel locks")
9af5b8f0ead7 ("fs: dlm: add debugfs rawmsg send functionality")
92732376fd29 ("fs: dlm: trace socket handling")
f1d3b8f91d96 ("fs: dlm: initial support for tracepoints")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 724b6bab0d75f1dc01fdfbf7fe8d4217a5cb90ba Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:32 -0500
Subject: [PATCH] fs: dlm: fix use after free in midcomms commit
While working on processing dlm message in softirq context I experienced
the following KASAN use-after-free warning:
[ 151.760477] ==================================================================
[ 151.761803] BUG: KASAN: use-after-free in dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.763414] Read of size 4 at addr ffff88811a980c60 by task lock_torture/1347
[ 151.765284] CPU: 7 PID: 1347 Comm: lock_torture Not tainted 6.1.0-rc4+ #2828
[ 151.766778] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.16.0-3.module+el8.7.0+16134+e5908aa2 04/01/2014
[ 151.768726] Call Trace:
[ 151.769277] <TASK>
[ 151.769748] dump_stack_lvl+0x5b/0x86
[ 151.770556] print_report+0x180/0x4c8
[ 151.771378] ? kasan_complete_mode_report_info+0x7c/0x1e0
[ 151.772241] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.773069] kasan_report+0x93/0x1a0
[ 151.773668] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.774514] __asan_load4+0x7e/0xa0
[ 151.775089] dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.775890] ? create_message.isra.29.constprop.64+0x57/0xc0
[ 151.776770] send_common+0x19f/0x1b0
[ 151.777342] ? remove_from_waiters+0x60/0x60
[ 151.778017] ? lock_downgrade+0x410/0x410
[ 151.778648] ? __this_cpu_preempt_check+0x13/0x20
[ 151.779421] ? rcu_lockdep_current_cpu_online+0x88/0xc0
[ 151.780292] _convert_lock+0x46/0x150
[ 151.780893] convert_lock+0x7b/0xc0
[ 151.781459] dlm_lock+0x3ac/0x580
[ 151.781993] ? 0xffffffffc0540000
[ 151.782522] ? torture_stop+0x120/0x120 [dlm_locktorture]
[ 151.783379] ? dlm_scan_rsbs+0xa70/0xa70
[ 151.784003] ? preempt_count_sub+0xd6/0x130
[ 151.784661] ? is_module_address+0x47/0x70
[ 151.785309] ? torture_stop+0x120/0x120 [dlm_locktorture]
[ 151.786166] ? 0xffffffffc0540000
[ 151.786693] ? lockdep_init_map_type+0xc3/0x360
[ 151.787414] ? 0xffffffffc0540000
[ 151.787947] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture]
[ 151.789004] ? torture_stop+0x120/0x120 [dlm_locktorture]
[ 151.789858] ? 0xffffffffc0540000
[ 151.790392] ? lock_torture_cleanup+0x20/0x20 [dlm_locktorture]
[ 151.791347] ? delay_tsc+0x94/0xc0
[ 151.791898] torture_ex_iter+0xc3/0xea [dlm_locktorture]
[ 151.792735] ? torture_start+0x30/0x30 [dlm_locktorture]
[ 151.793606] lock_torture+0x177/0x270 [dlm_locktorture]
[ 151.794448] ? torture_dlm_lock_sync.isra.3+0x150/0x150 [dlm_locktorture]
[ 151.795539] ? lock_torture_stats+0x80/0x80 [dlm_locktorture]
[ 151.796476] ? do_raw_spin_lock+0x11e/0x1e0
[ 151.797152] ? mark_held_locks+0x34/0xb0
[ 151.797784] ? _raw_spin_unlock_irqrestore+0x30/0x70
[ 151.798581] ? __kthread_parkme+0x79/0x110
[ 151.799246] ? trace_preempt_on+0x2a/0xf0
[ 151.799902] ? __kthread_parkme+0x79/0x110
[ 151.800579] ? preempt_count_sub+0xd6/0x130
[ 151.801271] ? __kasan_check_read+0x11/0x20
[ 151.801963] ? __kthread_parkme+0xec/0x110
[ 151.802630] ? lock_torture_stats+0x80/0x80 [dlm_locktorture]
[ 151.803569] kthread+0x192/0x1d0
[ 151.804104] ? kthread_complete_and_exit+0x30/0x30
[ 151.804881] ret_from_fork+0x1f/0x30
[ 151.805480] </TASK>
[ 151.806111] Allocated by task 1347:
[ 151.806681] kasan_save_stack+0x26/0x50
[ 151.807308] kasan_set_track+0x25/0x30
[ 151.807920] kasan_save_alloc_info+0x1e/0x30
[ 151.808609] __kasan_slab_alloc+0x63/0x80
[ 151.809263] kmem_cache_alloc+0x1ad/0x830
[ 151.809916] dlm_allocate_mhandle+0x17/0x20
[ 151.810590] dlm_midcomms_get_mhandle+0x96/0x260
[ 151.811344] _create_message+0x95/0x180
[ 151.811994] create_message.isra.29.constprop.64+0x57/0xc0
[ 151.812880] send_common+0x129/0x1b0
[ 151.813467] _convert_lock+0x46/0x150
[ 151.814074] convert_lock+0x7b/0xc0
[ 151.814648] dlm_lock+0x3ac/0x580
[ 151.815199] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture]
[ 151.816258] torture_ex_iter+0xc3/0xea [dlm_locktorture]
[ 151.817129] lock_torture+0x177/0x270 [dlm_locktorture]
[ 151.817986] kthread+0x192/0x1d0
[ 151.818518] ret_from_fork+0x1f/0x30
[ 151.819369] Freed by task 1336:
[ 151.819890] kasan_save_stack+0x26/0x50
[ 151.820514] kasan_set_track+0x25/0x30
[ 151.821128] kasan_save_free_info+0x2e/0x50
[ 151.821812] __kasan_slab_free+0x107/0x1a0
[ 151.822483] kmem_cache_free+0x204/0x5e0
[ 151.823152] dlm_free_mhandle+0x18/0x20
[ 151.823781] dlm_mhandle_release+0x2e/0x40
[ 151.824454] rcu_core+0x583/0x1330
[ 151.825047] rcu_core_si+0xe/0x20
[ 151.825594] __do_softirq+0xf4/0x5c2
[ 151.826450] Last potentially related work creation:
[ 151.827238] kasan_save_stack+0x26/0x50
[ 151.827870] __kasan_record_aux_stack+0xa2/0xc0
[ 151.828609] kasan_record_aux_stack_noalloc+0xb/0x20
[ 151.829415] call_rcu+0x4c/0x760
[ 151.829954] dlm_mhandle_delete+0x97/0xb0
[ 151.830718] dlm_process_incoming_buffer+0x2fc/0xb30
[ 151.831524] process_dlm_messages+0x16e/0x470
[ 151.832245] process_one_work+0x505/0xa10
[ 151.832905] worker_thread+0x67/0x650
[ 151.833507] kthread+0x192/0x1d0
[ 151.834046] ret_from_fork+0x1f/0x30
[ 151.834900] The buggy address belongs to the object at ffff88811a980c30
which belongs to the cache dlm_mhandle of size 88
[ 151.836894] The buggy address is located 48 bytes inside of
88-byte region [ffff88811a980c30, ffff88811a980c88)
[ 151.839007] The buggy address belongs to the physical page:
[ 151.839904] page:0000000076cf5d62 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11a980
[ 151.841378] flags: 0x8000000000000200(slab|zone=2)
[ 151.842141] raw: 8000000000000200 0000000000000000 dead000000000122 ffff8881089b43c0
[ 151.843401] raw: 0000000000000000 0000000000220022 00000001ffffffff 0000000000000000
[ 151.844640] page dumped because: kasan: bad access detected
[ 151.845822] Memory state around the buggy address:
[ 151.846602] ffff88811a980b00: fb fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb
[ 151.847761] ffff88811a980b80: fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb
[ 151.848921] >ffff88811a980c00: fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb
[ 151.850076] ^
[ 151.851085] ffff88811a980c80: fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb
[ 151.852269] ffff88811a980d00: fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb fc
[ 151.853428] ==================================================================
[ 151.855618] Disabling lock debugging due to kernel taint
It is accessing a mhandle in dlm_midcomms_commit_mhandle() and the mhandle
was freed by a call_rcu() call in dlm_process_incoming_buffer(),
dlm_mhandle_delete(). It looks like it was freed because an ack of
this message was received. There is a short race between committing the
dlm message to be transmitted and getting an ack back. If the ack is
faster than returning from dlm_midcomms_commit_msg_3_2(), then we run
into a use-after free because we still need to reference the mhandle when
calling srcu_read_unlock().
To avoid that, we don't allow that mhandle to be freed between
dlm_midcomms_commit_msg_3_2() and srcu_read_unlock() by using rcu read
lock. We can do that because mhandle is protected by rcu handling.
Cc: stable(a)vger.kernel.org
Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index fc015a6abe17..2e60d9a2c883 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1214,8 +1214,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
+ /* held rcu read lock here, because we sending the
+ * dlm message out, when we do that we could receive
+ * an ack back which releases the mhandle and we
+ * get a use after free.
+ */
+ rcu_read_lock();
dlm_midcomms_commit_msg_3_2(mh, name, namelen);
srcu_read_unlock(&nodes_srcu, mh->idx);
+ rcu_read_unlock();
break;
default:
srcu_read_unlock(&nodes_srcu, mh->idx);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 724b6bab0d75f1dc01fdfbf7fe8d4217a5cb90ba
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678109579147225(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
724b6bab0d75 ("fs: dlm: fix use after free in midcomms commit")
e01c4b7bd415 ("fd: dlm: trace send/recv of dlm message and rcom")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 724b6bab0d75f1dc01fdfbf7fe8d4217a5cb90ba Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:32 -0500
Subject: [PATCH] fs: dlm: fix use after free in midcomms commit
While working on processing dlm message in softirq context I experienced
the following KASAN use-after-free warning:
[ 151.760477] ==================================================================
[ 151.761803] BUG: KASAN: use-after-free in dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.763414] Read of size 4 at addr ffff88811a980c60 by task lock_torture/1347
[ 151.765284] CPU: 7 PID: 1347 Comm: lock_torture Not tainted 6.1.0-rc4+ #2828
[ 151.766778] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.16.0-3.module+el8.7.0+16134+e5908aa2 04/01/2014
[ 151.768726] Call Trace:
[ 151.769277] <TASK>
[ 151.769748] dump_stack_lvl+0x5b/0x86
[ 151.770556] print_report+0x180/0x4c8
[ 151.771378] ? kasan_complete_mode_report_info+0x7c/0x1e0
[ 151.772241] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.773069] kasan_report+0x93/0x1a0
[ 151.773668] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.774514] __asan_load4+0x7e/0xa0
[ 151.775089] dlm_midcomms_commit_mhandle+0x19d/0x4b0
[ 151.775890] ? create_message.isra.29.constprop.64+0x57/0xc0
[ 151.776770] send_common+0x19f/0x1b0
[ 151.777342] ? remove_from_waiters+0x60/0x60
[ 151.778017] ? lock_downgrade+0x410/0x410
[ 151.778648] ? __this_cpu_preempt_check+0x13/0x20
[ 151.779421] ? rcu_lockdep_current_cpu_online+0x88/0xc0
[ 151.780292] _convert_lock+0x46/0x150
[ 151.780893] convert_lock+0x7b/0xc0
[ 151.781459] dlm_lock+0x3ac/0x580
[ 151.781993] ? 0xffffffffc0540000
[ 151.782522] ? torture_stop+0x120/0x120 [dlm_locktorture]
[ 151.783379] ? dlm_scan_rsbs+0xa70/0xa70
[ 151.784003] ? preempt_count_sub+0xd6/0x130
[ 151.784661] ? is_module_address+0x47/0x70
[ 151.785309] ? torture_stop+0x120/0x120 [dlm_locktorture]
[ 151.786166] ? 0xffffffffc0540000
[ 151.786693] ? lockdep_init_map_type+0xc3/0x360
[ 151.787414] ? 0xffffffffc0540000
[ 151.787947] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture]
[ 151.789004] ? torture_stop+0x120/0x120 [dlm_locktorture]
[ 151.789858] ? 0xffffffffc0540000
[ 151.790392] ? lock_torture_cleanup+0x20/0x20 [dlm_locktorture]
[ 151.791347] ? delay_tsc+0x94/0xc0
[ 151.791898] torture_ex_iter+0xc3/0xea [dlm_locktorture]
[ 151.792735] ? torture_start+0x30/0x30 [dlm_locktorture]
[ 151.793606] lock_torture+0x177/0x270 [dlm_locktorture]
[ 151.794448] ? torture_dlm_lock_sync.isra.3+0x150/0x150 [dlm_locktorture]
[ 151.795539] ? lock_torture_stats+0x80/0x80 [dlm_locktorture]
[ 151.796476] ? do_raw_spin_lock+0x11e/0x1e0
[ 151.797152] ? mark_held_locks+0x34/0xb0
[ 151.797784] ? _raw_spin_unlock_irqrestore+0x30/0x70
[ 151.798581] ? __kthread_parkme+0x79/0x110
[ 151.799246] ? trace_preempt_on+0x2a/0xf0
[ 151.799902] ? __kthread_parkme+0x79/0x110
[ 151.800579] ? preempt_count_sub+0xd6/0x130
[ 151.801271] ? __kasan_check_read+0x11/0x20
[ 151.801963] ? __kthread_parkme+0xec/0x110
[ 151.802630] ? lock_torture_stats+0x80/0x80 [dlm_locktorture]
[ 151.803569] kthread+0x192/0x1d0
[ 151.804104] ? kthread_complete_and_exit+0x30/0x30
[ 151.804881] ret_from_fork+0x1f/0x30
[ 151.805480] </TASK>
[ 151.806111] Allocated by task 1347:
[ 151.806681] kasan_save_stack+0x26/0x50
[ 151.807308] kasan_set_track+0x25/0x30
[ 151.807920] kasan_save_alloc_info+0x1e/0x30
[ 151.808609] __kasan_slab_alloc+0x63/0x80
[ 151.809263] kmem_cache_alloc+0x1ad/0x830
[ 151.809916] dlm_allocate_mhandle+0x17/0x20
[ 151.810590] dlm_midcomms_get_mhandle+0x96/0x260
[ 151.811344] _create_message+0x95/0x180
[ 151.811994] create_message.isra.29.constprop.64+0x57/0xc0
[ 151.812880] send_common+0x129/0x1b0
[ 151.813467] _convert_lock+0x46/0x150
[ 151.814074] convert_lock+0x7b/0xc0
[ 151.814648] dlm_lock+0x3ac/0x580
[ 151.815199] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture]
[ 151.816258] torture_ex_iter+0xc3/0xea [dlm_locktorture]
[ 151.817129] lock_torture+0x177/0x270 [dlm_locktorture]
[ 151.817986] kthread+0x192/0x1d0
[ 151.818518] ret_from_fork+0x1f/0x30
[ 151.819369] Freed by task 1336:
[ 151.819890] kasan_save_stack+0x26/0x50
[ 151.820514] kasan_set_track+0x25/0x30
[ 151.821128] kasan_save_free_info+0x2e/0x50
[ 151.821812] __kasan_slab_free+0x107/0x1a0
[ 151.822483] kmem_cache_free+0x204/0x5e0
[ 151.823152] dlm_free_mhandle+0x18/0x20
[ 151.823781] dlm_mhandle_release+0x2e/0x40
[ 151.824454] rcu_core+0x583/0x1330
[ 151.825047] rcu_core_si+0xe/0x20
[ 151.825594] __do_softirq+0xf4/0x5c2
[ 151.826450] Last potentially related work creation:
[ 151.827238] kasan_save_stack+0x26/0x50
[ 151.827870] __kasan_record_aux_stack+0xa2/0xc0
[ 151.828609] kasan_record_aux_stack_noalloc+0xb/0x20
[ 151.829415] call_rcu+0x4c/0x760
[ 151.829954] dlm_mhandle_delete+0x97/0xb0
[ 151.830718] dlm_process_incoming_buffer+0x2fc/0xb30
[ 151.831524] process_dlm_messages+0x16e/0x470
[ 151.832245] process_one_work+0x505/0xa10
[ 151.832905] worker_thread+0x67/0x650
[ 151.833507] kthread+0x192/0x1d0
[ 151.834046] ret_from_fork+0x1f/0x30
[ 151.834900] The buggy address belongs to the object at ffff88811a980c30
which belongs to the cache dlm_mhandle of size 88
[ 151.836894] The buggy address is located 48 bytes inside of
88-byte region [ffff88811a980c30, ffff88811a980c88)
[ 151.839007] The buggy address belongs to the physical page:
[ 151.839904] page:0000000076cf5d62 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11a980
[ 151.841378] flags: 0x8000000000000200(slab|zone=2)
[ 151.842141] raw: 8000000000000200 0000000000000000 dead000000000122 ffff8881089b43c0
[ 151.843401] raw: 0000000000000000 0000000000220022 00000001ffffffff 0000000000000000
[ 151.844640] page dumped because: kasan: bad access detected
[ 151.845822] Memory state around the buggy address:
[ 151.846602] ffff88811a980b00: fb fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb
[ 151.847761] ffff88811a980b80: fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb
[ 151.848921] >ffff88811a980c00: fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb
[ 151.850076] ^
[ 151.851085] ffff88811a980c80: fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb
[ 151.852269] ffff88811a980d00: fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb fc
[ 151.853428] ==================================================================
[ 151.855618] Disabling lock debugging due to kernel taint
It is accessing a mhandle in dlm_midcomms_commit_mhandle() and the mhandle
was freed by a call_rcu() call in dlm_process_incoming_buffer(),
dlm_mhandle_delete(). It looks like it was freed because an ack of
this message was received. There is a short race between committing the
dlm message to be transmitted and getting an ack back. If the ack is
faster than returning from dlm_midcomms_commit_msg_3_2(), then we run
into a use-after free because we still need to reference the mhandle when
calling srcu_read_unlock().
To avoid that, we don't allow that mhandle to be freed between
dlm_midcomms_commit_msg_3_2() and srcu_read_unlock() by using rcu read
lock. We can do that because mhandle is protected by rcu handling.
Cc: stable(a)vger.kernel.org
Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index fc015a6abe17..2e60d9a2c883 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1214,8 +1214,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
+ /* held rcu read lock here, because we sending the
+ * dlm message out, when we do that we could receive
+ * an ack back which releases the mhandle and we
+ * get a use after free.
+ */
+ rcu_read_lock();
dlm_midcomms_commit_msg_3_2(mh, name, namelen);
srcu_read_unlock(&nodes_srcu, mh->idx);
+ rcu_read_unlock();
break;
default:
srcu_read_unlock(&nodes_srcu, mh->idx);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x aad633dc0cf90093998b1ae0ba9f19b5f1dab644
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810955618189(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
aad633dc0cf9 ("fs: dlm: start midcomms before scand")
3e54c9e80e68 ("fs: dlm: fix log of lowcomms vs midcomms")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:31 -0500
Subject: [PATCH] fs: dlm: start midcomms before scand
The scand kthread can send dlm messages out, especially dlm remove
messages to free memory for unused rsb on other nodes. To send out dlm
messages, midcomms must be initialized. This patch moves the midcomms
start before scand is started.
Cc: stable(a)vger.kernel.org
Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d0b4e2181a5f..99bc96f90779 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
- error = dlm_scand_start();
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm_scand thread %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto fail;
}
- /* Thread for sending/receiving messages for all lockspace's */
- error = dlm_midcomms_start();
+ error = dlm_scand_start();
if (error) {
- log_print("cannot start dlm midcomms %d", error);
- goto scand_fail;
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
}
return 0;
- scand_fail:
- dlm_scand_stop();
+ midcomms_fail:
+ dlm_midcomms_stop();
fail:
return error;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x aad633dc0cf90093998b1ae0ba9f19b5f1dab644
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810955521098(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
aad633dc0cf9 ("fs: dlm: start midcomms before scand")
3e54c9e80e68 ("fs: dlm: fix log of lowcomms vs midcomms")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:31 -0500
Subject: [PATCH] fs: dlm: start midcomms before scand
The scand kthread can send dlm messages out, especially dlm remove
messages to free memory for unused rsb on other nodes. To send out dlm
messages, midcomms must be initialized. This patch moves the midcomms
start before scand is started.
Cc: stable(a)vger.kernel.org
Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d0b4e2181a5f..99bc96f90779 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
- error = dlm_scand_start();
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm_scand thread %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto fail;
}
- /* Thread for sending/receiving messages for all lockspace's */
- error = dlm_midcomms_start();
+ error = dlm_scand_start();
if (error) {
- log_print("cannot start dlm midcomms %d", error);
- goto scand_fail;
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
}
return 0;
- scand_fail:
- dlm_scand_stop();
+ midcomms_fail:
+ dlm_midcomms_stop();
fail:
return error;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x aad633dc0cf90093998b1ae0ba9f19b5f1dab644
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810955527215(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
aad633dc0cf9 ("fs: dlm: start midcomms before scand")
3e54c9e80e68 ("fs: dlm: fix log of lowcomms vs midcomms")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:31 -0500
Subject: [PATCH] fs: dlm: start midcomms before scand
The scand kthread can send dlm messages out, especially dlm remove
messages to free memory for unused rsb on other nodes. To send out dlm
messages, midcomms must be initialized. This patch moves the midcomms
start before scand is started.
Cc: stable(a)vger.kernel.org
Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d0b4e2181a5f..99bc96f90779 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
- error = dlm_scand_start();
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm_scand thread %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto fail;
}
- /* Thread for sending/receiving messages for all lockspace's */
- error = dlm_midcomms_start();
+ error = dlm_scand_start();
if (error) {
- log_print("cannot start dlm midcomms %d", error);
- goto scand_fail;
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
}
return 0;
- scand_fail:
- dlm_scand_stop();
+ midcomms_fail:
+ dlm_midcomms_stop();
fail:
return error;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x aad633dc0cf90093998b1ae0ba9f19b5f1dab644
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810955486120(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
aad633dc0cf9 ("fs: dlm: start midcomms before scand")
3e54c9e80e68 ("fs: dlm: fix log of lowcomms vs midcomms")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:31 -0500
Subject: [PATCH] fs: dlm: start midcomms before scand
The scand kthread can send dlm messages out, especially dlm remove
messages to free memory for unused rsb on other nodes. To send out dlm
messages, midcomms must be initialized. This patch moves the midcomms
start before scand is started.
Cc: stable(a)vger.kernel.org
Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d0b4e2181a5f..99bc96f90779 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
- error = dlm_scand_start();
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm_scand thread %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto fail;
}
- /* Thread for sending/receiving messages for all lockspace's */
- error = dlm_midcomms_start();
+ error = dlm_scand_start();
if (error) {
- log_print("cannot start dlm midcomms %d", error);
- goto scand_fail;
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
}
return 0;
- scand_fail:
- dlm_scand_stop();
+ midcomms_fail:
+ dlm_midcomms_stop();
fail:
return error;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x aad633dc0cf90093998b1ae0ba9f19b5f1dab644
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678109553184203(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
aad633dc0cf9 ("fs: dlm: start midcomms before scand")
3e54c9e80e68 ("fs: dlm: fix log of lowcomms vs midcomms")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:31 -0500
Subject: [PATCH] fs: dlm: start midcomms before scand
The scand kthread can send dlm messages out, especially dlm remove
messages to free memory for unused rsb on other nodes. To send out dlm
messages, midcomms must be initialized. This patch moves the midcomms
start before scand is started.
Cc: stable(a)vger.kernel.org
Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d0b4e2181a5f..99bc96f90779 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
- error = dlm_scand_start();
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm_scand thread %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto fail;
}
- /* Thread for sending/receiving messages for all lockspace's */
- error = dlm_midcomms_start();
+ error = dlm_scand_start();
if (error) {
- log_print("cannot start dlm midcomms %d", error);
- goto scand_fail;
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
}
return 0;
- scand_fail:
- dlm_scand_stop();
+ midcomms_fail:
+ dlm_midcomms_stop();
fail:
return error;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x aad633dc0cf90093998b1ae0ba9f19b5f1dab644
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678109553226244(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
aad633dc0cf9 ("fs: dlm: start midcomms before scand")
3e54c9e80e68 ("fs: dlm: fix log of lowcomms vs midcomms")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo(a)redhat.com>
Date: Thu, 12 Jan 2023 17:10:31 -0500
Subject: [PATCH] fs: dlm: start midcomms before scand
The scand kthread can send dlm messages out, especially dlm remove
messages to free memory for unused rsb on other nodes. To send out dlm
messages, midcomms must be initialized. This patch moves the midcomms
start before scand is started.
Cc: stable(a)vger.kernel.org
Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM")
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
Signed-off-by: David Teigland <teigland(a)redhat.com>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d0b4e2181a5f..99bc96f90779 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
- error = dlm_scand_start();
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm_scand thread %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto fail;
}
- /* Thread for sending/receiving messages for all lockspace's */
- error = dlm_midcomms_start();
+ error = dlm_scand_start();
if (error) {
- log_print("cannot start dlm midcomms %d", error);
- goto scand_fail;
+ log_print("cannot start dlm_scand thread %d", error);
+ goto midcomms_fail;
}
return 0;
- scand_fail:
- dlm_scand_stop();
+ midcomms_fail:
+ dlm_midcomms_stop();
fail:
return error;
}
The second to last argument is clk_root (root of the clock), however the
code called q6prm_request_lpass_clock() with clk_attr instead
(copy-paste error). This effectively was passing value of 1 as root
clock which worked on some of the SoCs (e.g. SM8450) but fails on
others, depending on the ADSP. For example on SM8550 this "1" as root
clock is not accepted and results in errors coming from ADSP.
Fixes: 2f20640491ed ("ASoC: qdsp6: qdsp6: q6prm: handle clk disable correctly")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
---
sound/soc/qcom/qdsp6/q6prm.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sound/soc/qcom/qdsp6/q6prm.c b/sound/soc/qcom/qdsp6/q6prm.c
index 8aa1a213bfb7..c1dc5bae715a 100644
--- a/sound/soc/qcom/qdsp6/q6prm.c
+++ b/sound/soc/qcom/qdsp6/q6prm.c
@@ -183,9 +183,9 @@ int q6prm_set_lpass_clock(struct device *dev, int clk_id, int clk_attr, int clk_
unsigned int freq)
{
if (freq)
- return q6prm_request_lpass_clock(dev, clk_id, clk_attr, clk_attr, freq);
+ return q6prm_request_lpass_clock(dev, clk_id, clk_attr, clk_root, freq);
- return q6prm_release_lpass_clock(dev, clk_id, clk_attr, clk_attr, freq);
+ return q6prm_release_lpass_clock(dev, clk_id, clk_attr, clk_root, freq);
}
EXPORT_SYMBOL_GPL(q6prm_set_lpass_clock);
--
2.34.1
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x f2d3155e2a6bac44d16f04415a321e8707d895c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810000922119(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
f2d3155e2a6b ("KVM: s390: disable migration mode when dirty tracking is disabled")
ec5c86976674 ("KVM: s390: Skip gfn/size sanity checks on memslot DELETE or FLAGS_ONLY")
cf5b486922dc ("KVM: s390: Use "new" memslot instead of userspace memory region")
106ee47dc633 ("docs: kvm: Convert api.txt to ReST format")
6c972ba685d5 ("docs: kvm: convert devices/vm.txt to ReST")
aff7aeea5483 ("docs: kvm: convert devices/vfio.txt to ReST")
e777a5bd98c6 ("docs: kvm: convert devices/vcpu.txt to ReST")
e94474300361 ("docs: kvm: convert devices/s390_flic.txt to ReST")
05c47036c62e ("docs: kvm: convert devices/mpic.txt to ReST")
c0d1c8a0af59 ("docs: kvm: devices/arm-vgit-v3.txt to ReST")
d371c011fc5e ("docs: kvm: devices/arm-vgic-its.txt to ReST format")
7bd460fc1dfa ("docs: kvm: add arm/pvtime.rst to index.rst")
290a6bb06de9 ("arm64: KVM: Add UAPI notes for swapped registers")
22945688acd4 ("KVM: PPC: Book3S HV: Support reset of secure guest")
008e359c76d8 ("KVM: PPC: Book3S HV: Radix changes for secure guest")
60f0a643aa44 ("KVM: PPC: Book3S HV: Shared pages support for secure guests")
ca9f4942670c ("KVM: PPC: Book3S HV: Support for running secure guests")
a4b28f5c6798 ("Merge remote-tracking branch 'kvmarm/kvm-arm64/stolen-time' into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f2d3155e2a6bac44d16f04415a321e8707d895c6 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Fri, 27 Jan 2023 15:05:32 +0100
Subject: [PATCH] KVM: s390: disable migration mode when dirty tracking is
disabled
Migration mode is a VM attribute which enables tracking of changes in
storage attributes (PGSTE). It assumes dirty tracking is enabled on all
memslots to keep a dirty bitmap of pages with changed storage attributes.
When enabling migration mode, we currently check that dirty tracking is
enabled for all memslots. However, userspace can disable dirty tracking
without disabling migration mode.
Since migration mode is pointless with dirty tracking disabled, disable
migration mode whenever userspace disables dirty tracking on any slot.
Also update the documentation to clarify that dirty tracking must be
enabled when enabling migration mode, which is already enforced by the
code in kvm_s390_vm_start_migration().
Also highlight in the documentation for KVM_S390_GET_CMMA_BITS that it
can now fail with -EINVAL when dirty tracking is disabled while
migration mode is on. Move all the error codes to a table so this stays
readable.
To disable migration mode, slots_lock should be held, which is taken
in kvm_set_memory_region() and thus held in
kvm_arch_prepare_memory_region().
Restructure the prepare code a bit so all the sanity checking is done
before disabling migration mode. This ensures migration mode isn't
disabled when some sanity check fails.
Cc: stable(a)vger.kernel.org
Fixes: 190df4a212a7 ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230127140532.230651-2-nrb@linux.ibm.com
Message-Id: <20230127140532.230651-2-nrb(a)linux.ibm.com>
[frankja(a)linux.ibm.com: fixed commit message typo, moved api.rst error table upwards]
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index deb494f759ed..8cd7fd05d53b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4449,6 +4449,18 @@ not holding a previously reported uncorrected error).
:Parameters: struct kvm_s390_cmma_log (in, out)
:Returns: 0 on success, a negative value on error
+Errors:
+
+ ====== =============================================================
+ ENOMEM not enough memory can be allocated to complete the task
+ ENXIO if CMMA is not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but migration mode was not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but dirty tracking has been
+ disabled (and thus migration mode was automatically disabled)
+ EFAULT if the userspace address is invalid or if no page table is
+ present for the addresses (e.g. when using hugepages).
+ ====== =============================================================
+
This ioctl is used to get the values of the CMMA bits on the s390
architecture. It is meant to be used in two scenarios:
@@ -4529,12 +4541,6 @@ mask is unused.
values points to the userspace buffer where the result will be stored.
-This ioctl can fail with -ENOMEM if not enough memory can be allocated to
-complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
-KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
--EFAULT if the userspace address is invalid or if no page table is
-present for the addresses (e.g. when using hugepages).
-
4.108 KVM_S390_SET_CMMA_BITS
----------------------------
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 60acc39e0e93..147efec626e5 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -302,6 +302,10 @@ Allows userspace to start migration mode, needed for PGSTE migration.
Setting this attribute when migration mode is already active will have
no effects.
+Dirty tracking must be enabled on all memslots, else -EINVAL is returned. When
+dirty tracking is disabled on any memslot, migration mode is automatically
+stopped.
+
:Parameters: none
:Returns: -ENOMEM if there is not enough free memory to start migration mode;
-EINVAL if the state of the VM is invalid (e.g. no memory defined);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..cb72f9a09fb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5633,23 +5633,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x af1c89ddb74f170eccd5a57001d7317560b638ea
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810226715267(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
af1c89ddb74f ("ARM: dts: exynos: correct HDMI phy compatible in Exynos4")
73a901d09a21 ("ARM: dts: exynos: Add soc node to exynos4")
9097b4bd9fce ("ARM: dts: exynos: Use pmu label in exynos4412")
e030be47ac48 ("ARM: dts: exynos: Use labels instead of full paths in exynos4210")
e58864515240 ("ARM: dts: exynos: Use pinctrl labels in exynos4412-pinctrl")
88c166cec136 ("ARM: dts: exynos: Use pinctrl labels in exynos4210-pinctrl")
3be1ecf291df ("ARM: dts: exynos: Use lower case hex addresses in node unit addresses")
6351fe9375f4 ("ARM: dts: exynos: Add G3D power domain to Exynos5250")
c0d40bb3ac71 ("ARM: dts: exynos: Add audio power domain to Exynos5250")
9fbb4c096323 ("ARM: dts: exynos: Fix power domain node names for Exynos5250")
528832d4c01a ("ARM: dts: exynos: Add audio power domain support to Exynos542x SoCs")
cdd745c8c76b ("ARM: dts: exynos: Remove duplicate definitions of SSS nodes for Exynos5")
78a68acf3d33 ("ARM: dts: exynos: Switch to dedicated Odroid XU3 sound card binding")
061ae5326613 ("Merge tag 'samsung-dt-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux into next/soc")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From af1c89ddb74f170eccd5a57001d7317560b638ea Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Date: Wed, 25 Jan 2023 10:45:05 +0100
Subject: [PATCH] ARM: dts: exynos: correct HDMI phy compatible in Exynos4
The HDMI phy compatible was missing vendor prefix.
Fixes: ed80d4cab772 ("ARM: dts: add hdmi related nodes for exynos4 SoCs")
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20230125094513.155063-1-krzysztof.kozlowski@linar…
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
diff --git a/arch/arm/boot/dts/exynos4.dtsi b/arch/arm/boot/dts/exynos4.dtsi
index 55afe9972460..d1adaee2af58 100644
--- a/arch/arm/boot/dts/exynos4.dtsi
+++ b/arch/arm/boot/dts/exynos4.dtsi
@@ -605,7 +605,7 @@ i2c_8: i2c@138e0000 {
status = "disabled";
hdmi_i2c_phy: hdmiphy@38 {
- compatible = "exynos4210-hdmiphy";
+ compatible = "samsung,exynos4210-hdmiphy";
reg = <0x38>;
};
};
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x b7625f461da6734a21c38ba6e7558538a116a2e3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810211413647(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
b7625f461da6 ("btrfs: sysfs: update fs features directory asynchronously")
85e79ec7b78f ("btrfs: zoned: enable metadata over-commit for non-ZNS setup")
c52cc7b7acfb ("btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
8bb808c6ad91 ("btrfs: don't print stack trace when transaction is aborted due to ENOMEM")
0e75f0054a2a ("btrfs: move fs_info forward declarations to the top of ctree.h")
2103da3b0e3a ("btrfs: move btrfs_swapfile_pin into volumes.h")
c2e79e865b87 ("btrfs: move btrfs_pinned_by_swapfile prototype into volumes.h")
43712116f8c8 ("btrfs: move btrfs_init_async_reclaim_work prototype to space-info.h")
83cf709a89fb ("btrfs: move extent state init and alloc functions to their own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b7625f461da6734a21c38ba6e7558538a116a2e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 13 Jan 2023 19:11:39 +0800
Subject: [PATCH] btrfs: sysfs: update fs features directory asynchronously
[BUG]
Since the introduction of per-fs feature sysfs interface
(/sys/fs/btrfs/<UUID>/features/), the content of that directory is never
updated.
Thus for the following case, that directory will not show the new
features like RAID56:
# mkfs.btrfs -f $dev1 $dev2 $dev3
# mount $dev1 $mnt
# btrfs balance start -f -mconvert=raid5 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes skinny_metadata
While after unmount and mount, we got the correct features:
# umount $mnt
# mount $dev1 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes raid56 skinny_metadata
[CAUSE]
Because we never really try to update the content of per-fs features/
directory.
We had an attempt to update the features directory dynamically in commit
14e46e04958d ("btrfs: synchronize incompat feature bits with sysfs
files"), but unfortunately it get reverted in commit e410e34fad91
("Revert "btrfs: synchronize incompat feature bits with sysfs files"").
The problem in the original patch is, in the context of
btrfs_create_chunk(), we can not afford to update the sysfs group.
The exported but never utilized function, btrfs_sysfs_feature_update()
is the leftover of such attempt. As even if we go sysfs_update_group(),
new files will need extra memory allocation, and we have no way to
specify the sysfs update to go GFP_NOFS.
[FIX]
This patch will address the old problem by doing asynchronous sysfs
update in the cleaner thread.
This involves the following changes:
- Make __btrfs_(set|clear)_fs_(incompat|compat_ro) helpers to set
BTRFS_FS_FEATURE_CHANGED flag when needed
- Update btrfs_sysfs_feature_update() to use sysfs_update_group()
And drop unnecessary arguments.
- Call btrfs_sysfs_feature_update() in cleaner_kthread
If we have the BTRFS_FS_FEATURE_CHANGED flag set.
- Wake up cleaner_kthread in btrfs_commit_transaction if we have
BTRFS_FS_FEATURE_CHANGED flag
By this, all the previously dangerous call sites like
btrfs_create_chunk() need no new changes, as above helpers would
have already set the BTRFS_FS_FEATURE_CHANGED flag.
The real work happens at cleaner_kthread, thus we pay the cost of
delaying the update to sysfs directory, but the delayed time should be
small enough that end user can not distinguish though it might get
delayed if the cleaner thread is busy with removing subvolumes or
defrag.
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain(a)oracle.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b718..a6f89ac1c086 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1914,6 +1914,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..3d8156fc8523 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -125,6 +125,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..108aa3876186 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 528efe559866..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x b7625f461da6734a21c38ba6e7558538a116a2e3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810211211536(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
b7625f461da6 ("btrfs: sysfs: update fs features directory asynchronously")
85e79ec7b78f ("btrfs: zoned: enable metadata over-commit for non-ZNS setup")
c52cc7b7acfb ("btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
8bb808c6ad91 ("btrfs: don't print stack trace when transaction is aborted due to ENOMEM")
0e75f0054a2a ("btrfs: move fs_info forward declarations to the top of ctree.h")
2103da3b0e3a ("btrfs: move btrfs_swapfile_pin into volumes.h")
c2e79e865b87 ("btrfs: move btrfs_pinned_by_swapfile prototype into volumes.h")
43712116f8c8 ("btrfs: move btrfs_init_async_reclaim_work prototype to space-info.h")
83cf709a89fb ("btrfs: move extent state init and alloc functions to their own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b7625f461da6734a21c38ba6e7558538a116a2e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 13 Jan 2023 19:11:39 +0800
Subject: [PATCH] btrfs: sysfs: update fs features directory asynchronously
[BUG]
Since the introduction of per-fs feature sysfs interface
(/sys/fs/btrfs/<UUID>/features/), the content of that directory is never
updated.
Thus for the following case, that directory will not show the new
features like RAID56:
# mkfs.btrfs -f $dev1 $dev2 $dev3
# mount $dev1 $mnt
# btrfs balance start -f -mconvert=raid5 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes skinny_metadata
While after unmount and mount, we got the correct features:
# umount $mnt
# mount $dev1 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes raid56 skinny_metadata
[CAUSE]
Because we never really try to update the content of per-fs features/
directory.
We had an attempt to update the features directory dynamically in commit
14e46e04958d ("btrfs: synchronize incompat feature bits with sysfs
files"), but unfortunately it get reverted in commit e410e34fad91
("Revert "btrfs: synchronize incompat feature bits with sysfs files"").
The problem in the original patch is, in the context of
btrfs_create_chunk(), we can not afford to update the sysfs group.
The exported but never utilized function, btrfs_sysfs_feature_update()
is the leftover of such attempt. As even if we go sysfs_update_group(),
new files will need extra memory allocation, and we have no way to
specify the sysfs update to go GFP_NOFS.
[FIX]
This patch will address the old problem by doing asynchronous sysfs
update in the cleaner thread.
This involves the following changes:
- Make __btrfs_(set|clear)_fs_(incompat|compat_ro) helpers to set
BTRFS_FS_FEATURE_CHANGED flag when needed
- Update btrfs_sysfs_feature_update() to use sysfs_update_group()
And drop unnecessary arguments.
- Call btrfs_sysfs_feature_update() in cleaner_kthread
If we have the BTRFS_FS_FEATURE_CHANGED flag set.
- Wake up cleaner_kthread in btrfs_commit_transaction if we have
BTRFS_FS_FEATURE_CHANGED flag
By this, all the previously dangerous call sites like
btrfs_create_chunk() need no new changes, as above helpers would
have already set the BTRFS_FS_FEATURE_CHANGED flag.
The real work happens at cleaner_kthread, thus we pay the cost of
delaying the update to sysfs directory, but the delayed time should be
small enough that end user can not distinguish though it might get
delayed if the cleaner thread is busy with removing subvolumes or
defrag.
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain(a)oracle.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b718..a6f89ac1c086 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1914,6 +1914,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..3d8156fc8523 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -125,6 +125,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..108aa3876186 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 528efe559866..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x b7625f461da6734a21c38ba6e7558538a116a2e3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678102110115171(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
b7625f461da6 ("btrfs: sysfs: update fs features directory asynchronously")
85e79ec7b78f ("btrfs: zoned: enable metadata over-commit for non-ZNS setup")
c52cc7b7acfb ("btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
8bb808c6ad91 ("btrfs: don't print stack trace when transaction is aborted due to ENOMEM")
0e75f0054a2a ("btrfs: move fs_info forward declarations to the top of ctree.h")
2103da3b0e3a ("btrfs: move btrfs_swapfile_pin into volumes.h")
c2e79e865b87 ("btrfs: move btrfs_pinned_by_swapfile prototype into volumes.h")
43712116f8c8 ("btrfs: move btrfs_init_async_reclaim_work prototype to space-info.h")
83cf709a89fb ("btrfs: move extent state init and alloc functions to their own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b7625f461da6734a21c38ba6e7558538a116a2e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 13 Jan 2023 19:11:39 +0800
Subject: [PATCH] btrfs: sysfs: update fs features directory asynchronously
[BUG]
Since the introduction of per-fs feature sysfs interface
(/sys/fs/btrfs/<UUID>/features/), the content of that directory is never
updated.
Thus for the following case, that directory will not show the new
features like RAID56:
# mkfs.btrfs -f $dev1 $dev2 $dev3
# mount $dev1 $mnt
# btrfs balance start -f -mconvert=raid5 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes skinny_metadata
While after unmount and mount, we got the correct features:
# umount $mnt
# mount $dev1 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes raid56 skinny_metadata
[CAUSE]
Because we never really try to update the content of per-fs features/
directory.
We had an attempt to update the features directory dynamically in commit
14e46e04958d ("btrfs: synchronize incompat feature bits with sysfs
files"), but unfortunately it get reverted in commit e410e34fad91
("Revert "btrfs: synchronize incompat feature bits with sysfs files"").
The problem in the original patch is, in the context of
btrfs_create_chunk(), we can not afford to update the sysfs group.
The exported but never utilized function, btrfs_sysfs_feature_update()
is the leftover of such attempt. As even if we go sysfs_update_group(),
new files will need extra memory allocation, and we have no way to
specify the sysfs update to go GFP_NOFS.
[FIX]
This patch will address the old problem by doing asynchronous sysfs
update in the cleaner thread.
This involves the following changes:
- Make __btrfs_(set|clear)_fs_(incompat|compat_ro) helpers to set
BTRFS_FS_FEATURE_CHANGED flag when needed
- Update btrfs_sysfs_feature_update() to use sysfs_update_group()
And drop unnecessary arguments.
- Call btrfs_sysfs_feature_update() in cleaner_kthread
If we have the BTRFS_FS_FEATURE_CHANGED flag set.
- Wake up cleaner_kthread in btrfs_commit_transaction if we have
BTRFS_FS_FEATURE_CHANGED flag
By this, all the previously dangerous call sites like
btrfs_create_chunk() need no new changes, as above helpers would
have already set the BTRFS_FS_FEATURE_CHANGED flag.
The real work happens at cleaner_kthread, thus we pay the cost of
delaying the update to sysfs directory, but the delayed time should be
small enough that end user can not distinguish though it might get
delayed if the cleaner thread is busy with removing subvolumes or
defrag.
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain(a)oracle.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b718..a6f89ac1c086 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1914,6 +1914,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..3d8156fc8523 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -125,6 +125,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..108aa3876186 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 528efe559866..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b7625f461da6734a21c38ba6e7558538a116a2e3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810210825354(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
b7625f461da6 ("btrfs: sysfs: update fs features directory asynchronously")
85e79ec7b78f ("btrfs: zoned: enable metadata over-commit for non-ZNS setup")
c52cc7b7acfb ("btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
8bb808c6ad91 ("btrfs: don't print stack trace when transaction is aborted due to ENOMEM")
0e75f0054a2a ("btrfs: move fs_info forward declarations to the top of ctree.h")
2103da3b0e3a ("btrfs: move btrfs_swapfile_pin into volumes.h")
c2e79e865b87 ("btrfs: move btrfs_pinned_by_swapfile prototype into volumes.h")
43712116f8c8 ("btrfs: move btrfs_init_async_reclaim_work prototype to space-info.h")
83cf709a89fb ("btrfs: move extent state init and alloc functions to their own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b7625f461da6734a21c38ba6e7558538a116a2e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 13 Jan 2023 19:11:39 +0800
Subject: [PATCH] btrfs: sysfs: update fs features directory asynchronously
[BUG]
Since the introduction of per-fs feature sysfs interface
(/sys/fs/btrfs/<UUID>/features/), the content of that directory is never
updated.
Thus for the following case, that directory will not show the new
features like RAID56:
# mkfs.btrfs -f $dev1 $dev2 $dev3
# mount $dev1 $mnt
# btrfs balance start -f -mconvert=raid5 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes skinny_metadata
While after unmount and mount, we got the correct features:
# umount $mnt
# mount $dev1 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes raid56 skinny_metadata
[CAUSE]
Because we never really try to update the content of per-fs features/
directory.
We had an attempt to update the features directory dynamically in commit
14e46e04958d ("btrfs: synchronize incompat feature bits with sysfs
files"), but unfortunately it get reverted in commit e410e34fad91
("Revert "btrfs: synchronize incompat feature bits with sysfs files"").
The problem in the original patch is, in the context of
btrfs_create_chunk(), we can not afford to update the sysfs group.
The exported but never utilized function, btrfs_sysfs_feature_update()
is the leftover of such attempt. As even if we go sysfs_update_group(),
new files will need extra memory allocation, and we have no way to
specify the sysfs update to go GFP_NOFS.
[FIX]
This patch will address the old problem by doing asynchronous sysfs
update in the cleaner thread.
This involves the following changes:
- Make __btrfs_(set|clear)_fs_(incompat|compat_ro) helpers to set
BTRFS_FS_FEATURE_CHANGED flag when needed
- Update btrfs_sysfs_feature_update() to use sysfs_update_group()
And drop unnecessary arguments.
- Call btrfs_sysfs_feature_update() in cleaner_kthread
If we have the BTRFS_FS_FEATURE_CHANGED flag set.
- Wake up cleaner_kthread in btrfs_commit_transaction if we have
BTRFS_FS_FEATURE_CHANGED flag
By this, all the previously dangerous call sites like
btrfs_create_chunk() need no new changes, as above helpers would
have already set the BTRFS_FS_FEATURE_CHANGED flag.
The real work happens at cleaner_kthread, thus we pay the cost of
delaying the update to sysfs directory, but the delayed time should be
small enough that end user can not distinguish though it might get
delayed if the cleaner thread is busy with removing subvolumes or
defrag.
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain(a)oracle.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b718..a6f89ac1c086 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1914,6 +1914,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..3d8156fc8523 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -125,6 +125,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..108aa3876186 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 528efe559866..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b7625f461da6734a21c38ba6e7558538a116a2e3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810210618738(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
b7625f461da6 ("btrfs: sysfs: update fs features directory asynchronously")
85e79ec7b78f ("btrfs: zoned: enable metadata over-commit for non-ZNS setup")
c52cc7b7acfb ("btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
8bb808c6ad91 ("btrfs: don't print stack trace when transaction is aborted due to ENOMEM")
0e75f0054a2a ("btrfs: move fs_info forward declarations to the top of ctree.h")
2103da3b0e3a ("btrfs: move btrfs_swapfile_pin into volumes.h")
c2e79e865b87 ("btrfs: move btrfs_pinned_by_swapfile prototype into volumes.h")
43712116f8c8 ("btrfs: move btrfs_init_async_reclaim_work prototype to space-info.h")
83cf709a89fb ("btrfs: move extent state init and alloc functions to their own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b7625f461da6734a21c38ba6e7558538a116a2e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 13 Jan 2023 19:11:39 +0800
Subject: [PATCH] btrfs: sysfs: update fs features directory asynchronously
[BUG]
Since the introduction of per-fs feature sysfs interface
(/sys/fs/btrfs/<UUID>/features/), the content of that directory is never
updated.
Thus for the following case, that directory will not show the new
features like RAID56:
# mkfs.btrfs -f $dev1 $dev2 $dev3
# mount $dev1 $mnt
# btrfs balance start -f -mconvert=raid5 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes skinny_metadata
While after unmount and mount, we got the correct features:
# umount $mnt
# mount $dev1 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes raid56 skinny_metadata
[CAUSE]
Because we never really try to update the content of per-fs features/
directory.
We had an attempt to update the features directory dynamically in commit
14e46e04958d ("btrfs: synchronize incompat feature bits with sysfs
files"), but unfortunately it get reverted in commit e410e34fad91
("Revert "btrfs: synchronize incompat feature bits with sysfs files"").
The problem in the original patch is, in the context of
btrfs_create_chunk(), we can not afford to update the sysfs group.
The exported but never utilized function, btrfs_sysfs_feature_update()
is the leftover of such attempt. As even if we go sysfs_update_group(),
new files will need extra memory allocation, and we have no way to
specify the sysfs update to go GFP_NOFS.
[FIX]
This patch will address the old problem by doing asynchronous sysfs
update in the cleaner thread.
This involves the following changes:
- Make __btrfs_(set|clear)_fs_(incompat|compat_ro) helpers to set
BTRFS_FS_FEATURE_CHANGED flag when needed
- Update btrfs_sysfs_feature_update() to use sysfs_update_group()
And drop unnecessary arguments.
- Call btrfs_sysfs_feature_update() in cleaner_kthread
If we have the BTRFS_FS_FEATURE_CHANGED flag set.
- Wake up cleaner_kthread in btrfs_commit_transaction if we have
BTRFS_FS_FEATURE_CHANGED flag
By this, all the previously dangerous call sites like
btrfs_create_chunk() need no new changes, as above helpers would
have already set the BTRFS_FS_FEATURE_CHANGED flag.
The real work happens at cleaner_kthread, thus we pay the cost of
delaying the update to sysfs directory, but the delayed time should be
small enough that end user can not distinguish though it might get
delayed if the cleaner thread is busy with removing subvolumes or
defrag.
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain(a)oracle.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b718..a6f89ac1c086 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1914,6 +1914,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..3d8156fc8523 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -125,6 +125,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..108aa3876186 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 528efe559866..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b7625f461da6734a21c38ba6e7558538a116a2e3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678102104245178(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
b7625f461da6 ("btrfs: sysfs: update fs features directory asynchronously")
85e79ec7b78f ("btrfs: zoned: enable metadata over-commit for non-ZNS setup")
c52cc7b7acfb ("btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag")
7966a6b5959b ("btrfs: move fs_info::flags enum to fs.h")
fc97a410bd78 ("btrfs: move mount option definitions to fs.h")
0d3a9cf8c306 ("btrfs: convert incompat and compat flag test helpers to macros")
ec8eb376e271 ("btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h")
9b569ea0be6f ("btrfs: move the printk helpers out of ctree.h")
e118578a8df7 ("btrfs: move assert helpers out of ctree.h")
c7f13d428ea1 ("btrfs: move fs wide helpers out of ctree.h")
63a7cb130718 ("btrfs: auto enable discard=async when possible")
f1e5c6185ca1 ("btrfs: move flush related definitions to space-info.h")
4300c58f8090 ("btrfs: move btrfs on-disk definitions out of ctree.h")
d60d956eb41f ("btrfs: remove unused set/clear_pending_info helpers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b7625f461da6734a21c38ba6e7558538a116a2e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 13 Jan 2023 19:11:39 +0800
Subject: [PATCH] btrfs: sysfs: update fs features directory asynchronously
[BUG]
Since the introduction of per-fs feature sysfs interface
(/sys/fs/btrfs/<UUID>/features/), the content of that directory is never
updated.
Thus for the following case, that directory will not show the new
features like RAID56:
# mkfs.btrfs -f $dev1 $dev2 $dev3
# mount $dev1 $mnt
# btrfs balance start -f -mconvert=raid5 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes skinny_metadata
While after unmount and mount, we got the correct features:
# umount $mnt
# mount $dev1 $mnt
# ls /sys/fs/btrfs/$uuid/features/
extended_iref free_space_tree no_holes raid56 skinny_metadata
[CAUSE]
Because we never really try to update the content of per-fs features/
directory.
We had an attempt to update the features directory dynamically in commit
14e46e04958d ("btrfs: synchronize incompat feature bits with sysfs
files"), but unfortunately it get reverted in commit e410e34fad91
("Revert "btrfs: synchronize incompat feature bits with sysfs files"").
The problem in the original patch is, in the context of
btrfs_create_chunk(), we can not afford to update the sysfs group.
The exported but never utilized function, btrfs_sysfs_feature_update()
is the leftover of such attempt. As even if we go sysfs_update_group(),
new files will need extra memory allocation, and we have no way to
specify the sysfs update to go GFP_NOFS.
[FIX]
This patch will address the old problem by doing asynchronous sysfs
update in the cleaner thread.
This involves the following changes:
- Make __btrfs_(set|clear)_fs_(incompat|compat_ro) helpers to set
BTRFS_FS_FEATURE_CHANGED flag when needed
- Update btrfs_sysfs_feature_update() to use sysfs_update_group()
And drop unnecessary arguments.
- Call btrfs_sysfs_feature_update() in cleaner_kthread
If we have the BTRFS_FS_FEATURE_CHANGED flag set.
- Wake up cleaner_kthread in btrfs_commit_transaction if we have
BTRFS_FS_FEATURE_CHANGED flag
By this, all the previously dangerous call sites like
btrfs_create_chunk() need no new changes, as above helpers would
have already set the BTRFS_FS_FEATURE_CHANGED flag.
The real work happens at cleaner_kthread, thus we pay the cost of
delaying the update to sysfs directory, but the delayed time should be
small enough that end user can not distinguish though it might get
delayed if the cleaner thread is busy with removing subvolumes or
defrag.
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Anand Jain <anand.jain(a)oracle.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b718..a6f89ac1c086 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1914,6 +1914,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..3d8156fc8523 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -125,6 +125,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..108aa3876186 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 528efe559866..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f2d3155e2a6bac44d16f04415a321e8707d895c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678100008170144(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f2d3155e2a6b ("KVM: s390: disable migration mode when dirty tracking is disabled")
ec5c86976674 ("KVM: s390: Skip gfn/size sanity checks on memslot DELETE or FLAGS_ONLY")
cf5b486922dc ("KVM: s390: Use "new" memslot instead of userspace memory region")
106ee47dc633 ("docs: kvm: Convert api.txt to ReST format")
6c972ba685d5 ("docs: kvm: convert devices/vm.txt to ReST")
aff7aeea5483 ("docs: kvm: convert devices/vfio.txt to ReST")
e777a5bd98c6 ("docs: kvm: convert devices/vcpu.txt to ReST")
e94474300361 ("docs: kvm: convert devices/s390_flic.txt to ReST")
05c47036c62e ("docs: kvm: convert devices/mpic.txt to ReST")
c0d1c8a0af59 ("docs: kvm: devices/arm-vgit-v3.txt to ReST")
d371c011fc5e ("docs: kvm: devices/arm-vgic-its.txt to ReST format")
7bd460fc1dfa ("docs: kvm: add arm/pvtime.rst to index.rst")
290a6bb06de9 ("arm64: KVM: Add UAPI notes for swapped registers")
22945688acd4 ("KVM: PPC: Book3S HV: Support reset of secure guest")
008e359c76d8 ("KVM: PPC: Book3S HV: Radix changes for secure guest")
60f0a643aa44 ("KVM: PPC: Book3S HV: Shared pages support for secure guests")
ca9f4942670c ("KVM: PPC: Book3S HV: Support for running secure guests")
a4b28f5c6798 ("Merge remote-tracking branch 'kvmarm/kvm-arm64/stolen-time' into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f2d3155e2a6bac44d16f04415a321e8707d895c6 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Fri, 27 Jan 2023 15:05:32 +0100
Subject: [PATCH] KVM: s390: disable migration mode when dirty tracking is
disabled
Migration mode is a VM attribute which enables tracking of changes in
storage attributes (PGSTE). It assumes dirty tracking is enabled on all
memslots to keep a dirty bitmap of pages with changed storage attributes.
When enabling migration mode, we currently check that dirty tracking is
enabled for all memslots. However, userspace can disable dirty tracking
without disabling migration mode.
Since migration mode is pointless with dirty tracking disabled, disable
migration mode whenever userspace disables dirty tracking on any slot.
Also update the documentation to clarify that dirty tracking must be
enabled when enabling migration mode, which is already enforced by the
code in kvm_s390_vm_start_migration().
Also highlight in the documentation for KVM_S390_GET_CMMA_BITS that it
can now fail with -EINVAL when dirty tracking is disabled while
migration mode is on. Move all the error codes to a table so this stays
readable.
To disable migration mode, slots_lock should be held, which is taken
in kvm_set_memory_region() and thus held in
kvm_arch_prepare_memory_region().
Restructure the prepare code a bit so all the sanity checking is done
before disabling migration mode. This ensures migration mode isn't
disabled when some sanity check fails.
Cc: stable(a)vger.kernel.org
Fixes: 190df4a212a7 ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230127140532.230651-2-nrb@linux.ibm.com
Message-Id: <20230127140532.230651-2-nrb(a)linux.ibm.com>
[frankja(a)linux.ibm.com: fixed commit message typo, moved api.rst error table upwards]
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index deb494f759ed..8cd7fd05d53b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4449,6 +4449,18 @@ not holding a previously reported uncorrected error).
:Parameters: struct kvm_s390_cmma_log (in, out)
:Returns: 0 on success, a negative value on error
+Errors:
+
+ ====== =============================================================
+ ENOMEM not enough memory can be allocated to complete the task
+ ENXIO if CMMA is not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but migration mode was not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but dirty tracking has been
+ disabled (and thus migration mode was automatically disabled)
+ EFAULT if the userspace address is invalid or if no page table is
+ present for the addresses (e.g. when using hugepages).
+ ====== =============================================================
+
This ioctl is used to get the values of the CMMA bits on the s390
architecture. It is meant to be used in two scenarios:
@@ -4529,12 +4541,6 @@ mask is unused.
values points to the userspace buffer where the result will be stored.
-This ioctl can fail with -ENOMEM if not enough memory can be allocated to
-complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
-KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
--EFAULT if the userspace address is invalid or if no page table is
-present for the addresses (e.g. when using hugepages).
-
4.108 KVM_S390_SET_CMMA_BITS
----------------------------
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 60acc39e0e93..147efec626e5 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -302,6 +302,10 @@ Allows userspace to start migration mode, needed for PGSTE migration.
Setting this attribute when migration mode is already active will have
no effects.
+Dirty tracking must be enabled on all memslots, else -EINVAL is returned. When
+dirty tracking is disabled on any memslot, migration mode is automatically
+stopped.
+
:Parameters: none
:Returns: -ENOMEM if there is not enough free memory to start migration mode;
-EINVAL if the state of the VM is invalid (e.g. no memory defined);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..cb72f9a09fb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5633,23 +5633,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x c16bda37594f83147b167d381d54c010024efecf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167809982125422(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
c16bda37594f ("io_uring/poll: allow some retries for poll triggering spuriously")
9b797a37c4bd ("io_uring: add abstraction around apoll cache")
9da7471ed10d ("io_uring: move apoll cache to poll.c")
5204aa8c43bd ("io_uring: add a helper for apoll alloc")
d9b57aa3cfc7 ("io_uring: move opcode table to opdef.c")
f3b44f92e59a ("io_uring: move read/write related opcodes to its own file")
c98817e6cd44 ("io_uring: move remaining file table manipulation to filetable.c")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c16bda37594f83147b167d381d54c010024efecf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Sat, 25 Feb 2023 12:53:53 -0700
Subject: [PATCH] io_uring/poll: allow some retries for poll triggering
spuriously
If we get woken spuriously when polling and fail the operation with
-EAGAIN again, then we generally only allow polling again if data
had been transferred at some point. This is indicated with
REQ_F_PARTIAL_IO. However, if the spurious poll triggers when the socket
was originally empty, then we haven't transferred data yet and we will
fail the poll re-arm. This either punts the socket to io-wq if it's
blocking, or it fails the request with -EAGAIN if not. Neither condition
is desirable, as the former will slow things down, while the latter
will make the application confused.
We want to ensure that a repeated poll trigger doesn't lead to infinite
work making no progress, that's what the REQ_F_PARTIAL_IO check was
for. But it doesn't protect against a loop post the first receive, and
it's unnecessarily strict if we started out with an empty socket.
Add a somewhat random retry count, just to put an upper limit on the
potential number of retries that will be done. This should be high enough
that we won't really hit it in practice, unless something needs to be
aborted anyway.
Cc: stable(a)vger.kernel.org # v5.10+
Link: https://github.com/axboe/liburing/issues/364
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8339a92b4510..a82d6830bdfd 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -650,6 +650,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
+/*
+ * We can't reliably detect loops in repeated poll triggers and issue
+ * subsequently failing. But rather than fail these immediately, allow a
+ * certain amount of retries before we give up. Given that this condition
+ * should _rarely_ trigger even once, we should be fine with a larger value.
+ */
+#define APOLL_MAX_RETRY 128
+
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
unsigned issue_flags)
{
@@ -665,14 +673,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
if (entry == NULL)
goto alloc_apoll;
apoll = container_of(entry, struct async_poll, cache);
+ apoll->poll.retries = APOLL_MAX_RETRY;
} else {
alloc_apoll:
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return NULL;
+ apoll->poll.retries = APOLL_MAX_RETRY;
}
apoll->double_poll = NULL;
req->apoll = apoll;
+ if (unlikely(!--apoll->poll.retries))
+ return NULL;
return apoll;
}
@@ -694,8 +706,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_ABORTED;
if (!file_can_poll(req->file))
return IO_APOLL_ABORTED;
- if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 5f3bae50fc81..b2393b403a2c 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -12,6 +12,7 @@ struct io_poll {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
+ int retries;
struct wait_queue_entry wait;
};
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x c16bda37594f83147b167d381d54c010024efecf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16780998193152(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
c16bda37594f ("io_uring/poll: allow some retries for poll triggering spuriously")
9b797a37c4bd ("io_uring: add abstraction around apoll cache")
9da7471ed10d ("io_uring: move apoll cache to poll.c")
5204aa8c43bd ("io_uring: add a helper for apoll alloc")
d9b57aa3cfc7 ("io_uring: move opcode table to opdef.c")
f3b44f92e59a ("io_uring: move read/write related opcodes to its own file")
c98817e6cd44 ("io_uring: move remaining file table manipulation to filetable.c")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c16bda37594f83147b167d381d54c010024efecf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Sat, 25 Feb 2023 12:53:53 -0700
Subject: [PATCH] io_uring/poll: allow some retries for poll triggering
spuriously
If we get woken spuriously when polling and fail the operation with
-EAGAIN again, then we generally only allow polling again if data
had been transferred at some point. This is indicated with
REQ_F_PARTIAL_IO. However, if the spurious poll triggers when the socket
was originally empty, then we haven't transferred data yet and we will
fail the poll re-arm. This either punts the socket to io-wq if it's
blocking, or it fails the request with -EAGAIN if not. Neither condition
is desirable, as the former will slow things down, while the latter
will make the application confused.
We want to ensure that a repeated poll trigger doesn't lead to infinite
work making no progress, that's what the REQ_F_PARTIAL_IO check was
for. But it doesn't protect against a loop post the first receive, and
it's unnecessarily strict if we started out with an empty socket.
Add a somewhat random retry count, just to put an upper limit on the
potential number of retries that will be done. This should be high enough
that we won't really hit it in practice, unless something needs to be
aborted anyway.
Cc: stable(a)vger.kernel.org # v5.10+
Link: https://github.com/axboe/liburing/issues/364
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8339a92b4510..a82d6830bdfd 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -650,6 +650,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
+/*
+ * We can't reliably detect loops in repeated poll triggers and issue
+ * subsequently failing. But rather than fail these immediately, allow a
+ * certain amount of retries before we give up. Given that this condition
+ * should _rarely_ trigger even once, we should be fine with a larger value.
+ */
+#define APOLL_MAX_RETRY 128
+
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
unsigned issue_flags)
{
@@ -665,14 +673,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
if (entry == NULL)
goto alloc_apoll;
apoll = container_of(entry, struct async_poll, cache);
+ apoll->poll.retries = APOLL_MAX_RETRY;
} else {
alloc_apoll:
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return NULL;
+ apoll->poll.retries = APOLL_MAX_RETRY;
}
apoll->double_poll = NULL;
req->apoll = apoll;
+ if (unlikely(!--apoll->poll.retries))
+ return NULL;
return apoll;
}
@@ -694,8 +706,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_ABORTED;
if (!file_can_poll(req->file))
return IO_APOLL_ABORTED;
- if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 5f3bae50fc81..b2393b403a2c 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -12,6 +12,7 @@ struct io_poll {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
+ int retries;
struct wait_queue_entry wait;
};
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 7605c43d67face310b4b87dee1a28bc0c8cd8c0f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678099792191220(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
7605c43d67fa ("io_uring: remove MSG_NOSIGNAL from recvmsg")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
453b329be5ea ("io_uring: separate out file table handling code")
f4c163dd7d4b ("io_uring: split out fadvise/madvise operations")
0d5847274037 ("io_uring: split out fs related sync/fallocate functions")
531113bbd5bf ("io_uring: split out splice related operations")
11aeb71406dd ("io_uring: split out filesystem related operations")
e28683bdfc2f ("io_uring: move nop into its own file")
5e2a18d93fec ("io_uring: move xattr related opcodes to its own file")
97b388d70b53 ("io_uring: handle completions in the core")
de23077eda61 ("io_uring: set completion results upfront")
e27f928ee1cb ("io_uring: add io_uring_types.h")
4d4c9cff4f70 ("io_uring: define a request type cleanup handler")
890968dc0336 ("io_uring: unify struct io_symlink and io_hardlink")
9a3a11f977f9 ("io_uring: convert iouring_cmd to io_cmd_type")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7605c43d67face310b4b87dee1a28bc0c8cd8c0f Mon Sep 17 00:00:00 2001
From: David Lamparter <equinox(a)diac24.net>
Date: Fri, 24 Feb 2023 16:01:24 +0100
Subject: [PATCH] io_uring: remove MSG_NOSIGNAL from recvmsg
MSG_NOSIGNAL is not applicable for the receiving side, SIGPIPE is
generated when trying to write to a "broken pipe". AF_PACKET's
packet_recvmsg() does enforce this, giving back EINVAL when MSG_NOSIGNAL
is set - making it unuseable in io_uring's recvmsg.
Remove MSG_NOSIGNAL from io_recvmsg_prep().
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: David Lamparter <equinox(a)diac24.net>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Link: https://lore.kernel.org/r/20230224150123.128346-1-equinox@diac24.net
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/net.c b/io_uring/net.c
index cbd4b725f58c..b7f190ca528e 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~(RECVMSG_FLAGS))
return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ sr->msg_flags = READ_ONCE(sqe->msg_flags);
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
if (sr->msg_flags & MSG_ERRQUEUE)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 7605c43d67face310b4b87dee1a28bc0c8cd8c0f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678099792218125(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
7605c43d67fa ("io_uring: remove MSG_NOSIGNAL from recvmsg")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
453b329be5ea ("io_uring: separate out file table handling code")
f4c163dd7d4b ("io_uring: split out fadvise/madvise operations")
0d5847274037 ("io_uring: split out fs related sync/fallocate functions")
531113bbd5bf ("io_uring: split out splice related operations")
11aeb71406dd ("io_uring: split out filesystem related operations")
e28683bdfc2f ("io_uring: move nop into its own file")
5e2a18d93fec ("io_uring: move xattr related opcodes to its own file")
97b388d70b53 ("io_uring: handle completions in the core")
de23077eda61 ("io_uring: set completion results upfront")
e27f928ee1cb ("io_uring: add io_uring_types.h")
4d4c9cff4f70 ("io_uring: define a request type cleanup handler")
890968dc0336 ("io_uring: unify struct io_symlink and io_hardlink")
9a3a11f977f9 ("io_uring: convert iouring_cmd to io_cmd_type")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7605c43d67face310b4b87dee1a28bc0c8cd8c0f Mon Sep 17 00:00:00 2001
From: David Lamparter <equinox(a)diac24.net>
Date: Fri, 24 Feb 2023 16:01:24 +0100
Subject: [PATCH] io_uring: remove MSG_NOSIGNAL from recvmsg
MSG_NOSIGNAL is not applicable for the receiving side, SIGPIPE is
generated when trying to write to a "broken pipe". AF_PACKET's
packet_recvmsg() does enforce this, giving back EINVAL when MSG_NOSIGNAL
is set - making it unuseable in io_uring's recvmsg.
Remove MSG_NOSIGNAL from io_recvmsg_prep().
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: David Lamparter <equinox(a)diac24.net>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Link: https://lore.kernel.org/r/20230224150123.128346-1-equinox@diac24.net
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/net.c b/io_uring/net.c
index cbd4b725f58c..b7f190ca528e 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~(RECVMSG_FLAGS))
return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ sr->msg_flags = READ_ONCE(sqe->msg_flags);
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
if (sr->msg_flags & MSG_ERRQUEUE)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x edd478269640b360c6f301f2baa04abdda563ef3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678099757152211(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
edd478269640 ("io_uring/rsrc: disallow multi-source reg buffers")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From edd478269640b360c6f301f2baa04abdda563ef3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Wed, 22 Feb 2023 14:36:48 +0000
Subject: [PATCH] io_uring/rsrc: disallow multi-source reg buffers
If two or more mappings go back to back to each other they can be passed
into io_uring to be registered as a single registered buffer. That would
even work if mappings came from different sources, e.g. it's possible to
mix in this way anon pages and pages from shmem or hugetlb. That is not
a problem but it'd rather be less prone if we forbid such mixing.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4e6191904c9e..53845e496881 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
if (pret == nr_pages) {
+ struct file *file = vmas[0]->vm_file;
+
/* don't support file backed memory */
for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma_is_shmem(vma))
+ if (vmas[i]->vm_file != file) {
+ ret = -EINVAL;
+ break;
+ }
+ if (!file)
continue;
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
+ if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
ret = -EOPNOTSUPP;
break;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x edd478269640b360c6f301f2baa04abdda563ef3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167809975610179(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
edd478269640 ("io_uring/rsrc: disallow multi-source reg buffers")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From edd478269640b360c6f301f2baa04abdda563ef3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Wed, 22 Feb 2023 14:36:48 +0000
Subject: [PATCH] io_uring/rsrc: disallow multi-source reg buffers
If two or more mappings go back to back to each other they can be passed
into io_uring to be registered as a single registered buffer. That would
even work if mappings came from different sources, e.g. it's possible to
mix in this way anon pages and pages from shmem or hugetlb. That is not
a problem but it'd rather be less prone if we forbid such mixing.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4e6191904c9e..53845e496881 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
if (pret == nr_pages) {
+ struct file *file = vmas[0]->vm_file;
+
/* don't support file backed memory */
for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma_is_shmem(vma))
+ if (vmas[i]->vm_file != file) {
+ ret = -EINVAL;
+ break;
+ }
+ if (!file)
continue;
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
+ if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
ret = -EOPNOTSUPP;
break;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f58680085478dd292435727210122960d38e8014
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16780997371220(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f58680085478 ("io_uring: add reschedule point to handle_tw_list()")
c6dd763c2460 ("io_uring: trace task_work_run")
3a0c037b0e16 ("io_uring: batch task_work")
f88262e60bb9 ("io_uring: lockless task list")
ed5ccb3beeba ("io_uring: remove priority tw list optimisation")
4a0fef62788b ("io_uring: optimize io_uring_task layout")
253993210bd8 ("io_uring: introduce locking helpers for CQE posting")
305bef988708 ("io_uring: hide eventfd assumptions in eventfd paths")
d9dee4302a7c ("io_uring: remove ->flush_cqes optimisation")
a830ffd28780 ("io_uring: move io_eventfd_signal()")
9046c6415be6 ("io_uring: reshuffle io_uring/io_uring.h")
d142c3ec8d16 ("io_uring: remove extra io_commit_cqring()")
68494a65d0e2 ("io_uring: introduce io_req_cqe_overflow()")
faf88dde060f ("io_uring: don't inline __io_get_cqe()")
d245bca6375b ("io_uring: don't expose io_fill_cqe_aux()")
6a02e4be8187 ("io_uring: inline ->registered_rings")
aa1e90f64ee5 ("io_uring: move small helpers to headers")
aa1e90f64ee5 ("io_uring: move small helpers to headers")
aa1e90f64ee5 ("io_uring: move small helpers to headers")
aa1e90f64ee5 ("io_uring: move small helpers to headers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f58680085478dd292435727210122960d38e8014 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Fri, 27 Jan 2023 09:50:31 -0700
Subject: [PATCH] io_uring: add reschedule point to handle_tw_list()
If CONFIG_PREEMPT_NONE is set and the task_work chains are long, we
could be running into issues blocking others for too long. Add a
reschedule check in handle_tw_list(), and flush the ctx if we need to
reschedule.
Cc: stable(a)vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fab581a31dc1..acf6d9680d76 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1179,10 +1179,16 @@ static unsigned int handle_tw_list(struct llist_node *node,
/* if not contended, grab and improve batching */
*locked = mutex_trylock(&(*ctx)->uring_lock);
percpu_ref_get(&(*ctx)->refs);
- }
+ } else if (!*locked)
+ *locked = mutex_trylock(&(*ctx)->uring_lock);
req->io_task_work.func(req, locked);
node = next;
count++;
+ if (unlikely(need_resched())) {
+ ctx_flush_and_put(*ctx, locked);
+ *ctx = NULL;
+ cond_resched();
+ }
}
return count;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x fcc926bb857949dbfa51a7d95f3f5ebc657f198c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678099723159112(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
fcc926bb8579 ("io_uring: add a conditional reschedule to the IOPOLL cancelation loop")
affa87db9010 ("io_uring: fix multi ctx cancellation")
9ca9fb24d5fe ("io_uring: mutex locked poll hashing")
e6f89be61410 ("io_uring: introduce a struct for hash table")
a2cdd5193218 ("io_uring: pass hash table into poll_find")
0ec6dca22319 ("io_uring: use state completion infra for poll reqs")
8b1dfd343ae6 ("io_uring: clean up io_ring_ctx_alloc")
4a07723fb4bb ("io_uring: limit the number of cancellation buckets")
1ab1edb0a104 ("io_uring: pass poll_find lock back")
38513c464d3d ("io_uring: switch cancel_hash to use per entry spinlock")
aff5b2df9e8b ("io_uring: better caching for ctx timeout fields")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fcc926bb857949dbfa51a7d95f3f5ebc657f198c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Fri, 27 Jan 2023 09:28:13 -0700
Subject: [PATCH] io_uring: add a conditional reschedule to the IOPOLL
cancelation loop
If the kernel is configured with CONFIG_PREEMPT_NONE, we could be
sitting in a tight loop reaping events but not giving them a chance to
finish. This results in a trace ala:
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: 2-...!: (5249 ticks this GP) idle=935c/1/0x4000000000000000 softirq=4265/4274 fqs=1
(t=5251 jiffies g=465 q=4135 ncpus=4)
rcu: rcu_sched kthread starved for 5249 jiffies! g465 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
rcu: Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
rcu: RCU grace-period kthread stack dump:
task:rcu_sched state:R running task stack:0 pid:12 ppid:2 flags:0x00000008
Call trace:
__switch_to+0xb0/0xc8
__schedule+0x43c/0x520
schedule+0x4c/0x98
schedule_timeout+0xbc/0xdc
rcu_gp_fqs_loop+0x308/0x344
rcu_gp_kthread+0xd8/0xf0
kthread+0xb8/0xc8
ret_from_fork+0x10/0x20
rcu: Stack dump where RCU GP kthread last ran:
Task dump for CPU 0:
task:kworker/u8:10 state:R running task stack:0 pid:89 ppid:2 flags:0x0000000a
Workqueue: events_unbound io_ring_exit_work
Call trace:
__switch_to+0xb0/0xc8
0xffff0000c8fefd28
CPU: 2 PID: 95 Comm: kworker/u8:13 Not tainted 6.2.0-rc5-00042-g40316e337c80-dirty #2759
Hardware name: linux,dummy-virt (DT)
Workqueue: events_unbound io_ring_exit_work
pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
pc : io_do_iopoll+0x344/0x360
lr : io_do_iopoll+0xb8/0x360
sp : ffff800009bebc60
x29: ffff800009bebc60 x28: 0000000000000000 x27: 0000000000000000
x26: ffff0000c0f67d48 x25: ffff0000c0f67840 x24: ffff800008950024
x23: 0000000000000001 x22: 0000000000000000 x21: ffff0000c27d3200
x20: ffff0000c0f67840 x19: ffff0000c0f67800 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
x14: 0000000000000001 x13: 0000000000000001 x12: 0000000000000000
x11: 0000000000000179 x10: 0000000000000870 x9 : ffff800009bebd60
x8 : ffff0000c27d3ad0 x7 : fefefefefefefeff x6 : 0000646e756f626e
x5 : ffff0000c0f67840 x4 : 0000000000000000 x3 : ffff0000c2398000
x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000
Call trace:
io_do_iopoll+0x344/0x360
io_uring_try_cancel_requests+0x21c/0x334
io_ring_exit_work+0x90/0x40c
process_one_work+0x1a4/0x254
worker_thread+0x1ec/0x258
kthread+0xb8/0xc8
ret_from_fork+0x10/0x20
Add a cond_resched() in the cancelation IOPOLL loop to fix this.
Cc: stable(a)vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9c92ca081c11..fab581a31dc1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3160,6 +3160,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
+ cond_resched();
}
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x fcc926bb857949dbfa51a7d95f3f5ebc657f198c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167809972215341(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
fcc926bb8579 ("io_uring: add a conditional reschedule to the IOPOLL cancelation loop")
affa87db9010 ("io_uring: fix multi ctx cancellation")
9ca9fb24d5fe ("io_uring: mutex locked poll hashing")
e6f89be61410 ("io_uring: introduce a struct for hash table")
a2cdd5193218 ("io_uring: pass hash table into poll_find")
0ec6dca22319 ("io_uring: use state completion infra for poll reqs")
8b1dfd343ae6 ("io_uring: clean up io_ring_ctx_alloc")
4a07723fb4bb ("io_uring: limit the number of cancellation buckets")
1ab1edb0a104 ("io_uring: pass poll_find lock back")
38513c464d3d ("io_uring: switch cancel_hash to use per entry spinlock")
aff5b2df9e8b ("io_uring: better caching for ctx timeout fields")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fcc926bb857949dbfa51a7d95f3f5ebc657f198c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Fri, 27 Jan 2023 09:28:13 -0700
Subject: [PATCH] io_uring: add a conditional reschedule to the IOPOLL
cancelation loop
If the kernel is configured with CONFIG_PREEMPT_NONE, we could be
sitting in a tight loop reaping events but not giving them a chance to
finish. This results in a trace ala:
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: 2-...!: (5249 ticks this GP) idle=935c/1/0x4000000000000000 softirq=4265/4274 fqs=1
(t=5251 jiffies g=465 q=4135 ncpus=4)
rcu: rcu_sched kthread starved for 5249 jiffies! g465 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
rcu: Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
rcu: RCU grace-period kthread stack dump:
task:rcu_sched state:R running task stack:0 pid:12 ppid:2 flags:0x00000008
Call trace:
__switch_to+0xb0/0xc8
__schedule+0x43c/0x520
schedule+0x4c/0x98
schedule_timeout+0xbc/0xdc
rcu_gp_fqs_loop+0x308/0x344
rcu_gp_kthread+0xd8/0xf0
kthread+0xb8/0xc8
ret_from_fork+0x10/0x20
rcu: Stack dump where RCU GP kthread last ran:
Task dump for CPU 0:
task:kworker/u8:10 state:R running task stack:0 pid:89 ppid:2 flags:0x0000000a
Workqueue: events_unbound io_ring_exit_work
Call trace:
__switch_to+0xb0/0xc8
0xffff0000c8fefd28
CPU: 2 PID: 95 Comm: kworker/u8:13 Not tainted 6.2.0-rc5-00042-g40316e337c80-dirty #2759
Hardware name: linux,dummy-virt (DT)
Workqueue: events_unbound io_ring_exit_work
pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
pc : io_do_iopoll+0x344/0x360
lr : io_do_iopoll+0xb8/0x360
sp : ffff800009bebc60
x29: ffff800009bebc60 x28: 0000000000000000 x27: 0000000000000000
x26: ffff0000c0f67d48 x25: ffff0000c0f67840 x24: ffff800008950024
x23: 0000000000000001 x22: 0000000000000000 x21: ffff0000c27d3200
x20: ffff0000c0f67840 x19: ffff0000c0f67800 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
x14: 0000000000000001 x13: 0000000000000001 x12: 0000000000000000
x11: 0000000000000179 x10: 0000000000000870 x9 : ffff800009bebd60
x8 : ffff0000c27d3ad0 x7 : fefefefefefefeff x6 : 0000646e756f626e
x5 : ffff0000c0f67840 x4 : 0000000000000000 x3 : ffff0000c2398000
x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000
Call trace:
io_do_iopoll+0x344/0x360
io_uring_try_cancel_requests+0x21c/0x334
io_ring_exit_work+0x90/0x40c
process_one_work+0x1a4/0x254
worker_thread+0x1ec/0x258
kthread+0xb8/0xc8
ret_from_fork+0x10/0x20
Add a cond_resched() in the cancelation IOPOLL loop to fix this.
Cc: stable(a)vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9c92ca081c11..fab581a31dc1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3160,6 +3160,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
+ cond_resched();
}
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b5d3ae202fbfe055aa2a8ae8524531ee1dcab717
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678099699105122(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
b5d3ae202fbf ("io_uring: handle TIF_NOTIFY_RESUME when checking for task_work")
7cfe7a09489c ("io_uring: clear TIF_NOTIFY_SIGNAL if set and task_work not available")
46a525e199e4 ("io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL")
c0e0d6ba25f1 ("io_uring: add IORING_SETUP_DEFER_TASKRUN")
b4c98d59a787 ("io_uring: introduce io_has_work")
78a861b94959 ("io_uring: add sync cancelation API through io_uring_register()")
c34398a8c018 ("io_uring: remove __io_req_task_work_add")
ed5ccb3beeba ("io_uring: remove priority tw list optimisation")
625d38b3fd34 ("io_uring: improve io_run_task_work()")
4a0fef62788b ("io_uring: optimize io_uring_task layout")
253993210bd8 ("io_uring: introduce locking helpers for CQE posting")
305bef988708 ("io_uring: hide eventfd assumptions in eventfd paths")
affa87db9010 ("io_uring: fix multi ctx cancellation")
d9dee4302a7c ("io_uring: remove ->flush_cqes optimisation")
a830ffd28780 ("io_uring: move io_eventfd_signal()")
9046c6415be6 ("io_uring: reshuffle io_uring/io_uring.h")
d142c3ec8d16 ("io_uring: remove extra io_commit_cqring()")
ab1c84d855cf ("io_uring: make io_uring_types.h public")
68494a65d0e2 ("io_uring: introduce io_req_cqe_overflow()")
faf88dde060f ("io_uring: don't inline __io_get_cqe()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5d3ae202fbfe055aa2a8ae8524531ee1dcab717 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Tue, 24 Jan 2023 08:24:25 -0700
Subject: [PATCH] io_uring: handle TIF_NOTIFY_RESUME when checking for
task_work
If TIF_NOTIFY_RESUME is set, then we need to call resume_user_mode_work()
for PF_IO_WORKER threads. They never return to usermode, hence never get
a chance to process any items that are marked by this flag. Most notably
this includes the final put of files, but also any throttling markers set
by block cgroups.
Cc: stable(a)vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c68edf9872a5..d58cfe062da9 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -3,6 +3,7 @@
#include <linux/errno.h>
#include <linux/lockdep.h>
+#include <linux/resume_user_mode.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
@@ -274,6 +275,13 @@ static inline int io_run_task_work(void)
*/
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
clear_notify_signal();
+ /*
+ * PF_IO_WORKER never returns to userspace, so check here if we have
+ * notify work that needs processing.
+ */
+ if (current->flags & PF_IO_WORKER &&
+ test_thread_flag(TIF_NOTIFY_RESUME))
+ resume_user_mode_work(NULL);
if (task_work_pending(current)) {
__set_current_state(TASK_RUNNING);
task_work_run();
Hello Good Morning,
This is to bring to your notice that all our efforts to contact you
through this your email ID failed Please Kindly contact Barrister.
Steven Mike { mbarrsteven(a)gmail.com } on his private email for the
claim of your compensation entitlement
Note: You have to pay for the delivery fee.
Yours Sincerely
Mrs EVE LEWIS
Dzień dobry,
jakiś czas temu zgłosiła się do nas firma, której strona internetowa nie pozycjonowała się wysoko w wyszukiwarce Google.
Na podstawie wykonanego przez nas audytu SEO zoptymalizowaliśmy treści na stronie pod kątem wcześniej opracowanych słów kluczowych. Nasz wewnętrzny system codziennie analizuje prawidłowe działanie witryny. Dzięki indywidualnej strategii, firma zdobywa coraz więcej Klientów.
Czy chcieliby Państwo zwiększyć liczbę osób odwiedzających stronę internetową firmy? Mógłbym przedstawić ofertę?
Pozdrawiam serdecznie,
Dominik Perkowski
The current interconnect provider registration interface is inherently
racy as nodes are not added until the after adding the provider. This
can specifically cause racing DT lookups to fail.
Switch to using the new API where the provider is not registered until
after it has been fully initialised.
Fixes: d5ef16ba5fbe ("memory: tegra20: Support interconnect framework")
Cc: stable(a)vger.kernel.org # 5.11
Cc: Dmitry Osipenko <digetx(a)gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/memory/tegra/tegra30-emc.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/memory/tegra/tegra30-emc.c b/drivers/memory/tegra/tegra30-emc.c
index 77706e9bc543..c91e9b7e2e01 100644
--- a/drivers/memory/tegra/tegra30-emc.c
+++ b/drivers/memory/tegra/tegra30-emc.c
@@ -1533,15 +1533,13 @@ static int tegra_emc_interconnect_init(struct tegra_emc *emc)
emc->provider.aggregate = soc->icc_ops->aggregate;
emc->provider.xlate_extended = emc_of_icc_xlate_extended;
- err = icc_provider_add(&emc->provider);
- if (err)
- goto err_msg;
+ icc_provider_init(&emc->provider);
/* create External Memory Controller node */
node = icc_node_create(TEGRA_ICC_EMC);
if (IS_ERR(node)) {
err = PTR_ERR(node);
- goto del_provider;
+ goto err_msg;
}
node->name = "External Memory Controller";
@@ -1562,12 +1560,14 @@ static int tegra_emc_interconnect_init(struct tegra_emc *emc)
node->name = "External Memory (DRAM)";
icc_node_add(node, &emc->provider);
+ err = icc_provider_register(&emc->provider);
+ if (err)
+ goto remove_nodes;
+
return 0;
remove_nodes:
icc_nodes_remove(&emc->provider);
-del_provider:
- icc_provider_del(&emc->provider);
err_msg:
dev_err(emc->dev, "failed to initialize ICC: %d\n", err);
--
2.39.2
The current interconnect provider registration interface is inherently
racy as nodes are not added until the after adding the provider. This
can specifically cause racing DT lookups to fail.
Switch to using the new API where the provider is not registered until
after it has been fully initialised.
Fixes: d5ef16ba5fbe ("memory: tegra20: Support interconnect framework")
Cc: stable(a)vger.kernel.org # 5.11
Cc: Dmitry Osipenko <digetx(a)gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/memory/tegra/tegra20-emc.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/memory/tegra/tegra20-emc.c b/drivers/memory/tegra/tegra20-emc.c
index bd4e37b6552d..fd595c851a27 100644
--- a/drivers/memory/tegra/tegra20-emc.c
+++ b/drivers/memory/tegra/tegra20-emc.c
@@ -1021,15 +1021,13 @@ static int tegra_emc_interconnect_init(struct tegra_emc *emc)
emc->provider.aggregate = soc->icc_ops->aggregate;
emc->provider.xlate_extended = emc_of_icc_xlate_extended;
- err = icc_provider_add(&emc->provider);
- if (err)
- goto err_msg;
+ icc_provider_init(&emc->provider);
/* create External Memory Controller node */
node = icc_node_create(TEGRA_ICC_EMC);
if (IS_ERR(node)) {
err = PTR_ERR(node);
- goto del_provider;
+ goto err_msg;
}
node->name = "External Memory Controller";
@@ -1050,12 +1048,14 @@ static int tegra_emc_interconnect_init(struct tegra_emc *emc)
node->name = "External Memory (DRAM)";
icc_node_add(node, &emc->provider);
+ err = icc_provider_register(&emc->provider);
+ if (err)
+ goto remove_nodes;
+
return 0;
remove_nodes:
icc_nodes_remove(&emc->provider);
-del_provider:
- icc_provider_del(&emc->provider);
err_msg:
dev_err(emc->dev, "failed to initialize ICC: %d\n", err);
--
2.39.2
The current interconnect provider registration interface is inherently
racy as nodes are not added until the after adding the provider. This
can specifically cause racing DT lookups to fail.
Switch to using the new API where the provider is not registered until
after it has been fully initialised.
Fixes: 380def2d4cf2 ("memory: tegra124: Support interconnect framework")
Cc: stable(a)vger.kernel.org # 5.12
Cc: Dmitry Osipenko <digetx(a)gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/memory/tegra/tegra124-emc.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/memory/tegra/tegra124-emc.c b/drivers/memory/tegra/tegra124-emc.c
index 85bc936c02f9..00ed2b6a0d1b 100644
--- a/drivers/memory/tegra/tegra124-emc.c
+++ b/drivers/memory/tegra/tegra124-emc.c
@@ -1351,15 +1351,13 @@ static int tegra_emc_interconnect_init(struct tegra_emc *emc)
emc->provider.aggregate = soc->icc_ops->aggregate;
emc->provider.xlate_extended = emc_of_icc_xlate_extended;
- err = icc_provider_add(&emc->provider);
- if (err)
- goto err_msg;
+ icc_provider_init(&emc->provider);
/* create External Memory Controller node */
node = icc_node_create(TEGRA_ICC_EMC);
if (IS_ERR(node)) {
err = PTR_ERR(node);
- goto del_provider;
+ goto err_msg;
}
node->name = "External Memory Controller";
@@ -1380,12 +1378,14 @@ static int tegra_emc_interconnect_init(struct tegra_emc *emc)
node->name = "External Memory (DRAM)";
icc_node_add(node, &emc->provider);
+ err = icc_provider_register(&emc->provider);
+ if (err)
+ goto remove_nodes;
+
return 0;
remove_nodes:
icc_nodes_remove(&emc->provider);
-del_provider:
- icc_provider_del(&emc->provider);
err_msg:
dev_err(emc->dev, "failed to initialize ICC: %d\n", err);
--
2.39.2
The current interconnect provider registration interface is inherently
racy as nodes are not added until the after adding the provider. This
can specifically cause racing DT lookups to fail.
Switch to using the new API where the provider is not registered until
after it has been fully initialised.
Fixes: 06f079816d4c ("memory: tegra-mc: Add interconnect framework")
Cc: stable(a)vger.kernel.org # 5.11
Cc: Dmitry Osipenko <digetx(a)gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/memory/tegra/mc.c | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c
index 592907546ee6..5cd28619ea9f 100644
--- a/drivers/memory/tegra/mc.c
+++ b/drivers/memory/tegra/mc.c
@@ -794,16 +794,12 @@ static int tegra_mc_interconnect_setup(struct tegra_mc *mc)
mc->provider.aggregate = mc->soc->icc_ops->aggregate;
mc->provider.xlate_extended = mc->soc->icc_ops->xlate_extended;
- err = icc_provider_add(&mc->provider);
- if (err)
- return err;
+ icc_provider_init(&mc->provider);
/* create Memory Controller node */
node = icc_node_create(TEGRA_ICC_MC);
- if (IS_ERR(node)) {
- err = PTR_ERR(node);
- goto del_provider;
- }
+ if (IS_ERR(node))
+ return PTR_ERR(node);
node->name = "Memory Controller";
icc_node_add(node, &mc->provider);
@@ -830,12 +826,14 @@ static int tegra_mc_interconnect_setup(struct tegra_mc *mc)
goto remove_nodes;
}
+ err = icc_provider_register(&mc->provider);
+ if (err)
+ goto remove_nodes;
+
return 0;
remove_nodes:
icc_nodes_remove(&mc->provider);
-del_provider:
- icc_provider_del(&mc->provider);
return err;
}
--
2.39.2
The current interconnect provider registration interface is inherently
racy as nodes are not added until the after adding the provider. This
can specifically cause racing DT lookups to trigger a NULL-pointer
deference when either a NULL pointer or not fully initialised node is
returned from exynos_generic_icc_xlate().
Switch to using the new API where the provider is not registered until
after it has been fully initialised.
Fixes: 2f95b9d5cf0b ("interconnect: Add generic interconnect driver for Exynos SoCs")
Cc: stable(a)vger.kernel.org # 5.11
Cc: Sylwester Nawrocki <s.nawrocki(a)samsung.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/interconnect/samsung/exynos.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/interconnect/samsung/exynos.c b/drivers/interconnect/samsung/exynos.c
index e70665899482..72e42603823b 100644
--- a/drivers/interconnect/samsung/exynos.c
+++ b/drivers/interconnect/samsung/exynos.c
@@ -98,12 +98,13 @@ static int exynos_generic_icc_remove(struct platform_device *pdev)
struct exynos_icc_priv *priv = platform_get_drvdata(pdev);
struct icc_node *parent_node, *node = priv->node;
+ icc_provider_deregister(&priv->provider);
+
parent_node = exynos_icc_get_parent(priv->dev->parent->of_node);
if (parent_node && !IS_ERR(parent_node))
icc_link_destroy(node, parent_node);
icc_nodes_remove(&priv->provider);
- icc_provider_del(&priv->provider);
return 0;
}
@@ -132,15 +133,11 @@ static int exynos_generic_icc_probe(struct platform_device *pdev)
provider->inter_set = true;
provider->data = priv;
- ret = icc_provider_add(provider);
- if (ret < 0)
- return ret;
+ icc_provider_init(provider);
icc_node = icc_node_create(pdev->id);
- if (IS_ERR(icc_node)) {
- ret = PTR_ERR(icc_node);
- goto err_prov_del;
- }
+ if (IS_ERR(icc_node))
+ return PTR_ERR(icc_node);
priv->node = icc_node;
icc_node->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%pOFn",
@@ -171,14 +168,17 @@ static int exynos_generic_icc_probe(struct platform_device *pdev)
goto err_pmqos_del;
}
+ ret = icc_provider_register(provider);
+ if (ret < 0)
+ goto err_pmqos_del;
+
return 0;
err_pmqos_del:
dev_pm_qos_remove_request(&priv->qos_req);
err_node_del:
icc_nodes_remove(provider);
-err_prov_del:
- icc_provider_del(provider);
+
return ret;
}
--
2.39.2
The current interconnect provider registration interface is inherently
racy as nodes are not added until the after adding the provider. This
can specifically cause racing DT lookups to fail:
of_icc_xlate_onecell: invalid index 0
cpu cpu0: error -EINVAL: error finding src node
cpu cpu0: dev_pm_opp_of_find_icc_paths: Unable to get path0: -22
qcom-cpufreq-hw: probe of 18591000.cpufreq failed with error -22
Switch to using the new API where the provider is not registered until
after it has been fully initialised.
Fixes: 5bc9900addaf ("interconnect: qcom: Add OSM L3 interconnect provider support")
Cc: stable(a)vger.kernel.org # 5.7
Reviewed-by: Konrad Dybcio <konrad.dybcio(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/interconnect/qcom/osm-l3.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/interconnect/qcom/osm-l3.c b/drivers/interconnect/qcom/osm-l3.c
index 5fa171087425..3a1cbfe3e481 100644
--- a/drivers/interconnect/qcom/osm-l3.c
+++ b/drivers/interconnect/qcom/osm-l3.c
@@ -158,8 +158,8 @@ static int qcom_osm_l3_remove(struct platform_device *pdev)
{
struct qcom_osm_l3_icc_provider *qp = platform_get_drvdata(pdev);
+ icc_provider_deregister(&qp->provider);
icc_nodes_remove(&qp->provider);
- icc_provider_del(&qp->provider);
return 0;
}
@@ -245,14 +245,9 @@ static int qcom_osm_l3_probe(struct platform_device *pdev)
provider->set = qcom_osm_l3_set;
provider->aggregate = icc_std_aggregate;
provider->xlate = of_icc_xlate_onecell;
- INIT_LIST_HEAD(&provider->nodes);
provider->data = data;
- ret = icc_provider_add(provider);
- if (ret) {
- dev_err(&pdev->dev, "error adding interconnect provider\n");
- return ret;
- }
+ icc_provider_init(provider);
for (i = 0; i < num_nodes; i++) {
size_t j;
@@ -275,12 +270,15 @@ static int qcom_osm_l3_probe(struct platform_device *pdev)
}
data->num_nodes = num_nodes;
+ ret = icc_provider_register(provider);
+ if (ret)
+ goto err;
+
platform_set_drvdata(pdev, qp);
return 0;
err:
icc_nodes_remove(provider);
- icc_provider_del(provider);
return ret;
}
--
2.39.2
The patch below was submitted to be applied to the 5.15-stable tree.
I fail to see how this patch meets the stable kernel rules as found at
Documentation/process/stable-kernel-rules.rst.
I could be totally wrong, and if so, please respond to
<stable(a)vger.kernel.org> and let me know why this patch should be
applied. Otherwise, it is now dropped from my patch queues, never to be
seen again.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9e8b89926fb87e5625bdde6fd5de2c31fb1d83bf Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard(a)mvista.com>
Date: Wed, 25 Jan 2023 10:41:48 -0600
Subject: [PATCH] ipmi:ssif: Remove rtc_us_timer
It was cruft left over from older handling of run to completion.
Cc: stable(a)vger.kernel.org
Signed-off-by: Corey Minyard <cminyard(a)mvista.com>
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 5a377e86dade..c25c4b1a03ae 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -241,12 +241,6 @@ struct ssif_info {
*/
bool req_flags;
- /*
- * Used to perform timer operations when run-to-completion
- * mode is on. This is a countdown timer.
- */
- int rtc_us_timer;
-
/* Used for sending/receiving data. +1 for the length. */
unsigned char data[IPMI_MAX_MSG_LENGTH + 1];
unsigned int data_len;
@@ -530,7 +524,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
static void start_get(struct ssif_info *ssif_info)
{
- ssif_info->rtc_us_timer = 0;
ssif_info->multi_pos = 0;
ssif_i2c_send(ssif_info, msg_done_handler, I2C_SMBUS_READ,
@@ -622,7 +615,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
ssif_info->waiting_alert = true;
- ssif_info->rtc_us_timer = SSIF_MSG_USEC;
if (!ssif_info->stopping)
mod_timer(&ssif_info->retry_timer,
jiffies + SSIF_MSG_JIFFIES);
@@ -973,7 +965,6 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
/* Wait a jiffie then request the next message */
ssif_info->waiting_alert = true;
ssif_info->retries_left = SSIF_RECV_RETRIES;
- ssif_info->rtc_us_timer = SSIF_MSG_PART_USEC;
if (!ssif_info->stopping)
mod_timer(&ssif_info->retry_timer,
jiffies + SSIF_MSG_PART_JIFFIES);
The patch below was submitted to be applied to the 6.1-stable tree.
I fail to see how this patch meets the stable kernel rules as found at
Documentation/process/stable-kernel-rules.rst.
I could be totally wrong, and if so, please respond to
<stable(a)vger.kernel.org> and let me know why this patch should be
applied. Otherwise, it is now dropped from my patch queues, never to be
seen again.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9e8b89926fb87e5625bdde6fd5de2c31fb1d83bf Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard(a)mvista.com>
Date: Wed, 25 Jan 2023 10:41:48 -0600
Subject: [PATCH] ipmi:ssif: Remove rtc_us_timer
It was cruft left over from older handling of run to completion.
Cc: stable(a)vger.kernel.org
Signed-off-by: Corey Minyard <cminyard(a)mvista.com>
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 5a377e86dade..c25c4b1a03ae 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -241,12 +241,6 @@ struct ssif_info {
*/
bool req_flags;
- /*
- * Used to perform timer operations when run-to-completion
- * mode is on. This is a countdown timer.
- */
- int rtc_us_timer;
-
/* Used for sending/receiving data. +1 for the length. */
unsigned char data[IPMI_MAX_MSG_LENGTH + 1];
unsigned int data_len;
@@ -530,7 +524,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
static void start_get(struct ssif_info *ssif_info)
{
- ssif_info->rtc_us_timer = 0;
ssif_info->multi_pos = 0;
ssif_i2c_send(ssif_info, msg_done_handler, I2C_SMBUS_READ,
@@ -622,7 +615,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
ssif_info->waiting_alert = true;
- ssif_info->rtc_us_timer = SSIF_MSG_USEC;
if (!ssif_info->stopping)
mod_timer(&ssif_info->retry_timer,
jiffies + SSIF_MSG_JIFFIES);
@@ -973,7 +965,6 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
/* Wait a jiffie then request the next message */
ssif_info->waiting_alert = true;
ssif_info->retries_left = SSIF_RECV_RETRIES;
- ssif_info->rtc_us_timer = SSIF_MSG_PART_USEC;
if (!ssif_info->stopping)
mod_timer(&ssif_info->retry_timer,
jiffies + SSIF_MSG_PART_JIFFIES);
The patch below was submitted to be applied to the 6.2-stable tree.
I fail to see how this patch meets the stable kernel rules as found at
Documentation/process/stable-kernel-rules.rst.
I could be totally wrong, and if so, please respond to
<stable(a)vger.kernel.org> and let me know why this patch should be
applied. Otherwise, it is now dropped from my patch queues, never to be
seen again.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9e8b89926fb87e5625bdde6fd5de2c31fb1d83bf Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard(a)mvista.com>
Date: Wed, 25 Jan 2023 10:41:48 -0600
Subject: [PATCH] ipmi:ssif: Remove rtc_us_timer
It was cruft left over from older handling of run to completion.
Cc: stable(a)vger.kernel.org
Signed-off-by: Corey Minyard <cminyard(a)mvista.com>
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 5a377e86dade..c25c4b1a03ae 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -241,12 +241,6 @@ struct ssif_info {
*/
bool req_flags;
- /*
- * Used to perform timer operations when run-to-completion
- * mode is on. This is a countdown timer.
- */
- int rtc_us_timer;
-
/* Used for sending/receiving data. +1 for the length. */
unsigned char data[IPMI_MAX_MSG_LENGTH + 1];
unsigned int data_len;
@@ -530,7 +524,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
static void start_get(struct ssif_info *ssif_info)
{
- ssif_info->rtc_us_timer = 0;
ssif_info->multi_pos = 0;
ssif_i2c_send(ssif_info, msg_done_handler, I2C_SMBUS_READ,
@@ -622,7 +615,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
ssif_info->waiting_alert = true;
- ssif_info->rtc_us_timer = SSIF_MSG_USEC;
if (!ssif_info->stopping)
mod_timer(&ssif_info->retry_timer,
jiffies + SSIF_MSG_JIFFIES);
@@ -973,7 +965,6 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
/* Wait a jiffie then request the next message */
ssif_info->waiting_alert = true;
ssif_info->retries_left = SSIF_RECV_RETRIES;
- ssif_info->rtc_us_timer = SSIF_MSG_PART_USEC;
if (!ssif_info->stopping)
mod_timer(&ssif_info->retry_timer,
jiffies + SSIF_MSG_PART_JIFFIES);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 310726c33ad76cebdee312dbfafc12c1b44bf977
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16780875786811(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
310726c33ad7 ("block: be a bit more careful in checking for NULL bdev while polling")
69fe0f298920 ("block: add ->poll_bio to block_device_operations")
7f36b7d02a28 ("block: move blk_crypto_bio_prep() out of blk-mq.c")
a650628bde77 ("block: move submit_bio_checks() into submit_bio_noacct")
9d497e2941c3 ("block: don't protect submit_bio_checks by q_usage_counter")
a08ed9aae8a3 ("block: fix double bio queue when merging in cached request path")
5f480b1a6325 ("blk-mq: use bio->bi_opf after bio is checked")
5b13bc8a3fd5 ("blk-mq: cleanup request allocation")
0c5bcc92d94a ("blk-mq: simplify the plug handling in blk_mq_submit_bio")
95febeb61bf8 ("block: fix missing queue put in error path")
b637108a4022 ("blk-mq: fix filesystem I/O request allocation")
b131f2011115 ("blk-mq: rename blk_attempt_bio_merge")
10c47870155b ("block: ensure cached plug request matches the current queue")
900e08075202 ("block: move queue enter logic into blk_mq_submit_bio()")
c98cb5bbdab1 ("block: make bio_queue_enter() fast-path available inline")
71539717c105 ("block: split request allocation components into helpers")
a1cb65377e70 ("blk-mq: only try to run plug merge if request has same queue with incoming bio")
e94f68527a35 ("block: kill extra rcu lock/unlock in queue enter")
179ae84f7ef5 ("block: clean up blk_mq_submit_bio() merging")
1497a51a3287 ("block: don't bloat enter_queue with percpu_ref")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 310726c33ad76cebdee312dbfafc12c1b44bf977 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Fri, 24 Feb 2023 10:01:19 -0700
Subject: [PATCH] block: be a bit more careful in checking for NULL bdev while
polling
Wei reports a crash with an application using polled IO:
PGD 14265e067 P4D 14265e067 PUD 47ec50067 PMD 0
Oops: 0000 [#1] SMP
CPU: 0 PID: 21915 Comm: iocore_0 Kdump: loaded Tainted: G S 5.12.0-0_fbk12_clang_7346_g1bb6f2e7058f #1
Hardware name: Wiwynn Delta Lake MP T8/Delta Lake-Class2, BIOS Y3DLM08 04/10/2022
RIP: 0010:bio_poll+0x25/0x200
Code: 0f 1f 44 00 00 0f 1f 44 00 00 55 41 57 41 56 41 55 41 54 53 48 83 ec 28 65 48 8b 04 25 28 00 00 00 48 89 44 24 20 48 8b 47 08 <48> 8b 80 70 02 00 00 4c 8b 70 50 8b 6f 34 31 db 83 fd ff 75 25 65
RSP: 0018:ffffc90005fafdf8 EFLAGS: 00010292
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 74b43cd65dd66600
RDX: 0000000000000003 RSI: ffffc90005fafe78 RDI: ffff8884b614e140
RBP: ffff88849964df78 R08: 0000000000000000 R09: 0000000000000008
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88849964df00
R13: ffffc90005fafe78 R14: ffff888137d3c378 R15: 0000000000000001
FS: 00007fd195000640(0000) GS:ffff88903f400000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000270 CR3: 0000000466121001 CR4: 00000000007706f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
iocb_bio_iopoll+0x1d/0x30
io_do_iopoll+0xac/0x250
__se_sys_io_uring_enter+0x3c5/0x5a0
? __x64_sys_write+0x89/0xd0
do_syscall_64+0x2d/0x40
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x94f225d
Code: 24 cc 00 00 00 41 8b 84 24 d0 00 00 00 c1 e0 04 83 e0 10 41 09 c2 8b 33 8b 53 04 4c 8b 43 18 4c 63 4b 0c b8 aa 01 00 00 0f 05 <85> c0 0f 88 85 00 00 00 29 03 45 84 f6 0f 84 88 00 00 00 41 f6 c7
RSP: 002b:00007fd194ffcd88 EFLAGS: 00000202 ORIG_RAX: 00000000000001aa
RAX: ffffffffffffffda RBX: 00007fd194ffcdc0 RCX: 00000000094f225d
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000007
RBP: 00007fd194ffcdb0 R08: 0000000000000000 R09: 0000000000000008
R10: 0000000000000001 R11: 0000000000000202 R12: 00007fd269d68030
R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
which is due to bio->bi_bdev being NULL. This can happen if we have two
tasks doing polled IO, and task B ends up completing IO from task A if
they are sharing a poll queue. If task B completes the IO and puts the
bio into our cache, then it can allocate that bio again before task A
is done polling for it. As that would necessitate a preempt between the
two tasks, it's enough to just be a bit more careful in checking for
whether or not bio->bi_bdev is NULL.
Reported-and-tested-by: Wei Zhang <wzhang(a)meta.com>
Cc: stable(a)vger.kernel.org
Fixes: be4d234d7aeb ("bio: add allocation cache abstraction")
Reviewed-by: Keith Busch <kbusch(a)kernel.org>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-core.c b/block/blk-core.c
index 5fb6856745b4..5d7e2ca75234 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -854,10 +854,16 @@ EXPORT_SYMBOL(submit_bio);
*/
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
+ struct block_device *bdev;
+ struct request_queue *q;
int ret = 0;
+ bdev = READ_ONCE(bio->bi_bdev);
+ if (!bdev)
+ return 0;
+
+ q = bdev_get_queue(bdev);
if (cookie == BLK_QC_T_NONE ||
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
@@ -926,7 +932,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
*/
rcu_read_lock();
bio = READ_ONCE(kiocb->private);
- if (bio && bio->bi_bdev)
+ if (bio)
ret = bio_poll(bio, iob, flags);
rcu_read_unlock();
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 11eb695feb636fa5211067189cad25ac073e7fe5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678087561207107(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
11eb695feb63 ("block: clear bio->bi_bdev when putting a bio back in the cache")
f25cf75a4521 ("bio: split pcpu cache part of bio_put into a helper")
d4347d50407d ("bio: safeguard REQ_ALLOC_CACHE bio put")
0df71650c051 ("block: allow using the per-cpu bio cache from bio_alloc_bioset")
49add4966d79 ("block: pass a block_device and opf to bio_init")
07888c665b40 ("block: pass a block_device and opf to bio_alloc")
b77c88c2100c ("block: pass a block_device and opf to bio_alloc_kiocb")
609be1066731 ("block: pass a block_device and opf to bio_alloc_bioset")
0a3140ea0fae ("block: pass a block_device and opf to blk_next_bio")
3b005bf6acf0 ("block: move blk_next_bio to bio.c")
7d8d0c658d48 ("xen-blkback: bio_alloc can't fail if it is allow to sleep")
d7b78de2b155 ("rnbd-srv: remove struct rnbd_dev_blk_io")
1fe0640ff94f ("rnbd-srv: simplify bio mapping in process_rdma")
4b1dc86d1857 ("drbd: bio_alloc can't fail if it is allow to sleep")
3f868c09ea8f ("dm-crypt: remove clone_init")
53db984e004c ("dm: bio_alloc can't fail if it is allowed to sleep")
39146b6f66ba ("ntfs3: remove ntfs_alloc_bio")
5d2ca2132f88 ("nfs/blocklayout: remove bl_alloc_init_bio")
f0d911927b3c ("nilfs2: remove nilfs_alloc_seg_bio")
d5f68a42da7a ("fs: remove mpage_alloc")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 11eb695feb636fa5211067189cad25ac073e7fe5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Fri, 24 Feb 2023 09:59:44 -0700
Subject: [PATCH] block: clear bio->bi_bdev when putting a bio back in the
cache
This isn't strictly needed in terms of correctness, but it does allow
polling to know if the bio has been put already by a different task
and hence avoid polling something that we don't need to.
Cc: stable(a)vger.kernel.org
Fixes: be4d234d7aeb ("bio: add allocation cache abstraction")
Reviewed-by: Keith Busch <kbusch(a)kernel.org>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/bio.c b/block/bio.c
index 2693f34afb7e..605c40025068 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -772,6 +772,7 @@ static inline void bio_put_percpu_cache(struct bio *bio)
if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
bio->bi_next = cache->free_list;
+ bio->bi_bdev = NULL;
cache->free_list = bio;
cache->nr++;
} else {