Hello,
CVE-2019-12378 was fixed in the upstream linux kernel with the following commit.
* 95baa60a0da8 ("ipv6_sockglue: Fix a missing-check bug in ip6_ra_control()")
Could the patch be applied to v4.19.y, v4.14.y, v4.9.y and v4.4.y?
Tests run:
* Chrome OS tryjobs
Thanks,
- Zubin
Currently, ima_appraise_measurement() ignores the EVM status when
evm_verifyxattr() returns INTEGRITY_UNKNOWN. If a file has a valid
security.ima xattr with type IMA_XATTR_DIGEST or IMA_XATTR_DIGEST_NG,
ima_appraise_measurement() returns INTEGRITY_PASS regardless of the EVM
status. The problem is that the EVM status is overwritten with the
appraisal status.
This patch mitigates the issue by selecting signature verification as the
only method allowed for appraisal when EVM is not initialized. Since the
new behavior might break user space, it must be turned on by adding the
'-evm' suffix to the value of the ima_appraise= kernel option.
Fixes: 2fe5d6def1672 ("ima: integrity appraisal extension")
Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
Cc: stable(a)vger.kernel.org
---
Documentation/admin-guide/kernel-parameters.txt | 3 ++-
security/integrity/ima/ima_appraise.c | 8 ++++++++
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..d84a2e612b93 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1585,7 +1585,8 @@
Set number of hash buckets for inode cache.
ima_appraise= [IMA] appraise integrity measurements
- Format: { "off" | "enforce" | "fix" | "log" }
+ Format: { "off" | "enforce" | "fix" | "log" |
+ "enforce-evm" | "log-evm" }
default: "enforce"
ima_appraise_tcb [IMA] Deprecated. Use ima_policy= instead.
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 5fb7127bbe68..afef06e10fb9 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -18,6 +18,7 @@
#include "ima.h"
+static bool ima_appraise_req_evm __ro_after_init;
static int __init default_appraise_setup(char *str)
{
#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
@@ -28,6 +29,9 @@ static int __init default_appraise_setup(char *str)
else if (strncmp(str, "fix", 3) == 0)
ima_appraise = IMA_APPRAISE_FIX;
#endif
+ if (strcmp(str, "enforce-evm") == 0 ||
+ strcmp(str, "log-evm") == 0)
+ ima_appraise_req_evm = true;
return 1;
}
@@ -245,7 +249,11 @@ int ima_appraise_measurement(enum ima_hooks func,
switch (status) {
case INTEGRITY_PASS:
case INTEGRITY_PASS_IMMUTABLE:
+ break;
case INTEGRITY_UNKNOWN:
+ if (ima_appraise_req_evm &&
+ xattr_value->type != EVM_IMA_XATTR_DIGSIG)
+ goto out;
break;
case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */
case INTEGRITY_NOLABEL: /* No security.evm xattr. */
--
2.17.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a86cb413f4bf273a9d341a3ab2c2ca44e12eb317 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth(a)redhat.com>
Date: Thu, 23 May 2019 18:43:08 +0200
Subject: [PATCH] KVM: s390: Do not report unusabled IDs via
KVM_CAP_MAX_VCPU_ID
KVM_CAP_MAX_VCPU_ID is currently always reporting KVM_MAX_VCPU_ID on all
architectures. However, on s390x, the amount of usable CPUs is determined
during runtime - it is depending on the features of the machine the code
is running on. Since we are using the vcpu_id as an index into the SCA
structures that are defined by the hardware (see e.g. the sca_add_vcpu()
function), it is not only the amount of CPUs that is limited by the hard-
ware, but also the range of IDs that we can use.
Thus KVM_CAP_MAX_VCPU_ID must be determined during runtime on s390x, too.
So the handling of KVM_CAP_MAX_VCPU_ID has to be moved from the common
code into the architecture specific code, and on s390x we have to return
the same value here as for KVM_CAP_MAX_VCPUS.
This problem has been discovered with the kvm_create_max_vcpus selftest.
With this change applied, the selftest now passes on s390x, too.
Reviewed-by: Andrew Jones <drjones(a)redhat.com>
Reviewed-by: Cornelia Huck <cohuck(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Thomas Huth <thuth(a)redhat.com>
Message-Id: <20190523164309.13345-9-thuth(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Christian Borntraeger <borntraeger(a)de.ibm.com>
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 6d0517ac18e5..0369f26ab96d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1122,6 +1122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
case KVM_CAP_MIPS_FPU:
/* We don't handle systems with inconsistent cpu_has_fpu */
r = !!raw_cpu_has_fpu;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3393b166817a..aa3a678711be 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -657,6 +657,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_PPC_GET_SMMU_INFO:
r = 1;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e5e8eb29e68e..28ebd647784c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -539,6 +539,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
+ case KVM_CAP_MAX_VCPU_ID:
r = KVM_S390_BSCA_CPU_SLOTS;
if (!kvm_s390_use_sca_entries())
r = KVM_MAX_VCPUS;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index acb179f78fdc..83aefd759846 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3122,6 +3122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
break;
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 90cedebaeb94..7eeebe5e9da2 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -224,6 +224,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
case KVM_CAP_MSI_DEVID:
if (!kvm)
r = -EINVAL;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 301089a462c4..ca54b09adf5b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3151,8 +3151,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_MULTI_ADDRESS_SPACE:
return KVM_ADDRESS_SPACE_NUM;
#endif
- case KVM_CAP_MAX_VCPU_ID:
- return KVM_MAX_VCPU_ID;
case KVM_CAP_NR_MEMSLOTS:
return KVM_USER_MEM_SLOTS;
default:
I'm announcing the release of the 5.0.21 kernel.
All users of the 5.0 kernel series must upgrade.
Note, this is the LAST 5.0.y kernel to be released. It is now
end-of-life. Please move to the 5.1.y kernel tree at this point in
time.
The updated 5.0.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.0.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
drivers/crypto/vmx/ghash.c | 212 ++++++-----------
drivers/net/bonding/bond_main.c | 15 -
drivers/net/dsa/mv88e6xxx/chip.c | 2
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 -
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 6
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2
drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 5
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 11
drivers/net/ethernet/freescale/fec_main.c | 2
drivers/net/ethernet/marvell/mvneta.c | 4
drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 6
drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c | 11
drivers/net/ethernet/realtek/r8169.c | 3
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 4
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8
drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 3
drivers/net/phy/marvell10g.c | 13 +
drivers/net/usb/usbnet.c | 6
drivers/xen/xen-pciback/pciback_ops.c | 2
include/linux/siphash.h | 5
include/net/netns/ipv4.h | 2
include/uapi/linux/tipc_config.h | 10
net/core/dev.c | 2
net/core/skbuff.c | 6
net/ipv4/igmp.c | 47 ++-
net/ipv4/ip_output.c | 4
net/ipv4/route.c | 12
net/ipv6/ip6_output.c | 4
net/ipv6/output_core.c | 30 +-
net/ipv6/raw.c | 2
net/ipv6/route.c | 6
net/llc/llc_output.c | 2
net/sched/act_api.c | 3
net/tipc/core.c | 32 +-
net/tipc/subscr.h | 5
net/tipc/topsrv.c | 14 -
net/tls/tls_device.c | 9
41 files changed, 312 insertions(+), 244 deletions(-)
Andy Duan (1):
net: fec: fix the clk mismatch in failed_reset path
Antoine Tenart (1):
net: mvpp2: fix bad MVPP2_TXQ_SCHED_TOKEN_CNTR_REG queue value
Chris Packham (1):
tipc: Avoid copying bytes beyond the supplied data
Daniel Axtens (1):
crypto: vmx - ghash: do nosimd fallback manually
David Ahern (1):
ipv6: Fix redirect with VRF
David S. Miller (1):
Revert "tipc: fix modprobe tipc failed after switch order of device registration"
Eric Dumazet (5):
inet: switch IP ID generator to siphash
ipv4/igmp: fix another memory leak in igmpv3_del_delrec()
ipv4/igmp: fix build error if !CONFIG_IP_MULTICAST
llc: fix skb leak in llc_build_and_send_ui_pkt()
net-gro: fix use-after-free read in napi_gro_frags()
Greg Kroah-Hartman (1):
Linux 5.0.21
Heiner Kallweit (1):
r8169: fix MAC address being lost in PCI D3
Jakub Kicinski (2):
net/tls: fix state removal with feature flags off
net/tls: don't ignore netdev notifications if no TLS features
Jarod Wilson (1):
bonding/802.3ad: fix slave link initialization transition states
Jiri Pirko (1):
mlxsw: spectrum_acl: Avoid warning after identical rules insertion
Jisheng Zhang (2):
net: mvneta: Fix err code path of probe
net: stmmac: fix reset gpio free missing
Junwei Hu (1):
tipc: fix modprobe tipc failed after switch order of device registration
Kloetzke Jan (1):
usbnet: fix kernel crash after disconnect
Konrad Rzeszutek Wilk (1):
xen/pciback: Don't disable PCI_COMMAND on PCI device reset.
Michael Chan (3):
bnxt_en: Fix aggregation buffer leak under OOM condition.
bnxt_en: Fix possible BUG() condition when calling pci_disable_msix().
bnxt_en: Reduce memory usage when running in kdump kernel.
Mike Manning (1):
ipv6: Consider sk_bound_dev_if when binding a raw socket to an address
Parav Pandit (2):
net/mlx5: Avoid double free in fs init error unwinding path
net/mlx5: Allocate root ns memory using kzalloc to match kfree
Raju Rangoju (1):
cxgb4: offload VLAN flows regardless of VLAN ethtype
Rasmus Villemoes (1):
net: dsa: mv88e6xxx: fix handling of upper half of STATS_TYPE_PORT
Russell King (1):
net: phy: marvell10g: report if the PHY fails to boot firmware
Saeed Mahameed (1):
net/mlx5e: Disable rxhash when CQE compress is enabled
Tan, Tee Min (1):
net: stmmac: fix ethtool flow control not able to get/set
Vishal Kulkarni (1):
cxgb4: Revert "cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page size"
Vlad Buslov (1):
net: sched: don't use tc_action->order during action dump
Weifeng Voon (1):
net: stmmac: dma channel control register need to be init first
Willem de Bruijn (1):
net: correct zerocopy refcnt with udp MSG_MORE
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Fri, 17 May 2019 10:44:25 +0300
Subject: [PATCH] btrfs: Ensure replaced device doesn't have pending chunk
allocation
Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update
operations during transaction commit") combined the way certain
operations are recoded in a transaction. As a result an ASSERT was added
in dev_replace_finish to ensure the new code works correctly.
Unfortunately I got reports that it's possible to trigger the assert,
meaning that during a device replace it's possible to have an unfinished
chunk allocation on the source device.
This is supposed to be prevented by the fact that a transaction is
committed before finishing the replace oepration and alter acquiring the
chunk mutex. This is not sufficient since by the time the transaction is
committed and the chunk mutex acquired it's possible to allocate a chunk
depending on the workload being executed on the replaced device. This
bug has been present ever since device replace was introduced but there
was never code which checks for it.
The correct way to fix is to ensure that there is no pending device
modification operation when the chunk mutex is acquire and if there is
repeat transaction commit. Unfortunately it's not possible to just
exclude the source device from btrfs_fs_devices::dev_alloc_list since
this causes ENOSPC to be hit in transaction commit.
Fixing that in another way would need to add special cases to handle the
last writes and forbid new ones. The looped transaction fix is more
obvious, and can be easily backported. The runtime of dev-replace is
long so there's no noticeable delay caused by that.
Reported-by: David Sterba <dsterba(a)suse.com>
Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 55c15f31d00d..ee0989c7e3a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -603,17 +603,33 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ /*
+ * We have to use this loop approach because at this point src_device
+ * has to be available for transaction commit to complete, yet new
+ * chunks shouldn't be allocated on the device.
+ */
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ /* Prevent write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ /* Prevent new chunks being allocated on the source device */
+ mutex_lock(&fs_info->chunk_mutex);
+
+ if (!list_empty(&src_device->post_commit_list)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -662,7 +678,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
- ASSERT(list_empty(&src_device->post_commit_list));
tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Fri, 17 May 2019 10:44:25 +0300
Subject: [PATCH] btrfs: Ensure replaced device doesn't have pending chunk
allocation
Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update
operations during transaction commit") combined the way certain
operations are recoded in a transaction. As a result an ASSERT was added
in dev_replace_finish to ensure the new code works correctly.
Unfortunately I got reports that it's possible to trigger the assert,
meaning that during a device replace it's possible to have an unfinished
chunk allocation on the source device.
This is supposed to be prevented by the fact that a transaction is
committed before finishing the replace oepration and alter acquiring the
chunk mutex. This is not sufficient since by the time the transaction is
committed and the chunk mutex acquired it's possible to allocate a chunk
depending on the workload being executed on the replaced device. This
bug has been present ever since device replace was introduced but there
was never code which checks for it.
The correct way to fix is to ensure that there is no pending device
modification operation when the chunk mutex is acquire and if there is
repeat transaction commit. Unfortunately it's not possible to just
exclude the source device from btrfs_fs_devices::dev_alloc_list since
this causes ENOSPC to be hit in transaction commit.
Fixing that in another way would need to add special cases to handle the
last writes and forbid new ones. The looped transaction fix is more
obvious, and can be easily backported. The runtime of dev-replace is
long so there's no noticeable delay caused by that.
Reported-by: David Sterba <dsterba(a)suse.com>
Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 55c15f31d00d..ee0989c7e3a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -603,17 +603,33 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ /*
+ * We have to use this loop approach because at this point src_device
+ * has to be available for transaction commit to complete, yet new
+ * chunks shouldn't be allocated on the device.
+ */
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ /* Prevent write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ /* Prevent new chunks being allocated on the source device */
+ mutex_lock(&fs_info->chunk_mutex);
+
+ if (!list_empty(&src_device->post_commit_list)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -662,7 +678,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
- ASSERT(list_empty(&src_device->post_commit_list));
tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Fri, 17 May 2019 10:44:25 +0300
Subject: [PATCH] btrfs: Ensure replaced device doesn't have pending chunk
allocation
Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update
operations during transaction commit") combined the way certain
operations are recoded in a transaction. As a result an ASSERT was added
in dev_replace_finish to ensure the new code works correctly.
Unfortunately I got reports that it's possible to trigger the assert,
meaning that during a device replace it's possible to have an unfinished
chunk allocation on the source device.
This is supposed to be prevented by the fact that a transaction is
committed before finishing the replace oepration and alter acquiring the
chunk mutex. This is not sufficient since by the time the transaction is
committed and the chunk mutex acquired it's possible to allocate a chunk
depending on the workload being executed on the replaced device. This
bug has been present ever since device replace was introduced but there
was never code which checks for it.
The correct way to fix is to ensure that there is no pending device
modification operation when the chunk mutex is acquire and if there is
repeat transaction commit. Unfortunately it's not possible to just
exclude the source device from btrfs_fs_devices::dev_alloc_list since
this causes ENOSPC to be hit in transaction commit.
Fixing that in another way would need to add special cases to handle the
last writes and forbid new ones. The looped transaction fix is more
obvious, and can be easily backported. The runtime of dev-replace is
long so there's no noticeable delay caused by that.
Reported-by: David Sterba <dsterba(a)suse.com>
Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 55c15f31d00d..ee0989c7e3a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -603,17 +603,33 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ /*
+ * We have to use this loop approach because at this point src_device
+ * has to be available for transaction commit to complete, yet new
+ * chunks shouldn't be allocated on the device.
+ */
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ /* Prevent write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ /* Prevent new chunks being allocated on the source device */
+ mutex_lock(&fs_info->chunk_mutex);
+
+ if (!list_empty(&src_device->post_commit_list)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -662,7 +678,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
- ASSERT(list_empty(&src_device->post_commit_list));
tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;