I happened to notice this commit:
9ca415399dae - "net/mlx5: Annotate mutex destroy for root ns"
...was backported to 4.19 and 5.4 and v5.6 in linux-stable.
It patches del_sw_root_ns() - which only exists after v5.7-rc7 from:
6eb7a268a99b - "net/mlx5: Don't maintain a case of del_sw_func being
null"
which creates the one line del_sw_root_ns stub function around
kfree(node) by breaking it out of tree_put_node().
In the absense of del_sw_root_ns - the backport finds an identical one
line kfree stub fcn - named del_sw_prio from this earlier commit:
139ed6c6c46a - "net/mlx5: Fix steering memory leak" [in v4.15-rc5]
and then puts the mutex_destroy() into that (wrong) function, instead of
putting it into tree_put_node where the root ns case used to be handled.
Paul.
Inode's i_io_list list head is used to attach inode to several different
lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker
prepares a list of inodes to writeback e.g. for sync(2), it moves inodes
to b_io list. Thus it is critical for sync(2) data integrity guarantees
that inode is not requeued to any other writeback list when inode is
queued for processing by flush worker. That's the reason why
writeback_single_inode() does not touch i_io_list (unless the inode is
completely clean) and why __mark_inode_dirty() does not touch i_io_list
if I_SYNC flag is set.
However there are two flaws in the current logic:
1) When inode has only I_DIRTY_TIME set but it is already queued in b_io
list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC)
can still move inode back to b_dirty list resulting in skipping
writeback of inode time stamps during sync(2).
2) When inode is on b_dirty_time list and writeback_single_inode() races
with __mark_inode_dirty() like:
writeback_single_inode() __mark_inode_dirty(inode, I_DIRTY_PAGES)
inode->i_state |= I_SYNC
__writeback_single_inode()
inode->i_state |= I_DIRTY_PAGES;
if (inode->i_state & I_SYNC)
bail
if (!(inode->i_state & I_DIRTY_ALL))
- not true so nothing done
We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus
standard background writeback will not writeback this inode leading to
possible dirty throttling stalls etc. (thanks to Martijn Coenen for this
analysis).
Fix these problems by tracking whether inode is queued in b_io or
b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we
know flush worker has queued inode and we should not touch i_io_list.
On the other hand we also know that once flush worker is done with the
inode it will requeue the inode to appropriate dirty list. When
I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode
to appropriate dirty list.
Reported-by: Martijn Coenen <maco(a)android.com>
Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option")
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/fs-writeback.c | 39 +++++++++++++++++++++++++++++----------
include/linux/fs.h | 8 ++++++--
2 files changed, 35 insertions(+), 12 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76ac9c7d32ec..855c6611710a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -144,7 +144,9 @@ static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -1123,7 +1125,9 @@ void inode_io_list_del(struct inode *inode)
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
+ spin_lock(&inode->i_lock);
inode_io_list_del_locked(inode, wb);
+ spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
}
@@ -1172,8 +1176,9 @@ void sb_clear_inode_writeback(struct inode *inode)
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+static void __redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
+ assert_spin_locked(&inode->i_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -1182,6 +1187,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
+ inode->i_state &= ~I_SYNC_QUEUED;
+}
+
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+{
+ spin_lock(&inode->i_lock);
+ __redirty_tail(inode, wb);
+ spin_unlock(&inode->i_lock);
}
/*
@@ -1250,8 +1263,11 @@ static int move_expired_inodes(struct list_head *delaying_queue,
break;
list_move(&inode->i_io_list, &tmp);
moved++;
+ spin_lock(&inode->i_lock);
if (flags & EXPIRE_DIRTY_ATIME)
- set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+ inode->i_state |= I_DIRTY_TIME_EXPIRED;
+ inode->i_state |= I_SYNC_QUEUED;
+ spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -1394,7 +1410,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
- redirty_tail(inode, wb);
+ __redirty_tail(inode, wb);
return;
}
@@ -1414,7 +1430,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
- redirty_tail(inode, wb);
+ __redirty_tail(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
@@ -1422,10 +1438,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
- redirty_tail(inode, wb);
+ __redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
inode_io_list_del_locked(inode, wb);
@@ -1669,8 +1686,9 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ inode->i_state &= ~I_SYNC_QUEUED;
+ __redirty_tail(inode, wb);
spin_unlock(&inode->i_lock);
- redirty_tail(inode, wb);
continue;
}
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
@@ -2289,11 +2307,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags;
/*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
+ * If the inode is queued for writeback by flush worker, just
+ * update its dirty state. Once the flush worker is done with
+ * the inode it will place it on the appropriate superblock
+ * list, based upon its state.
*/
- if (inode->i_state & I_SYNC)
+ if (inode->i_state & I_SYNC_QUEUED)
goto out_unlock_inode;
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 45cc10cdf6dd..b02290d19edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2156,6 +2156,10 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
*
* I_CREATING New object's inode in the middle of setting up.
*
+ * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists.
+ * Used to detect that mark_inode_dirty() should not move
+ * inode between dirty lists.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
@@ -2173,11 +2177,11 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP)
#define I_LINKABLE (1 << 10)
#define I_DIRTY_TIME (1 << 11)
-#define __I_DIRTY_TIME_EXPIRED 12
-#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
+#define I_DIRTY_TIME_EXPIRED (1 << 12)
#define I_WB_SWITCH (1 << 13)
#define I_OVL_INUSE (1 << 14)
#define I_CREATING (1 << 15)
+#define I_SYNC_QUEUED (1 << 16)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
--
2.16.4
When userspace configures KVM_GUESTDBG_SINGLESTEP, KVM will manage the
presence of X86_EFLAGS_TF via kvm_set/get_rflags on vcpus. The actual
rflag bit is therefore hidden from callers.
That includes init_emulate_ctxt() which uses the value returned from
kvm_get_flags() to set ctxt->tf. As a result, x86_emulate_instruction()
will skip a single step, leaving singlestep_rip stale and not returning
to userspace.
This resolves the issue by observing the vcpu guest_debug configuration
alongside ctxt->tf in x86_emulate_instruction(), performing the single
step if set.
Signed-off-by: Felipe Franciosi <felipe(a)nutanix.com>
---
arch/x86/kvm/x86.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c17e6eb9ad43..64cb183636da 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6919,7 +6919,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
- if (r && ctxt->tf)
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
if (kvm_x86_ops.update_emulated_instruction)
kvm_x86_ops.update_emulated_instruction(vcpu);
--
2.20.1
Charan Teja reported a 'use-after-free' in dmabuffs_dname [1], which
happens if the dma_buf_release() is called while the userspace is
accessing the dma_buf pseudo fs's dmabuffs_dname() in another process,
and dma_buf_release() releases the dmabuf object when the last reference
to the struct file goes away.
I discussed with Arnd Bergmann, and he suggested that rather than tying
the dma_buf_release() to the file_operations' release(), we can tie it to
the dentry_operations' d_release(), which will be called when the last ref
to the dentry is removed.
The path exercised by __fput() calls f_op->release() first, and then calls
dput, which eventually calls d_op->d_release().
In the 'normal' case, when no userspace access is happening via dma_buf
pseudo fs, there should be exactly one fd, file, dentry and inode, so
closing the fd will kill of everything right away.
In the presented case, the dentry's d_release() will be called only when
the dentry's last ref is released.
Therefore, lets move dma_buf_release() from fops->release() to
d_ops->d_release().
Many thanks to Arnd for his FS insights :)
[1]: https://lore.kernel.org/patchwork/patch/1238278/
Fixes: bb2bb9030425 ("dma-buf: add DMA_BUF_SET_NAME ioctls")
Reported-by: syzbot+3643a18836bce555bff6(a)syzkaller.appspotmail.com
Cc: <stable(a)vger.kernel.org> [5.3+]
Cc: Arnd Bergmann <arnd(a)arndb.de>
Reported-by: Charan Teja Reddy <charante(a)codeaurora.org>
Signed-off-by: Sumit Semwal <sumit.semwal(a)linaro.org>
---
drivers/dma-buf/dma-buf.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 01ce125f8e8d..92ba4b6ef3e7 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -54,8 +54,11 @@ static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
dentry->d_name.name, ret > 0 ? name : "");
}
+static void dma_buf_release(struct dentry *dentry);
+
static const struct dentry_operations dma_buf_dentry_ops = {
.d_dname = dmabuffs_dname,
+ .d_release = dma_buf_release,
};
static struct vfsmount *dma_buf_mnt;
@@ -77,14 +80,14 @@ static struct file_system_type dma_buf_fs_type = {
.kill_sb = kill_anon_super,
};
-static int dma_buf_release(struct inode *inode, struct file *file)
+static void dma_buf_release(struct dentry *dentry)
{
struct dma_buf *dmabuf;
- if (!is_dma_buf_file(file))
- return -EINVAL;
+ if (dentry->d_op != &dma_buf_dentry_ops)
+ return;
- dmabuf = file->private_data;
+ dmabuf = dentry->d_fsdata;
BUG_ON(dmabuf->vmapping_counter);
@@ -110,7 +113,6 @@ static int dma_buf_release(struct inode *inode, struct file *file)
module_put(dmabuf->owner);
kfree(dmabuf->name);
kfree(dmabuf);
- return 0;
}
static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
@@ -412,7 +414,6 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file)
}
static const struct file_operations dma_buf_fops = {
- .release = dma_buf_release,
.mmap = dma_buf_mmap_internal,
.llseek = dma_buf_llseek,
.poll = dma_buf_poll,
--
2.27.0
This is the start of the stable review cycle for the 4.19.127 release.
There are 28 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 07 Jun 2020 13:54:56 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.127-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.19.127-rc1
Dinghao Liu <dinghao.liu(a)zju.edu.cn>
net: smsc911x: Fix runtime PM imbalance on error
Jonathan McDowell <noodles(a)earth.li>
net: ethernet: stmmac: Enable interface clocks on probe for IPQ806x
Valentin Longchamp <valentin(a)longchamp.me>
net/ethernet/freescale: rework quiesce/activate for ucc_geth
Chaitanya Kulkarni <chaitanya.kulkarni(a)wdc.com>
null_blk: return error for invalid zone size
Gerald Schaefer <gerald.schaefer(a)de.ibm.com>
s390/mm: fix set_huge_pte_at() for empty ptes
Jan Schmidt <jan(a)centricular.com>
drm/edid: Add Oculus Rift S to non-desktop list
Jeremy Kerr <jk(a)ozlabs.org>
net: bmac: Fix read of MAC address from ROM
Nathan Chancellor <natechancellor(a)gmail.com>
x86/mmiotrace: Use cpumask_available() for cpumask_var_t variables
Atsushi Nemoto <atsushi.nemoto(a)sord.co.jp>
i2c: altera: Fix race between xfer_msg and isr thread
Madhuparna Bhowmik <madhuparnabhowmik10(a)gmail.com>
evm: Fix RCU list related warnings
Vineet Gupta <vgupta(a)synopsys.com>
ARC: [plat-eznps]: Restrict to CONFIG_ISA_ARCOMPACT
Eugeniy Paltsev <Eugeniy.Paltsev(a)synopsys.com>
ARC: Fix ICCM & DCCM runtime size checks
Vasily Gorbik <gor(a)linux.ibm.com>
s390/ftrace: save traced function caller
Xinwei Kong <kong.kongxinwei(a)hisilicon.com>
spi: dw: use "smp_mb()" to avoid sending spi data error
Anju T Sudhakar <anju(a)linux.vnet.ibm.com>
powerpc/powernv: Avoid re-registration of imc debugfs directory
Xiang Chen <chenxiang66(a)hisilicon.com>
scsi: hisi_sas: Check sas_port before using it
Lucas De Marchi <lucas.demarchi(a)intel.com>
drm/i915: fix port checks for MST support on gen >= 11
Dan Carpenter <dan.carpenter(a)oracle.com>
airo: Fix read overflows sending packets
DENG Qingfang <dqfext(a)gmail.com>
net: dsa: mt7530: set CPU port to fallback mode
Can Guo <cang(a)codeaurora.org>
scsi: ufs: Release clock if DMA map fails
Jérôme Pouiller <jerome.pouiller(a)silabs.com>
mmc: fix compilation of user API
Daniel Axtens <dja(a)axtens.net>
kernel/relay.c: handle alloc_percpu returning NULL in relay_open
Giuseppe Marco Randazzo <gmrandazzo(a)gmail.com>
p54usb: add AirVasT USB stick device-id
Julian Sax <jsbc(a)gmx.de>
HID: i2c-hid: add Schneider SCL142ALM to descriptor override
Scott Shumate <scott.shumate(a)gmail.com>
HID: sony: Fix for broken buttons on DS3 USB dongles
Fan Yang <Fan_Yang(a)sjtu.edu.cn>
mm: Fix mremap not considering huge pmd devmap
Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
libnvdimm: Fix endian conversion issues
Tejun Heo <tj(a)kernel.org>
Revert "cgroup: Add memory barriers to plug cgroup_rstat_updated() race window"
-------------
Diffstat:
Makefile | 4 +--
arch/arc/kernel/setup.c | 5 +--
arch/arc/plat-eznps/Kconfig | 1 +
arch/powerpc/platforms/powernv/opal-imc.c | 39 +++++++++-------------
arch/s390/kernel/mcount.S | 1 +
arch/s390/mm/hugetlbpage.c | 9 +++--
arch/x86/include/asm/pgtable.h | 1 +
arch/x86/mm/mmio-mod.c | 4 +--
drivers/block/null_blk_zoned.c | 4 +++
drivers/gpu/drm/drm_edid.c | 3 +-
drivers/gpu/drm/i915/intel_dp.c | 7 ++--
drivers/gpu/drm/i915/intel_dp_mst.c | 22 ++++++++----
drivers/hid/hid-sony.c | 17 ++++++++++
drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c | 8 +++++
drivers/i2c/busses/i2c-altera.c | 10 +++++-
drivers/net/dsa/mt7530.c | 11 ++++--
drivers/net/dsa/mt7530.h | 6 ++++
drivers/net/ethernet/apple/bmac.c | 2 +-
drivers/net/ethernet/freescale/ucc_geth.c | 13 ++++----
drivers/net/ethernet/smsc/smsc911x.c | 9 ++---
.../net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 13 ++++++++
drivers/net/wireless/cisco/airo.c | 12 +++++++
drivers/net/wireless/intersil/p54/p54usb.c | 1 +
drivers/nvdimm/btt.c | 8 ++---
drivers/nvdimm/namespace_devs.c | 7 ++--
drivers/scsi/hisi_sas/hisi_sas_main.c | 3 +-
drivers/scsi/ufs/ufshcd.c | 1 +
drivers/spi/spi-dw.c | 3 ++
include/uapi/linux/mmc/ioctl.h | 1 +
kernel/cgroup/rstat.c | 16 ++-------
kernel/relay.c | 5 +++
mm/mremap.c | 2 +-
security/integrity/evm/evm_crypto.c | 2 +-
security/integrity/evm/evm_main.c | 4 +--
security/integrity/evm/evm_secfs.c | 9 ++++-
35 files changed, 178 insertions(+), 85 deletions(-)
AArch32 CP1x registers are overlayed on their AArch64 counterparts
in the vcpu struct. This leads to an interesting problem as they
are stored in their CPU-local format, and thus a CP1x register
doesn't "hit" the lower 32bit portion of the AArch64 register on
a BE host.
To workaround this unfortunate situation, introduce a bias trick
in the vcpu_cp1x() accessors which picks the correct half of the
64bit register.
Cc: stable(a)vger.kernel.org
Reported-by: James Morse <james.morse(a)arm.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
---
arch/arm64/include/asm/kvm_host.h | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 59029e90b557..e80c0e06f235 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -404,8 +404,14 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
* CP14 and CP15 live in the same array, as they are backed by the
* same system registers.
*/
-#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r)])
-#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r)])
+#ifdef CPU_BIG_ENDIAN
+#define CPx_OFFSET 1
+#else
+#define CPx_OFFSET 0
+#endif
+
+#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_OFFSET])
+#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_OFFSET])
struct kvm_vm_stat {
ulong remote_tlb_flush;
--
2.26.2