Apparently, there are some applications that use IN_DELETE event as an
invalidation mechanism and expect that if they try to open a file with
the name reported with the delete event, that it should not contain the
content of the deleted file.
Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of
d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify
will have access to a positive dentry.
This allowed a race where opening the deleted file via cached dentry
is now possible after receiving the IN_DELETE event.
To fix the regression, we use two different techniques:
1) For call sites that call d_delete() with elevated refcount, convert
the call to d_drop() and move the fsnotify hook after d_drop().
2) For the vfs helpers that may turn dentry to negative on d_delete(),
use a helper d_delete_notify() to pin the inode, so we can pass it
to an fsnotify hook after d_delete().
Create a new hook fsnotify_delete() that allows to pass a negative
dentry and takes the unlinked inode as an argument.
Add a missing fsnotify_unlink() hook in nfsdfs that was found during
the call sites audit.
Note that the call sites in simple_recursive_removal() follow
d_invalidate(), so they require no change.
Backporting hint: this regression is from v5.3. Although patch will
apply with only trivial conflicts to v5.4 and v5.10, it won't build,
because fsnotify_delete() implementation is different in each of those
versions (see fsnotify_link()).
Reported-by: Ivan Delalande <colona(a)arista.com>
Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/
Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()")
Cc: stable(a)vger.kernel.org # v5.3+
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
---
Jan,
This turned into an audit of fsnotify_unlink/rmdir() call sites, so
besides fixing the regression, I also added one missing hook and replaced
most of the d_delete() calls with d_drop() to simplify things.
I will follow up with backports for v5.4 and v5.10 and will send the
repro to LTP guys.
Thanks,
Amir.
fs/btrfs/ioctl.c | 5 ++---
fs/configfs/dir.c | 6 +++---
fs/devpts/inode.c | 2 +-
fs/namei.c | 27 ++++++++++++++++++++++-----
fs/nfsd/nfsctl.c | 5 +++--
include/linux/fsnotify.h | 20 ++++++++++++++++++++
net/sunrpc/rpc_pipe.c | 4 ++--
7 files changed, 53 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edfecfe62b4b..121e8f439996 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3060,10 +3060,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
btrfs_inode_unlock(inode, 0);
- if (!err) {
+ d_drop(dentry);
+ if (!err)
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
out_dput:
dput(dentry);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 1466b5d01cbb..d3cd2a94d1e8 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1780,8 +1780,8 @@ void configfs_unregister_group(struct config_group *group)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
+ d_drop(dentry);
fsnotify_rmdir(d_inode(parent), dentry);
- d_delete(dentry);
inode_unlock(d_inode(parent));
dput(dentry);
@@ -1922,10 +1922,10 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- fsnotify_rmdir(d_inode(root), dentry);
inode_unlock(d_inode(dentry));
- d_delete(dentry);
+ d_drop(dentry);
+ fsnotify_rmdir(d_inode(root), dentry);
inode_unlock(d_inode(root));
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e5a766d33c..4f25015aa534 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dentry)
dentry->d_fsdata = NULL;
drop_nlink(dentry->d_inode);
- fsnotify_unlink(d_inode(dentry->d_parent), dentry);
d_drop(dentry);
+ fsnotify_unlink(d_inode(dentry->d_parent), dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
}
diff --git a/fs/namei.c b/fs/namei.c
index 1f9d2187c765..b11991b57f9b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3929,6 +3929,23 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}
+/**
+ * d_delete_notify - delete a dentry and call fsnotify_delete()
+ * @dentry: The dentry to delete
+ *
+ * This helper is used to guaranty that the unlinked inode cannot be found
+ * by lookup of this name after fsnotify_delete() event has been delivered.
+ */
+static void d_delete_notify(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ ihold(inode);
+ d_delete(dentry);
+ fsnotify_delete(dir, inode, dentry);
+ iput(inode);
+}
+
/**
* vfs_rmdir - remove directory
* @mnt_userns: user namespace of the mount the inode was found from
@@ -3973,13 +3990,12 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_rmdir(dir, dentry);
out:
inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_rmdir);
@@ -4101,7 +4117,6 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_unlink(dir, dentry);
}
}
}
@@ -4109,9 +4124,11 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
- if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ fsnotify_unlink(dir, dentry);
+ } else if (!error) {
fsnotify_link_count(target);
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
}
return error;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 51a49e0cfe37..d0761ca8cb54 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1249,7 +1249,8 @@ static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
clear_ncl(d_inode(dentry));
dget(dentry);
ret = simple_unlink(dir, dentry);
- d_delete(dentry);
+ d_drop(dentry);
+ fsnotify_unlink(dir, dentry);
dput(dentry);
WARN_ON_ONCE(ret);
}
@@ -1340,8 +1341,8 @@ void nfsd_client_rmdir(struct dentry *dentry)
dget(dentry);
ret = simple_rmdir(dir, dentry);
WARN_ON_ONCE(ret);
+ d_drop(dentry);
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
dput(dentry);
inode_unlock(dir);
}
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 3a2d7dc3c607..b4a4085adc89 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -224,6 +224,26 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
dir, &new_dentry->d_name, 0);
}
+/*
+ * fsnotify_delete - @dentry was unlinked and unhashed
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ *
+ * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
+ * as this may be called after d_delete() and old_dentry may be negative.
+ */
+static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
+ struct dentry *dentry)
+{
+ __u32 mask = FS_DELETE;
+
+ if (S_ISDIR(inode->i_mode))
+ mask |= FS_ISDIR;
+
+ fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
+ 0);
+}
+
/*
* fsnotify_unlink - 'name' was unlinked
*
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index ee5336d73fdd..35588f0afa86 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -600,9 +600,9 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
dget(dentry);
ret = simple_rmdir(dir, dentry);
+ d_drop(dentry);
if (!ret)
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
dput(dentry);
return ret;
}
@@ -613,9 +613,9 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
dget(dentry);
ret = simple_unlink(dir, dentry);
+ d_drop(dentry);
if (!ret)
fsnotify_unlink(dir, dentry);
- d_delete(dentry);
dput(dentry);
return ret;
}
--
2.34.1
The feature negotiation was designed in a way that
makes it possible for devices to know which config
fields will be accessed by drivers.
This is broken since commit 404123c2db79 ("virtio: allow drivers to
validate features") with fallout in at least block and net. We have a
partial work-around in commit 2f9a174f918e ("virtio: write back
F_VERSION_1 before validate") which at least lets devices find out which
format should config space have, but this is a partial fix: guests
should not access config space without acknowledging features since
otherwise we'll never be able to change the config space format.
To fix, split finalize_features from virtio_finalize_features and
call finalize_features with all feature bits before validation,
and then - if validation changed any bits - once again after.
Since virtio_finalize_features no longer writes out features
rename it to virtio_features_ok - since that is what it does:
checks that features are ok with the device.
As a side effect, this also reduces the amount of hypervisor accesses -
we now only acknowledge features once unless we are clearing any
features when validating (which is uncommon).
Cc: stable(a)vger.kernel.org
Fixes: 404123c2db79 ("virtio: allow drivers to validate features")
Fixes: 2f9a174f918e ("virtio: write back F_VERSION_1 before validate")
Cc: "Halil Pasic" <pasic(a)linux.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
fixup! virtio: acknowledge all features before access
---
drivers/virtio/virtio.c | 39 ++++++++++++++++++++---------------
include/linux/virtio_config.h | 3 ++-
2 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index d891b0a354b0..d6396be0ea83 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -166,14 +166,13 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status)
}
EXPORT_SYMBOL_GPL(virtio_add_status);
-static int virtio_finalize_features(struct virtio_device *dev)
+/* Do some validation, then set FEATURES_OK */
+static int virtio_features_ok(struct virtio_device *dev)
{
- int ret = dev->config->finalize_features(dev);
unsigned status;
+ int ret;
might_sleep();
- if (ret)
- return ret;
ret = arch_has_restricted_virtio_memory_access();
if (ret) {
@@ -244,17 +243,6 @@ static int virtio_dev_probe(struct device *_d)
driver_features_legacy = driver_features;
}
- /*
- * Some devices detect legacy solely via F_VERSION_1. Write
- * F_VERSION_1 to force LE config space accesses before FEATURES_OK for
- * these when needed.
- */
- if (drv->validate && !virtio_legacy_is_little_endian()
- && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) {
- dev->features = BIT_ULL(VIRTIO_F_VERSION_1);
- dev->config->finalize_features(dev);
- }
-
if (device_features & (1ULL << VIRTIO_F_VERSION_1))
dev->features = driver_features & device_features;
else
@@ -265,13 +253,26 @@ static int virtio_dev_probe(struct device *_d)
if (device_features & (1ULL << i))
__virtio_set_bit(dev, i);
+ err = dev->config->finalize_features(dev);
+ if (err)
+ goto err;
+
if (drv->validate) {
+ u64 features = dev->features;
+
err = drv->validate(dev);
if (err)
goto err;
+
+ /* Did validation change any features? Then write them again. */
+ if (features != dev->features) {
+ err = dev->config->finalize_features(dev);
+ if (err)
+ goto err;
+ }
}
- err = virtio_finalize_features(dev);
+ err = virtio_features_ok(dev);
if (err)
goto err;
@@ -495,7 +496,11 @@ int virtio_device_restore(struct virtio_device *dev)
/* We have a driver! */
virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
- ret = virtio_finalize_features(dev);
+ ret = dev->config->finalize_features(dev);
+ if (ret)
+ goto err;
+
+ ret = virtio_features_ok(dev);
if (ret)
goto err;
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 4d107ad31149..dafdc7f48c01 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -64,8 +64,9 @@ struct virtio_shm_region {
* Returns the first 64 feature bits (all we currently need).
* @finalize_features: confirm what device features we'll be using.
* vdev: the virtio_device
- * This gives the final feature bits for the device: it can change
+ * This sends the driver feature bits to the device: it can change
* the dev->feature bits if it wants.
+ * Note: despite the name this can be called any number of times.
* Returns 0 on success or error status
* @bus_name: return the bus name associated with the device (optional)
* vdev: the virtio_device
--
MST