If an application uses direct open or accept, it knows in advance what
direct descriptor value it will get as it picks it itself. This allows
combined requests such as:
sqe = io_uring_get_sqe(ring);
io_uring_prep_openat_direct(sqe, ..., file_slot);
sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS;
sqe = io_uring_get_sqe(ring);
io_uring_prep_read(sqe,file_slot, buf, buf_size, 0);
sqe->flags |= IOSQE_FIXED_FILE;
io_uring_submit(ring);
where we prepare both a file open and read, and only get a completion
event for the read when both have completed successfully.
Currently links are fully prepared before the head is issued, but that
fails if the dependent link needs a file assigned that isn't valid until
the head has completed.
Conversely, if the same chain is performed but the fixed file slot is
already valid, then we would be unexpectedly returning data from the
old file slot rather than the newly opened one. Make sure we're
consistent here.
Allow deferral of file setup, which makes this documented case work.
Cc: stable(a)vger.kernel.org # v5.15+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
fs/io_uring.c | 43 ++++++++++++++++++++++++++++++++++---------
1 file changed, 34 insertions(+), 9 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 84433dc57914..e73e333d4705 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -915,7 +915,11 @@ struct io_kiocb {
unsigned int flags;
u64 user_data;
- u32 result;
+ /* fd before execution, if valid, result after execution */
+ union {
+ u32 result;
+ s32 fd;
+ };
u32 cflags;
struct io_ring_ctx *ctx;
@@ -7208,6 +7212,23 @@ static void io_clean_op(struct io_kiocb *req)
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
+static bool io_assign_file(struct io_kiocb *req)
+{
+ if (req->file || !io_op_defs[req->opcode].needs_file)
+ return true;
+
+ req->file = io_file_get(req->ctx, req, req->fd,
+ req->flags & REQ_F_FIXED_FILE);
+ if (req->file) {
+ req->result = 0;
+ return 0;
+ }
+
+ req_set_fail(req);
+ req->result = -EBADF;
+ return false;
+}
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
const struct cred *creds = NULL;
@@ -7218,6 +7239,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (!io_op_defs[req->opcode].audit_skip)
audit_uring_entry(req->opcode);
+ if (unlikely(!io_assign_file(req)))
+ return -EBADF;
switch (req->opcode) {
case IORING_OP_NOP:
@@ -7362,10 +7385,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
unsigned int issue_flags = IO_URING_F_UNLOCKED;
bool needs_poll = false;
struct io_kiocb *timeout;
- int ret = 0;
+ int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
@@ -7377,14 +7401,18 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+ if (!io_assign_file(req)) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ err = -EBADF;
+ }
+
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
- io_req_task_queue_fail(req, -ECANCELED);
+ io_req_task_queue_fail(req, err);
return;
}
if (req->flags & REQ_F_FORCE_ASYNC) {
- const struct io_op_def *def = &io_op_defs[req->opcode];
bool opcode_poll = def->pollin || def->pollout;
if (opcode_poll && file_can_poll(req->file)) {
@@ -7720,6 +7748,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (io_op_defs[opcode].needs_file) {
struct io_submit_state *state = &ctx->submit_state;
+ req->fd = READ_ONCE(sqe->fd);
+
/*
* Plug now if we have more than 2 IO left after this, and the
* target is potentially a read/write to block based storage.
@@ -7729,11 +7759,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
state->need_plug = false;
blk_start_plug_nr_ios(&state->plug, state->submit_nr);
}
-
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- return -EBADF;
}
personality = READ_ONCE(sqe->personality);
--
2.35.1
This is a leftover from the really old days where we weren't able to
track and error early if we need a file and it wasn't assigned. Kill
the check.
Cc: stable(a)vger.kernel.org # v5.15+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
fs/io_uring.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0b89f35378fa..67244ea0af48 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4511,9 +4511,6 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->file)
- return -EBADF;
-
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
--
2.35.1
From: Miklos Szeredi <mszeredi(a)redhat.com>
commit 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 upstream.
In FOPEN_DIRECT_IO mode, fuse_file_write_iter() calls
fuse_direct_write_iter(), which normally calls fuse_direct_io(), which then
imports the write buffer with fuse_get_user_pages(), which uses
iov_iter_get_pages() to grab references to userspace pages instead of
actually copying memory.
On the filesystem device side, these pages can then either be read to
userspace (via fuse_dev_read()), or splice()d over into a pipe using
fuse_dev_splice_read() as pipe buffers with &nosteal_pipe_buf_ops.
This is wrong because after fuse_dev_do_read() unlocks the FUSE request,
the userspace filesystem can mark the request as completed, causing write()
to return. At that point, the userspace filesystem should no longer have
access to the pipe buffer.
Fix by copying pages coming from the user address space to new pipe
buffers.
Reported-by: Jann Horn <jannh(a)google.com>
Fixes: c3021629a0d8 ("fuse: support splice() reading from fuse device")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
Signed-off-by: Zach O'Keefe <zokeefe(a)google.com>
---
Applies against stable-v4.14 and stable-v4.19
struct fuse_args hasn't been piped through relevant functions yet, so
place user_pages flag in an empty hole in struct fuse_req instead.
fs/fuse/dev.c | 12 +++++++++++-
fs/fuse/file.c | 1 +
fs/fuse/fuse_i.h | 2 ++
3 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index db7d746633cf..1c98b5b7bead 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -991,7 +991,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
while (count) {
if (cs->write && cs->pipebufs && page) {
- return fuse_ref_page(cs, page, offset, count);
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
} else if (!cs->len) {
if (cs->move_pages && page &&
offset == 0 && count == PAGE_SIZE) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5f5da2911cea..a32b2ca3de6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1325,6 +1325,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ req->user_pages = true;
if (write)
req->in.argpages = 1;
else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fac1f08dd32e..30fdede2ea64 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -312,6 +312,8 @@ struct fuse_req {
/** refcount */
refcount_t count;
+ bool user_pages;
+
/** Unique ID for the interrupt request */
u64 intr_unique;
--
2.35.1.1021.g381101b075-goog
Make the two locations where exportfs helpers check permission to lookup
a given inode idmapped mount aware by switching it to the lookup_one()
helper. This is a bugfix for the open_by_handle_at() system call which
doesn't take idmapped mounts into account currently. It's not tied to a
specific commit so we'll just Cc stable.
In addition this is required to support idmapped base layers in overlay.
The overlay filesystem uses exportfs to encode and decode file handles
for its index=on mount option and when nfs_export=on.
Cc: <stable(a)vger.kernel.org>
Cc: <linux-fsdevel(a)vger.kernel.org>
Tested-by: Giuseppe Scrivano <gscrivan(a)redhat.com>
Reviewed-by: Amir Goldstein <amir73il(a)gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner(a)kernel.org>
---
/* v2 */
unchanged
---
fs/exportfs/expfs.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0106eba46d5a..3ef80d000e13 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
+ tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -525,7 +525,8 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
}
inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ nresult = lookup_one(mnt_user_ns(mnt), nbuf,
+ target_dir, strlen(nbuf));
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
--
2.32.0