This fixes a crash bug caused by commit ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") by removing a leftover folio_end_private_2() call after all calls to folio_start_private_2() had been removed by the commit.
By calling folio_end_private_2() without folio_start_private_2(), the folio refcounter breaks and causes trouble like RCU stalls and general protection faults.
Signed-off-by: Max Kellermann max.kellermann@ionos.com Fixes: ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") Link: https://lore.kernel.org/ceph-devel/CAKPOu+_DA8XiMAA2ApMj7Pyshve_YWknw8Hdt1=z... Signed-off-by: Max Kellermann max.kellermann@ionos.com --- fs/ceph/addr.c | 2 +- fs/netfs/fscache_io.c | 29 +---------------------------- include/linux/fscache.h | 30 ++++-------------------------- 3 files changed, 6 insertions(+), 55 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef..485cbd1730d1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -512,7 +512,7 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), - ceph_fscache_write_terminated, inode, true, caching); + ceph_fscache_write_terminated, inode, caching); } #else static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 38637e5c9b57..0d8f3f646598 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -166,30 +166,10 @@ struct fscache_write_request { loff_t start; size_t len; bool set_bits; - bool using_pgpriv2; netfs_io_terminated_t term_func; void *term_func_priv; };
-void __fscache_clear_page_bits(struct address_space *mapping, - loff_t start, size_t len) -{ - pgoff_t first = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE; - struct page *page; - - if (len) { - XA_STATE(xas, &mapping->i_pages, first); - - rcu_read_lock(); - xas_for_each(&xas, page, last) { - folio_end_private_2(page_folio(page)); - } - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(__fscache_clear_page_bits); - /* * Deal with the completion of writing the data to the cache. */ @@ -198,10 +178,6 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, { struct fscache_write_request *wreq = priv;
- if (wreq->using_pgpriv2) - fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len, - wreq->set_bits); - if (wreq->term_func) wreq->term_func(wreq->term_func_priv, transferred_or_error, was_async); @@ -214,7 +190,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, - bool using_pgpriv2, bool cond) + bool cond) { struct fscache_write_request *wreq; struct netfs_cache_resources *cres; @@ -232,7 +208,6 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, wreq->mapping = mapping; wreq->start = start; wreq->len = len; - wreq->using_pgpriv2 = using_pgpriv2; wreq->set_bits = cond; wreq->term_func = term_func; wreq->term_func_priv = term_func_priv; @@ -260,8 +235,6 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, abandon_free: kfree(wreq); abandon: - if (using_pgpriv2) - fscache_clear_page_bits(mapping, start, len, cond); if (term_func) term_func(term_func_priv, ret, false); } diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9de27643607f..f8c52bddaa15 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -177,8 +177,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, - bool using_pgpriv2, bool cond); -extern void __fscache_clear_page_bits(struct address_space *, loff_t, size_t); + bool cond);
/** * fscache_acquire_volume - Register a volume as desiring caching services @@ -573,24 +572,6 @@ int fscache_write(struct netfs_cache_resources *cres, return ops->write(cres, start_pos, iter, term_func, term_func_priv); }
-/** - * fscache_clear_page_bits - Clear the PG_fscache bits from a set of pages - * @mapping: The netfs inode to use as the source - * @start: The start position in @mapping - * @len: The amount of data to unlock - * @caching: If PG_fscache has been set - * - * Clear the PG_fscache flag from a sequence of pages and wake up anyone who's - * waiting. - */ -static inline void fscache_clear_page_bits(struct address_space *mapping, - loff_t start, size_t len, - bool caching) -{ - if (caching) - __fscache_clear_page_bits(mapping, start, len); -} - /** * fscache_write_to_cache - Save a write to the cache and clear PG_fscache * @cookie: The cookie representing the cache object @@ -600,7 +581,6 @@ static inline void fscache_clear_page_bits(struct address_space *mapping, * @i_size: The new size of the inode * @term_func: The function to call upon completion * @term_func_priv: The private data for @term_func - * @using_pgpriv2: If we're using PG_private_2 to mark in-progress write * @caching: If we actually want to do the caching * * Helper function for a netfs to write dirty data from an inode into the cache @@ -612,21 +592,19 @@ static inline void fscache_clear_page_bits(struct address_space *mapping, * marked with PG_fscache. * * If given, @term_func will be called upon completion and supplied with - * @term_func_priv. Note that if @using_pgpriv2 is set, the PG_private_2 flags - * will have been cleared by this point, so the netfs must retain its own pin - * on the mapping. + * @term_func_priv. */ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, struct address_space *mapping, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, - bool using_pgpriv2, bool caching) + bool caching) { if (caching) __fscache_write_to_cache(cookie, mapping, start, len, i_size, term_func, term_func_priv, - using_pgpriv2, caching); + caching); else if (term_func) term_func(term_func_priv, -ENOBUFS, false);
Hi,
Thanks for your patch.
FYI: kernel test robot notices the stable kernel rule is not satisfied.
The check is based on https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html#opti...
Rule: add the tag "Cc: stable@vger.kernel.org" in the sign-off area to have the patch automatically included in the stable tree. Subject: [PATCH] fs/netfs/fscache_io: remove the obsolete "using_pgpriv2" flag Link: https://lore.kernel.org/stable/20240729091532.855688-1-max.kellermann%40iono...
On Mon, 2024-07-29 at 11:15 +0200, Max Kellermann wrote:
This fixes a crash bug caused by commit ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") by removing a leftover folio_end_private_2() call after all calls to folio_start_private_2() had been removed by the commit.
By calling folio_end_private_2() without folio_start_private_2(), the folio refcounter breaks and causes trouble like RCU stalls and general protection faults.
Signed-off-by: Max Kellermann max.kellermann@ionos.com Fixes: ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") Link: https://lore.kernel.org/ceph-devel/CAKPOu+_DA8XiMAA2ApMj7Pyshve_YWknw8Hdt1=z... Signed-off-by: Max Kellermann max.kellermann@ionos.com
fs/ceph/addr.c | 2 +- fs/netfs/fscache_io.c | 29 +---------------------------- include/linux/fscache.h | 30 ++++-------------------------- 3 files changed, 6 insertions(+), 55 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef..485cbd1730d1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -512,7 +512,7 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b struct fscache_cookie *cookie = ceph_fscache_cookie(ci); fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
ceph_fscache_write_terminated, inode, true, caching);
ceph_fscache_write_terminated, inode, caching);
} #else static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 38637e5c9b57..0d8f3f646598 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -166,30 +166,10 @@ struct fscache_write_request { loff_t start; size_t len; bool set_bits;
- bool using_pgpriv2;
netfs_io_terminated_t term_func; void *term_func_priv; }; -void __fscache_clear_page_bits(struct address_space *mapping,
loff_t start, size_t len)
-{
- pgoff_t first = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE;
- struct page *page;
- if (len) {
XA_STATE(xas, &mapping->i_pages, first);
rcu_read_lock();
xas_for_each(&xas, page, last) {
folio_end_private_2(page_folio(page));
}
rcu_read_unlock();
- }
-} -EXPORT_SYMBOL(__fscache_clear_page_bits);
/* * Deal with the completion of writing the data to the cache. */ @@ -198,10 +178,6 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, { struct fscache_write_request *wreq = priv;
- if (wreq->using_pgpriv2)
fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
wreq->set_bits);
if (wreq->term_func) wreq->term_func(wreq->term_func_priv, transferred_or_error, was_async); @@ -214,7 +190,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv,
bool using_pgpriv2, bool cond)
bool cond)
{ struct fscache_write_request *wreq; struct netfs_cache_resources *cres; @@ -232,7 +208,6 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, wreq->mapping = mapping; wreq->start = start; wreq->len = len;
- wreq->using_pgpriv2 = using_pgpriv2;
wreq->set_bits = cond; wreq->term_func = term_func; wreq->term_func_priv = term_func_priv; @@ -260,8 +235,6 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, abandon_free: kfree(wreq); abandon:
- if (using_pgpriv2)
fscache_clear_page_bits(mapping, start, len, cond);
if (term_func) term_func(term_func_priv, ret, false); } diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9de27643607f..f8c52bddaa15 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -177,8 +177,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv,
bool using_pgpriv2, bool cond);
-extern void __fscache_clear_page_bits(struct address_space *, loff_t, size_t);
bool cond);
/** * fscache_acquire_volume - Register a volume as desiring caching services @@ -573,24 +572,6 @@ int fscache_write(struct netfs_cache_resources *cres, return ops->write(cres, start_pos, iter, term_func, term_func_priv); } -/**
- fscache_clear_page_bits - Clear the PG_fscache bits from a set of pages
- @mapping: The netfs inode to use as the source
- @start: The start position in @mapping
- @len: The amount of data to unlock
- @caching: If PG_fscache has been set
- Clear the PG_fscache flag from a sequence of pages and wake up anyone who's
- waiting.
- */
-static inline void fscache_clear_page_bits(struct address_space *mapping,
loff_t start, size_t len,
bool caching)
-{
- if (caching)
__fscache_clear_page_bits(mapping, start, len);
-}
/** * fscache_write_to_cache - Save a write to the cache and clear PG_fscache * @cookie: The cookie representing the cache object @@ -600,7 +581,6 @@ static inline void fscache_clear_page_bits(struct address_space *mapping, * @i_size: The new size of the inode * @term_func: The function to call upon completion * @term_func_priv: The private data for @term_func
- @using_pgpriv2: If we're using PG_private_2 to mark in-progress write
* @caching: If we actually want to do the caching * * Helper function for a netfs to write dirty data from an inode into the cache @@ -612,21 +592,19 @@ static inline void fscache_clear_page_bits(struct address_space *mapping, * marked with PG_fscache. * * If given, @term_func will be called upon completion and supplied with
- @term_func_priv. Note that if @using_pgpriv2 is set, the PG_private_2 flags
- will have been cleared by this point, so the netfs must retain its own pin
- on the mapping.
- @term_func_priv.
*/ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, struct address_space *mapping, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv,
bool using_pgpriv2, bool caching)
bool caching)
{ if (caching) __fscache_write_to_cache(cookie, mapping, start, len, i_size, term_func, term_func_priv,
using_pgpriv2, caching);
caching);
else if (term_func) term_func(term_func_priv, -ENOBUFS, false);
(cc'ing the cephfs maintainers too)
Nice work! I'd prefer this patch over the first one. It looks like the Fixes: commit went into v6.10. Did it go into earlier kernels too?
If so, what might be best is to take both of your patches. Have the simple one first that just flips the flag, and mark that one for stable. Then we can add the second patch on top to remove all of this stuff for mainline.
Either way, you can add this to both patches:
Reviewed-by: Jeff Layton jlayton@kernel.org
On Mon, Jul 29, 2024 at 2:56 PM Jeff Layton jlayton@kernel.org wrote:
Nice work! I'd prefer this patch over the first one. It looks like the Fixes: commit went into v6.10. Did it go into earlier kernels too?
No, it's 6.10 only.
On Mon, Jul 29, 2024 at 2:56 PM Jeff Layton jlayton@kernel.org wrote:
Either way, you can add this to both patches:
Reviewed-by: Jeff Layton jlayton@kernel.org
Stop the merge :-)
I just found that my patch introduces another lockup; copy_file_range locks up this way:
[<0>] folio_wait_private_2+0xd9/0x140 [<0>] ceph_write_begin+0x56/0x90 [<0>] generic_perform_write+0xc0/0x210 [<0>] ceph_write_iter+0x4e2/0x650 [<0>] iter_file_splice_write+0x30d/0x550 [<0>] splice_file_range_actor+0x2c/0x40 [<0>] splice_direct_to_actor+0xee/0x270 [<0>] splice_file_range+0x80/0xc0 [<0>] ceph_copy_file_range+0xbb/0x5b0 [<0>] vfs_copy_file_range+0x33e/0x5d0 [<0>] __x64_sys_copy_file_range+0xf7/0x200 [<0>] do_syscall_64+0x64/0x100 [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
Turns out that there are still private_2 users left in both fs/ceph and fs/netfs. My patches fix one problem, but cause another problem. Too bad!
This leaves me confused again: how shall I fix this? Can all folio_wait_private_2() calls simply be removed? This looks like some refactoring gone wrong, and some parts don't make sense (like netfs and ceph claim ownership of the folio_private pointer). I could try to fix the mess, but I need to know how this is meant to be. David, can you enlighten me?
Max
On Mon, 2024-07-29 at 17:35 +0200, Max Kellermann wrote:
On Mon, Jul 29, 2024 at 2:56 PM Jeff Layton jlayton@kernel.org wrote:
Either way, you can add this to both patches:
Reviewed-by: Jeff Layton jlayton@kernel.org
Stop the merge :-)
I just found that my patch introduces another lockup; copy_file_range locks up this way:
[<0>] folio_wait_private_2+0xd9/0x140 [<0>] ceph_write_begin+0x56/0x90 [<0>] generic_perform_write+0xc0/0x210 [<0>] ceph_write_iter+0x4e2/0x650 [<0>] iter_file_splice_write+0x30d/0x550 [<0>] splice_file_range_actor+0x2c/0x40 [<0>] splice_direct_to_actor+0xee/0x270 [<0>] splice_file_range+0x80/0xc0 [<0>] ceph_copy_file_range+0xbb/0x5b0 [<0>] vfs_copy_file_range+0x33e/0x5d0 [<0>] __x64_sys_copy_file_range+0xf7/0x200 [<0>] do_syscall_64+0x64/0x100 [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
Turns out that there are still private_2 users left in both fs/ceph and fs/netfs. My patches fix one problem, but cause another problem. Too bad!
This leaves me confused again: how shall I fix this? Can all folio_wait_private_2() calls simply be removed? This looks like some refactoring gone wrong, and some parts don't make sense (like netfs and ceph claim ownership of the folio_private pointer). I could try to fix the mess, but I need to know how this is meant to be. David, can you enlighten me?
Max
I suspect the folio_wait_private_2 call in ceph_write_begin should have also been removed in ae678317b95, and it just got missed somehow in the original patch. All of the other callsites that did anything with private_2 were removed in that patch.
David, can you confirm that?
Hi Max,
Can you try this patch instead of either of yours?
David ---
This reverts commit ae678317b95e760607c7b20b97c9cd4ca9ed6e1a.
Revert the patch that removes the deprecated use of PG_private_2 in netfslib for the moment as Ceph is actually still using this to track data copied to the cache.
Fixes: ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") Reported-by: Max Kellermann max.kellermann@ionos.com Signed-off-by: David Howells dhowells@redhat.com cc: Ilya Dryomov idryomov@gmail.com cc: Xiubo Li xiubli@redhat.com cc: Jeff Layton jlayton@kernel.org cc: Matthew Wilcox willy@infradead.org cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/ceph/addr.c | 19 +++++ fs/netfs/buffered_read.c | 8 ++ fs/netfs/io.c | 144 +++++++++++++++++++++++++++++++++++++++++++ include/trace/events/netfs.h | 1 4 files changed, 170 insertions(+), 2 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef..73b5a07bf94d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -498,6 +498,11 @@ const struct netfs_request_ops ceph_netfs_ops = { };
#ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) +{ + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ +} + static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { struct inode *inode = priv; @@ -515,6 +520,10 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b ceph_fscache_write_terminated, inode, true, caching); } #else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { } @@ -706,6 +715,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = wlen;
set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) { @@ -789,6 +800,8 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return AOP_WRITEPAGE_ACTIVATE; }
+ folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -1062,7 +1075,8 @@ static int ceph_writepages_start(struct address_space *mapping, unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || + PagePrivate2(page) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); @@ -1070,6 +1084,7 @@ static int ceph_writepages_start(struct address_space *mapping, } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ }
if (!clear_page_dirty_for_io(page)) { @@ -1254,6 +1269,8 @@ static int ceph_writepages_start(struct address_space *mapping, }
set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a688d4c75d99..424048f9ed1f 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -466,7 +466,7 @@ int netfs_write_begin(struct netfs_inode *ctx, if (!netfs_is_cache_enabled(ctx) && netfs_skip_folio_read(folio, pos, len, false)) { netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio; + goto have_folio_no_wait; }
rreq = netfs_alloc_request(mapping, file, @@ -507,6 +507,12 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio: + if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; + } +have_folio_no_wait: *_folio = folio; _leave(" = 0"); return 0; diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c93851b98368..c179a1c73fa7 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -98,6 +98,146 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); }
+/* + * [DEPRECATED] Deal with the completion of writing the data to the cache. We + * have to clear the PG_fscache bits on the folios involved and release the + * caller's ref. + * + * May be called in softirq mode and we inherit a ref from the caller. + */ +static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, + bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t unlocked = 0; + bool have_unlocked = false; + + rcu_read_lock(); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); + + xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + if (xas_retry(&xas, folio)) + continue; + + /* We might have multiple writes from the same huge + * folio, but we mustn't unlock a folio more than once. + */ + if (have_unlocked && folio->index <= unlocked) + continue; + unlocked = folio_next_index(folio) - 1; + trace_netfs_folio(folio, netfs_folio_trace_end_copy); + folio_end_private_2(folio); + have_unlocked = true; + } + } + + rcu_read_unlock(); + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) /* [DEPRECATED] */ +{ + struct netfs_io_subrequest *subreq = priv; + struct netfs_io_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) { + netfs_stat(&netfs_n_rh_write_failed); + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_copy_to_cache); + } else { + netfs_stat(&netfs_n_rh_write_done); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); + + /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} + +/* + * [DEPRECATED] Perform any outstanding writes to the cache. We inherit a ref + * from the caller. + */ +static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct netfs_io_subrequest *subreq, *next, *p; + struct iov_iter iter; + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_copy); + + /* We don't want terminating writes trying to wake us up whilst we're + * still going through the list. + */ + atomic_inc(&rreq->nr_copy_ops); + + list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + list_del_init(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_no_copy); + } + } + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + /* Amalgamate adjacent writes */ + while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + next = list_next_entry(subreq, rreq_link); + if (next->start != subreq->start + subreq->len) + break; + subreq->len += next->len; + list_del_init(&next->rreq_link); + netfs_put_subrequest(next, false, + netfs_sreq_trace_put_merged); + } + + ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, + subreq->len, rreq->i_size, true); + if (ret < 0) { + trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); + trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); + continue; + } + + iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, + subreq->start, subreq->len); + + atomic_inc(&rreq->nr_copy_ops); + netfs_stat(&netfs_n_rh_write); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); + trace_netfs_sreq(subreq, netfs_sreq_trace_write); + cres->ops->write(cres, subreq->start, &iter, + netfs_rreq_copy_terminated, subreq); + } + + /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, false); +} + +static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */ +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_rreq_do_write_to_cache(rreq); +} + +static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */ +{ + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); +} + /* * Handle a short read. */ @@ -275,6 +415,10 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+ if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) && + test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) + return netfs_rreq_write_to_cache(rreq); + netfs_rreq_completed(rreq, was_async); }
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index da23484268df..24ec3434d32e 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -145,6 +145,7 @@ EM(netfs_folio_trace_clear_g, "clear-g") \ EM(netfs_folio_trace_clear_s, "clear-s") \ EM(netfs_folio_trace_copy_to_cache, "mark-copy") \ + EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_kill_cc, "kill-cc") \
On Tue, Jul 30, 2024 at 6:01 PM David Howells dhowells@redhat.com wrote:
Can you try this patch instead of either of yours?
I booted it on one of the servers, and no problem so far. All tests complete successfully, even the one with copy_file_range that crashed with my patch. I'll let you know when problems occur later, but until then, I agree with merging your revert instead of my patches.
If I understand this correctly, my other problem (the folio_attach_private conflict between netfs and ceph) I posted in https://lore.kernel.org/ceph-devel/CAKPOu+8q_1rCnQndOj3KAitNY2scPQFuSS-AxeGr... was caused by my (bad) patch after all, wasn't it?
For the moment, ceph has to continue using PG_private_2. It doesn't use netfs_writepages(). I have mostly complete patches to fix that, but they got popped onto the back burner for a bit.
When you're done with those patches, Cc me on those if you want me to help test them.
Max
On Tue, Jul 30, 2024 at 6:28 PM Max Kellermann max.kellermann@ionos.com wrote:
I'll let you know when problems occur later, but until then, I agree with merging your revert instead of my patches.
Not sure if that's the same bug/cause (looks different), but 6.10.2 with your patch is still unstable:
rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 9-.... 15-.... } 521399 jiffies s: 2085 root: 0x1/. rcu: blocking rcu_node structures (internal RCU debug): l=1:0-15:0x8200/. Sending NMI from CPU 3 to CPUs 9: NMI backtrace for cpu 9 CPU: 9 PID: 2756 Comm: kworker/9:2 Tainted: G D 6.10.2-cm4all2-vm+ #171 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Workqueue: ceph-msgr ceph_con_workfn RIP: 0010:native_queued_spin_lock_slowpath+0x80/0x260 Code: 57 85 c0 74 10 0f b6 03 84 c0 74 09 f3 90 0f b6 03 84 c0 75 f7 b8 01 00 00 00 66 89 03 5b 5d 41 5c 41 5d c3 cc cc cc cc f3 90 <eb> 93 8b 37 b8 00 02 00 00 81 fe 00 01 00 00 74 07 eb a1 83 e8 01 RSP: 0018:ffffaf5880c03bb8 EFLAGS: 00000202 RAX: 0000000000000001 RBX: ffffa02bc37c9e98 RCX: ffffaf5880c03c90 RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffffa02bc37c9e98 RBP: ffffa02bc2f94000 R08: ffffaf5880c03c90 R09: 0000000000000010 R10: 0000000000000514 R11: 0000000000000000 R12: ffffaf5880c03c90 R13: ffffffffb4bcb2f0 R14: ffffa036c9e7e8e8 R15: ffffa02bc37c9e98 FS: 0000000000000000(0000) GS:ffffa036cf040000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fecac48568 CR3: 000000030d82c002 CR4: 00000000001706b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <NMI> ? nmi_cpu_backtrace+0x83/0xf0 ? nmi_cpu_backtrace_handler+0xd/0x20 ? nmi_handle+0x56/0x120 ? default_do_nmi+0x40/0x100 ? exc_nmi+0xdc/0x100 ? end_repeat_nmi+0xf/0x53 ? __pfx_ceph_ino_compare+0x10/0x10 ? native_queued_spin_lock_slowpath+0x80/0x260 ? native_queued_spin_lock_slowpath+0x80/0x260 ? native_queued_spin_lock_slowpath+0x80/0x260 </NMI> <TASK> ? __pfx_ceph_ino_compare+0x10/0x10 _raw_spin_lock+0x1e/0x30 find_inode+0x6e/0xc0 ? __pfx_ceph_ino_compare+0x10/0x10 ? __pfx_ceph_set_ino_cb+0x10/0x10 ilookup5_nowait+0x6d/0xa0 ? __pfx_ceph_ino_compare+0x10/0x10 iget5_locked+0x33/0xe0 ceph_get_inode+0xb8/0xf0 mds_dispatch+0xfe8/0x1ff0 ? inet_recvmsg+0x4d/0xf0 ceph_con_process_message+0x66/0x80 ceph_con_v1_try_read+0xcfc/0x17c0 ? __switch_to_asm+0x39/0x70 ? finish_task_switch.isra.0+0x78/0x240 ? __schedule+0x32a/0x1440 ceph_con_workfn+0x339/0x4f0 process_one_work+0x138/0x2e0 worker_thread+0x2b9/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0xba/0xe0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK>
On Tue, Jul 30, 2024 at 6:28 PM Max Kellermann max.kellermann@ionos.com wrote:
If I understand this correctly, my other problem (the folio_attach_private conflict between netfs and ceph) I posted in https://lore.kernel.org/ceph-devel/CAKPOu+8q_1rCnQndOj3KAitNY2scPQFuSS-AxeGr... was caused by my (bad) patch after all, wasn't it?
It was not caused by my bad patch. Without my patch, but with your revert instead I just got a crash (this time, I enabled lots of debugging options in the kernel, including KASAN) - it's the same crash as in the post I linked in my previous email:
------------[ cut here ]------------ WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386 ceph_put_wrbuffer_cap_refs+0x416/0x500 Modules linked in: CPU: 13 PID: 3621 Comm: rsync Not tainted 6.10.2-cm4all2-vm+ #176 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 RIP: 0010:ceph_put_wrbuffer_cap_refs+0x416/0x500 Code: e8 af 7f 50 01 45 84 ed 75 27 45 8d 74 24 ff e9 cf fd ff ff e8 ab ea 64 ff e9 4c fc ff ff 31 f6 48 89 df e8 3c 86 ff ff eb b5 <0f> 0b e9 7a ff ff ff 31 f6 48 89 df e8 29 86 ff ff eb cd 0f 0b 48 RSP: 0018:ffff88813c57f868 EFLAGS: 00010286 RAX: dffffc0000000000 RBX: ffff88823dc66588 RCX: 0000000000000000 RDX: 1ffff11047b8cda7 RSI: ffff88823dc66df0 RDI: ffff88823dc66d38 RBP: 0000000000000001 R08: 0000000000000000 R09: fffffbfff5f9a8cd R10: ffffffffafcd466f R11: 0000000000000001 R12: 0000000000000000 R13: ffffea000947af00 R14: 00000000ffffffff R15: 0000000000000356 FS: 00007f1e82957b80(0000) GS:ffff888a73400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559037dacea8 CR3: 000000013f1b2002 CR4: 00000000001706b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? __warn+0xc8/0x2c0 ? ceph_put_wrbuffer_cap_refs+0x416/0x500 ? report_bug+0x257/0x2b0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? ceph_put_wrbuffer_cap_refs+0x416/0x500 ? ceph_put_wrbuffer_cap_refs+0x2e/0x500 ceph_invalidate_folio+0x241/0x310 truncate_cleanup_folio+0x277/0x330 truncate_inode_pages_range+0x1b4/0x940 ? __pfx_truncate_inode_pages_range+0x10/0x10 ? __lock_acquire+0x19f3/0x5c10 ? __lock_acquire+0x19f3/0x5c10 ? __pfx___lock_acquire+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 ? srso_alias_untrain_ret+0x1/0x10 ? lock_acquire+0x186/0x490 ? find_held_lock+0x2d/0x110 ? kvm_sched_clock_read+0xd/0x20 ? local_clock_noinstr+0x9/0xb0 ? __pfx_lock_release+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ceph_evict_inode+0xd5/0x530 evict+0x251/0x560 __dentry_kill+0x17b/0x500 dput+0x393/0x690 __fput+0x40e/0xa60 __x64_sys_close+0x78/0xd0 do_syscall_64+0x82/0x130 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ? syscall_exit_to_user_mode+0x9f/0x190 ? do_syscall_64+0x8e/0x130 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ? syscall_exit_to_user_mode+0x9f/0x190 ? do_syscall_64+0x8e/0x130 ? do_syscall_64+0x8e/0x130 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f1e823178e0 Code: 0d 00 00 00 eb b2 e8 ff f7 01 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 80 3d 01 1d 0e 00 00 74 17 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 48 c3 0f 1f 80 00 00 00 00 48 83 ec 18 89 7c RSP: 002b:00007ffe16c2e108 EFLAGS: 00000202 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 000000000000001e RCX: 00007f1e823178e0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000001 RBP: 00007f1e8219bc08 R08: 0000000000000000 R09: 0000559037df64b0 R10: fe04b91e88691591 R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007ffe16c2e220 R15: 0000000000000001 </TASK> irq event stamp: 26945 hardirqs last enabled at (26951): [<ffffffffaaac5a99>] console_unlock+0x189/0x1b0 hardirqs last disabled at (26956): [<ffffffffaaac5a7e>] console_unlock+0x16e/0x1b0 softirqs last enabled at (26518): [<ffffffffaa962375>] irq_exit_rcu+0x95/0xc0 softirqs last disabled at (26513): [<ffffffffaa962375>] irq_exit_rcu+0x95/0xc0 ---[ end trace 0000000000000000 ]--- ================================================================== BUG: KASAN: null-ptr-deref in ceph_put_snap_context+0x18/0x50 Write of size 4 at addr 0000000000000356 by task rsync/3621
CPU: 13 PID: 3621 Comm: rsync Tainted: G W 6.10.2-cm4all2-vm+ #176 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x74/0xd0 kasan_report+0xb9/0xf0 ? ceph_put_snap_context+0x18/0x50 kasan_check_range+0xeb/0x1a0 ceph_put_snap_context+0x18/0x50 ceph_invalidate_folio+0x249/0x310 truncate_cleanup_folio+0x277/0x330 truncate_inode_pages_range+0x1b4/0x940 ? __pfx_truncate_inode_pages_range+0x10/0x10 ? __lock_acquire+0x19f3/0x5c10 ? __lock_acquire+0x19f3/0x5c10 ? __pfx___lock_acquire+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 ? srso_alias_untrain_ret+0x1/0x10 ? lock_acquire+0x186/0x490 ? find_held_lock+0x2d/0x110 ? kvm_sched_clock_read+0xd/0x20 ? local_clock_noinstr+0x9/0xb0 ? __pfx_lock_release+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ceph_evict_inode+0xd5/0x530 evict+0x251/0x560 __dentry_kill+0x17b/0x500 dput+0x393/0x690 __fput+0x40e/0xa60 __x64_sys_close+0x78/0xd0 do_syscall_64+0x82/0x130 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ? syscall_exit_to_user_mode+0x9f/0x190 ? do_syscall_64+0x8e/0x130 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 ? syscall_exit_to_user_mode+0x9f/0x190 ? do_syscall_64+0x8e/0x130 ? do_syscall_64+0x8e/0x130 ? lockdep_hardirqs_on_prepare+0x275/0x3e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f1e823178e0 Code: 0d 00 00 00 eb b2 e8 ff f7 01 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 80 3d 01 1d 0e 00 00 74 17 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 48 c3 0f 1f 80 00 00 00 00 48 83 ec 18 89 7c RSP: 002b:00007ffe16c2e108 EFLAGS: 00000202 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 000000000000001e RCX: 00007f1e823178e0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000001 RBP: 00007f1e8219bc08 R08: 0000000000000000 R09: 0000559037df64b0 R10: fe04b91e88691591 R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007ffe16c2e220 R15: 0000000000000001 </TASK>
Max Kellermann max.kellermann@ionos.com wrote:
It was not caused by my bad patch. Without my patch, but with your revert instead I just got a crash (this time, I enabled lots of debugging options in the kernel, including KASAN) - it's the same crash as in the post I linked in my previous email:
------------[ cut here ]------------ WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386 ceph_put_wrbuffer_cap_refs+0x416/0x500
Is that "WARN_ON_ONCE(ci->i_auth_cap);" for you?
David
On Wed, Jul 31, 2024 at 12:41 PM David Howells dhowells@redhat.com wrote:
------------[ cut here ]------------ WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386 ceph_put_wrbuffer_cap_refs+0x416/0x500
Is that "WARN_ON_ONCE(ci->i_auth_cap);" for you?
Yes, and that happens because no "capsnap" was found, because the "snapc" parameter is 0x356 (NETFS_FOLIO_COPY_TO_CACHE); no snap_context with address 0x356 could be found, of course.
Max
The attached patch gets me most of the way there, applied on the top of the reversion one. See:
https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h...
There's still an occasional slab-use-after-free that pops up:
BUG: KASAN: slab-use-after-free in xa_head+0xe/0x70 Read of size 8 at addr ffff8881b2cf6df8 by task kworker/0:1/9 ... xa_head+0xe/0x70 xas_start+0xca/0x140 xas_load+0x16/0x110 xas_find+0x84/0x1f0 __fscache_clear_page_bits+0x136/0x340 ...
where the thing being allocated is a ceph inode.
Note that Hristo's patch is not sufficient.
David --- netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags
The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used correctly. The problem is that we try to set them up in the request initialisation, but we the cache may be in the process of setting up still, and so the state may not be correct. Further, we secondarily sample the cache state and make contradictory decisions later.
The issue arises because we set up the cache resources, which allows the cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which triggers cache writing even if we didn't set the flags when allocating.
Fix this in the following way:
(1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in ->init_request() rather than trying to juggle that in netfs_alloc_request().
(2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is to be done, then PG_private_2 is to be used rather than only setting it if we decide to cache and then having netfs_rreq_unlock_folios() set the non-PG_private_2 writeback-to-cache if it wasn't set.
(3) Split netfs_rreq_unlock_folios() into two functions, one of which contains the deprecated code for using PG_private_2 to avoid accidentally doing the writeback path - and always use it if USE_PGPRIV2 is set.
(4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always wait for PG_private_2. This function is deprecated and only used by ceph anyway, and so label it so.
(5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use fscache_operation_valid() on the cache_resources instead. This has the advantage of picking up the result of netfs_begin_cache_read() and fscache_begin_write_operation() - which are called after the object is initialised and will wait for the cache to come to a usable state.
Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be applied on top of that. Without this as well, things like:
rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: {
and:
WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386
may happen, along with some UAFs due to PG_private_2 not getting used to wait on writeback completion.
Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Reported-by: Max Kellermann max.kellermann@ionos.com Signed-off-by: David Howells dhowells@redhat.com cc: Ilya Dryomov idryomov@gmail.com cc: Xiubo Li xiubli@redhat.com cc: Hristo Venev hristo@venev.name cc: Jeff Layton jlayton@kernel.org cc: Matthew Wilcox willy@infradead.org cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1]
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 73b5a07bf94d..cc0a2240de98 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -424,6 +424,9 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) struct ceph_netfs_request_data *priv; int ret = 0;
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f8de8f33abb..71cd70514efa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
/* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
spin_lock_init(&ci->i_ceph_lock);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 424048f9ed1f..79d83abb655b 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,6 +9,97 @@ #include <linux/task_io_accounting_ops.h> #include "internal.h"
+/* + * [DEPRECATED] Unlock the folios in a read operation for when the filesystem + * is using PG_private_2 and direct writing to the cache from here rather than + * marking the page for writeback. + * + * Note that we don't touch folio->private in this code. + */ +static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + size_t account = 0; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + loff_t pg_end; + bool pg_failed = false; + bool folio_started = false; + + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + + for (;;) { + loff_t sreq_end; + + if (!subreq) { + pg_failed = true; + break; + } + + if (!folio_started && + test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && + fscache_operation_valid(&rreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folio_started = true; + } + + pg_failed |= subreq_failed; + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) + break; + + account += subreq->transferred; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + + if (pg_end == sreq_end) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + kdebug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); +} + /* * Unlock the folios in a read operation. We need to set PG_writeback on any * folios we're going to write back before we unlock them. @@ -35,6 +126,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } }
+ /* Handle deprecated PG_private_2 case. */ + if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + netfs_rreq_unlock_folios_pgpriv2(rreq); + goto out; + } + /* Walk through the pagecache and the I/O request lists simultaneously. * We may have a mixture of cached and uncached sections and we only * really want to write out the uncached sections. This is slightly @@ -52,7 +149,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) loff_t pg_end; bool pg_failed = false; bool wback_to_cache = false; - bool folio_started = false;
if (xas_retry(&xas, folio)) continue; @@ -66,17 +162,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, - &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - } else { - wback_to_cache |= - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } + + wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) @@ -124,6 +211,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } rcu_read_unlock();
+out: task_io_account_read(account); if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -395,7 +483,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, }
/** - * netfs_write_begin - Helper to prepare for writing + * netfs_write_begin - Helper to prepare for writing [DEPRECATED] * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from @@ -426,6 +514,9 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, * inode before calling this. * * This is usable whether or not caching is enabled. + * + * Note that this should be considered deprecated and netfs_perform_write() + * used instead. */ int netfs_write_begin(struct netfs_inode *ctx, struct file *file, struct address_space *mapping, @@ -507,11 +598,9 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio: - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { - ret = folio_wait_private_2_killable(folio); - if (ret < 0) - goto error; - } + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; have_folio_no_wait: *_folio = folio; _leave(" = 0"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index f4a642727479..0faea0cee179 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -57,10 +57,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); if (cached) { - __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) - /* Filesystem uses deprecated PG_private_2 marking. */ - __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 9258d30cffe3..d35bb0f25d69 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -102,7 +102,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode); - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + if (fscache_operation_valid(&wreq->cache_resources)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
wreq->contiguity = wreq->start; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7202ce84d0eb..bf29a65c5027 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -265,6 +265,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi { rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
return 0; } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index fbed0027996f..e8adae1bc260 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -81,8 +81,6 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d0288938cc2..983816608f15 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,8 +73,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark - * write to cache on read */ };
/* @@ -269,7 +267,6 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ -#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 24ec3434d32e..606b4a0f92da 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -51,6 +51,7 @@ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ + EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \
Okay, I updated to -rc2 and now the apparent UAF of a ceph inode doesn't seem to happen. I've fixed a couple of minor issues in the patch:
- Switched a kdebug() to a _debug().
- netfs_rreq_unlock_folios_pgpriv2() needed to updated the 'account' variable in the caller, not do it's own thing.
David --- netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags
The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used correctly. The problem is that we try to set them up in the request initialisation, but we the cache may be in the process of setting up still, and so the state may not be correct. Further, we secondarily sample the cache state and make contradictory decisions later.
The issue arises because we set up the cache resources, which allows the cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which triggers cache writing even if we didn't set the flags when allocating.
Fix this in the following way:
(1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in ->init_request() rather than trying to juggle that in netfs_alloc_request().
(2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is to be done, then PG_private_2 is to be used rather than only setting it if we decide to cache and then having netfs_rreq_unlock_folios() set the non-PG_private_2 writeback-to-cache if it wasn't set.
(3) Split netfs_rreq_unlock_folios() into two functions, one of which contains the deprecated code for using PG_private_2 to avoid accidentally doing the writeback path - and always use it if USE_PGPRIV2 is set.
(4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always wait for PG_private_2. This function is deprecated and only used by ceph anyway, and so label it so.
(5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use fscache_operation_valid() on the cache_resources instead. This has the advantage of picking up the result of netfs_begin_cache_read() and fscache_begin_write_operation() - which are called after the object is initialised and will wait for the cache to come to a usable state.
Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be applied on top of that. Without this as well, things like:
rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: {
and:
WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386
may happen, along with some UAFs due to PG_private_2 not getting used to wait on writeback completion.
Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Reported-by: Max Kellermann max.kellermann@ionos.com Signed-off-by: David Howells dhowells@redhat.com cc: Ilya Dryomov idryomov@gmail.com cc: Xiubo Li xiubli@redhat.com cc: Hristo Venev hristo@venev.name cc: Jeff Layton jlayton@kernel.org cc: Matthew Wilcox willy@infradead.org cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1] --- fs/ceph/addr.c | 3 + fs/ceph/inode.c | 2 fs/netfs/buffered_read.c | 125 ++++++++++++++++++++++++++++++++++++------- fs/netfs/objects.c | 4 - fs/netfs/write_issue.c | 2 fs/nfs/fscache.c | 2 fs/nfs/fscache.h | 2 include/linux/netfs.h | 3 - include/trace/events/netfs.h | 1 9 files changed, 114 insertions(+), 30 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 73b5a07bf94d..cc0a2240de98 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -424,6 +424,9 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) struct ceph_netfs_request_data *priv; int ret = 0;
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f8de8f33abb..71cd70514efa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
/* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
spin_lock_init(&ci->i_ceph_lock);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 424048f9ed1f..27c750d39476 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,6 +9,97 @@ #include <linux/task_io_accounting_ops.h> #include "internal.h"
+/* + * [DEPRECATED] Unlock the folios in a read operation for when the filesystem + * is using PG_private_2 and direct writing to the cache from here rather than + * marking the page for writeback. + * + * Note that we don't touch folio->private in this code. + */ +static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq, + size_t *account) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + loff_t pg_end; + bool pg_failed = false; + bool folio_started = false; + + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + + for (;;) { + loff_t sreq_end; + + if (!subreq) { + pg_failed = true; + break; + } + + if (!folio_started && + test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && + fscache_operation_valid(&rreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folio_started = true; + } + + pg_failed |= subreq_failed; + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) + break; + + *account += subreq->transferred; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + + if (pg_end == sreq_end) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + _debug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); +} + /* * Unlock the folios in a read operation. We need to set PG_writeback on any * folios we're going to write back before we unlock them. @@ -35,6 +126,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } }
+ /* Handle deprecated PG_private_2 case. */ + if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + netfs_rreq_unlock_folios_pgpriv2(rreq, &account); + goto out; + } + /* Walk through the pagecache and the I/O request lists simultaneously. * We may have a mixture of cached and uncached sections and we only * really want to write out the uncached sections. This is slightly @@ -52,7 +149,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) loff_t pg_end; bool pg_failed = false; bool wback_to_cache = false; - bool folio_started = false;
if (xas_retry(&xas, folio)) continue; @@ -66,17 +162,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, - &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - } else { - wback_to_cache |= - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } + + wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) @@ -124,6 +211,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } rcu_read_unlock();
+out: task_io_account_read(account); if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -395,7 +483,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, }
/** - * netfs_write_begin - Helper to prepare for writing + * netfs_write_begin - Helper to prepare for writing [DEPRECATED] * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from @@ -426,6 +514,9 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, * inode before calling this. * * This is usable whether or not caching is enabled. + * + * Note that this should be considered deprecated and netfs_perform_write() + * used instead. */ int netfs_write_begin(struct netfs_inode *ctx, struct file *file, struct address_space *mapping, @@ -507,11 +598,9 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio: - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { - ret = folio_wait_private_2_killable(folio); - if (ret < 0) - goto error; - } + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; have_folio_no_wait: *_folio = folio; _leave(" = 0"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index f4a642727479..0faea0cee179 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -57,10 +57,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); if (cached) { - __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) - /* Filesystem uses deprecated PG_private_2 marking. */ - __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 9258d30cffe3..d35bb0f25d69 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -102,7 +102,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode); - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + if (fscache_operation_valid(&wreq->cache_resources)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
wreq->contiguity = wreq->start; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7202ce84d0eb..bf29a65c5027 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -265,6 +265,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi { rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
return 0; } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index fbed0027996f..e8adae1bc260 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -81,8 +81,6 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d0288938cc2..983816608f15 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,8 +73,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark - * write to cache on read */ };
/* @@ -269,7 +267,6 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ -#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 24ec3434d32e..606b4a0f92da 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -51,6 +51,7 @@ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ + EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \
Hi,
Thanks for your patch.
FYI: kernel test robot notices the stable kernel rule is not satisfied.
The check is based on https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html#opti...
Rule: add the tag "Cc: stable@vger.kernel.org" in the sign-off area to have the patch automatically included in the stable tree. Subject: [PATCH v2] netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags Link: https://lore.kernel.org/stable/1068323.1723117607%40warthog.procyon.org.uk
It turned out that I had accidentally disabled caching for 9p, afs and cifs, so here's a v3 that fixes that.
David --- netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags
The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used correctly. The problem is that we try to set them up in the request initialisation, but we the cache may be in the process of setting up still, and so the state may not be correct. Further, we secondarily sample the cache state and make contradictory decisions later.
The issue arises because we set up the cache resources, which allows the cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which triggers cache writing even if we didn't set the flags when allocating.
Fix this in the following way:
(1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in ->init_request() rather than trying to juggle that in netfs_alloc_request().
(2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is to be done, then PG_private_2 is to be used rather than only setting it if we decide to cache and then having netfs_rreq_unlock_folios() set the non-PG_private_2 writeback-to-cache if it wasn't set.
(3) Split netfs_rreq_unlock_folios() into two functions, one of which contains the deprecated code for using PG_private_2 to avoid accidentally doing the writeback path - and always use it if USE_PGPRIV2 is set.
(4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always wait for PG_private_2. This function is deprecated and only used by ceph anyway, and so label it so.
(5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use fscache_operation_valid() on the cache_resources instead. This has the advantage of picking up the result of netfs_begin_cache_read() and fscache_begin_write_operation() - which are called after the object is initialised and will wait for the cache to come to a usable state.
Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be applied on top of that. Without this as well, things like:
rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: {
and:
WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386
may happen, along with some UAFs due to PG_private_2 not getting used to wait on writeback completion.
Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Reported-by: Max Kellermann max.kellermann@ionos.com Signed-off-by: David Howells dhowells@redhat.com cc: Ilya Dryomov idryomov@gmail.com cc: Xiubo Li xiubli@redhat.com cc: Hristo Venev hristo@venev.name cc: Jeff Layton jlayton@kernel.org cc: Matthew Wilcox willy@infradead.org cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1] --- fs/ceph/addr.c | 3 + fs/ceph/inode.c | 2 fs/netfs/buffered_read.c | 125 ++++++++++++++++++++++++++++++++++++------- fs/netfs/objects.c | 10 --- fs/netfs/write_issue.c | 4 + fs/nfs/fscache.c | 2 fs/nfs/fscache.h | 2 include/linux/netfs.h | 3 - include/trace/events/netfs.h | 1 9 files changed, 116 insertions(+), 36 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 73b5a07bf94d..cc0a2240de98 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -424,6 +424,9 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) struct ceph_netfs_request_data *priv; int ret = 0;
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f8de8f33abb..71cd70514efa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
/* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
spin_lock_init(&ci->i_ceph_lock);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 424048f9ed1f..27c750d39476 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,6 +9,97 @@ #include <linux/task_io_accounting_ops.h> #include "internal.h"
+/* + * [DEPRECATED] Unlock the folios in a read operation for when the filesystem + * is using PG_private_2 and direct writing to the cache from here rather than + * marking the page for writeback. + * + * Note that we don't touch folio->private in this code. + */ +static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq, + size_t *account) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + loff_t pg_end; + bool pg_failed = false; + bool folio_started = false; + + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + + for (;;) { + loff_t sreq_end; + + if (!subreq) { + pg_failed = true; + break; + } + + if (!folio_started && + test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && + fscache_operation_valid(&rreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folio_started = true; + } + + pg_failed |= subreq_failed; + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) + break; + + *account += subreq->transferred; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + + if (pg_end == sreq_end) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + _debug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); +} + /* * Unlock the folios in a read operation. We need to set PG_writeback on any * folios we're going to write back before we unlock them. @@ -35,6 +126,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } }
+ /* Handle deprecated PG_private_2 case. */ + if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + netfs_rreq_unlock_folios_pgpriv2(rreq, &account); + goto out; + } + /* Walk through the pagecache and the I/O request lists simultaneously. * We may have a mixture of cached and uncached sections and we only * really want to write out the uncached sections. This is slightly @@ -52,7 +149,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) loff_t pg_end; bool pg_failed = false; bool wback_to_cache = false; - bool folio_started = false;
if (xas_retry(&xas, folio)) continue; @@ -66,17 +162,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, - &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - } else { - wback_to_cache |= - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } + + wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) @@ -124,6 +211,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } rcu_read_unlock();
+out: task_io_account_read(account); if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -395,7 +483,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, }
/** - * netfs_write_begin - Helper to prepare for writing + * netfs_write_begin - Helper to prepare for writing [DEPRECATED] * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from @@ -426,6 +514,9 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, * inode before calling this. * * This is usable whether or not caching is enabled. + * + * Note that this should be considered deprecated and netfs_perform_write() + * used instead. */ int netfs_write_begin(struct netfs_inode *ctx, struct file *file, struct address_space *mapping, @@ -507,11 +598,9 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio: - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { - ret = folio_wait_private_2_killable(folio); - if (ret < 0) - goto error; - } + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; have_folio_no_wait: *_folio = folio; _leave(" = 0"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index f4a642727479..0294df70c3ff 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -24,10 +24,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct netfs_io_request *rreq; mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; struct kmem_cache *cache = mempool->pool_data; - bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || - origin == NETFS_DIO_READ || - origin == NETFS_DIO_WRITE); - bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret;
for (;;) { @@ -56,12 +52,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, refcount_set(&rreq->ref, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (cached) { - __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) - /* Filesystem uses deprecated PG_private_2 marking. */ - __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); - } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 9258d30cffe3..3f7e37e50c7d 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -94,6 +94,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, { struct netfs_io_request *wreq; struct netfs_inode *ictx; + bool is_buffered = (origin == NETFS_WRITEBACK || + origin == NETFS_WRITETHROUGH);
wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) @@ -102,7 +104,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode); - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + if (is_buffered && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
wreq->contiguity = wreq->start; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7202ce84d0eb..bf29a65c5027 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -265,6 +265,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi { rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
return 0; } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index fbed0027996f..e8adae1bc260 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -81,8 +81,6 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d0288938cc2..983816608f15 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,8 +73,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark - * write to cache on read */ };
/* @@ -269,7 +267,6 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ -#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 24ec3434d32e..606b4a0f92da 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -51,6 +51,7 @@ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ + EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \
linux-stable-mirror@lists.linaro.org