From: Xiubo Li xiubli@redhat.com
When trimming the caps and just after the 'session->s_cap_lock' is
released in ceph_iterate_session_caps() the cap maybe removed by
another thread, and when using the stale cap memory in the callbacks
it will trigger use-after-free crash.
We need to check the existence of the cap just after the 'ci->i_ceph_lock'
being acquired. And do nothing if it's already removed.
Your patch seems to be OK, but I'll be honest: the locking is *so* complex
that I can say for sure it really solves any problem :-(
ceph_put_cap() uses mdsc->caps_list_lock to protect the list, but I can't
be sure that holding ci->i_ceph_lock will protect a race in the case
you're trying to solve.
--
Luís
> Cc: stable@vger.kernel.org
> URL:
https://bugzilla.redhat.com/show_bug.cgi?id=2186264
> URL:
https://tracker.ceph.com/issues/43272
> Signed-off-by: Xiubo Li
xiubli@redhat.com
> ---
>
> V3:
> - Fixed 3 issues, which reported by Luis and kernel test robot
lkp@intel.com
>
> fs/ceph/debugfs.c | 7 ++++-
> fs/ceph/mds_client.c | 68 +++++++++++++++++++++++++++++---------------
> fs/ceph/mds_client.h | 2 +-
> 3 files changed, 52 insertions(+), 25 deletions(-)
>
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index bec3c4549c07..5c0f07df5b02 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -248,14 +248,19 @@ static int metrics_caps_show(struct seq_file *s, void *p)
> return 0;
> }
>
> -static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
> +static int caps_show_cb(struct inode *inode, struct rb_node *ci_node, void *p)
> {
> + struct ceph_inode_info *ci = ceph_inode(inode);
> struct seq_file *s = p;
> + struct ceph_cap *cap;
>
> + spin_lock(&ci->i_ceph_lock);
> + cap = rb_entry(ci_node, struct ceph_cap, ci_node);
> seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
> cap->session->s_mds,
> ceph_cap_string(cap->issued),
> ceph_cap_string(cap->implemented));
> + spin_unlock(&ci->i_ceph_lock);
> return 0;
> }
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 294af79c25c9..fb777add2257 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1786,7 +1786,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
> * Caller must hold session s_mutex.
> */
> int ceph_iterate_session_caps(struct ceph_mds_session *session,
> - int (*cb)(struct inode *, struct ceph_cap *,
> + int (*cb)(struct inode *, struct rb_node *ci_node,
> void *), void *arg)
> {
> struct list_head *p;
> @@ -1799,6 +1799,8 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
> spin_lock(&session->s_cap_lock);
> p = session->s_caps.next;
> while (p != &session->s_caps) {
> + struct rb_node *ci_node;
> +
> cap = list_entry(p, struct ceph_cap, session_caps);
> inode = igrab(&cap->ci->netfs.inode);
> if (!inode) {
> @@ -1806,6 +1808,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
> continue;
> }
> session->s_cap_iterator = cap;
> + ci_node = &cap->ci_node;
> spin_unlock(&session->s_cap_lock);
>
> if (last_inode) {
> @@ -1817,7 +1820,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
> old_cap = NULL;
> }
>
> - ret = cb(inode, cap, arg);
> + ret = cb(inode, ci_node, arg);
> last_inode = inode;
>
> spin_lock(&session->s_cap_lock);
> @@ -1850,20 +1853,26 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
> return ret;
> }
>
> -static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> +static int remove_session_caps_cb(struct inode *inode, struct rb_node *ci_node,
> void *arg)
> {
> struct ceph_inode_info *ci = ceph_inode(inode);
> bool invalidate = false;
> - int iputs;
> + struct ceph_cap *cap;
> + int iputs = 0;
>
> - dout("removing cap %p, ci is %p, inode is %p\n",
> - cap, ci, &ci->netfs.inode);
> spin_lock(&ci->i_ceph_lock);
> - iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
> + cap = rb_entry(ci_node, struct ceph_cap, ci_node);
> + if (cap) {
> + dout(" removing cap %p, ci is %p, inode is %p\n",
> + cap, ci, &ci->netfs.inode);
> +
> + iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
> + }
> spin_unlock(&ci->i_ceph_lock);
>
> - wake_up_all(&ci->i_cap_wq);
> + if (cap)
> + wake_up_all(&ci->i_cap_wq);
> if (invalidate)
> ceph_queue_invalidate(inode);
> while (iputs--)
> @@ -1934,8 +1943,7 @@ enum {
> *
> * caller must hold s_mutex.
> */
> -static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
> - void *arg)
> +static int wake_up_session_cb(struct inode *inode, struct rb_node *ci_node, void *arg)
> {
> struct ceph_inode_info *ci = ceph_inode(inode);
> unsigned long ev = (unsigned long)arg;
> @@ -1946,12 +1954,14 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
> ci->i_requested_max_size = 0;
> spin_unlock(&ci->i_ceph_lock);
> } else if (ev == RENEWCAPS) {
> - if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) {
> - /* mds did not re-issue stale cap */
> - spin_lock(&ci->i_ceph_lock);
> + struct ceph_cap *cap;
> +
> + spin_lock(&ci->i_ceph_lock);
> + cap = rb_entry(ci_node, struct ceph_cap, ci_node);
> + /* mds did not re-issue stale cap */
> + if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
> cap->issued = cap->implemented = CEPH_CAP_PIN;
> - spin_unlock(&ci->i_ceph_lock);
> - }
> + spin_unlock(&ci->i_ceph_lock);
> } else if (ev == FORCE_RO) {
> }
> wake_up_all(&ci->i_cap_wq);
> @@ -2113,16 +2123,22 @@ static bool drop_negative_children(struct dentry *dentry)
> * Yes, this is a bit sloppy. Our only real goal here is to respond to
> * memory pressure from the MDS, though, so it needn't be perfect.
> */
> -static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
> +static int trim_caps_cb(struct inode *inode, struct rb_node *ci_node, void *arg)
> {
> int *remaining = arg;
> struct ceph_inode_info *ci = ceph_inode(inode);
> int used, wanted, oissued, mine;
> + struct ceph_cap *cap;
>
> if (*remaining <= 0)
> return -1;
>
> spin_lock(&ci->i_ceph_lock);
> + cap = rb_entry(ci_node, struct ceph_cap, ci_node);
> + if (!cap) {
> + spin_unlock(&ci->i_ceph_lock);
> + return 0;
> + }
> mine = cap->issued | cap->implemented;
> used = __ceph_caps_used(ci);
> wanted = __ceph_caps_file_wanted(ci);
> @@ -4265,26 +4281,23 @@ static struct dentry* d_find_primary(struct inode *inode)
> /*
> * Encode information about a cap for a reconnect with the MDS.
> */
> -static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> +static int reconnect_caps_cb(struct inode *inode, struct rb_node *ci_node,
> void *arg)
> {
> union {
> struct ceph_mds_cap_reconnect v2;
> struct ceph_mds_cap_reconnect_v1 v1;
> } rec;
> - struct ceph_inode_info *ci = cap->ci;
> + struct ceph_inode_info *ci = ceph_inode(inode);
> struct ceph_reconnect_state *recon_state = arg;
> struct ceph_pagelist *pagelist = recon_state->pagelist;
> struct dentry *dentry;
> + struct ceph_cap *cap;
> char *path;
> - int pathlen = 0, err;
> + int pathlen = 0, err = 0;
> u64 pathbase;
> u64 snap_follows;
>
> - dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
> - inode, ceph_vinop(inode), cap, cap->cap_id,
> - ceph_cap_string(cap->issued));
> -
> dentry = d_find_primary(inode);
> if (dentry) {
> /* set pathbase to parent dir when msg_version >= 2 */
> @@ -4301,6 +4314,15 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> }
>
> spin_lock(&ci->i_ceph_lock);
> + cap = rb_entry(ci_node, struct ceph_cap, ci_node);
> + if (!cap) {
> + spin_unlock(&ci->i_ceph_lock);
> + goto out_err;
> + }
> + dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
> + inode, ceph_vinop(inode), cap, cap->cap_id,
> + ceph_cap_string(cap->issued));
> +
> cap->seq = 0; /* reset cap seq */
> cap->issue_seq = 0; /* and issue_seq */
> cap->mseq = 0; /* and migrate_seq */
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 0f70ca3cdb21..001b69d04307 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -569,7 +569,7 @@ extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
> extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
> extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
> int (*cb)(struct inode *,
> - struct ceph_cap *, void *),
> + struct rb_node *ci_node, void *),
> void *arg);
> extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
>
> --
>
> 2.39.2
>