From: Peter Zijlstra peterz@infradead.org
commit 8019ad13ef7f64be44d4f892af9c840179009254 upstream.
As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier.
This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue.
Reported-by: Jann Horn jannh@google.com Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
--- fs/inode.c | 1 include/linux/fs.h | 1 include/linux/futex.h | 17 +++++---- kernel/futex.c | 89 +++++++++++++++++++++++++++++--------------------- 4 files changed, 65 insertions(+), 43 deletions(-)
--- a/fs/inode.c +++ b/fs/inode.c @@ -135,6 +135,7 @@ int inode_init_always(struct super_block inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -645,6 +645,7 @@ struct inode { struct rcu_head i_rcu; }; u64 i_version; + atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -34,23 +34,26 @@ long do_futex(u32 __user *uaddr, int op,
union futex_key { struct { + u64 i_seq; unsigned long pgoff; - struct inode *inode; - int offset; + unsigned int offset; } shared; struct { + union { + struct mm_struct *mm; + u64 __tmp; + }; unsigned long address; - struct mm_struct *mm; - int offset; + unsigned int offset; } private; struct { + u64 ptr; unsigned long word; - void *ptr; - int offset; + unsigned int offset; } both; };
-#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
#ifdef CONFIG_FUTEX enum { --- a/kernel/futex.c +++ b/kernel/futex.c @@ -445,7 +445,7 @@ static void get_futex_key_refs(union fut
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ @@ -479,7 +479,6 @@ static void drop_futex_key_refs(union fu
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -487,6 +486,46 @@ static void drop_futex_key_refs(union fu } }
+/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -499,9 +538,15 @@ static void drop_futex_key_refs(union fu * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -641,8 +686,6 @@ again: key->private.mm = mm; key->private.address = address;
- get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode;
@@ -674,40 +717,14 @@ again: goto again; }
- /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); }
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page); return err;