On Mon, 2024-08-19 at 20:24 +0100, Mark Brown wrote:
[snip]
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 059685612362..42b2b18de20d 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -191,44 +191,103 @@ void reset_thread_features(void) current->thread.features_locked = 0; } -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, - unsigned long stack_size) +int arch_shstk_validate_clone(struct task_struct *t, + struct vm_area_struct *vma, + struct page *page, + struct kernel_clone_args *args) +{ + /* + * SSP is aligned, so reserved bits and mode bit are a zero, just mark + * the token 64-bit. + */ + void *maddr = kmap_local_page(page); + int offset; + unsigned long addr, ssp; + u64 expected; + u64 val;
+ if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0;
+ ssp = args->shadow_stack + args->shadow_stack_size; + addr = ssp - SS_FRAME_SIZE; + expected = ssp | BIT(0); + offset = offset_in_page(ssp);
+ /* This should really be an atomic cmpxchg. It is not. */ + copy_from_user_page(vma, page, addr, &val, maddr + offset, + sizeof(val));
Were so close to the real cmpxchg at this point. I took a shot at it with the diff at the end. I'm not sure if it might need some of the instrumentation calls.
+ if (val != expected) + return false;
Return false for an int will be 0 (i.e. success). I think it might be covering up a bug. The gup happens to args->shadow_stack + args->shadow_stack_size - 1 (the size inclusive). But the copy happens at the size exclusive.
So shadow_stack_size = PAGE_SIZE, will try to read the token at the start of the shadow stack, but the failure will be reported as success. I think...
On another note, I think we need to verify that ssp is 8 byte aligned, or it could be made to overflow the adjacent direct map page a few bytes. At least I didn't see how it was prevented.
+ val = 0;
+ copy_to_user_page(vma, page, addr, maddr + offset, &val, sizeof(val)); + set_page_dirty_lock(page);
+ return 0; +}
[snip]
+static int shstk_validate_clone(struct task_struct *p, + struct kernel_clone_args *args) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + unsigned long addr; + int ret;
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK)) + return 0;
+ if (!args->shadow_stack) + return 0;
+ mm = get_task_mm(p); + if (!mm) + return -EFAULT;
+ mmap_read_lock(mm);
+ /* + * All current shadow stack architectures have tokens at the + * top of a downward growing shadow stack. + */ + addr = args->shadow_stack + args->shadow_stack_size - 1; + addr = untagged_addr_remote(mm, addr);
+ page = get_user_page_vma_remote(mm, addr, FOLL_FORCE | FOLL_WRITE, + &vma); + if (IS_ERR(page)) { + ret = -EFAULT; + goto out; + }
+ if (!(vma->vm_flags & VM_SHADOW_STACK)) {
Can we check VM_WRITE here too? At least on x86, shadow stacks can be mprotect()ed as read-only. The reason for this before I think fell out of the implementation details, but all the same it would be nice be consistent. Then it should behave identically to a real shadow stack access.
+ ret = -EFAULT; + goto out_page; + }
+ ret = arch_shstk_validate_clone(p, vma, page, args);
+out_page: + put_page(page); +out: + mmap_read_unlock(mm); + mmput(mm); + return ret; +}
[snip]
+/**
- clone3_shadow_stack_valid - check and prepare shadow stack
- @kargs: kernel clone args
- Verify that shadow stacks are only enabled if supported.
- */
+static inline bool clone3_shadow_stack_valid(struct kernel_clone_args *kargs) +{ + if (kargs->shadow_stack) { + if (!kargs->shadow_stack_size) + return false;
+ if (kargs->shadow_stack_size < SHADOW_STACK_SIZE_MIN) + return false;
+ if (kargs->shadow_stack_size > rlimit(RLIMIT_STACK)) + return false;
At the risk of asking a stupid question or one that I should have asked a long time ago...
Why do we need both shadow_stack and shadow_stack_size? We are basically asking it to consume a token at a pointer and have userspace manage the shadow stack itself. So why does the kernel care what size it is? Couldn't we just have 'shadow_stack' have that mean consume a token here.
+ /* + * The architecture must check support on the specific + * machine. + */ + return IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK); + } else { + return !kargs->shadow_stack_size; + } +}
Fixing some of mentioned bugs, this on top passed the selftests for me. It doesn't have the 8 byte alignment check I mentioned because I'm less sure I might be missing it somewhere.
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 42b2b18de20d..2685180b8c5c 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -204,7 +204,6 @@ int arch_shstk_validate_clone(struct task_struct *t, int offset; unsigned long addr, ssp; u64 expected; - u64 val;
if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; @@ -212,17 +211,12 @@ int arch_shstk_validate_clone(struct task_struct *t, ssp = args->shadow_stack + args->shadow_stack_size; addr = ssp - SS_FRAME_SIZE; expected = ssp | BIT(0); - offset = offset_in_page(ssp); + offset = offset_in_page(addr);
- /* This should really be an atomic cmpxchg. It is not. */ - copy_from_user_page(vma, page, addr, &val, maddr + offset, - sizeof(val)); + if (!cmpxchg_to_user_page(vma, page, addr, (unsigned long *)(maddr + offset), + expected, 0)) + return -EINVAL;
- if (val != expected) - return false; - val = 0; - - copy_to_user_page(vma, page, addr, maddr + offset, &val, sizeof(val)); set_page_dirty_lock(page);
return 0; diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 7ee8a179d103..1500d49bc3f7 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -124,4 +124,15 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) } while (0) #endif
+#ifndef cmpxchg_to_user_page +#define cmpxchg_to_user_page(vma, page, vaddr, ptr, old, new) \ +({ \ + bool ret; \ + \ + ret = try_cmpxchg(ptr, &old, new); \ + flush_icache_user_page(vma, page, vaddr, sizeof(*ptr)); \ + ret; \ +}) +#endif + #endif /* _ASM_GENERIC_CACHEFLUSH_H */