From: Ira Weiny ira.weiny@intel.com
While evaluating the possibility of defining a new type for pkeys within the kernel I found a couple of minor bugs.
Because these patches clean up the return codes from system calls I'm sending this out RFC hoping that users will speak up if anything breaks.
I'm not too concerned about pkey_free() because it is unlikely that anyone is checking the return code. Interestingly enough, glibc recommends not calling pkey_free() because it does not change the access rights to the key and may be subsequently allocated again.[1][2]
The pkey_alloc() is more concerning. However, I checked the Chrome source and it does not differentiate among the return codes and maps all errors into kNoMemoryProtectionKey.
glibc says it returns ENOSYS if the system does not support pkeys but I don't see where ENOSYS is returned? AFAICS it just returns what the kernel returns. So it is probably up to user of glibc.
In addition I've enhanced the pkey tests to verify and test the changes.
Thanks to Rick Edgecombe and Sohil Mehta for internal review.
[1] Quote from manual/memory.texi:
Calling this function does not change the access rights of the freed protection key. The calling thread and other threads may retain access to it, even if it is subsequently allocated again. For this reason, it is not recommended to call the @code{pkey_free} function.
[2] PKS had a similar issue and went to statically allocated keys instead.
Ira Weiny (6): testing/pkeys: Add command line options testing/pkeys: Don't use uninitialized variable testing/pkeys: Add additional test for pkey_alloc() pkeys: Lift pkey hardware check for pkey_alloc() pkeys: Up level pkey_free() checks pkeys: Change mm_pkey_free() to void
arch/powerpc/include/asm/pkeys.h | 18 ++--- arch/x86/include/asm/pkeys.h | 7 +- include/linux/pkeys.h | 5 +- mm/mprotect.c | 13 +++- tools/testing/selftests/vm/pkey-helpers.h | 7 +- tools/testing/selftests/vm/protection_keys.c | 75 +++++++++++++++++--- 6 files changed, 86 insertions(+), 39 deletions(-)
base-commit: 874c8ca1e60b2c564a48f7e7acc40d328d5c8733
From: Ira Weiny ira.weiny@intel.com
It is more convenient to use command line options for debug and iterations vs changing the code and recompiling.
Add command line options for debug level and number of iterations.
$ ./protection_keys_64 -h Usage: ./protection_keys_64 [-h,-d,-i <iter>] --help,-h This help --debug,-d Increase debug level for each -d --iterations,-i <iter> repeate test <iter> times default: 22
Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Signed-off-by: Ira Weiny ira.weiny@intel.com --- tools/testing/selftests/vm/pkey-helpers.h | 7 +-- tools/testing/selftests/vm/protection_keys.c | 59 +++++++++++++++++--- 2 files changed, 55 insertions(+), 11 deletions(-)
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 92f3be3dd8e5..7aaac1c8ebca 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -23,9 +23,8 @@
#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
-#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif +extern int debug_level; + #define DPRINT_IN_SIGNAL_BUF_SIZE 4096 extern int dprint_in_signal; extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; @@ -58,7 +57,7 @@ static inline void sigsafe_printf(const char *format, ...) } } #define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ + if (level <= debug_level) \ sigsafe_printf(args); \ } while (0) #define dprintf0(args...) dprintf_level(0, args) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 291bc1e07842..d0183c381859 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -44,9 +44,13 @@ #include <unistd.h> #include <sys/ptrace.h> #include <setjmp.h> +#include <getopt.h>
#include "pkey-helpers.h"
+#define DEFAULT_ITERATIONS 22 + +int debug_level; int iteration_nr = 1; int test_nr;
@@ -361,7 +365,7 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * here. */ dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); - if (DEBUG_LEVEL > 4) + if (debug_level > 4) dump_mem(pkey_reg_ptr - 128, 256); pkey_assert(*pkey_reg_ptr); #endif /* arch */ @@ -480,7 +484,7 @@ int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf2("SYS_mprotect_key sret: %d\n", sret); dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) + if (debug_level >= 2) perror("SYS_mprotect_pkey"); } return sret; @@ -1116,7 +1120,7 @@ void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) pkey_write_deny(pkey); ret = read(test_fd, ptr, 100); dprintf1("read ret: %d\n", ret); - if (ret < 0 && (DEBUG_LEVEL > 0)) + if (ret < 0 && (debug_level > 0)) perror("verbose read result (OK for this to be bad)"); pkey_assert(ret); } @@ -1155,7 +1159,7 @@ void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) pkey_write_deny(pkey); futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, &ignored, ignored); - if (DEBUG_LEVEL > 0) + if (debug_level > 0) perror("futex"); dprintf1("futex() ret: %d\n", futex_ret); } @@ -1626,11 +1630,52 @@ void pkey_setup_shadow(void) shadow_pkey_reg = __read_pkey_reg(); }
-int main(void) +static void print_help_and_exit(char *argv0) +{ + printf("Usage: %s [-h,-d,-i <iter>]\n", argv0); + printf(" --help,-h This help\n"); + printf(" --debug,-d Increase debug level for each -d\n"); + printf(" --iterations,-i <iter> repeate test <iter> times\n"); + printf(" default: %d\n", DEFAULT_ITERATIONS); + printf("\n"); +} + +int main(int argc, char *argv[]) { - int nr_iterations = 22; - int pkeys_supported = is_pkeys_supported(); + int nr_iterations = DEFAULT_ITERATIONS; + int pkeys_supported; + + while (1) { + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"debug", no_argument, 0, 'd' }, + {"iterations", required_argument, 0, 'i' }, + {0, 0, 0, 0 } + }; + int option_index = 0; + int c; + + c = getopt_long(argc, argv, "hdi:", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + print_help_and_exit(argv[0]); + return 0; + case 'd': + debug_level++; + break; + case 'i': + nr_iterations = strtoul(optarg, NULL, 0); + break; + default: + print_help_and_exit(argv[0]); + exit(-1); + } + }
+ pkeys_supported = is_pkeys_supported(); srand((unsigned int)time(NULL));
setup_handlers();
On 6/10/2022 4:35 PM, ira.weiny@intel.com wrote:
Add command line options for debug level and number of iterations.
$ ./protection_keys_64 -h Usage: ./protection_keys_64 [-h,-d,-i <iter>] --help,-h This help --debug,-d Increase debug level for each -d
Is this mechanism (of counting d's) commonplace in other selftests as well? Looking at the test code for pkeys the debug levels run from 1-5. That feels like quite a few d's to input :)
Would it be easier to input the number in the command line directly?
Either way it would be useful to know the debug range in the help. Maybe something like: --debug,-d Increase debug level for each -d (1-5)
The patch seems fine to me otherwise.
--iterations,-i <iter> repeate test <iter> times default: 22
Thanks, Sohil
On Mon, Jun 13, 2022 at 03:31:02PM -0700, Mehta, Sohil wrote:
On 6/10/2022 4:35 PM, ira.weiny@intel.com wrote:
Add command line options for debug level and number of iterations.
$ ./protection_keys_64 -h Usage: ./protection_keys_64 [-h,-d,-i <iter>] --help,-h This help --debug,-d Increase debug level for each -d
Is this mechanism (of counting d's) commonplace in other selftests as well? Looking at the test code for pkeys the debug levels run from 1-5. That feels like quite a few d's to input :)
I've seen (and used) it before yes. See ibnetdiscover.
... # Debugging flags -d raise the IB debugging level. May be used several times (-ddd or -d -d -d). ... -v increase the application verbosity level. May be used several times (-vv or -v -v -v) ... - https://linux.die.net/man/8/ibnetdiscover
But a much more mainstream example I can think of is verbosity level with lspci.
16:29:12 > lspci -h ... Display options: -v Be verbose (-vv or -vvv for higher verbosity) ...
Would it be easier to input the number in the command line directly?
Either way it would be useful to know the debug range in the help. Maybe something like: --debug,-d Increase debug level for each -d (1-5)
I'm inclined not to do this because it would encode the max debug level. On the other hand I'm not sure why there are 5 levels now. ;-)
Having the multiple options specified was an easy way to maintain the large number of levels.
Ira
The patch seems fine to me otherwise.
--iterations,-i <iter> repeate test <iter> times default: 22
Thanks, Sohil
From: Ira Weiny ira.weiny@intel.com
err was being used in test_pkey_alloc_exhaust() prior to being assigned. errno is useful to know after a failed alloc_pkey() call.
Change err to errno in the debug print.
Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Signed-off-by: Ira Weiny ira.weiny@intel.com --- tools/testing/selftests/vm/protection_keys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index d0183c381859..43e47de19c0d 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1225,9 +1225,9 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + dprintf4("%s()::%d, errno: %d pkey_reg: 0x%016llx" " shadow: 0x%016llx\n", - __func__, __LINE__, err, __read_pkey_reg(), + __func__, __LINE__, errno, __read_pkey_reg(), shadow_pkey_reg); read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
On 6/10/2022 4:35 PM, ira.weiny@intel.com wrote:
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index d0183c381859..43e47de19c0d 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1225,9 +1225,9 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey();
dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
dprintf4("%s()::%d, errno: %d pkey_reg: 0x%016llx"
What is errno referring to over here? There are a few things happening in alloc_pkey(). I guess it would show the latest error that happened. Does errno need to be set to 0 before the call?
Also, would it be useful to print the return value (new_pkey) from alloc_pkey() here?
" shadow: 0x%016llx\n",
__func__, __LINE__, err, __read_pkey_reg(),
read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);__func__, __LINE__, errno, __read_pkey_reg(), shadow_pkey_reg);
Sohil
On Mon, Jun 13, 2022 at 03:48:56PM -0700, Mehta, Sohil wrote:
On 6/10/2022 4:35 PM, ira.weiny@intel.com wrote:
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index d0183c381859..43e47de19c0d 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1225,9 +1225,9 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey();
dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
dprintf4("%s()::%d, errno: %d pkey_reg: 0x%016llx"
What is errno referring to over here? There are a few things happening in alloc_pkey().
Good point, but the only system call in alloc_pkey() is pkey_alloc() so it will be the errno from there.
In test_pkey_alloc_exhaust() we are expecting the errno to be from pkey_alloc()
... if ((new_pkey == -1) && (errno == ENOSPC)) { ...
I guess it would show the latest error that happened. Does errno need to be set to 0 before the call?
Maybe. Now that I look again errno is printed just below at level 2.
dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
I missed that.
Also, would it be useful to print the return value (new_pkey) from alloc_pkey() here?
Yea that might be useful. Perhaps change err to new_pkey instead since errno is already printed.
Ira
" shadow: 0x%016llx\n",
__func__, __LINE__, err, __read_pkey_reg(),
read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);__func__, __LINE__, errno, __read_pkey_reg(), shadow_pkey_reg);
Sohil
From: Ira Weiny ira.weiny@intel.com
When pkeys are not available on the hardware pkey_alloc() has specific behavior which was previously untested.
Add test for this.
Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Signed-off-by: Ira Weiny ira.weiny@intel.com --- tools/testing/selftests/vm/protection_keys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 43e47de19c0d..4b733a75606f 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1554,6 +1554,16 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); }
+void test_pkey_alloc_on_unsupported_cpu(void) +{ + int test_pkey = sys_pkey_alloc(0, 0); + + dprintf1("pkey_alloc: %d (%d %s)\n", test_pkey, errno, + strerror(errno)); + pkey_assert(test_pkey < 0); + pkey_assert(errno == ENOSPC); +} + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@ -1688,6 +1698,8 @@ int main(int argc, char *argv[])
printf("running PKEY tests for unsupported CPU/OS\n");
+ test_pkey_alloc_on_unsupported_cpu(); + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); assert(ptr != (void *)-1); test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
On 6/10/2022 4:35 PM, ira.weiny@intel.com wrote:
+void test_pkey_alloc_on_unsupported_cpu(void) +{
- int test_pkey = sys_pkey_alloc(0, 0);
- dprintf1("pkey_alloc: %d (%d %s)\n", test_pkey, errno,
strerror(errno));
- pkey_assert(test_pkey < 0);
- pkey_assert(errno == ENOSPC);
This assert fails on a kernel with CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS disabled.
Since pkey_alloc() is an architecture dependent syscall, ENOSYS is returned instead of ENOSPC when support is disabled at compile time. See kernel/sys_ni.c
This brings us to an interesting question.
Should we have different return error codes when compile support is disabled vs when runtime support is missing?
Here is the current behavior for pkey_alloc():
No compile time support -> return ENOSYS No runtime support (but compile time support present) -> return ENOSPC
I would think applications would prefer the same error code. But, I am not sure if we can achieve this now due to ABI reasons.
+}
On 6/16/22 12:25, Sohil Mehta wrote:
Should we have different return error codes when compile support is disabled vs when runtime support is missing?
It doesn't *really* matter. Programs have to be able to run on old kernels which will return ENOSYS. So, _when_ new kernels return ENOSYS or ENOSPC is pretty immaterial.
From: Ira Weiny ira.weiny@intel.com
pkey_alloc() is documented to return ENOSPC when the hardware does not support pkeys. On x86, pkey_alloc() incorrectly returns EINVAL.
This is because mm_pkey_alloc() does not check for pkey support before returning a key. Therefore, if the keys are not exhausted pkey_alloc() continues on to call arch_set_user_pkey_access(). Unfortunately, when arch_set_user_pkey_access() detects the failed support it overwrites the ENOSPC return value with EINVAL.
Ensure consistent behavior across architectures by lifting this check to the core mm code.
Remove a couple of 'we' references in code comments as well.
Cc: ahaas@chromium.org Cc: clemensb@chromium.org Cc: gdeepti@chromium.org Cc: jkummerow@chromium.org Cc: manoskouk@chromium.org Cc: thibaudm@chromium.org Cc: Florian Weimer fweimer@redhat.com Cc: Sohil Mehta sohil.mehta@intel.com Cc: Andrew Morton akpm@linux-foundation.org Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Cc: linux-api@vger.kernel.org Fixes: e8c24d3a23a4 ("x86/pkeys: Allocation/free syscalls") Signed-off-by: Ira Weiny ira.weiny@intel.com
--- Thanks to Sohil for pointing out that the commit message could be more clear WRT how EINVAL is returned incorrectly. --- arch/powerpc/include/asm/pkeys.h | 8 +++----- mm/mprotect.c | 3 +++ 2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index 59a2c7dbc78f..2c8351248793 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -85,18 +85,16 @@ static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) static inline int mm_pkey_alloc(struct mm_struct *mm) { /* - * Note: this is the one and only place we make sure that the pkey is + * Note: this is the one and only place to make sure that the pkey is * valid as far as the hardware is concerned. The rest of the kernel * trusts that only good, valid pkeys come out of here. */ u32 all_pkeys_mask = (u32)(~(0x0)); int ret;
- if (!mmu_has_feature(MMU_FTR_PKEY)) - return -1; /* - * Are we out of pkeys? We must handle this specially because ffz() - * behavior is undefined if there are no zeros. + * Out of pkeys? Handle this specially because ffz() behavior is + * undefined if there are no zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; diff --git a/mm/mprotect.c b/mm/mprotect.c index ba5592655ee3..56d35de33725 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -773,6 +773,9 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) int pkey; int ret;
+ if (!arch_pkeys_enabled()) + return -ENOSPC; + /* No flags supported yet. */ if (flags) return -EINVAL;
diff --git a/mm/mprotect.c b/mm/mprotect.c index ba5592655ee3..56d35de33725 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -773,6 +773,9 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) int pkey; int ret;
- if (!arch_pkeys_enabled())
return -ENOSPC;
See comments in patch 3/6. Since we are modifying (fixing) old behavior, should we just return ENOSYS to make this consistent?
Sohil
/* No flags supported yet. */ if (flags) return -EINVAL;
From: Ira Weiny ira.weiny@intel.com
x86 is missing a hardware check for pkey support in pkey_free(). While the net result is the same (-EINVAL returned), pkey_free() has well defined behavior which will be easier to maintain in one place.
For powerpc the return code is -1 rather than -EINVAL. This changes that behavior slightly but this is very unlikely to break any user space.
Lift the checks for pkey_free() to the core mm code and ensure consistency with returning -EINVAL.
Cc: ahaas@chromium.org Cc: clemensb@chromium.org Cc: gdeepti@chromium.org Cc: jkummerow@chromium.org Cc: manoskouk@chromium.org Cc: thibaudm@chromium.org Cc: Florian Weimer fweimer@redhat.com Cc: Andrew Morton akpm@linux-foundation.org Cc: linux-api@vger.kernel.org Cc: Sohil Mehta sohil.mehta@intel.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Signed-off-by: Ira Weiny ira.weiny@intel.com
--- Thanks to Sohil for suggesting I mention the powerpc return value in the commit message.
Also Sohil suggested changing mm_pkey_free() from int to void. This is added as a separate patch with his suggested by. --- arch/powerpc/include/asm/pkeys.h | 6 ------ arch/x86/include/asm/pkeys.h | 3 --- mm/mprotect.c | 8 ++++++-- 3 files changed, 6 insertions(+), 11 deletions(-)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index 2c8351248793..e96aa91f817b 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -107,12 +107,6 @@ static inline int mm_pkey_alloc(struct mm_struct *mm)
static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { - if (!mmu_has_feature(MMU_FTR_PKEY)) - return -1; - - if (!mm_pkey_is_allocated(mm, pkey)) - return -EINVAL; - __mm_pkey_free(mm, pkey);
return 0; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 2e6c04d8a45b..da02737cc4d1 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -107,9 +107,6 @@ int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { - if (!mm_pkey_is_allocated(mm, pkey)) - return -EINVAL; - mm_set_pkey_free(mm, pkey);
return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 56d35de33725..41458e729c27 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -803,10 +803,14 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
SYSCALL_DEFINE1(pkey_free, int, pkey) { - int ret; + int ret = -EINVAL; + + if (!arch_pkeys_enabled()) + return ret;
mmap_write_lock(current->mm); - ret = mm_pkey_free(current->mm, pkey); + if (mm_pkey_is_allocated(current->mm, pkey)) + ret = mm_pkey_free(current->mm, pkey); mmap_write_unlock(current->mm);
/*
Le 11/06/2022 à 01:35, ira.weiny@intel.com a écrit :
From: Ira Weiny ira.weiny@intel.com
x86 is missing a hardware check for pkey support in pkey_free(). While the net result is the same (-EINVAL returned), pkey_free() has well defined behavior which will be easier to maintain in one place.
For powerpc the return code is -1 rather than -EINVAL. This changes that behavior slightly but this is very unlikely to break any user space.
Lift the checks for pkey_free() to the core mm code and ensure consistency with returning -EINVAL.
Cc: ahaas@chromium.org Cc: clemensb@chromium.org Cc: gdeepti@chromium.org Cc: jkummerow@chromium.org Cc: manoskouk@chromium.org Cc: thibaudm@chromium.org Cc: Florian Weimer fweimer@redhat.com Cc: Andrew Morton akpm@linux-foundation.org Cc: linux-api@vger.kernel.org Cc: Sohil Mehta sohil.mehta@intel.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Signed-off-by: Ira Weiny ira.weiny@intel.com
Thanks to Sohil for suggesting I mention the powerpc return value in the commit message.
Also Sohil suggested changing mm_pkey_free() from int to void. This is added as a separate patch with his suggested by.
arch/powerpc/include/asm/pkeys.h | 6 ------ arch/x86/include/asm/pkeys.h | 3 --- mm/mprotect.c | 8 ++++++-- 3 files changed, 6 insertions(+), 11 deletions(-)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index 2c8351248793..e96aa91f817b 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -107,12 +107,6 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) {
- if (!mmu_has_feature(MMU_FTR_PKEY))
return -1;
- if (!mm_pkey_is_allocated(mm, pkey))
return -EINVAL;
- __mm_pkey_free(mm, pkey);
return 0;
If it returns always 0, the return value is pointless and the function mm_pkey_free() should be changed to return void.
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 2e6c04d8a45b..da02737cc4d1 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -107,9 +107,6 @@ int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) {
- if (!mm_pkey_is_allocated(mm, pkey))
return -EINVAL;
- mm_set_pkey_free(mm, pkey);
return 0;
Same.
diff --git a/mm/mprotect.c b/mm/mprotect.c index 56d35de33725..41458e729c27 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -803,10 +803,14 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) SYSCALL_DEFINE1(pkey_free, int, pkey) {
- int ret;
- int ret = -EINVAL;
Don't initialise 'ret'
- if (!arch_pkeys_enabled())
return ret;
Make it explicit, do 'return -EINVAL'
Once that is done, is there any point in having a fallback version of mm_pkey_free() which returns -EINVAL ?
mmap_write_lock(current->mm);
- ret = mm_pkey_free(current->mm, pkey);
- if (mm_pkey_is_allocated(current->mm, pkey))
ret = mm_pkey_free(current->mm, pkey);
Add:
else ret = -EINVAL;
mmap_write_unlock(current->mm); /*
From: Ira Weiny ira.weiny@intel.com
Now that the pkey arch support is no longer checked in mm_pkey_free() there is no reason to have it return int.
Change the return value to void.
Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Suggested-by: Sohil Mehta sohil.mehta@intel.com Signed-off-by: Ira Weiny ira.weiny@intel.com --- arch/powerpc/include/asm/pkeys.h | 4 +--- arch/x86/include/asm/pkeys.h | 4 +--- include/linux/pkeys.h | 5 +---- mm/mprotect.c | 6 ++++-- 4 files changed, 7 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index e96aa91f817b..4d01a48ab941 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -105,11 +105,9 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) return ret; }
-static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +static inline void mm_pkey_free(struct mm_struct *mm, int pkey) { __mm_pkey_free(mm, pkey); - - return 0; }
/* diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index da02737cc4d1..1f408f46fa9a 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -105,11 +105,9 @@ int mm_pkey_alloc(struct mm_struct *mm) }
static inline -int mm_pkey_free(struct mm_struct *mm, int pkey) +void mm_pkey_free(struct mm_struct *mm, int pkey) { mm_set_pkey_free(mm, pkey); - - return 0; }
static inline int vma_pkey(struct vm_area_struct *vma) diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 86be8bf27b41..bf98c50a3437 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -30,10 +30,7 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) return -1; }
-static inline int mm_pkey_free(struct mm_struct *mm, int pkey) -{ - return -EINVAL; -} +static inline void mm_pkey_free(struct mm_struct *mm, int pkey) { }
static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) diff --git a/mm/mprotect.c b/mm/mprotect.c index 41458e729c27..e872bdd2e228 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -809,8 +809,10 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) return ret;
mmap_write_lock(current->mm); - if (mm_pkey_is_allocated(current->mm, pkey)) - ret = mm_pkey_free(current->mm, pkey); + if (mm_pkey_is_allocated(current->mm, pkey)) { + mm_pkey_free(current->mm, pkey); + ret = 0; + } mmap_write_unlock(current->mm);
/*
Le 11/06/2022 à 01:35, ira.weiny@intel.com a écrit :
From: Ira Weiny ira.weiny@intel.com
Now that the pkey arch support is no longer checked in mm_pkey_free() there is no reason to have it return int.
Right, I see this is doing what I commented in previous patch.
Change the return value to void.
Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Suggested-by: Sohil Mehta sohil.mehta@intel.com Signed-off-by: Ira Weiny ira.weiny@intel.com
arch/powerpc/include/asm/pkeys.h | 4 +--- arch/x86/include/asm/pkeys.h | 4 +--- include/linux/pkeys.h | 5 +---- mm/mprotect.c | 6 ++++-- 4 files changed, 7 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index e96aa91f817b..4d01a48ab941 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -105,11 +105,9 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) return ret; } -static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +static inline void mm_pkey_free(struct mm_struct *mm, int pkey) { __mm_pkey_free(mm, pkey);
- return 0; }
/* diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index da02737cc4d1..1f408f46fa9a 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -105,11 +105,9 @@ int mm_pkey_alloc(struct mm_struct *mm) } static inline -int mm_pkey_free(struct mm_struct *mm, int pkey) +void mm_pkey_free(struct mm_struct *mm, int pkey) { mm_set_pkey_free(mm, pkey);
- return 0; }
static inline int vma_pkey(struct vm_area_struct *vma) diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 86be8bf27b41..bf98c50a3437 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -30,10 +30,7 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) return -1; } -static inline int mm_pkey_free(struct mm_struct *mm, int pkey) -{
- return -EINVAL;
-} +static inline void mm_pkey_free(struct mm_struct *mm, int pkey) { } static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) diff --git a/mm/mprotect.c b/mm/mprotect.c index 41458e729c27..e872bdd2e228 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -809,8 +809,10 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) return ret; mmap_write_lock(current->mm);
- if (mm_pkey_is_allocated(current->mm, pkey))
ret = mm_pkey_free(current->mm, pkey);
- if (mm_pkey_is_allocated(current->mm, pkey)) {
mm_pkey_free(current->mm, pkey);
ret = 0;
- }
Or you could have ret = 0 by default and do
if (mm_pkey_is_allocated(current->mm, pkey)) mm_pkey_free(current->mm, pkey); else ret = -EINVAL;
mmap_write_unlock(current->mm); /*
On Mon, Jun 13, 2022 at 09:17:06AM +0000, Christophe Leroy wrote:
Le 11/06/2022 à 01:35, ira.weiny@intel.com a écrit :
From: Ira Weiny ira.weiny@intel.com
Now that the pkey arch support is no longer checked in mm_pkey_free() there is no reason to have it return int.
Right, I see this is doing what I commented in previous patch.
Yes because it was suggested by Sohil I decided to make it a separate patch to make the credit easier.
diff --git a/mm/mprotect.c b/mm/mprotect.c index 41458e729c27..e872bdd2e228 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -809,8 +809,10 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) return ret; mmap_write_lock(current->mm);
- if (mm_pkey_is_allocated(current->mm, pkey))
ret = mm_pkey_free(current->mm, pkey);
- if (mm_pkey_is_allocated(current->mm, pkey)) {
mm_pkey_free(current->mm, pkey);
ret = 0;
- }
Or you could have ret = 0 by default and do
if (mm_pkey_is_allocated(current->mm, pkey)) mm_pkey_free(current->mm, pkey); else ret = -EINVAL;
Yes that fits the kernel style better.
Thanks for the review! Ira
mmap_write_unlock(current->mm); /*
On 6/10/2022 4:35 PM, ira.weiny@intel.com wrote:
glibc says it returns ENOSYS if the system does not support pkeys but I don't see where ENOSYS is returned? AFAICS it just returns what the kernel returns. So it is probably up to user of glibc.
Implementation of the pkeys system calls is arch specific and conditional. See kernel/sys_ni.c
glibc is probably talking about ENOSYS being returned when the architecture doesn't have support or the CONFIG option is disabled on supported architectures.
Thanks, Sohil
linux-kselftest-mirror@lists.linaro.org