This series attempts to make the fsgsbase test in the x86 kselftest report a stable result. On some Intel systems there are intermittent failures in this testcase which have been reported and discussed previously with the initial report and last meaningful discussion having been about a year ago:
https://lore.kernel.org/lkml/20180126153631.ha7yc33fj5uhitjo@xps/
with the analysis concluding that this is a hardware issue affecting a subset of systems but no fix has been merged as yet. In order to at least make the test more solid for use in automated testing this series modifies it to execute the test often enough to reproduce the problem reliably, at least for the systems I have access to.
Mark Brown (2): selftests/x86/fsgsbase: Indirect output through a wrapper function selftests/x86/fsgsbase: Default to trying to run the test repeatedly
tools/testing/selftests/x86/fsgsbase.c | 79 +++++++++++++++++++------- 1 file changed, 58 insertions(+), 21 deletions(-)
In preparation for a change to make this test run repeatedly which would generate huge amounts of output as is indirect all the printf() calls in the program through a wrapper and add a quiet flag which can be used to suppress the output. This is fairly quick and dirty, I'm not 100% sure what would be idiomatic here.
Signed-off-by: Mark Brown broonie@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 52 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 20 deletions(-)
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index f249e042b3b5..6cda6daa1f8c 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -7,6 +7,7 @@ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> +#include <stdarg.h> #include <stdbool.h> #include <string.h> #include <sys/syscall.h> @@ -32,6 +33,17 @@ static volatile sig_atomic_t want_segv; static volatile unsigned long segv_addr;
static int nerrs; +static bool quiet; + +static void logger(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + if (!quiet) + vprintf(fmt, args); + va_end(args); +}
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) @@ -118,29 +130,29 @@ static void check_gs_value(unsigned long value) unsigned long base; unsigned short sel;
- printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value); + logger("[RUN]\tARCH_SET_GS to 0x%lx\n", value); if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0) err(1, "ARCH_SET_GS");
asm volatile ("mov %%gs, %0" : "=rm" (sel)); base = read_base(GS); if (base == value) { - printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n", + logger("[OK]\tGSBASE was set as expected (selector 0x%hx)\n", sel); } else { nerrs++; - printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n", + logger("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n", base, sel); }
if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0) err(1, "ARCH_GET_GS"); if (base == value) { - printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n", + logger("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n", sel); } else { nerrs++; - printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n", + logger("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n", base, sel); } } @@ -149,7 +161,7 @@ static void mov_0_gs(unsigned long initial_base, bool schedule) { unsigned long base, arch_base;
- printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : ""); + logger("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : ""); if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0) err(1, "ARCH_SET_GS");
@@ -161,10 +173,10 @@ static void mov_0_gs(unsigned long initial_base, bool schedule) if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0) err(1, "ARCH_GET_GS"); if (base == arch_base) { - printf("[OK]\tGSBASE is 0x%lx\n", base); + logger("[OK]\tGSBASE is 0x%lx\n", base); } else { nerrs++; - printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base); + logger("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base); } }
@@ -195,7 +207,7 @@ static void do_remote_base()
unsigned short sel; asm volatile ("mov %%gs, %0" : "=rm" (sel)); - printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n", + logger("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n", to_set, hard_zero ? " and clear gs" : "", sel); }
@@ -228,7 +240,7 @@ void do_unexpected_base(void) .useable = 0 }; if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { - printf("\tother thread: using LDT slot 0\n"); + logger("\tother thread: using LDT slot 0\n"); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); } else { /* No modify_ldt for us (configured out, perhaps) */ @@ -250,10 +262,10 @@ void do_unexpected_base(void) munmap(low_desc, sizeof(desc));
if (ret != 0) { - printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); + logger("[NOTE]\tcould not create a segment -- test won't do anything\n"); return; } - printf("\tother thread: using GDT slot %d\n", desc.entry_number); + logger("\tother thread: using GDT slot %d\n", desc.entry_number); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); }
@@ -298,10 +310,10 @@ static void set_gs_and_switch_to(unsigned long local, local = 0; }
- printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n", + logger("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n", local, hard_zero ? " and clear gs" : "", remote); if (force_sel) - printf("\tBefore schedule, set selector to 0x%hx\n", force_sel); + logger("\tBefore schedule, set selector to 0x%hx\n", force_sel); if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0) err(1, "ARCH_SET_GS"); if (hard_zero) @@ -309,7 +321,7 @@ static void set_gs_and_switch_to(unsigned long local,
if (read_base(GS) != local) { nerrs++; - printf("[FAIL]\tGSBASE wasn't set as expected\n"); + logger("[FAIL]\tGSBASE wasn't set as expected\n"); }
if (force_sel) { @@ -335,11 +347,11 @@ static void set_gs_and_switch_to(unsigned long local, asm volatile ("mov %%gs, %0" : "=rm" (sel_post_sched)); base = read_base(GS); if (base == local && sel_pre_sched == sel_post_sched) { - printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n", + logger("[OK]\tGS/BASE remained 0x%hx/0x%lx\n", sel_pre_sched, local); } else { nerrs++; - printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n", + logger("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n", sel_pre_sched, local, sel_post_sched, base); } } @@ -348,7 +360,7 @@ static void test_unexpected_base(void) { unsigned long base;
- printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n"); + logger("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n"); if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) err(1, "ARCH_SET_GS"); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); @@ -360,10 +372,10 @@ static void test_unexpected_base(void)
base = read_base(GS); if (base == 0) { - printf("[OK]\tGSBASE remained 0\n"); + logger("[OK]\tGSBASE remained 0\n"); } else { nerrs++; - printf("[FAIL]\tGSBASE changed to 0x%lx\n", base); + logger("[FAIL]\tGSBASE changed to 0x%lx\n", base); } }
* Mark Brown broonie@kernel.org wrote:
In preparation for a change to make this test run repeatedly which would generate huge amounts of output as is indirect all the printf() calls in the program through a wrapper and add a quiet flag which can be used to suppress the output. This is fairly quick and dirty, I'm not 100% sure what would be idiomatic here.
Signed-off-by: Mark Brown broonie@kernel.org
tools/testing/selftests/x86/fsgsbase.c | 52 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 20 deletions(-)
So this really loses useful output when there is a failure. After this patch I get:
dagon:~/tip/tools/testing/selftests/x86> ./fsgsbase_64 -vv [FAIL] 24 errors detected in 1 tries
Not very helpful, and no real command-line way to see the failures.
While previously it would tell me what's going on:
dagon:~/tip/tools/testing/selftests/x86> ./fsgsbase_64 | grep FAIL [FAIL] GS/BASE changed from 0x1/0x1 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x1 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x1 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x1 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x1 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x1 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x1 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x1 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x1 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x1 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x1 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x1 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x200000000 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x200000000 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x200000000 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x200000000 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x200000000 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x200000000 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x200000000 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x200000000 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x200000000 to 0x3/0x0 [FAIL] GS/BASE changed from 0x1/0x200000000 to 0x1/0x0 [FAIL] GS/BASE changed from 0x2/0x200000000 to 0x2/0x0 [FAIL] GS/BASE changed from 0x3/0x200000000 to 0x3/0x0
So I believe at least failure messages need to be preserved.
Thanks,
Ingo
On Mon, Feb 11, 2019 at 09:45:43AM +0100, Ingo Molnar wrote:
- Mark Brown broonie@kernel.org wrote:
In preparation for a change to make this test run repeatedly which would generate huge amounts of output as is indirect all the printf() calls in the program through a wrapper and add a quiet flag which can be used to suppress the output. This is fairly quick and dirty, I'm not 100% sure what would be idiomatic here.
So this really loses useful output when there is a failure. After this patch I get:
dagon:~/tip/tools/testing/selftests/x86> ./fsgsbase_64 -vv [FAIL] 24 errors detected in 1 tries
Not very helpful, and no real command-line way to see the failures.
For clarity it's the second patch that breaks this, though you're right this isn't helpful and this patch needs a log level adding to really fix that.
In automated testing it has been found that on many systems the fsgsbase test fails intermittently. This was reported and discussed a while back:
https://lore.kernel.org/lkml/20180126153631.ha7yc33fj5uhitjo@xps/
with the analysis concluding that this is a hardware issue affecting a subset of systems but no fix has been merged as yet. As well as the actual problem found by testing the intermittent test failure is causing issues for the people doing the automated testing due to the noise.
In order to make the testing stable modify the test program to iterate through the test repeatedly, choosing 5000 iterations based on prior reports and local testing. This unfortunately greatly increases the execution time for the selftests when things succeed which isn't great, in my local tests on a range of systems it pushes the execution time up to approximately a minute when no failures are encountered.
Reported-by: Dan Rue dan.rue@linaro.org Signed-off-by: Mark Brown broonie@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 6cda6daa1f8c..83410749ff1f 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -379,7 +379,7 @@ static void test_unexpected_base(void) } }
-int main() +int test() { pthread_t thread;
@@ -437,3 +437,28 @@ int main()
return nerrs == 0 ? 0 : 1; } + +int main() +{ + int tries = 5000; + int i; + + if (tries > 1) + quiet = true; + + for (i = 0; i < tries; i++) { + if (test() != 0) + break; + } + + if (quiet) { + if (nerrs) { + printf("[FAIL] %d errors detected in %d tries\n", + nerrs, i + 1); + } else { + printf("[PASS] %d runs succeeded\n", i); + } + } + + return nerrs == 0 ? 0 : 1; +}
* Mark Brown broonie@kernel.org wrote:
In automated testing it has been found that on many systems the fsgsbase test fails intermittently. This was reported and discussed a while back:
https://lore.kernel.org/lkml/20180126153631.ha7yc33fj5uhitjo@xps/
with the analysis concluding that this is a hardware issue affecting a subset of systems but no fix has been merged as yet. As well as the actual problem found by testing the intermittent test failure is causing issues for the people doing the automated testing due to the noise.
In order to make the testing stable modify the test program to iterate through the test repeatedly, choosing 5000 iterations based on prior reports and local testing. This unfortunately greatly increases the execution time for the selftests when things succeed which isn't great, in my local tests on a range of systems it pushes the execution time up to approximately a minute when no failures are encountered.
Reported-by: Dan Rue dan.rue@linaro.org Signed-off-by: Mark Brown broonie@kernel.org
tools/testing/selftests/x86/fsgsbase.c | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 6cda6daa1f8c..83410749ff1f 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -379,7 +379,7 @@ static void test_unexpected_base(void) } } -int main() +int test() { pthread_t thread; @@ -437,3 +437,28 @@ int main() return nerrs == 0 ? 0 : 1; }
+int main() +{
- int tries = 5000;
- int i;
- if (tries > 1)
quiet = true;
- for (i = 0; i < tries; i++) {
if (test() != 0)
break;
- }
- if (quiet) {
if (nerrs) {
printf("[FAIL] %d errors detected in %d tries\n",
nerrs, i + 1);
} else {
printf("[PASS] %d runs succeeded\n", i);
}
- }
- return nerrs == 0 ? 0 : 1;
+}
So this isn't very user-friendly either, previously it would run a testcase and immediately provide output.
Now it's just starting and 'hanging':
galatea:~/linux/linux/tools/testing/selftests/x86> ./fsgsbase_64
I got bored and Ctrl-C-ed it after ~30 seconds.
How long is this supposed to run, and why isn't the user informed?
Also, testcases should really be short, so I think a better approach would be to thread the test-case and start an instance on every CPU. That should also excercise SMP bugs, if any.
Thanks,
Ingo
On Mon, Feb 11, 2019 at 09:49:16AM +0100, Ingo Molnar wrote:
So this isn't very user-friendly either, previously it would run a testcase and immediately provide output.
Now it's just starting and 'hanging':
galatea:~/linux/linux/tools/testing/selftests/x86> ./fsgsbase_64
I got bored and Ctrl-C-ed it after ~30 seconds.
How long is this supposed to run, and why isn't the user informed?
On Intel systems I've got access to it's tended to only run for less than 10 seconds for me with excursions up to ~30s at most, I'd have projected it to be about a minute if the tests pass. However retesting with Debian's v4.19 kernel it seems to be running a lot more stably so we're now seeing it run to completion reliably when just one copy of the test is running.
AFAICT it's not terribly idiomatic to provide much output, and anything that was per iteration would be *way* too spammy.
Also, testcases should really be short, so I think a better approach would be to thread the test-case and start an instance on every CPU. That should also excercise SMP bugs, if any.
Well, a *better* approach would be for the underlying issue that the test is finding to be fixed.
I didn't look at adding more threads as the test case is already threaded, it does seem that running multiple copies simultaneously makes things reproduce more quickly so it's definitely useful though it's still taking multiple iterations.
* Mark Brown broonie@kernel.org wrote:
On Mon, Feb 11, 2019 at 09:49:16AM +0100, Ingo Molnar wrote:
So this isn't very user-friendly either, previously it would run a testcase and immediately provide output.
Now it's just starting and 'hanging':
galatea:~/linux/linux/tools/testing/selftests/x86> ./fsgsbase_64
I got bored and Ctrl-C-ed it after ~30 seconds.
How long is this supposed to run, and why isn't the user informed?
On Intel systems I've got access to it's tended to only run for less than 10 seconds for me with excursions up to ~30s at most, I'd have projected it to be about a minute if the tests pass. However retesting with Debian's v4.19 kernel it seems to be running a lot more stably so we're now seeing it run to completion reliably when just one copy of the test is running.
AFAICT it's not terribly idiomatic to provide much output, and anything that was per iteration would be *way* too spammy.
Certainly - but a "please wait" and updating the current count via \r once every second isn't spammy.
Also, testcases should really be short, so I think a better approach would be to thread the test-case and start an instance on every CPU. That should also excercise SMP bugs, if any.
Well, a *better* approach would be for the underlying issue that the test is finding to be fixed.
I didn't look at adding more threads as the test case is already threaded, it does seem that running multiple copies simultaneously makes things reproduce more quickly so it's definitely useful though it's still taking multiple iterations.
multiple iterations are fine - waiting a minute with zero output on the console isn't.
Thanks,
Ingo
linux-kselftest-mirror@lists.linaro.org