From: Willy Tarreau <w(a)1wt.eu>
After re-checking in the spec and comparing stack offsets with glibc,
The last pushed argument must be 16-byte aligned (i.e. aligned before the
call) so that in the callee esp+4 is multiple of 16, so the principle is
the 32-bit equivalent to what Ammar fixed for x86_64. It's possible that
32-bit code using SSE2 or MMX could have been affected. In addition the
frame pointer ought to be zero at the deepest level.
Link: https://gitlab.com/x86-psABIs/i386-ABI/-/wikis/Intel386-psABI
Cc: Ammar Faizi <ammar.faizi(a)students.amikom.ac.id>
Cc: stable(a)vger.kernel.org
Signed-off-by: Willy Tarreau <w(a)1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
---
tools/include/nolibc/nolibc.h | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 96b6d56acb572..7f300dc379e70 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -583,13 +583,21 @@ struct sys_stat_struct {
})
/* startup code */
+/*
+ * i386 System V ABI mandates:
+ * 1) last pushed argument must be 16-byte aligned.
+ * 2) The deepest stack frame should be set to zero
+ *
+ */
asm(".section .text\n"
".global _start\n"
"_start:\n"
"pop %eax\n" // argc (first arg, %eax)
"mov %esp, %ebx\n" // argv[] (second arg, %ebx)
"lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx)
- "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when
+ "xor %ebp, %ebp\n" // zero the stack frame
+ "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned before
+ "sub $4, %esp\n" // the call instruction (args are aligned)
"push %ecx\n" // push all registers on the stack so that we
"push %ebx\n" // support both regparm and plain stack modes
"push %eax\n"
--
2.31.1.189.g2e36527f23
From: Ammar Faizi <ammar.faizi(a)students.amikom.ac.id>
Before this patch, the `_start` function looks like this:
```
0000000000001170 <_start>:
1170: pop %rdi
1171: mov %rsp,%rsi
1174: lea 0x8(%rsi,%rdi,8),%rdx
1179: and $0xfffffffffffffff0,%rsp
117d: sub $0x8,%rsp
1181: call 1000 <main>
1186: movzbq %al,%rdi
118a: mov $0x3c,%rax
1191: syscall
1193: hlt
1194: data16 cs nopw 0x0(%rax,%rax,1)
119f: nop
```
Note the "and" to %rsp with $-16, it makes the %rsp be 16-byte aligned,
but then there is a "sub" with $0x8 which makes the %rsp no longer
16-byte aligned, then it calls main. That's the bug!
What actually the x86-64 System V ABI mandates is that right before the
"call", the %rsp must be 16-byte aligned, not after the "call". So the
"sub" with $0x8 here breaks the alignment. Remove it.
An example where this rule matters is when the callee needs to align
its stack at 16-byte for aligned move instruction, like `movdqa` and
`movaps`. If the callee can't align its stack properly, it will result
in segmentation fault.
x86-64 System V ABI also mandates the deepest stack frame should be
zero. Just to be safe, let's zero the %rbp on startup as the content
of %rbp may be unspecified when the program starts. Now it looks like
this:
```
0000000000001170 <_start>:
1170: pop %rdi
1171: mov %rsp,%rsi
1174: lea 0x8(%rsi,%rdi,8),%rdx
1179: xor %ebp,%ebp # zero the %rbp
117b: and $0xfffffffffffffff0,%rsp # align the %rsp
117f: call 1000 <main>
1184: movzbq %al,%rdi
1188: mov $0x3c,%rax
118f: syscall
1191: hlt
1192: data16 cs nopw 0x0(%rax,%rax,1)
119d: nopl (%rax)
```
Cc: Bedirhan KURT <windowz414(a)gnuweeb.org>
Cc: Louvian Lyndal <louvianlyndal(a)gmail.com>
Reported-by: Peter Cordes <peter(a)cordes.ca>
Signed-off-by: Ammar Faizi <ammar.faizi(a)students.amikom.ac.id>
[wt: I did this on purpose due to a misunderstanding of the spec, other
archs will thus have to be rechecked, particularly i386]
Cc: stable(a)vger.kernel.org
Signed-off-by: Willy Tarreau <w(a)1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
---
tools/include/nolibc/nolibc.h | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 3430667b0d241..96b6d56acb572 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -399,14 +399,20 @@ struct stat {
})
/* startup code */
+/*
+ * x86-64 System V ABI mandates:
+ * 1) %rsp must be 16-byte aligned right before the function call.
+ * 2) The deepest stack frame should be zero (the %rbp).
+ *
+ */
asm(".section .text\n"
".global _start\n"
"_start:\n"
"pop %rdi\n" // argc (first arg, %rdi)
"mov %rsp, %rsi\n" // argv[] (second arg, %rsi)
"lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx)
- "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when
- "sub $8, %rsp\n" // entering the callee
+ "xor %ebp, %ebp\n" // zero the stack frame
+ "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call
"call main\n" // main() returns the status code, we'll exit with it.
"movzb %al, %rdi\n" // retrieve exit code from 8 lower bits
"mov $60, %rax\n" // NR_exit == 60
--
2.31.1.189.g2e36527f23
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: c7719e79347803b8e3b6b50da8c6db410a3012b5
Gitweb: https://git.kernel.org/tip/c7719e79347803b8e3b6b50da8c6db410a3012b5
Author: Feng Tang <feng.tang(a)intel.com>
AuthorDate: Wed, 17 Nov 2021 10:37:50 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Thu, 02 Dec 2021 00:40:35 +01:00
x86/tsc: Add a timer to make sure TSC_adjust is always checked
The TSC_ADJUST register is checked every time a CPU enters idle state, but
Thomas Gleixner mentioned there is still a caveat that a system won't enter
idle [1], either because it's too busy or configured purposely to not enter
idle.
Setup a periodic timer (every 10 minutes) to make sure the check is
happening on a regular base.
[1] https://lore.kernel.org/lkml/875z286xtk.fsf@nanos.tec.linutronix.de/
Fixes: 6e3cd95234dc ("x86/hpet: Use another crystalball to evaluate HPET usability")
Requested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Feng Tang <feng.tang(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20211117023751.24190-1-feng.tang@intel.com
---
arch/x86/kernel/tsc_sync.c | 41 +++++++++++++++++++++++++++++++++++++-
1 file changed, 41 insertions(+)
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 50a4515..9452dc9 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: b50db7095fe002fa3e16605546cba66bf1b68a3e
Gitweb: https://git.kernel.org/tip/b50db7095fe002fa3e16605546cba66bf1b68a3e
Author: Feng Tang <feng.tang(a)intel.com>
AuthorDate: Wed, 17 Nov 2021 10:37:51 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Thu, 02 Dec 2021 00:40:36 +01:00
x86/tsc: Disable clocksource watchdog for TSC on qualified platorms
There are cases that the TSC clocksource is wrongly judged as unstable by
the clocksource watchdog mechanism which tries to validate the TSC against
HPET, PM_TIMER or jiffies. While there is hardly a general reliable way to
check the validity of a watchdog, Thomas Gleixner proposed [1]:
"I'm inclined to lift that requirement when the CPU has:
1) X86_FEATURE_CONSTANT_TSC
2) X86_FEATURE_NONSTOP_TSC
3) X86_FEATURE_NONSTOP_TSC_S3
4) X86_FEATURE_TSC_ADJUST
5) At max. 4 sockets
After two decades of horrors we're finally at a point where TSC seems
to be halfway reliable and less abused by BIOS tinkerers. TSC_ADJUST
was really key as we can now detect even small modifications reliably
and the important point is that we can cure them as well (not pretty
but better than all other options)."
As feature #3 X86_FEATURE_NONSTOP_TSC_S3 only exists on several generations
of Atom processorz, and is always coupled with X86_FEATURE_CONSTANT_TSC
and X86_FEATURE_NONSTOP_TSC, skip checking it, and also be more defensive
to use maximal 2 sockets.
The check is done inside tsc_init() before registering 'tsc-early' and
'tsc' clocksources, as there were cases that both of them had been
wrongly judged as unreliable.
For more background of tsc/watchdog, there is a good summary in [2]
[tglx} Update vs. jiffies:
On systems where the only remaining clocksource aside of TSC is jiffies
there is no way to make this work because that creates a circular
dependency. Jiffies accuracy depends on not missing a periodic timer
interrupt, which is not guaranteed. That could be detected by TSC, but as
TSC is not trusted this cannot be compensated. The consequence is a
circulus vitiosus which results in shutting down TSC and falling back to
the jiffies clocksource which is even more unreliable.
[1]. https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
[2]. https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
[ tglx: Refine comment and amend changelog ]
Fixes: 6e3cd95234dc ("x86/hpet: Use another crystalball to evaluate HPET usability")
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Feng Tang <feng.tang(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20211117023751.24190-2-feng.tang@intel.com
---
arch/x86/kernel/tsc.c | 28 ++++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a4..a698196 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();