Given that the entry_*.S changes for this functionality are somewhat
tricky, make sure the paths are tested every boot, instead of on the
rare occasion when we trip an INT3 while rewriting text.
Getting the INT3 frame setup even slightly wrong will make this come
unstuck something spectacular.
Requested-by: Andy Lutomirski <luto(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
---
arch/x86/kernel/alternative.c | 81 ++++++++++++++++++++++++++++++++++++++++---
1 file changed, 77 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4db9c0d29bc1..2b853c2ab894 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,11 +613,83 @@ extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */
+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
+static void __init int3_magic(unsigned int *ptr)
+{
+ *ptr = 1;
+}
+
+extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+
+ if (!regs || user_mode(regs))
+ return NOTIFY_DONE;
+
+ if (val != DIE_INT3)
+ return NOTIFY_DONE;
+
+ if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
+ return NOTIFY_DONE;
+
+ int3_emulate_call(regs, (unsigned long)&int3_magic);
+ return NOTIFY_STOP;
+}
+
+static void __init int3_selftest(void)
+{
+ static __initdata struct notifier_block int3_exception_nb = {
+ .notifier_call = int3_exception_notify,
+ .priority = INT_MAX-1, /* last */
+ };
+ unsigned int val = 0;
+
+ BUG_ON(register_die_notifier(&int3_exception_nb));
+
+ /*
+ * Basically: int3_magic(&val); but really complicated :-)
+ *
+ * Stick the address of the INT3 instruction into int3_selftest_ip,
+ * then trigger the INT3, padded with NOPs to match a CALL instruction
+ * length.
+ */
+ asm volatile ("1: int3; nop; nop; nop; nop\n\t"
+ ".pushsection .init.data,\"aw\"\n\t"
+ ".align " __ASM_SEL(4, 8) "\n\t"
+ ".type int3_selftest_ip, @object\n\t"
+ ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
+ "int3_selftest_ip:\n\t"
+ __ASM_SEL(.long, .quad) " 1b\n\t"
+ ".popsection\n\t"
+ : : __ASM_SEL_RAW(a, D) (&val) : "memory");
+
+ BUG_ON(val != 1);
+
+ unregister_die_notifier(&int3_exception_nb);
+}
+
void __init alternative_instructions(void)
{
- /* The patching is not fully atomic, so try to avoid local interruptions
- that might execute the to be patched code.
- Other CPUs are not running. */
+ int3_selftest();
+
+ /*
+ * The patching is not fully atomic, so try to avoid local
+ * interruptions that might execute the to be patched code.
+ * Other CPUs are not running.
+ */
stop_nmi();
/*
@@ -642,10 +714,11 @@ void __init alternative_instructions(void)
_text, _etext);
}
- if (!uniproc_patched || num_possible_cpus() == 1)
+ if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+ }
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
Introduce in-kernel headers which are made available as an archive
through proc (/proc/kheaders.tar.xz file). This archive makes it
possible to run eBPF and other tracing programs that need to extend the
kernel for tracing purposes without any dependency on the file system
having headers.
A github PR is sent for the corresponding BCC patch at:
https://github.com/iovisor/bcc/pull/2312
On Android and embedded systems, it is common to switch kernels but not
have kernel headers available on the file system. Further once a
different kernel is booted, any headers stored on the file system will
no longer be useful. This is an issue even well known to distros.
By storing the headers as a compressed archive within the kernel, we can
avoid these issues that have been a hindrance for a long time.
The best way to use this feature is by building it in. Several users
have a need for this, when they switch debug kernels, they do not want to
update the filesystem or worry about it where to store the headers on
it. However, the feature is also buildable as a module in case the user
desires it not being part of the kernel image. This makes it possible to
load and unload the headers from memory on demand. A tracing program can
load the module, do its operations, and then unload the module to save
kernel memory. The total memory needed is 3.3MB.
By having the archive available at a fixed location independent of
filesystem dependencies and conventions, all debugging tools can
directly refer to the fixed location for the archive, without concerning
with where the headers on a typical filesystem which significantly
simplifies tooling that needs kernel headers.
The code to read the headers is based on /proc/config.gz code and uses
the same technique to embed the headers.
Other approaches were discussed such as having an in-memory mountable
filesystem, but that has drawbacks such as requiring an in-kernel xz
decompressor which we don't have today, and requiring usage of 42 MB of
kernel memory to host the decompressed headers at anytime. Also this
approach is simpler than such approaches.
Reviewed-by: Masahiro Yamada <yamada.masahiro(a)socionext.com>
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
(Just a resend with Masahiro's Reviewed-by tag added)
v6 -> v7:
- Minor nits from Masahiro Yamada are addressed.
v5 -> v6: (Masahiro Yamada suggestions mostly)
- Dropped support for module building.
- Rebuild archive if script changes.
- Move archive file list to script.
- Move build script to kernel directory.
v4 -> v5:
(v4 was Tested-by the following folks)
Tested-by: qais.yousef(a)arm.com
Tested-by: dietmar.eggemann(a)arm.com
Tested-by: linux(a)manojrajarao.com
(Thanks to Masahiro Yamada for several excellent suggestions)
- used incbin instead of bin2c (Masahiro did similar idea)
- added module.lds if ia64 otherwise ia64 may fail to build.
- added clean-files rule to Makefile
- removed strip-comments script and doing it inline
- added set -e to header generated to die on errorsr
- fixed a minor issue where find command was noisy.
- removed unneeded tar.xz rule from kernel/.gitignore
- added Tested-by tags from ARM folks.
Changes since v3:
- Blank tar was being generated because of a one line I
forgot to push. It is updated now.
- Added module.lds since arm64 needs it to build modules.
Changes since v2:
(Thanks to Masahiro Yamada for several excellent suggestions)
- Added support for out of tree builds.
- Added incremental build support bringing down build time of
incremental builds from 50 seconds to 5 seconds.
- Fixed various small nits / cleanups.
- clean ups to kheaders.c pointed by Alexey Dobriyan.
- Fixed MODULE_LICENSE in test module and kheaders.c
- Dropped Module.symvers from archive due to circular dependency.
Changes since v1:
- removed IKH_EXTRA variable, not needed (Masahiro Yamada)
- small fix ups to selftest
- added target to main Makefile etc
- added MODULE_LICENSE to test module
- made selftest more quiet
Changes since RFC:
Both changes bring size down to 3.8MB:
- use xz for compression
- strip comments except SPDX lines
- Call out the module name in Kconfig
- Also added selftests in second patch to ensure headers are always
working.
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
init/Kconfig | 10 +++++
kernel/.gitignore | 1 +
kernel/Makefile | 10 +++++
kernel/gen_ikh_data.sh | 89 ++++++++++++++++++++++++++++++++++++++++++
kernel/kheaders.c | 74 +++++++++++++++++++++++++++++++++++
5 files changed, 184 insertions(+)
create mode 100755 kernel/gen_ikh_data.sh
create mode 100644 kernel/kheaders.c
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..47c0db6e63a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -580,6 +580,16 @@ config IKCONFIG_PROC
This option enables access to the kernel configuration file
through /proc/config.gz.
+config IKHEADERS_PROC
+ tristate "Enable kernel header artifacts through /proc/kheaders.tar.xz"
+ depends on PROC_FS
+ help
+ This option enables access to the kernel header and other artifacts that
+ are generated during the build process. These can be used to build eBPF
+ tracing programs, or similar programs. If you build the headers as a
+ module, a module called kheaders.ko is built which can be loaded on-demand
+ to get access to the headers.
+
config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
range 12 25
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 6e699100872f..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,5 +1,6 @@
#
# Generated files
#
+kheaders.md5
timeconst.h
hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..12399614c350 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -121,3 +122,12 @@ $(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
+
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
+
+quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
+cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+$(obj)/kheaders_data.tar.xz: FORCE
+ $(call cmd,genikh)
+
+clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh
new file mode 100755
index 000000000000..591a94f7b387
--- /dev/null
+++ b/kernel/gen_ikh_data.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS_PROC.
+set -e
+spath="$(dirname "$(readlink -f "$0")")"
+kroot="$spath/.."
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+
+# Script filename relative to the kernel source root
+# We add it to the archive because it is small and any changes
+# to this script will also cause a rebuild of the archive.
+sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
+
+src_file_list="
+include/
+arch/$SRCARCH/include/
+$sfile
+"
+
+obj_file_list="
+include/
+arch/$SRCARCH/include/
+"
+
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# iter=1
+# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
+# else; iter=$(($(cat /tmp/iter) + 1)); fi
+# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $kroot > /dev/null
+src_files_md5="$(find $src_file_list -type f |
+ grep -v "include/generated/compile.h" |
+ xargs ls -lR | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $obj_file_list -type f |
+ grep -v "include/generated/compile.h" |
+ xargs ls -lR | md5sum | cut -d ' ' -f1)"
+
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+ [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+ exit
+fi
+
+if [ "${quiet}" != "silent_" ]; then
+ echo " GEN $tarfile"
+fi
+
+rm -rf $cpio_dir
+mkdir $cpio_dir
+
+pushd $kroot > /dev/null
+for f in $src_file_list;
+ do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $obj_file_list;
+ do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+ xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+
+rm -rf $cpio_dir
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..70ae6052920d
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .global kernel_headers_data \n"
+"kernel_headers_data: \n"
+" .incbin \"kernel/kheaders_data.tar.xz\" \n"
+" .global kernel_headers_data_end \n"
+"kernel_headers_data_end: \n"
+" .popsection \n"
+);
+
+extern char kernel_headers_data;
+extern char kernel_headers_data_end;
+
+static ssize_t
+ikheaders_read_current(struct file *file, char __user *buf,
+ size_t len, loff_t *offset)
+{
+ return simple_read_from_buffer(buf, len, offset,
+ &kernel_headers_data,
+ &kernel_headers_data_end -
+ &kernel_headers_data);
+}
+
+static const struct file_operations ikheaders_file_ops = {
+ .read = ikheaders_read_current,
+ .llseek = default_llseek,
+};
+
+static int __init ikheaders_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ /* create the current headers file */
+ entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
+ &ikheaders_file_ops);
+ if (!entry)
+ return -ENOMEM;
+
+ proc_set_size(entry,
+ &kernel_headers_data_end -
+ &kernel_headers_data);
+ return 0;
+}
+
+static void __exit ikheaders_cleanup(void)
+{
+ remove_proc_entry("kheaders.tar.xz", NULL);
+}
+
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
--
2.21.0.593.g511ec345e18-goog
In one of my rcutorture tests the TSC clocksource got marked unstable
due to a large difference in the TSC value. I'm not sure if the guest
run for a long time with disabled interrupts or if the host was very
busy and didn't schedule the guest for some time.
I took a look on the qemu/KVM options and decided to update the options:
- Use kvm{32|64} as CPU. We could probably use `host' (like ARM does)
for maximum available features but since we don't run any userland I'm
not sure if it makes any difference.
- Drop the "noapic" option, enable TSC deadline timer. There is no
history why the APIC was disabled, I see no reason for it. The
deadline timer is probably "nicer".
- Additional config options. It ensures that the kernel knowns that it
runs as a kvm guest and can use virt devices like the kvm-clock as
clocksource. The kvm-clock was the main motivation here.
- I didn't add a random HW device. It would make the random device ready
earlier (not it doesn't complete the initialisation at all) but I
doubt that there is any need for this.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
---
tools/testing/selftests/rcutorture/bin/functions.sh | 13 ++++++++++++-
.../selftests/rcutorture/configs/rcu/CFcommon | 4 ++++
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 6bcb8b5b2ff22..be3c5c73d7e79 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -172,7 +172,7 @@ identify_qemu_append () {
local console=ttyS0
case "$1" in
qemu-system-x86_64|qemu-system-i386)
- echo noapic selinux=0 initcall_debug debug
+ echo selinux=0 initcall_debug debug
;;
qemu-system-aarch64)
console=ttyAMA0
@@ -191,8 +191,19 @@ identify_qemu_append () {
# Output arguments for qemu arguments based on the TORTURE_QEMU_MAC
# and TORTURE_QEMU_INTERACTIVE environment variables.
identify_qemu_args () {
+ local KVM_CPU=""
+ case "$1" in
+ qemu-system-x86_64)
+ KVM_CPU=kvm64
+ ;;
+ qemu-system-i386)
+ KVM_CPU=kvm32
+ ;;
+ esac
case "$1" in
qemu-system-x86_64|qemu-system-i386)
+ echo -machine q35,accel=kvm
+ echo -cpu ${KVM_CPU},x2apic=on,tsc-deadline=on,hypervisor=on,tsc_adjust=on
;;
qemu-system-aarch64)
echo -machine virt,gic-version=host -cpu host
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index d2d2a86139db1..322d5d40443cd 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,2 +1,6 @@
CONFIG_RCU_TORTURE_TEST=y
CONFIG_PRINTK_TIME=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_PARAVIRT_SPINLOCKS=y
+CONFIG_KVM_GUEST=y
--
2.20.1
From: Peter Zijlstra <peterz(a)infradead.org>
Nicolai Stange discovered[1] that if live kernel patching is enabled, and the
function tracer started tracing the same function that was patched, the
conversion of the fentry call site during the translation of going from
calling the live kernel patch trampoline to the iterator trampoline, would
have as slight window where it didn't call anything.
As live kernel patching depends on ftrace to always call its code (to
prevent the function being traced from being called, as it will redirect
it). This small window would allow the old buggy function to be called, and
this can cause undesirable results.
Nicolai submitted new patches[2] but these were controversial. As this is
similar to the static call emulation issues that came up a while ago[3].
But after some debate[4][5] adding a gap in the stack when entering the
breakpoint handler allows for pushing the return address onto the stack to
easily emulate a call.
[1] http://lkml.kernel.org/r/20180726104029.7736-1-nstange@suse.de
[2] http://lkml.kernel.org/r/20190427100639.15074-1-nstange@suse.de
[3] http://lkml.kernel.org/r/3cf04e113d71c9f8e4be95fb84a510f085aa4afa.154171145…
[4] http://lkml.kernel.org/r/CAHk-=wh5OpheSU8Em_Q3Hg8qw_JtoijxOdPtHru6d+5K8TWM=…
[5] http://lkml.kernel.org/r/CAHk-=wjvQxY4DvPrJ6haPgAa6b906h=MwZXO6G8OtiTGe=N7_…
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Nicolai Stange <nstange(a)suse.de>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: the arch/x86 maintainers <x86(a)kernel.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Jiri Kosina <jikos(a)kernel.org>
Cc: Miroslav Benes <mbenes(a)suse.cz>
Cc: Petr Mladek <pmladek(a)suse.com>
Cc: Joe Lawrence <joe.lawrence(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Cc: Mimi Zohar <zohar(a)linux.ibm.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Cc: Nayna Jain <nayna(a)linux.ibm.com>
Cc: Masahiro Yamada <yamada.masahiro(a)socionext.com>
Cc: Joerg Roedel <jroedel(a)suse.de>
Cc: "open list:KERNEL SELFTEST FRAMEWORK" <linux-kselftest(a)vger.kernel.org>
Cc: stable(a)vger.kernel.org
Fixes: b700e7f03df5 ("livepatch: kernel: add support for live patching")
Signed-off-by: *** Need SoB From Peter Zijlstra ***
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
arch/x86/kernel/ftrace.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..fd152f5a937b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,7 @@
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
+#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
}
static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;
static int update_ftrace_func(unsigned long ip, void *new)
{
@@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned char *new;
int ret;
+ ftrace_update_func_call = (unsigned long)func;
+
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -294,13 +298,21 @@ int ftrace_int3_handler(struct pt_regs *regs)
if (WARN_ON_ONCE(!regs))
return 0;
- ip = regs->ip - 1;
- if (!ftrace_location(ip) && !is_ftrace_caller(ip))
- return 0;
+ ip = regs->ip - INT3_INSN_SIZE;
- regs->ip += MCOUNT_INSN_SIZE - 1;
+ if (ftrace_location(ip)) {
+ int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ if (!ftrace_update_func_call) {
+ int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+ return 1;
+ }
+ int3_emulate_call(regs, ftrace_update_func_call);
+ return 1;
+ }
- return 1;
+ return 0;
}
NOKPROBE_SYMBOL(ftrace_int3_handler);
@@ -859,6 +871,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
func = ftrace_ops_get_func(ops);
+ ftrace_update_func_call = (unsigned long)func;
+
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -960,6 +974,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;
+ ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
return update_ftrace_func(ip, new);
--
2.20.1
On Mon, Apr 29, 2019 at 12:13 PM Linus Torvalds
<torvalds(a)linux-foundation.org> wrote:
>
>
>
> On Mon, Apr 29, 2019, 12:02 Linus Torvalds <torvalds(a)linux-foundation.org> wrote:
>>
>>
>>
>> If nmi were to break it, it would be a cpu bug.
>
>
> Side note: we *already* depend on sti shadow working in other parts of the kernel, namely sti->iret.
>
Where? STI; IRET would be nuts.
Before:
commit 4214a16b02971c60960afd675d03544e109e0d75
Author: Andy Lutomirski <luto(a)kernel.org>
Date: Thu Apr 2 17:12:12 2015 -0700
x86/asm/entry/64/compat: Use SYSRETL to return from compat mode SYSENTER
we did sti; sysxit, but, when we discussed this, I don't recall anyone
speaking up in favor of the safely of the old code.
Not to mention that the crash we'll get if we get an NMI and a
rescheduling interrupt in this path will be very, very hard to debug.
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Nicolai Stange discovered[1] that if live kernel patching is enabled, and the
function tracer started tracing the same function that was patched, the
conversion of the fentry call site during the translation of going from
calling the live kernel patch trampoline to the iterator trampoline, would
have as slight window where it didn't call anything.
As live kernel patching depends on ftrace to always call its code (to
prevent the function being traced from being called, as it will redirect
it). This small window would allow the old buggy function to be called, and
this can cause undesirable results.
Nicolai submitted new patches[2] but these were controversial. As this is
similar to the static call emulation issues that came up a while ago[3],
Linus suggested using per CPU data along with special trampolines[4] to emulate
the calls.
Linus's solution was for text poke (which was mostly what the static_call
code did), but as ftrace has its own mechanism, it required doing its own
thing.
Having ftrace use its own per CPU data and having its own set of specialized
trampolines solves the issue of missed calls that live kernel patching
suffers.
[1] http://lkml.kernel.org/r/20180726104029.7736-1-nstange@suse.de
[2] http://lkml.kernel.org/r/20190427100639.15074-1-nstange@suse.de
[3] http://lkml.kernel.org/r/3cf04e113d71c9f8e4be95fb84a510f085aa4afa.154171145…
[4] http://lkml.kernel.org/r/CAHk-=wh5OpheSU8Em_Q3Hg8qw_JtoijxOdPtHru6d+5K8TWM=…
Inspired-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org
Fixes: b700e7f03df5 ("livepatch: kernel: add support for live patching")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
Changes since v2:
- Moved inline asm to ftrace_64.S
- Used PER_CPU_VAR() and TRACE_IRQS_ON macros in assembly
- Renamed trampolines to be a little more coherent
- Created assembly version of STACK_FRAME_NON_STANDARD() in asm/frame.h
- Call ftrace_regs_caller instead of ftrace_caller
- No longer support 32 bit (it crashed badly on tests, and I couldn't figure it out)
arch/x86/include/asm/frame.h | 15 ++++++
arch/x86/kernel/ftrace.c | 102 ++++++++++++++++++++++++++++++++++-
arch/x86/kernel/ftrace_64.S | 56 +++++++++++++++++++
3 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 5cbce6fbb534..04892b374b93 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -42,4 +42,19 @@
#endif /* CONFIG_FRAME_POINTER */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_STACK_VALIDATION
+#define STACK_FRAME_NON_STANDARD(func) \
+.section .discard.func_stack_frame_non_standard,"aw"; \
+.align 8; \
+.type __func_stack_frame_non_standard_##func, @object; \
+.size __func_stack_frame_non_standard_##func, 8; \
+__func_stack_frame_non_standard_##func: \
+.quad func; \
+.previous
+#else /* !CONFIG_STACK_VALIDATION */
+#define STACK_FRAME_NON_STANDARD(func)
+#endif /* CONFIG_STACK_VALIDATION */
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_FRAME_H */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..634fc0d4fe97 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -232,6 +232,9 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
static unsigned long ftrace_update_func;
+/* Used within inline asm below */
+unsigned long ftrace_update_func_call;
+
static int update_ftrace_func(unsigned long ip, void *new)
{
unsigned char old[MCOUNT_INSN_SIZE];
@@ -259,6 +262,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned char *new;
int ret;
+ ftrace_update_func_call = (unsigned long)func;
+
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -280,6 +285,46 @@ static nokprobe_inline int is_ftrace_caller(unsigned long ip)
return 0;
}
+#ifdef CONFIG_X86_64
+/*
+ * We need to handle the "call func1" -> "call func2" case.
+ * Just skipping the call is not sufficient as it will be like
+ * changing to "nop" first and then updating the call. But some
+ * users of ftrace require calls never to be missed.
+ *
+ * To emulate the call while converting the call site with a breakpoint,
+ * some trampolines are used along with per CPU buffers.
+ * There are three trampolines for the call sites and three trampolines
+ * for the updating of the call in ftrace trampoline. The three
+ * trampolines are:
+ *
+ * 1) Interrupts are enabled when the breakpoint is hit
+ * 2) Interrupts are disabled when the breakpoint is hit
+ * 3) The breakpoint was hit in an NMI
+ *
+ * As per CPU data is used, interrupts must be disabled to prevent them
+ * from corrupting the data. A separate NMI trampoline is used for the
+ * NMI case. If interrupts are already disabled, then the return path
+ * of where the breakpoint was hit (saved in the per CPU data) is pushed
+ * on the stack and then a jump to either the ftrace_caller (which will
+ * loop through all registered ftrace_ops handlers depending on the ip
+ * address), or if its a ftrace trampoline call update, it will call
+ * ftrace_update_func_call which will hold the call that should be
+ * called.
+ */
+extern asmlinkage void ftrace_emulate_call_sti(void);
+extern asmlinkage void ftrace_emulate_call(void);
+extern asmlinkage void ftrace_emulate_call_nmi(void);
+extern asmlinkage void ftrace_emulate_call_update_sti(void);
+extern asmlinkage void ftrace_emulate_call_update(void);
+extern asmlinkage void ftrace_emulate_call_update_nmi(void);
+
+DEFINE_PER_CPU(void *, ftrace_bp_call_return);
+DEFINE_PER_CPU(void *, ftrace_bp_call_nmi_return);
+
+/* To hold the ftrace_regs_caller address to push on the stack */
+void *ftrace_caller_func = (void *)ftrace_regs_caller;
+
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -291,6 +336,58 @@ int ftrace_int3_handler(struct pt_regs *regs)
{
unsigned long ip;
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ ip = regs->ip - 1;
+ if (ftrace_location(ip)) {
+ /* A breakpoint at the beginning of the function was hit */
+ if (in_nmi()) {
+ /* NMIs have their own trampoline */
+ this_cpu_write(ftrace_bp_call_nmi_return, (void *)ip + MCOUNT_INSN_SIZE);
+ regs->ip = (unsigned long) ftrace_emulate_call_nmi;
+ return 1;
+ }
+ this_cpu_write(ftrace_bp_call_return, (void *)ip + MCOUNT_INSN_SIZE);
+ if (regs->flags & X86_EFLAGS_IF) {
+ regs->flags &= ~X86_EFLAGS_IF;
+ regs->ip = (unsigned long) ftrace_emulate_call_sti;
+ /* Tell lockdep here we are enabling interrupts */
+ lockdep_hardirqs_on(_THIS_IP_);
+ } else {
+ regs->ip = (unsigned long) ftrace_emulate_call;
+ }
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ /* An ftrace trampoline is being updated */
+ if (!ftrace_update_func_call) {
+ /* If it's a jump, just need to skip it */
+ regs->ip += MCOUNT_INSN_SIZE -1;
+ return 1;
+ }
+ if (in_nmi()) {
+ /* NMIs have their own trampoline */
+ this_cpu_write(ftrace_bp_call_nmi_return, (void *)ip + MCOUNT_INSN_SIZE);
+ regs->ip = (unsigned long) ftrace_emulate_call_update_nmi;
+ return 1;
+ }
+ this_cpu_write(ftrace_bp_call_return, (void *)ip + MCOUNT_INSN_SIZE);
+ if (regs->flags & X86_EFLAGS_IF) {
+ regs->flags &= ~X86_EFLAGS_IF;
+ regs->ip = (unsigned long) ftrace_emulate_call_update_sti;
+ } else {
+ regs->ip = (unsigned long) ftrace_emulate_call_update;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+#else /* !X86_64 */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ unsigned long ip;
+
if (WARN_ON_ONCE(!regs))
return 0;
@@ -299,9 +396,9 @@ int ftrace_int3_handler(struct pt_regs *regs)
return 0;
regs->ip += MCOUNT_INSN_SIZE - 1;
-
return 1;
}
+#endif
NOKPROBE_SYMBOL(ftrace_int3_handler);
static int ftrace_write(unsigned long ip, const char *val, int size)
@@ -859,6 +956,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
func = ftrace_ops_get_func(ops);
+ ftrace_update_func_call = (unsigned long)func;
+
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -960,6 +1059,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;
+ ftrace_update_func_call = 0;
new = ftrace_jmp_replace(ip, (unsigned long)func);
return update_ftrace_func(ip, new);
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 75f2b36b41a6..8642e1719370 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -9,6 +9,9 @@
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
+#include <asm/irqflags.h>
+#include <asm/percpu.h>
+#include <asm/frame.h>
.code64
.section .entry.text, "ax"
@@ -262,6 +265,59 @@ GLOBAL(ftrace_regs_caller_end)
ENDPROC(ftrace_regs_caller)
+/* Trampoline for function update with interrupts enabled */
+GLOBAL(ftrace_emulate_call_sti)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_caller_func
+ TRACE_IRQS_ON
+ sti
+ ret
+ENDPROC(ftrace_emulate_call_sti)
+
+/* Trampoline for function update with interrupts disabled*/
+GLOBAL(ftrace_emulate_call)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_caller_func
+ ret
+ENDPROC(ftrace_emulate_call)
+
+/* Trampoline for function update in an NMI */
+GLOBAL(ftrace_emulate_call_nmi)
+ push PER_CPU_VAR(ftrace_bp_call_nmi_return)
+ push ftrace_caller_func
+ ret
+ENDPROC(ftrace_emulate_call_nmi)
+
+/* Trampoline for ftrace trampoline call update with interrupts enabled */
+GLOBAL(ftrace_emulate_call_update_sti)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_update_func_call
+ TRACE_IRQS_ON
+ sti
+ ret
+ENDPROC(ftrace_emulate_call_update_sti)
+
+/* Trampoline for ftrace trampoline call update with interrupts disabled */
+GLOBAL(ftrace_emulate_call_update)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_update_func_call
+ ret
+ENDPROC(ftrace_emulate_call_update)
+
+/* Trampoline for ftrace trampoline call update in an NMI */
+GLOBAL(ftrace_emulate_call_update_nmi)
+ push PER_CPU_VAR(ftrace_bp_call_nmi_return)
+ push ftrace_update_func_call
+ ret
+ENDPROC(ftrace_emulate_call_update_nmi)
+
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_sti)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_nmi)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_update_sti)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_update)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_update_nmi)
+
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(function_hook)
--
2.20.1