The entries within __rseq_table are aligned on 32 bytes due to
linux/rseq.h struct rseq_cs uapi requirements, but the start of the
__rseq_table section is not guaranteed to be 32-byte aligned. It can
cause padding to be added at the start of the section, which makes it
hard to use as an array of items by debuggers.
Considering that __rseq_table does not really consist of a table due to
the presence of padding, rename this section to __rseq_cs.
Create a new __rseq_cs_ptr_array section which contains 64-bit packed
pointers to entries within the __rseq_cs section.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
CC: Thomas Gleixner <tglx(a)linutronix.de>
CC: Joel Fernandes <joelaf(a)google.com>
CC: Peter Zijlstra <peterz(a)infradead.org>
CC: Catalin Marinas <catalin.marinas(a)arm.com>
CC: Dave Watson <davejwatson(a)fb.com>
CC: Will Deacon <will.deacon(a)arm.com>
CC: Shuah Khan <shuah(a)kernel.org>
CC: Andi Kleen <andi(a)firstfloor.org>
CC: linux-kselftest(a)vger.kernel.org
CC: "H . Peter Anvin" <hpa(a)zytor.com>
CC: Chris Lameter <cl(a)linux.com>
CC: Russell King <linux(a)arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages(a)gmail.com>
CC: "Paul E . McKenney" <paulmck(a)linux.vnet.ibm.com>
CC: Paul Turner <pjt(a)google.com>
CC: Boqun Feng <boqun.feng(a)gmail.com>
CC: Josh Triplett <josh(a)joshtriplett.org>
CC: Steven Rostedt <rostedt(a)goodmis.org>
CC: Ben Maurer <bmaurer(a)fb.com>
CC: linux-api(a)vger.kernel.org
CC: Andy Lutomirski <luto(a)amacapital.net>
CC: Andrew Morton <akpm(a)linux-foundation.org>
CC: Linus Torvalds <torvalds(a)linux-foundation.org>
---
tools/testing/selftests/rseq/rseq-arm.h | 32 +++++++++++++++++--------------
tools/testing/selftests/rseq/rseq-arm64.h | 9 ++++++---
tools/testing/selftests/rseq/rseq-mips.h | 32 +++++++++++++++++--------------
tools/testing/selftests/rseq/rseq-ppc.h | 22 +++++++++++++--------
tools/testing/selftests/rseq/rseq-s390.h | 18 +++++++++++------
tools/testing/selftests/rseq/rseq-x86.h | 19 ++++++++++++------
6 files changed, 81 insertions(+), 51 deletions(-)
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 17e8d231943a..5f262c54364f 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -30,24 +30,28 @@ do { \
#include "rseq-skip.h"
#else /* !RSEQ_SKIP_FASTPATH */
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".word " __rseq_str(label) "b, 0x0\n\t" \
".popsection\n\t"
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
- __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -99,7 +103,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -166,7 +170,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -237,7 +241,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
@@ -292,7 +296,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -367,7 +371,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -443,7 +447,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -527,7 +531,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -651,7 +655,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 2079f71e0ca2..b41a2a48e965 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -82,13 +82,16 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- " .pushsection __rseq_table, \"aw\"\n" \
+ " .pushsection __rseq_cs, \"aw\"\n" \
" .balign 32\n" \
__rseq_str(label) ":\n" \
" .long " __rseq_str(version) ", " __rseq_str(flags) "\n" \
" .quad " __rseq_str(start_ip) ", " \
__rseq_str(post_commit_offset) ", " \
__rseq_str(abort_ip) "\n" \
+ " .popsection\n\t" \
+ " .pushsection __rseq_cs_ptr_array, \"aw\"\n" \
+ " .quad " __rseq_str(label) "b\n" \
" .popsection\n"
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
@@ -99,8 +102,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
index 25d10ff54769..fe3eabcdcbe5 100644
--- a/tools/testing/selftests/rseq/rseq-mips.h
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -54,26 +54,30 @@ do { \
# error unsupported _MIPS_SZLONG
#endif
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(label) "b") "\n\t" \
".popsection\n\t"
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
- __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -127,7 +131,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -192,7 +196,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -261,7 +265,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
@@ -316,7 +320,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -389,7 +393,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -463,7 +467,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -543,7 +547,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -664,7 +668,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
index 24f95649d71e..9df18487fa9f 100644
--- a/tools/testing/selftests/rseq/rseq-ppc.h
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -33,8 +33,8 @@ do { \
#else /* !RSEQ_SKIP_FASTPATH */
/*
- * The __rseq_table section can be used by debuggers to better handle
- * single-stepping through the restartable critical sections.
+ * The __rseq_cs_ptr_array and __rseq_cs sections can be used by debuggers to
+ * better handle single-stepping through the restartable critical sections.
*/
#ifdef __PPC64__
@@ -46,11 +46,14 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
@@ -67,8 +70,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -85,20 +88,23 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
/* 32-bit only supported on BE */ \
".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index b8b5b6f900af..fbb97815d71c 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -37,19 +37,22 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -61,19 +64,22 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
index 0668608d3674..03095236f6fa 100644
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -47,13 +47,17 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
+
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
@@ -62,8 +66,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -566,11 +570,14 @@ do { \
*/
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long " __rseq_str(label) "b, 0x0\n\t" \
".popsection\n\t"
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
@@ -581,8 +588,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
--
2.11.0
pidfd are /proc/pid directory file descriptors referring to a task group
leader. Android low memory killer (LMK) needs pidfd polling support to
replace code that currently checks for existence of /proc/pid for
knowing a process that is signalled to be killed has died, which is both
racy and slow. The pidfd poll approach is race-free, and also allows the
LMK to do other things (such as by polling on other fds) while awaiting
the process being killed to die.
It prevents a situation where a PID is reused between when LMK sends a
kill signal and checks for existence of the PID, since the wrong PID is
now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of the
task group is to be notified, that is when the tasks waiting on a poll
of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following
reasons:
1. The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can
be reaped and destroyed before the poll returns.
2. By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new
waitqueue/pid even though its task_struct is different.
Appropriate test cases are added in the second patch to provide coverage
of all the cases the patch is handling.
Andy had a similar patch [1] in the past which was a good reference
however this patch tries to handle different situations properly related
to thread group existence, and how/where it notifies. And also solves
other bugs (existence of taks_struct). Daniel had a similar patch [2]
recently which this patch supercedes.
[1] https://lore.kernel.org/patchwork/patch/345098/
[2] https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/
Cc: luto(a)amacapital.net
Cc: rostedt(a)goodmis.org
Cc: dancol(a)google.com
Cc: christian(a)brauner.io
Cc: jannh(a)google.com
Cc: surenb(a)google.com
Cc: torvalds(a)linux-foundation.org
Co-developed-by: Daniel Colascione <dancol(a)google.com>
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
fs/proc/base.c | 39 +++++++++++++++++++++++++++++++++++++++
include/linux/pid.h | 3 +++
kernel/exit.c | 1 -
kernel/pid.c | 2 ++
kernel/signal.c | 14 ++++++++++++++
5 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6a803a0b75df..879900082647 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts)
+{
+ int poll_flags = 0;
+ struct task_struct *task;
+ struct pid *pid;
+
+ task = get_proc_task(file->f_path.dentry->d_inode);
+
+ WARN_ON_ONCE(task && !thread_group_leader(task));
+
+ /*
+ * tasklist_lock must be held because to avoid racing with
+ * changes in exit_state and wake up. Basically to avoid:
+ *
+ * P0: read exit_state = 0
+ * P1: write exit_state = EXIT_DEAD
+ * P1: Do a wake up - wq is empty, so do nothing
+ * P0: Queue for polling - wait forever.
+ */
+ read_lock(&tasklist_lock);
+ if (!task)
+ poll_flags = POLLIN | POLLRDNORM | POLLERR;
+ else if (task->exit_state == EXIT_DEAD)
+ poll_flags = POLLIN | POLLRDNORM;
+ else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
+ poll_flags = POLLIN | POLLRDNORM;
+
+ if (!poll_flags) {
+ pid = proc_pid(file->f_path.dentry->d_inode);
+ poll_wait(file, &pid->wait_pidfd, pts);
+ }
+ read_unlock(&tasklist_lock);
+
+ if (task)
+ put_task_struct(task);
+ return poll_flags;
+}
+
static const struct file_operations proc_tgid_base_operations = {
.read = generic_read_dir,
+ .poll = proc_tgid_base_poll,
.iterate_shared = proc_tgid_base_readdir,
.llseek = generic_file_llseek,
};
diff --git a/include/linux/pid.h b/include/linux/pid.h
index b6f4ba16065a..2e0dcbc6d14e 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
#define _LINUX_PID_H
#include <linux/rculist.h>
+#include <linux/wait.h>
enum pid_type
{
@@ -60,6 +61,8 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
+ /* wait queue for pidfd pollers */
+ wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..c386ec52687d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}
-
void release_task(struct task_struct *p)
{
struct task_struct *leader;
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..5c90c239242f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..e3781703ef7e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
return ret;
}
+static void do_wakeup_pidfd_pollers(struct task_struct *task)
+{
+ struct pid *pid;
+
+ lockdep_assert_held(&tasklist_lock);
+
+ pid = get_task_pid(task, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+ put_pid(pid);
+}
+
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ /* Wake up all pidfd waiters */
+ do_wakeup_pidfd_pollers(tsk);
+
if (sig != SIGCHLD) {
/*
* This is only possible if parent == real_parent.
--
2.21.0.392.gf8f6787159e-goog
This refactors the selftest Makefiles to extract the test running logic
to be reused between "run_tests" and "emit_tests", while also fixing
up the test output to be TAP version 13 compliant:
- added "plan" line
- fixed result line syntax
- moved all test output to be "# "-prefixed as TAP "diagnostic" lines
The prefixing code includes a fallback mode for limited execution
environments.
-Kees
Kees Cook (6):
selftests: Extract single-test shell logic from lib.mk
selftests: Use runner.sh for emit targets
selftests: Extract logic for multiple test runs
selftests/runner: Add plan line and fix result line syntax
selftests/runner: Distinguish between missing and non-executable
selftests: Move test output to diagnostic lines
tools/testing/selftests/.gitignore | 1 -
tools/testing/selftests/Makefile | 18 +++--
tools/testing/selftests/kselftest/prefix.pl | 23 ++++++
tools/testing/selftests/kselftest/runner.sh | 80 +++++++++++++++++++++
tools/testing/selftests/lib.mk | 61 +++-------------
5 files changed, 119 insertions(+), 64 deletions(-)
create mode 100755 tools/testing/selftests/kselftest/prefix.pl
create mode 100644 tools/testing/selftests/kselftest/runner.sh
--
2.17.1
This is version 4 of the MSI interrupts for ntb_transport patchset.
It's mostly a resend of v3 but with spelling and grammar fixes found by Bjorn.
I've addressed the feedback so far and rebased on the latest kernel
and would like this to be considered for merging this cycle.
The only outstanding issue I know of is that it still will not work
with IDT hardware, but ntb_transport doesn't work with IDT hardware
and there is still no sensible common infrastructure to support
ntb_peer_mw_set_trans(). Thus, I decline to consider that complication
in this patchset. However, I'll be happy to review work that adds this
feature in the future.
Also, as the port number and resource index stuff is a bit complicated,
I made a quick out of tree test fixture to ensure it's correct[1]. As
an excerise I also wrote some test code[2] using the upcomming KUnit
feature.
Logan
[1] https://repl.it/repls/ExcitingPresentFile
[2] https://github.com/sbates130272/linux-p2pmem/commits/ntb_kunit
--
Changes in v4:
* Rebased onto v5.1-rc6 (No changes)
* Numerous grammar and spelling mistakes spotted by Bjorn
--
Changes in v3:
* Rebased onto v5.1-rc1 (Dropped the first two patches as they have
been merged, and cleaned up some minor conflicts in the PCI tree)
* Added a new patch (#3) to calculate logical port numbers that
are port numbers from 0 to (number of ports - 1). This is
then used in ntb_peer_resource_idx() to fix the issues brought
up by Serge.
* Fixed missing __iomem and iowrite calls (as noticed by Serge)
* Added patch 10 which describes ntb_msi_test in the documentation
file (as requested by Serge)
* A couple other minor nits and documentation fixes
--
Changes in v2:
* Cleaned up the changes in intel_irq_remapping.c to make them
less confusing and add a comment. (Per discussion with Jacob and
Joerg)
* Fixed a nit from Bjorn and collected his Ack
* Added a Kconfig dependancy on CONFIG_PCI_MSI for CONFIG_NTB_MSI
as the Kbuild robot hit a random config that didn't build
without it.
* Worked in a callback for when the MSI descriptor changes so that
the clients can resend the new address and data values to the peer.
On my test system this was never necessary, but there may be
other platforms where this can occur. I tested this by hacking
in a path to rewrite the MSI descriptor when I change the cpu
affinity of an IRQ. There's a bit of uncertainty over the latency
of the change, but without hardware this can acctually occur on
we can't test this. This was the result of a discussion with Dave.
--
This patch series adds optional support for using MSI interrupts instead
of NTB doorbells in ntb_transport. This is desirable seeing doorbells on
current hardware are quite slow and therefore switching to MSI interrupts
provides a significant performance gain. On switchtec hardware, a simple
apples-to-apples comparison shows ntb_netdev/iperf numbers going from
3.88Gb/s to 14.1Gb/s when switching to MSI interrupts.
To do this, a couple changes are required outside of the NTB tree:
1) The IOMMU must know to accept MSI requests from aliased bused numbers
seeing NTB hardware typically sends proxied request IDs through
additional requester IDs. The first patch in this series adds support
for the Intel IOMMU. A quirk to add these aliases for switchtec hardware
was already accepted. See commit ad281ecf1c7d ("PCI: Add DMA alias quirk
for Microsemi Switchtec NTB") for a description of NTB proxy IDs and why
this is necessary.
2) NTB transport (and other clients) may often need more MSI interrupts
than the NTB hardware actually advertises support for. However, seeing
these interrupts will not be triggered by the hardware but through an
NTB memory window, the hardware does not actually need support or need
to know about them. Therefore we add the concept of Virtual MSI
interrupts which are allocated just like any other MSI interrupt but
are not programmed into the hardware's MSI table. This is done in
Patch 2 and then made use of in Patch 3.
The remaining patches in this series add a library for dealing with MSI
interrupts, a test client and finally support in ntb_transport.
The series is based off of v5.1-rc6 plus the patches in ntb-next.
A git repo is available here:
https://github.com/sbates130272/linux-p2pmem/ ntb_transport_msi_v4
Thanks,
Logan
--
Logan Gunthorpe (10):
PCI/MSI: Support allocating virtual MSI interrupts
PCI/switchtec: Add module parameter to request more interrupts
NTB: Introduce helper functions to calculate logical port number
NTB: Introduce functions to calculate multi-port resource index
NTB: Rename ntb.c to support multiple source files in the module
NTB: Introduce MSI library
NTB: Introduce NTB MSI Test Client
NTB: Add ntb_msi_test support to ntb_test
NTB: Add MSI interrupt support to ntb_transport
NTB: Describe the ntb_msi_test client in the documentation.
Documentation/ntb.txt | 27 ++
drivers/ntb/Kconfig | 11 +
drivers/ntb/Makefile | 3 +
drivers/ntb/{ntb.c => core.c} | 0
drivers/ntb/msi.c | 415 +++++++++++++++++++++++
drivers/ntb/ntb_transport.c | 169 ++++++++-
drivers/ntb/test/Kconfig | 9 +
drivers/ntb/test/Makefile | 1 +
drivers/ntb/test/ntb_msi_test.c | 433 ++++++++++++++++++++++++
drivers/pci/msi.c | 54 ++-
drivers/pci/switch/switchtec.c | 12 +-
include/linux/msi.h | 8 +
include/linux/ntb.h | 196 ++++++++++-
include/linux/pci.h | 9 +
tools/testing/selftests/ntb/ntb_test.sh | 54 ++-
15 files changed, 1386 insertions(+), 15 deletions(-)
rename drivers/ntb/{ntb.c => core.c} (100%)
create mode 100644 drivers/ntb/msi.c
create mode 100644 drivers/ntb/test/ntb_msi_test.c
--
2.20.1
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
[ Upstream commit 0f73bbc851ed32d22bbd86be09e0365c460bcd2e ]
Documentation/virtual/kvm/api.txt states:
NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and
KVM_EXIT_EPR the corresponding operations are complete (and guest
state is consistent) only after userspace has re-entered the
kernel with KVM_RUN. The kernel side will first finish incomplete
operations and then check for pending signals. Userspace can
re-enter the guest with an unmasked signal pending to complete
pending operations.
Because guest state may be inconsistent, starting state migration after
an IO exit without first completing IO may result in test failures, e.g.
a proposed change to KVM's handling of %rip in its fast PIO handling[1]
will cause the new VM, i.e. the post-migration VM, to have its %rip set
to the IN instruction that triggered KVM_EXIT_IO, leading to a test
assertion due to a stage mismatch.
For simplicitly, require KVM_CAP_IMMEDIATE_EXIT to complete IO and skip
the test if it's not available. The addition of KVM_CAP_IMMEDIATE_EXIT
predates the state selftest by more than a year.
[1] https://patchwork.kernel.org/patch/10848545/
Fixes: fa3899add1056 ("kvm: selftests: add basic test for state save and restore")
Reported-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Sasha Levin (Microsoft) <sashal(a)kernel.org>
---
tools/testing/selftests/kvm/include/kvm_util.h | 1 +
tools/testing/selftests/kvm/lib/kvm_util.c | 16 ++++++++++++++++
.../testing/selftests/kvm/x86_64/state_test.c | 18 ++++++++++++++++--
3 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a84785b02557..07b71ad9734a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -102,6 +102,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_mp_state *mp_state);
void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index b52cfdefecbf..efa0aad8b3c6 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1121,6 +1121,22 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
return rc;
}
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ vcpu->state->immediate_exit = 1;
+ ret = ioctl(vcpu->fd, KVM_RUN, NULL);
+ vcpu->state->immediate_exit = 0;
+
+ TEST_ASSERT(ret == -1 && errno == EINTR,
+ "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
+ ret, errno);
+}
+
/*
* VM VCPU Set MP State
*
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 4b3f556265f1..30f75856cf39 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -134,6 +134,11 @@ int main(int argc, char *argv[])
struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+ if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
+ fprintf(stderr, "immediate_exit not available, skipping test\n");
+ exit(KSFT_SKIP);
+ }
+
/* Create VM */
vm = vm_create_default(VCPU_ID, 0, guest_code);
vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
@@ -156,8 +161,6 @@ int main(int argc, char *argv[])
stage, run->exit_reason,
exit_reason_str(run->exit_reason));
- memset(®s1, 0, sizeof(regs1));
- vcpu_regs_get(vm, VCPU_ID, ®s1);
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_ABORT:
TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0],
@@ -176,6 +179,17 @@ int main(int argc, char *argv[])
uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx",
stage, (ulong)uc.args[1]);
+ /*
+ * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
+ * guest state is consistent only after userspace re-enters the
+ * kernel with KVM_RUN. Complete IO prior to migrating state
+ * to a new VM.
+ */
+ vcpu_run_complete_io(vm, VCPU_ID);
+
+ memset(®s1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, ®s1);
+
state = vcpu_save_state(vm, VCPU_ID);
kvm_vm_release(vm);
--
2.19.1