Memory access #VE's are hard for Linux to handle in contexts like the entry code or NMIs. But other OSes need them for functionality. There's a static (pre-guest-boot) way for a VMM to choose one or the other. But VMMs don't always know which OS they are booting, so they choose to deliver those #VE's so the "other" OSes will work. That, unfortunately has left us in the lurch and exposed to these hard-to-handle #VEs.
The TDX module has introduced a new feature. Even if the static configuration is "send nasty #VE's", the kernel can dynamically request that they be disabled.
Check if the feature is available and disable SEPT #VE if possible.
If the TD allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE attribute is no longer reliable. It reflects the initial state of the control for the TD, but it will not be updated if someone (e.g. bootloader) changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to determine if SEPT #VEs are enabled or disabled.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Fixes: 373e715e31bf ("x86/tdx: Panic on bad configs that #VE on "private" memory access") Cc: stable@vger.kernel.org --- arch/x86/coco/tdx/tdx.c | 76 ++++++++++++++++++++++++------- arch/x86/include/asm/shared/tdx.h | 10 +++- 2 files changed, 69 insertions(+), 17 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 08ce488b54d0..ba3103877b21 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -78,7 +78,7 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args) }
/* Read TD-scoped metadata */ -static inline u64 __maybe_unused tdg_vm_rd(u64 field, u64 *value) +static inline u64 tdg_vm_rd(u64 field, u64 *value) { struct tdx_module_args args = { .rdx = field, @@ -193,6 +193,62 @@ static void __noreturn tdx_panic(const char *msg) __tdx_hypercall(&args); }
+/* + * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure + * that no #VE will be delivered for accesses to TD-private memory. + * + * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM + * controls if the guest will receive such #VE with TD attribute + * ATTR_SEPT_VE_DISABLE. + * + * Newer TDX module allows the guest to control if it wants to receive SEPT + * violation #VEs. + * + * Check if the feature is available and disable SEPT #VE if possible. + * + * If the TD allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE + * attribute is no longer reliable. It reflects the initial state of the + * control for the TD, but it will not be updated if someone (e.g. bootloader) + * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to + * determine if SEPT #VEs are enabled or disabled. + */ +static void disable_sept_ve(u64 td_attr) +{ + const char *msg = "TD misconfiguration: SEPT #VE has to be disabled"; + bool debug = td_attr & ATTR_DEBUG; + u64 config, controls; + + /* Is this TD allowed to disable SEPT #VE */ + tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); + if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) { + /* No SEPT #VE controls for the guest: check the attribute */ + if (td_attr & ATTR_SEPT_VE_DISABLE) + return; + + /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */ + if (debug) + pr_warn("%s\n", msg); + else + tdx_panic(msg); + return; + } + + /* Check if SEPT #VE has been disabled before us */ + tdg_vm_rd(TDCS_TD_CTLS, &controls); + if (controls & TD_CTLS_PENDING_VE_DISABLE) + return; + + /* Keep #VEs enabled for splats in debugging environments */ + if (debug) + return; + + /* Disable SEPT #VEs */ + tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE, + TD_CTLS_PENDING_VE_DISABLE); + + return; +} + static void tdx_setup(u64 *cc_mask) { struct tdx_module_args args = {}; @@ -218,24 +274,12 @@ static void tdx_setup(u64 *cc_mask) gpa_width = args.rcx & GENMASK(5, 0); *cc_mask = BIT_ULL(gpa_width - 1);
+ td_attr = args.rdx; + /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
- /* - * The kernel can not handle #VE's when accessing normal kernel - * memory. Ensure that no #VE will be delivered for accesses to - * TD-private memory. Only VMM-shared memory (MMIO) will #VE. - */ - td_attr = args.rdx; - if (!(td_attr & ATTR_SEPT_VE_DISABLE)) { - const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set."; - - /* Relax SEPT_VE_DISABLE check for debug TD. */ - if (td_attr & ATTR_DEBUG) - pr_warn("%s\n", msg); - else - tdx_panic(msg); - } + disable_sept_ve(td_attr); }
/* diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 7e12cfa28bec..fecb2a6e864b 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -19,9 +19,17 @@ #define TDG_VM_RD 7 #define TDG_VM_WR 8
-/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016 +#define TDCS_TD_CTLS 0x1110000300000017 #define TDCS_NOTIFY_ENABLES 0x9100000000000010
+/* TDCS_CONFIG_FLAGS bits */ +#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1) + +/* TDCS_TD_CTLS bits */ +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0) + /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002
On 24.06.24 г. 14:41 ч., Kirill A. Shutemov wrote:
Memory access #VE's are hard for Linux to handle in contexts like the entry code or NMIs. But other OSes need them for functionality. There's a static (pre-guest-boot) way for a VMM to choose one or the other. But VMMs don't always know which OS they are booting, so they choose to deliver those #VE's so the "other" OSes will work. That, unfortunately has left us in the lurch and exposed to these hard-to-handle #VEs.
The TDX module has introduced a new feature. Even if the static configuration is "send nasty #VE's", the kernel can dynamically request that they be disabled.
Check if the feature is available and disable SEPT #VE if possible.
If the TD allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE attribute is no longer reliable. It reflects the initial state of the control for the TD, but it will not be updated if someone (e.g. bootloader) changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to determine if SEPT #VEs are enabled or disabled.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Fixes: 373e715e31bf ("x86/tdx: Panic on bad configs that #VE on "private" memory access") Cc: stable@vger.kernel.org
arch/x86/coco/tdx/tdx.c | 76 ++++++++++++++++++++++++------- arch/x86/include/asm/shared/tdx.h | 10 +++- 2 files changed, 69 insertions(+), 17 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 08ce488b54d0..ba3103877b21 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -78,7 +78,7 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args) } /* Read TD-scoped metadata */ -static inline u64 __maybe_unused tdg_vm_rd(u64 field, u64 *value) +static inline u64 tdg_vm_rd(u64 field, u64 *value) { struct tdx_module_args args = { .rdx = field, @@ -193,6 +193,62 @@ static void __noreturn tdx_panic(const char *msg) __tdx_hypercall(&args); } +/*
- The kernel cannot handle #VEs when accessing normal kernel memory. Ensure
- that no #VE will be delivered for accesses to TD-private memory.
- TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
- controls if the guest will receive such #VE with TD attribute
- ATTR_SEPT_VE_DISABLE.
- Newer TDX module allows the guest to control if it wants to receive SEPT
- violation #VEs.
- Check if the feature is available and disable SEPT #VE if possible.
- If the TD allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
- attribute is no longer reliable. It reflects the initial state of the
- control for the TD, but it will not be updated if someone (e.g. bootloader)
- changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
- determine if SEPT #VEs are enabled or disabled.
- */
+static void disable_sept_ve(u64 td_attr) +{
- const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
- bool debug = td_attr & ATTR_DEBUG;
- u64 config, controls;
- /* Is this TD allowed to disable SEPT #VE */
- tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
- if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
/* No SEPT #VE controls for the guest: check the attribute */
if (td_attr & ATTR_SEPT_VE_DISABLE)
return;
/* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
if (debug)
pr_warn("%s\n", msg);
else
tdx_panic(msg);
return;
- }
- /* Check if SEPT #VE has been disabled before us */
- tdg_vm_rd(TDCS_TD_CTLS, &controls);
- if (controls & TD_CTLS_PENDING_VE_DISABLE)
return;
- /* Keep #VEs enabled for splats in debugging environments */
- if (debug)
return;
- /* Disable SEPT #VEs */
- tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE,
TD_CTLS_PENDING_VE_DISABLE);
- return;
+}
- static void tdx_setup(u64 *cc_mask) { struct tdx_module_args args = {};
@@ -218,24 +274,12 @@ static void tdx_setup(u64 *cc_mask) gpa_width = args.rcx & GENMASK(5, 0); *cc_mask = BIT_ULL(gpa_width - 1);
- td_attr = args.rdx;
- /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
- /*
* The kernel can not handle #VE's when accessing normal kernel
* memory. Ensure that no #VE will be delivered for accesses to
* TD-private memory. Only VMM-shared memory (MMIO) will #VE.
*/
- td_attr = args.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
/* Relax SEPT_VE_DISABLE check for debug TD. */
if (td_attr & ATTR_DEBUG)
pr_warn("%s\n", msg);
else
tdx_panic(msg);
- }
- disable_sept_ve(td_attr); }
/* diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 7e12cfa28bec..fecb2a6e864b 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -19,9 +19,17 @@ #define TDG_VM_RD 7 #define TDG_VM_WR 8 -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016
0x9110000300000016
+#define TDCS_TD_CTLS 0x1110000300000017
0x9110000300000017
#define TDCS_NOTIFY_ENABLES 0x9100000000000010 +/* TDCS_CONFIG_FLAGS bits */ +#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
+/* TDCS_TD_CTLS bits */ +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
- /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002
On Wed, Jul 03, 2024 at 02:39:09PM +0300, Nikolay Borisov wrote:
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 7e12cfa28bec..fecb2a6e864b 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -19,9 +19,17 @@ #define TDG_VM_RD 7 #define TDG_VM_WR 8 -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016
0x9110000300000016
+#define TDCS_TD_CTLS 0x1110000300000017
0x9110000300000017
Setting bit 63 in these field id is regression in new TDX spec and TDX module. It is going to be fixed in next version. Both versions of field ids are going to be valid.
On 7/3/24 06:04, Kirill A. Shutemov wrote:
-/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016
0x9110000300000016
+#define TDCS_TD_CTLS 0x11104800000017
0x9110000300000017
Setting bit 63 in these field id is regression in new TDX spec and TDX module. It is going to be fixed in next version. Both versions of field ids are going to be valid.
I kinda never liked the big ol' magic numbers approach here. But could we please introduce some helpers here?
Then we'll end up with something like this (if the 0x111 can't be decomposed):
#define _TDCS_CMD(c) ((0x1110UL << 48) | (c))
#define TDCS_CONFIG_FLAGS _TDCS_CMD(0x16) #define TDCS_TD_CTLS _TDCS_CMD(0x17)
Then when folks change their mind about what should be in the TDX spec, we have one place to go fix it up in addition to making this all more readable.
On Wed, Jul 03, 2024 at 07:49:48AM -0700, Dave Hansen wrote:
On 7/3/24 06:04, Kirill A. Shutemov wrote:
-/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016
0x9110000300000016
+#define TDCS_TD_CTLS 0x11104800000017
0x9110000300000017
Setting bit 63 in these field id is regression in new TDX spec and TDX module. It is going to be fixed in next version. Both versions of field ids are going to be valid.
I kinda never liked the big ol' magic numbers approach here. But could we please introduce some helpers here?
Then we'll end up with something like this (if the 0x111 can't be decomposed):
#define _TDCS_CMD(c) ((0x1110UL << 48) | (c))
#define TDCS_CONFIG_FLAGS _TDCS_CMD(0x16) #define TDCS_TD_CTLS _TDCS_CMD(0x17)
Then when folks change their mind about what should be in the TDX spec, we have one place to go fix it up in addition to making this all more readable.
Hm. I am not sure about this. It can be tedious if we want to encode all info this way. Like, size of the field, number of elements in the field, number of elements in the sequence, etc. It is going to be macro on the macro on the macro. I don't think it would improve readability.
On related note, I think the TDX 1.5 definitions of field ids are more useful as you can infer more info from them. I consider to switching all definitions to the new format. It effectively drops TDX 1.0 support as the newly used bits are reserved in 1.0.
TDX module 1.0 is no longer supported, but kernel haven't broken anything from the guest side yet. I don't think anybody actually uses it and I need to jump hoops to get it running for validation.
Any objections for dropping TDX 1.0 support?
On 24.06.24 г. 14:41 ч., Kirill A. Shutemov wrote:
<snip>
--- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -19,9 +19,17 @@ #define TDG_VM_RD 7 #define TDG_VM_WR 8 -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016 +#define TDCS_TD_CTLS 0x1110000300000017 #define TDCS_NOTIFY_ENABLES 0x9100000000000010 +/* TDCS_CONFIG_FLAGS bits */ +#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
So where is this bit documented, because in td_scope_metadata.json CONFIG_FLAGS' individual bits aren't documented. All other TDX docs refer to the ABI .json file.
Landing code for undocumented bits unfortunately precludes any quality review on behalf of independent parties.
+/* TDCS_TD_CTLS bits */ +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
In contrast the TD_CTLS bits are documented in the same .json file.
- /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002
linux-stable-mirror@lists.linaro.org