Re: [PATCH v5 04/15] iommupt: Add the AMD IOMMU v1 page table format

8 Oct 2025

Jason,
On 9/3/2025 11:16 PM, Jason Gunthorpe wrote:
...
AMD IOMMU v1 is unique in supporting contiguous pages with a variable size
and it can decode the full 64 bit VA space. Unlike other x86 page tables
this explicitly does not do sign extension as part of allowing the entire
64 bit VA space to be supported.
I am still catching up w/ entire series.. But here is few fixes needed to boot
this series w/ SME.
...
The general design is quite similar to the x86 PAE format, except with a
6th level and quite different PTE encoding.
This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in
the existing code as the existing AMDv1 code starts out with a 3 level
table and adds levels on the fly if more IOVA is needed.
Comparing the performance of several operations to the existing version:
iommu_map()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     65,64    ,      62,61      ,  -1.01
     2^13,     70,66    ,      67,62      ,  -8.08
     2^14,     73,69    ,      71,65      ,  -9.09
     2^15,     78,75    ,      75,71      ,  -5.05
     2^16,     89,89    ,      86,84      ,  -2.02
     2^17,    128,121   ,     124,112     , -10.10
     2^18,    175,175   ,     170,163     ,  -4.04
     2^19,    264,306   ,     261,279     ,   6.06
     2^20,    444,525   ,     438,489     ,  10.10
     2^21,     60,62    ,      58,59      ,   1.01
 256*2^12,    381,1833  ,     367,1795    ,  79.79
 256*2^21,    375,1623  ,     356,1555    ,  77.77
 256*2^30,    356,1338  ,     349,1277    ,  72.72
iommu_unmap()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     76,89    ,      71,86      ,  17.17
     2^13,     79,89    ,      75,86      ,  12.12
     2^14,     78,90    ,      74,86      ,  13.13
     2^15,     82,89    ,      74,86      ,  13.13
     2^16,     79,89    ,      74,86      ,  13.13
     2^17,     81,89    ,      77,87      ,  11.11
     2^18,     90,92    ,      87,89      ,   2.02
     2^19,     91,93    ,      88,90      ,   2.02
     2^20,     96,95    ,      91,92      ,   1.01
     2^21,     72,88    ,      68,85      ,  20.20
 256*2^12,    372,6583  ,     364,6251    ,  94.94
 256*2^21,    398,6032  ,     392,5758    ,  93.93
 256*2^30,    396,5665  ,     389,5258    ,  92.92
The ~5-17x speedup when working with mutli-PTE map/unmaps is because the
AMD implementation rewalks the entire table on every new PTE while this
version retains its position. The same speedup will be seen with dirtys as
well.
The old implementation triggers a compiler optimization that ends up
generating a "rep stos" memset for contiguous PTEs. Since AMD can have
contiguous PTEs that span 2Kbytes of table this is a huge win compared to
a normal movq loop. It is why the unmap side has a fairly flat runtime as
the contiguous PTE sides increases. This version makes it explicit with a
memset64() call.
Tested-by: Alejandro Jimenez alejandro.j.jimenez@oracle.com
Signed-off-by: Jason Gunthorpe jgg@nvidia.com

drivers/iommu/Makefile                     |   1 +
 drivers/iommu/generic_pt/Kconfig           |  12 +
 drivers/iommu/generic_pt/fmt/Makefile      |  11 +
 drivers/iommu/generic_pt/fmt/amdv1.h       | 385 +++++++++++++++++++++
 drivers/iommu/generic_pt/fmt/defs_amdv1.h  |  21 ++
 drivers/iommu/generic_pt/fmt/iommu_amdv1.c |  15 +
 include/linux/generic_pt/common.h          |  19 +
 include/linux/generic_pt/iommu.h           |  29 ++
 8 files changed, 493 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/fmt/Makefile
 create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h
 create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h
 create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 355294fa9033f3..b17ef9818759be 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@ obj-y += arm/ iommufd/
 obj-$(CONFIG_AMD_IOMMU) += amd/
 obj-$(CONFIG_INTEL_IOMMU) += intel/
 obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
 obj-$(CONFIG_IOMMU_API) += iommu-traces.o
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index c35ddc7c827e92..208c8178d5dbd2 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -29,4 +29,16 @@ config IOMMU_PT
     IOMMU_PT provides an implementation of the page table operations
     related struct iommu_domain using GENERIC_PT to abstract the page
     table format.



+if IOMMU_PT
+config IOMMU_PT_AMDV1

tristate "IOMMU page table for 64-bit AMD IOMMU v1"
depends on !GENERIC_ATOMIC64 # for cmpxchg64
help
 iommu_domain implementation for the AMD v1 page table. AMDv1 is the


 "host" page table. It supports granular page sizes of almost every


 power of 2 and decodes an full 64-bit IOVA space.



 Selected automatically by an IOMMU driver that uses this format.



+endif
 endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
new file mode 100644
index 00000000000000..a4d83b7e0cf691
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0



+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1



+define create_format
+obj-$(2) += iommu_$(1).o



+endef



+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
new file mode 100644
index 00000000000000..901fc4a80e9a83
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*


Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES







AMD IOMMU v1 page table







This is described in Section "2.2.3 I/O Page Tables for Host Translations"



of the "AMD I/O Virtualization Technology (IOMMU) Specification"







Note the level numbering here matches the core code, so level 0 is the same



as mode 1.






*/

+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H



+#include "defs_amdv1.h"
+#include "../pt_defs.h"



+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>



+enum {

PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
PT_MAX_VA_ADDRESS_LG2 = 64,
PT_ITEM_WORD_SIZE = sizeof(u64),
PT_MAX_TOP_LEVEL = 5,
PT_GRANULE_LG2SZ = 12,
PT_TABLEMEM_LG2SZ = 12,

/* The DTE only has these bits for the top phyiscal address */
PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),

+};



+/* PTE bits */
+enum {

AMDV1PT_FMT_PR = BIT(0),
AMDV1PT_FMT_D = BIT(6),
AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
AMDV1PT_FMT_FC = BIT_ULL(60),
AMDV1PT_FMT_IR = BIT_ULL(61),
AMDV1PT_FMT_IW = BIT_ULL(62),

+};



+/*


gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make



these defines to avoid it.


*/

+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7



+#define common_to_amdv1pt(common_ptr) \

container_of_const(common_ptr, struct pt_amdv1, common)

+#define to_amdv1pt(pts) common_to_amdv1pt((pts)->range->common)



+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{

return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, pts->entry),

we need to clear SME bit here. (__sme_clr(pts->entry)).
...

	  PT_GRANULE_LG2SZ);



+}
+#define pt_table_pa amdv1pt_table_pa



+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{

pt_oaddr_t oa = FIELD_GET(AMDV1PT_FMT_OA, pts->entry);

we need to clear SME bit here. (__sme_clr(pts->entry)).
-Vasant

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH v5 04/15] iommupt: Add the AMD IOMMU v1 page table format