On Thu, Oct 23, 2025 at 11:21 AM Jason Gunthorpe jgg@nvidia.com wrote:
IOMMU HW now supports updating a dirty bit in an entry when a DMA writes to the entry's VA range. iommufd has a uAPI to read and clear the dirty bits from the tables.
This is a trivial recursive descent algorithm to read and optionally clear the dirty bits. The format needs a function to tell if a contiguous entry is dirty, and a function to clear a contiguous entry back to clean.
Tested-by: Alejandro Jimenez alejandro.j.jimenez@oracle.com Reviewed-by: Kevin Tian kevin.tian@intel.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com
drivers/iommu/generic_pt/iommu_pt.h | 104 ++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 6 ++ 2 files changed, 110 insertions(+)
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index f32e81509f4f09..448c5796d4a861 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -162,6 +162,108 @@ phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
+struct pt_iommu_dirty_args {
struct iommu_dirty_bitmap *dirty;unsigned int flags;+};
+static void record_dirty(struct pt_state *pts,
struct pt_iommu_dirty_args *dirty,unsigned int num_contig_lg2)+{
pt_vaddr_t dirty_len;if (num_contig_lg2 != ilog2(1)) {unsigned int index = pts->index;unsigned int end_index = log2_set_mod_max_t(unsigned int, pts->index, num_contig_lg2);/* Adjust for being contained inside a contiguous page */end_index = min(end_index, pts->end_index);dirty_len = (end_index - index) *log2_to_int(pt_table_item_lg2sz(pts));} else {dirty_len = log2_to_int(pt_table_item_lg2sz(pts));}if (dirty->dirty->bitmap)iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,dirty_len);if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {pt_entry_make_write_clean(pts);iommu_iotlb_gather_add_range(dirty->dirty->gather,pts->range->va, dirty_len);}+}
+static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
unsigned int level,struct pt_table_p *table)+{
struct pt_state pts = pt_init(range, level, table);struct pt_iommu_dirty_args *dirty = arg;int ret;for_each_pt_level_entry(&pts) {if (pts.type == PT_ENTRY_TABLE) {ret = pt_descend(&pts, arg, __read_and_clear_dirty);if (ret)return ret;continue;}if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))record_dirty(&pts, dirty,pt_entry_num_contig_lg2(&pts));}return 0;+}
+/**
- read_and_clear_dirty() - Manipulate the HW set write dirty state
- @domain: Domain to manipulate
- @iova: IO virtual address to start
- @size: Length of the IOVA
- @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
- @dirty: Place to store the dirty bits
- Iterate over all the entries in the mapped range and record their write dirty
- status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
- the entries will be left dirty, otherwise they are returned to being not
- write dirty.
- Context: The caller must hold a read range lock that includes @iova.
- Returns: -ERRNO on failure, 0 on success.
- */
+int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
unsigned long iova, size_t size,unsigned long flags,struct iommu_dirty_bitmap *dirty)+{
struct pt_iommu *iommu_table =container_of(domain, struct pt_iommu, domain);struct pt_iommu_dirty_args dirty_args = {.dirty = dirty,.flags = flags,};struct pt_range range;int ret;+#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
return -EOPNOTSUPP;+#endif
ret = make_range(common_from_iommu(iommu_table), &range, iova, size);if (ret)return ret;ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);PT_WARN_ON(ret);return ret;+} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
struct pt_iommu_collect_args { struct iommu_pages_list free_list; /* Fail if any OAs are within the range */ @@ -1015,5 +1117,7 @@ EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); MODULE_IMPORT_NS("GENERIC_PT"); +/* For iommu_dirty_bitmap_record() */ +MODULE_IMPORT_NS("IOMMUFD");
#endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0d59423024d57f..03a906fbe12a83 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -12,6 +12,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; struct pt_iommu_driver_ops; +struct iommu_dirty_bitmap;
/**
- DOC: IOMMU Radix Page Table
@@ -182,6 +183,9 @@ struct pt_iommu_cfg { struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ struct iommu_iotlb_gather *iotlb_gather); \
int pt_iommu_##fmt##_read_and_clear_dirty( \struct iommu_domain *domain, unsigned long iova, size_t size, \unsigned long flags, struct iommu_dirty_bitmap *dirty); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \@@ -202,6 +206,8 @@ struct pt_iommu_cfg { .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages +#define IOMMU_PT_DIRTY_OPS(fmt) \
.read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty/*
- The driver should setup its domain struct like
-- 2.43.0
Reviewed-by: Samiullah Khawaja skhawaja@google.com