RE: [PATCH v6 07/15] iommupt: Add map_pages op

16 Oct 2025

      ...
From: Jason Gunthorpe jgg@nvidia.com
Sent: Wednesday, October 8, 2025 12:12 AM

+static inline int pt_iommu_new_table(struct pt_state *pts,

		     struct pt_write_attrs *attrs)

+{

struct pt_table_p *table_mem;
phys_addr_t phys;

/* Given PA/VA/length can't be represented */
if (unlikely(!pt_can_have_table(pts)))
return -ENXIO;

PT_WARN_ON?
...

+/*

Check that the items in a contiguous block are all empty. This will

recursively check any tables in the block to validate they are empty and

then

free them through the gather.

*/

the 1st sentence is redundant.
...

/* Descend to a child table */
do {
pts.type = pt_load_entry_raw(&pts);

if (pts.type != PT_ENTRY_TABLE) {

	if (pts.type != PT_ENTRY_EMPTY)

		return -EADDRINUSE;

	ret = pt_iommu_new_table(&pts, &map->attrs);

	if (ret) {

		/*

		 * Racing with another thread installing a

table

		 */

		if (ret == -EAGAIN)

			continue;

		return ret;

	}

} else {

	pts.table_lower = pt_table_ptr(&pts);

}

/*

 * The already present table can possibly be shared with

another

 * concurrent map.

 */

if (map->leaf_level == level - 1)

	ret = pt_descend(&pts, arg, __map_range_leaf);

else

	ret = pt_descend(&pts, arg, __map_range);

if (ret)

	return ret;

pts.index++;

pt_index_to_va(&pts);

if (pts.index >= pts.end_index)

	break;

pts.type = pt_load_entry_raw(&pts);

redundant
...
+static __always_inline int __do_map_single_page(struct pt_range *range,

				void *arg, unsigned int level,

				struct pt_table_p *table,

				pt_level_fn_t descend_fn)

+{

struct pt_state pts = pt_init(range, level, table);
struct pt_iommu_map_args *map = arg;

pts.type = pt_load_single_entry(&pts);
if (level == 0) {
if (pts.type != PT_ENTRY_EMPTY)

	return -EADDRINUSE;

pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,

		      &map->attrs);

map->oa += PAGE_SIZE;

return 0;

}
if (pts.type != PT_ENTRY_TABLE)
return -EAGAIN;

return -EADDRINUSE if PT_ENTRY_OA. No need to retry if no empty.
...

+/**

map_pages() - Install translation for an IOVA range

@domain: Domain to manipulate

@iova: IO virtual address to start

@paddr: Physical/Output address to start

@pgsize: Length of each page

@pgcount: Length of the range in pgsize units starting from @iova

@prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO

@gfp: GFP flags for any memory allocations

@mapped: Total bytes successfully mapped

The range starting at IOVA will have paddr installed into it. The caller

must specify a valid pgsize and pgcount to segment the range into

compatible

blocks.

On error the caller will probably want to invoke unmap on the range from

iova

up to the amount indicated by @mapped to return the table back to an

unchanged state.

Context: The caller must hold a write range lock that includes the whole

range.

Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that

were

mapped are added to @mapped, @mapped is not zerod first.

hmm seems drivers are not consistent on this, e.g. Intel and several
others (virtio, rockchip. etc.) don't use addition. and there is no such
guidance from iommu.h.
Existing callers e.g. iommu_map_nosync() initializes mmaped to 0,
so this difference doesn't lead to observable problem so far. But it
may be good to unify drivers while at it.
...

/* Calculate target page size and level for the leaves */
if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
   pgcount == 1) {

PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));

if (log2_mod(iova | paddr, PAGE_SHIFT))

	return -ENXIO;

map.leaf_pgsize_lg2 = PAGE_SHIFT;

map.leaf_level = 0;

single_page = true;

} else {
map.leaf_pgsize_lg2 = pt_compute_best_pgsize(

	pgsize_bitmap, range.va, range.last_va, paddr);

if (!map.leaf_pgsize_lg2)

	return -ENXIO;

map.leaf_level =

	pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);

}

ret = check_map_range(iommu_table, &range, &map);
if (ret)
return ret;

single_page should aways pass the check then could skip it.
...

PT_WARN_ON(map.leaf_level > range.top_level);

do {
if (single_page) {

	ret = pt_walk_range(&range, __map_single_page,

&map);

	if (ret != -EAGAIN)

		break;

}

if (map.leaf_level == range.top_level)

	ret = pt_walk_range(&range, __map_range_leaf,

&map);

else

	ret = pt_walk_range(&range, __map_range, &map);

} while (false);

what is the point of do...while(false) compared to 'goto'?
...
+/**

struct pt_iommu_flush_ops - HW IOTLB cache flushing operations

The IOMMU driver should implement these using

container_of(iommu_table) to

get to it's iommu_domain derived structure. All ops can be called in

atomic

contexts as they are buried under DMA API calls.

*/

+struct pt_iommu_flush_ops {
not sure it's appropriate to call it flush_ops. It's not purely about flush,
e.g. @change_top requires the driver to actually change the top.
and we already have helpers for purely flushing iotlb.
...

/**
* @change_top: Update the top of table pointer

* @iommu_table: Table to operate on

* @top_paddr: New CPU physical address of the top pointer

* @top_level: IOMMU PT level of the new top

*

* Called under the get_top_lock() spinlock. The driver must update

all

* HW references to this domain with a new top address and

* configuration. On return mappings placed in the new top must be

* reachable by the HW.

*

* top_level encodes the level in IOMMU PT format, level 0 is the

* smallest page size increasing from there. This has to be translated

* to any HW specific format. During this call the new top will not be

* visible to any other API.

*

* This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if

* enabled.

*/

void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t

top_paddr,

	   unsigned int top_level);

/**
* @get_top_lock: Return a lock to hold when changing the table top

* @iommu_table: Table to operate on

*

blank line
...

* page table from being stored in HW. The lock will be held prior

* to calling change_top() and released once the top is fully visible.

*

* Typically this would be a lock that protects the iommu_domain's

* attachment list.

*

* This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if

* enabled.

*/

spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table);

+};

2025

2024

2023

2022

2021

2020

2019

2018

2017

RE: [PATCH v6 07/15] iommupt: Add map_pages op