Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions. Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands. The manual mmap alignment can be removed once mmap(!MAP_FIXED) on vfio device fds improves to automatically return well-aligned addresses.
Signed-off-by: Alex Mastro amastro@fb.com --- Sanity test run:
$ ./vfio_dma_mapping_mmio_test 0000:05:00.0 TAP version 13 1..4 # Starting 4 tests from 2 test cases. # RUN vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar ... Mapping BAR4: vaddr=0x7fad40000000 size=0x2000000000 iova=0x2000000000 # OK vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar ok 1 vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar ... Mapping BAR4 (partial): vaddr=0x7fad40000000 size=0x1000 iova=0x0 # OK vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar ok 2 vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar ... Mapping BAR4: vaddr=0x7fad40000000 size=0x2000000000 iova=0x2000000000 # OK vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar ok 3 vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar ... Mapping BAR4 (partial): vaddr=0x7fad40000000 size=0x1000 iova=0x0 # OK vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar ok 4 vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar # PASSED: 4 / 4 tests passed. # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0 --- tools/testing/selftests/vfio/Makefile | 1 + tools/testing/selftests/vfio/lib/vfio_pci_device.c | 28 ++++- .../selftests/vfio/vfio_dma_mapping_mmio_test.c | 132 +++++++++++++++++++++ 3 files changed, 160 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 3c796ca99a50..ead27892ab65 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,5 +1,6 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test +TEST_GEN_PROGS += vfio_dma_mapping_mmio_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 13fdb4b0b10f..6f29543856a5 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -12,10 +12,13 @@ #include <sys/mman.h>
#include <uapi/linux/types.h> +#include <linux/align.h> #include <linux/iommufd.h> +#include <linux/kernel.h> #include <linux/limits.h> #include <linux/mman.h> #include <linux/overflow.h> +#include <linux/sizes.h> #include <linux/types.h> #include <linux/vfio.h>
@@ -124,20 +127,43 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index]; + size_t align, size; + void *map_base, *map_align; int prot = 0;
VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP); + VFIO_ASSERT_GT(bar->info.size, 0);
if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE;
- bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED, + /* + * Align the mmap for more efficient IOMMU mapping. + * The largest PUD size supporting huge pfnmap is 1GiB. + */ + size = bar->info.size; + align = min_t(u64, 1ULL << __builtin_ctzll(size), SZ_1G); + + map_base = mmap(NULL, size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + VFIO_ASSERT_NE(map_base, MAP_FAILED); + + map_align = (void *)ALIGN((uintptr_t)map_base, align); + + if (map_align > map_base) + munmap(map_base, map_align - map_base); + if (align > (size_t)(map_align - map_base)) + munmap(map_align + size, align - (map_align - map_base)); + + bar->vaddr = mmap(map_align, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED); + + madvise(bar->vaddr, size, MADV_HUGEPAGE); }
static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c new file mode 100644 index 000000000000..211fa4203b49 --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <sys/mman.h> +#include <unistd.h> + +#include <uapi/linux/types.h> +#include <linux/pci_regs.h> +#include <linux/sizes.h> +#include <linux/vfio.h> + +#include <libvfio.h> + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +static int largest_mapped_bar(struct vfio_pci_device *device) +{ + int bar_idx = -1; + u64 bar_size = 0; + + for (int i = 0; i < PCI_STD_NUM_BARS; i++) { + struct vfio_pci_bar *bar = &device->bars[i]; + + if (!bar->vaddr) + continue; + + if (!(bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE)) + continue; + + if (bar->info.size > bar_size) { + bar_size = bar->info.size; + bar_idx = i; + } + } + + return bar_idx; +} + +FIXTURE(vfio_dma_mapping_mmio_test) { + struct iommu *iommu; + struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; + int bar_idx; +}; + +FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu); +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu); + +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE + +FIXTURE_SETUP(vfio_dma_mapping_mmio_test) +{ + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); + self->bar_idx = largest_mapped_bar(self->device); +} + +FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test) +{ + iova_allocator_cleanup(self->iova_allocator); + vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); +} + +TEST_F(vfio_dma_mapping_mmio_test, map_full_bar) +{ + struct vfio_pci_bar *bar; + struct dma_region region; + + if (self->bar_idx < 0) + SKIP(return, "No mappable BAR found on device %s", device_bdf); + + bar = &self->device->bars[self->bar_idx]; + + region = (struct dma_region) { + .vaddr = bar->vaddr, + .size = bar->info.size, + .iova = iova_allocator_alloc(self->iova_allocator, bar->info.size), + }; + + printf("Mapping BAR%d: vaddr=%p size=0x%lx iova=0x%lx\n", + self->bar_idx, region.vaddr, region.size, region.iova); + + iommu_map(self->iommu, ®ion); + iommu_unmap(self->iommu, ®ion); +} + +TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar) +{ + struct vfio_pci_bar *bar; + struct dma_region region; + size_t page_size; + + if (self->bar_idx < 0) + SKIP(return, "No mappable BAR found on device %s", device_bdf); + + bar = &self->device->bars[self->bar_idx]; + page_size = getpagesize(); + + if (bar->info.size < 2 * page_size) + SKIP(return, "BAR%d too small for partial mapping test (size=0x%llx)", + self->bar_idx, bar->info.size); + + region = (struct dma_region) { + .vaddr = bar->vaddr, + .size = page_size, + .iova = iova_allocator_alloc(self->iova_allocator, page_size), + }; + + printf("Mapping BAR%d (partial): vaddr=%p size=0x%lx iova=0x%lx\n", + self->bar_idx, region.vaddr, region.size, region.iova); + + iommu_map(self->iommu, ®ion); + iommu_unmap(self->iommu, ®ion); +} + +int main(int argc, char *argv[]) +{ + device_bdf = vfio_selftests_get_bdf(&argc, argv); + return test_harness_run(argc, argv); +}
--- base-commit: d721f52e31553a848e0e9947ca15a49c5674aef3 change-id: 20260107-scratch-amastro-vfio-dma-mapping-mmio-test-eecf25d9a742
Best regards,
On Wed, Jan 07, 2026 at 02:13:28PM -0800, Alex Mastro wrote:
@@ -124,20 +127,43 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index];
- size_t align, size;
- void *map_base, *map_align; int prot = 0;
VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP);
- VFIO_ASSERT_GT(bar->info.size, 0);
if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE;
- bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED,
- /*
* Align the mmap for more efficient IOMMU mapping.* The largest PUD size supporting huge pfnmap is 1GiB.*/- size = bar->info.size;
- align = min_t(u64, 1ULL << __builtin_ctzll(size), SZ_1G);
- map_base = mmap(NULL, size + align, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);- VFIO_ASSERT_NE(map_base, MAP_FAILED);
- map_align = (void *)ALIGN((uintptr_t)map_base, align);
- if (map_align > map_base)
munmap(map_base, map_align - map_base);- if (align > (size_t)(map_align - map_base))
I realized that this is tautological. Will fix in v2.
munmap(map_align + size, align - (map_align - map_base));- bar->vaddr = mmap(map_align, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED);
- madvise(bar->vaddr, size, MADV_HUGEPAGE);
}
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Assuming not, should we add negative tests here to make sure iommufd does not allow mapping BARs?
And then we can add dma-buf tests in a future commit.
The manual mmap alignment can be removed once mmap(!MAP_FIXED) on vfio device fds improves to automatically return well-aligned addresses.
Signed-off-by: Alex Mastro amastro@fb.com
Sanity test run:
$ ./vfio_dma_mapping_mmio_test 0000:05:00.0 TAP version 13 1..4 # Starting 4 tests from 2 test cases. # RUN vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar ... Mapping BAR4: vaddr=0x7fad40000000 size=0x2000000000 iova=0x2000000000 # OK vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar ok 1 vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar ... Mapping BAR4 (partial): vaddr=0x7fad40000000 size=0x1000 iova=0x0 # OK vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar ok 2 vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar ... Mapping BAR4: vaddr=0x7fad40000000 size=0x2000000000 iova=0x2000000000 # OK vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar ok 3 vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar ... Mapping BAR4 (partial): vaddr=0x7fad40000000 size=0x1000 iova=0x0 # OK vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar ok 4 vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar # PASSED: 4 / 4 tests passed.
# Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0
tools/testing/selftests/vfio/Makefile | 1 + tools/testing/selftests/vfio/lib/vfio_pci_device.c | 28 ++++- .../selftests/vfio/vfio_dma_mapping_mmio_test.c | 132 +++++++++++++++++++++ 3 files changed, 160 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 3c796ca99a50..ead27892ab65 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,5 +1,6 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test +TEST_GEN_PROGS += vfio_dma_mapping_mmio_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 13fdb4b0b10f..6f29543856a5 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -12,10 +12,13 @@ #include <sys/mman.h> #include <uapi/linux/types.h> +#include <linux/align.h> #include <linux/iommufd.h> +#include <linux/kernel.h> #include <linux/limits.h> #include <linux/mman.h> #include <linux/overflow.h> +#include <linux/sizes.h> #include <linux/types.h> #include <linux/vfio.h> @@ -124,20 +127,43 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index];
- size_t align, size;
- void *map_base, *map_align; int prot = 0;
VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP);
- VFIO_ASSERT_GT(bar->info.size, 0);
if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE;
- bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED,
- /*
* Align the mmap for more efficient IOMMU mapping.* The largest PUD size supporting huge pfnmap is 1GiB.*/- size = bar->info.size;
- align = min_t(u64, 1ULL << __builtin_ctzll(size), SZ_1G);
What's the reason to align to 1ULL << __builtin_ctzll(size) and not just size?
- map_base = mmap(NULL, size + align, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);- VFIO_ASSERT_NE(map_base, MAP_FAILED);
- map_align = (void *)ALIGN((uintptr_t)map_base, align);
- if (map_align > map_base)
munmap(map_base, map_align - map_base);- if (align > (size_t)(map_align - map_base))
munmap(map_align + size, align - (map_align - map_base));- bar->vaddr = mmap(map_align, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED);
- madvise(bar->vaddr, size, MADV_HUGEPAGE);
}
Can you split these changes out into a precursor commit? I think they stand on their own.
static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c new file mode 100644 index 000000000000..211fa4203b49 --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <sys/mman.h> +#include <unistd.h>
+#include <uapi/linux/types.h> +#include <linux/pci_regs.h> +#include <linux/sizes.h> +#include <linux/vfio.h>
+#include <libvfio.h>
+#include "../kselftest_harness.h"
+static const char *device_bdf;
+static int largest_mapped_bar(struct vfio_pci_device *device) +{
- int bar_idx = -1;
- u64 bar_size = 0;
- for (int i = 0; i < PCI_STD_NUM_BARS; i++) {
struct vfio_pci_bar *bar = &device->bars[i];if (!bar->vaddr)continue;if (!(bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE))continue;
nit: Add a comment here. I assume this is because iommu_map() tries to create writable IOMMU mappings?
Speaking of, maybe we can add a test that creating writable IOMMU mappings fails for read-only BARs?
if (bar->info.size > bar_size) {bar_size = bar->info.size;bar_idx = i;}- }
- return bar_idx;
+}
+FIXTURE(vfio_dma_mapping_mmio_test) {
- struct iommu *iommu;
- struct vfio_pci_device *device;
- struct iova_allocator *iova_allocator;
- int bar_idx;
+};
+FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) {
- const char *iommu_mode;
+};
+#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) { \
- .iommu_mode = #_iommu_mode, \
+}
nit: Alignment of trailing backslashes is off.
+FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu); +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu);
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+FIXTURE_SETUP(vfio_dma_mapping_mmio_test) +{
- self->iommu = iommu_init(variant->iommu_mode);
- self->device = vfio_pci_device_init(device_bdf, self->iommu);
- self->iova_allocator = iova_allocator_init(self->iommu);
- self->bar_idx = largest_mapped_bar(self->device);
+}
+FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test) +{
- iova_allocator_cleanup(self->iova_allocator);
- vfio_pci_device_cleanup(self->device);
- iommu_cleanup(self->iommu);
+}
+TEST_F(vfio_dma_mapping_mmio_test, map_full_bar) +{
- struct vfio_pci_bar *bar;
- struct dma_region region;
- if (self->bar_idx < 0)
SKIP(return, "No mappable BAR found on device %s", device_bdf);
I think you can do this in the FIXTURE_SETUP() to avoid duplication.
- bar = &self->device->bars[self->bar_idx];
- region = (struct dma_region) {
.vaddr = bar->vaddr,.size = bar->info.size,.iova = iova_allocator_alloc(self->iova_allocator, bar->info.size),- };
- printf("Mapping BAR%d: vaddr=%p size=0x%lx iova=0x%lx\n",
self->bar_idx, region.vaddr, region.size, region.iova);- iommu_map(self->iommu, ®ion);
- iommu_unmap(self->iommu, ®ion);
+}
+TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar) +{
- struct vfio_pci_bar *bar;
- struct dma_region region;
- size_t page_size;
- if (self->bar_idx < 0)
SKIP(return, "No mappable BAR found on device %s", device_bdf);- bar = &self->device->bars[self->bar_idx];
- page_size = getpagesize();
- if (bar->info.size < 2 * page_size)
SKIP(return, "BAR%d too small for partial mapping test (size=0x%llx)",self->bar_idx, bar->info.size);- region = (struct dma_region) {
.vaddr = bar->vaddr,.size = page_size,.iova = iova_allocator_alloc(self->iova_allocator, page_size),- };
- printf("Mapping BAR%d (partial): vaddr=%p size=0x%lx iova=0x%lx\n",
self->bar_idx, region.vaddr, region.size, region.iova);- iommu_map(self->iommu, ®ion);
- iommu_unmap(self->iommu, ®ion);
+}
+int main(int argc, char *argv[]) +{
- device_bdf = vfio_selftests_get_bdf(&argc, argv);
- return test_harness_run(argc, argv);
+}
base-commit: d721f52e31553a848e0e9947ca15a49c5674aef3 change-id: 20260107-scratch-amastro-vfio-dma-mapping-mmio-test-eecf25d9a742
Best regards,
Alex Mastro amastro@fb.com
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Only dmabuf.
Assuming not, should we add negative tests here to make sure iommufd does not allow mapping BARs?
Yes
And then we can add dma-buf tests in a future commit.
Yes
Jason
On Wed, Jan 07, 2026 at 08:54:06PM -0400, Jason Gunthorpe wrote:
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Only dmabuf.
Ack. I got confused. I had thought iommufd's vfio container compatibility mode was going to support this, but realized that doesn't make sense given past discussions about the pitfalls of achieving these mappings the legacy way.
Assuming not, should we add negative tests here to make sure iommufd does not allow mapping BARs?
Yes
Will add.
And then we can add dma-buf tests in a future commit.
Yes
Jason
On Wed, Jan 07, 2026 at 06:41:10PM -0800, Alex Mastro wrote:
On Wed, Jan 07, 2026 at 08:54:06PM -0400, Jason Gunthorpe wrote:
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Only dmabuf.
Ack. I got confused. I had thought iommufd's vfio container compatibility mode was going to support this, but realized that doesn't make sense given past discussions about the pitfalls of achieving these mappings the legacy way.
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
Jason
On Thu, 8 Jan 2026 10:10:44 -0400 Jason Gunthorpe jgg@ziepe.ca wrote:
On Wed, Jan 07, 2026 at 06:41:10PM -0800, Alex Mastro wrote:
On Wed, Jan 07, 2026 at 08:54:06PM -0400, Jason Gunthorpe wrote:
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Only dmabuf.
Ack. I got confused. I had thought iommufd's vfio container compatibility mode was going to support this, but realized that doesn't make sense given past discussions about the pitfalls of achieving these mappings the legacy way.
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Alex
On Thu, Jan 8, 2026 at 7:45 AM Alex Williamson alex@shazbot.org wrote:
On Thu, 8 Jan 2026 10:10:44 -0400 Jason Gunthorpe jgg@ziepe.ca wrote:
On Wed, Jan 07, 2026 at 06:41:10PM -0800, Alex Mastro wrote:
On Wed, Jan 07, 2026 at 08:54:06PM -0400, Jason Gunthorpe wrote:
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Only dmabuf.
Ack. I got confused. I had thought iommufd's vfio container compatibility mode was going to support this, but realized that doesn't make sense given past discussions about the pitfalls of achieving these mappings the legacy way.
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
Is the plan to address all the gaps so IOMMUFD_VFIO_CONTAINER can be made the default and the type1 code can be dropped from the upstream kernel?
On Thu, Jan 08, 2026 at 10:24:19AM -0800, David Matlack wrote:
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
What is this? I'm not aware of it..
Is the plan to address all the gaps so IOMMUFD_VFIO_CONTAINER can be made the default and the type1 code can be dropped from the upstream kernel?
This was a dream for sure, distros can decide if they want to continue to support both or have an option to do just one.
Jason
On 2026-01-08 02:33 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 10:24:19AM -0800, David Matlack wrote:
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
What is this? I'm not aware of it..
It's one of the test cases within tools/testing/selftests/vfio/vfio_dma_mapping_test.c.
Here's the output when running with CONFIG_IOMMUFD_VFIO_CONTAINER=y:
# RUN vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap ... Mapped HVA 0x7f0480000000 (size 0x40000000) at IOVA 0x0 Searching for IOVA 0x0 in /sys/kernel/debug/iommu/intel/0000:6a:01.0/domain_translation_struct Found IOMMU mappings for IOVA 0x0: PGD: 0x0000000203475027 P4D: 0x0000000203476027 PUD: 0x0000000203477027 PMD: 0x00000001e7562027 PTE: 0x00000041c0000067 # tools/testing/selftests/vfio/vfio_dma_mapping_test.c:188:dma_map_unmap:Expected 0 (0) == mapping.pte (282394099815) # dma_map_unmap: Test terminated by assertion # FAIL vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap
On Thu, Jan 08, 2026 at 09:29:29PM +0000, David Matlack wrote:
On 2026-01-08 02:33 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 10:24:19AM -0800, David Matlack wrote:
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
What is this? I'm not aware of it..
It's one of the test cases within tools/testing/selftests/vfio/vfio_dma_mapping_test.c.
Here's the output when running with CONFIG_IOMMUFD_VFIO_CONTAINER=y:
# RUN vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap ... Mapped HVA 0x7f0480000000 (size 0x40000000) at IOVA 0x0 Searching for IOVA 0x0 in /sys/kernel/debug/iommu/intel/0000:6a:01.0/domain_translation_struct Found IOMMU mappings for IOVA 0x0: PGD: 0x0000000203475027 P4D: 0x0000000203476027 PUD: 0x0000000203477027 PMD: 0x00000001e7562027 PTE: 0x00000041c0000067 # tools/testing/selftests/vfio/vfio_dma_mapping_test.c:188:dma_map_unmap:Expected 0 (0) == mapping.pte (282394099815) # dma_map_unmap: Test terminated by assertion # FAIL vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap
I can't think of any reason this would fail, I think your tests have found a real bug?? Can you check into it, what kernel call fails and where does the kernel code come from?
I don't think I can run these tests with the HW I have??
Jason
On 2026-01-08 08:36 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 09:29:29PM +0000, David Matlack wrote:
On 2026-01-08 02:33 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 10:24:19AM -0800, David Matlack wrote:
Oh, I was thinking about a compatability only flow only in the type 1 emulation that internally magically converts a VMA to a dmabuf, but I haven't written anything.. It is a bit tricky and the type 1 emulation has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
What is this? I'm not aware of it..
It's one of the test cases within tools/testing/selftests/vfio/vfio_dma_mapping_test.c.
Here's the output when running with CONFIG_IOMMUFD_VFIO_CONTAINER=y:
# RUN vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap ... Mapped HVA 0x7f0480000000 (size 0x40000000) at IOVA 0x0 Searching for IOVA 0x0 in /sys/kernel/debug/iommu/intel/0000:6a:01.0/domain_translation_struct Found IOMMU mappings for IOVA 0x0: PGD: 0x0000000203475027 P4D: 0x0000000203476027 PUD: 0x0000000203477027 PMD: 0x00000001e7562027 PTE: 0x00000041c0000067 # tools/testing/selftests/vfio/vfio_dma_mapping_test.c:188:dma_map_unmap:Expected 0 (0) == mapping.pte (282394099815) # dma_map_unmap: Test terminated by assertion # FAIL vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap
I can't think of any reason this would fail, I think your tests have found a real bug?? Can you check into it, what kernel call fails and where does the kernel code come from?
Oh I thought it was by design. This code in iommufd_vfio_set_iommu():
/* * The difference between TYPE1 and TYPE1v2 is the ability to unmap in * the middle of mapped ranges. This is complicated by huge page support * which creates single large IOPTEs that cannot be split by the iommu * driver. TYPE1 is very old at this point and likely nothing uses it, * however it is simple enough to emulate by simply disabling the * problematic large IOPTEs. Then we can safely unmap within any range. */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt);
git-blame says some guy named Jason Gunthorpe wrote it :P
I don't think I can run these tests with the HW I have??
FWIW all you need any PCI device that can be bound to vfio-pci and mapped by VT-d. This test does not rely on any of the VFIO selftests drivers to trigger DMA.
On Fri, Jan 09, 2026 at 12:43:32AM +0000, David Matlack wrote:
On 2026-01-08 08:36 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 09:29:29PM +0000, David Matlack wrote:
On 2026-01-08 02:33 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 10:24:19AM -0800, David Matlack wrote:
> Oh, I was thinking about a compatability only flow only in the type 1 > emulation that internally magically converts a VMA to a dmabuf, but I > haven't written anything.. It is a bit tricky and the type 1 emulation > has not been as popular as I expected??
In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
What is this? I'm not aware of it..
It's one of the test cases within tools/testing/selftests/vfio/vfio_dma_mapping_test.c.
Here's the output when running with CONFIG_IOMMUFD_VFIO_CONTAINER=y:
# RUN vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap ... Mapped HVA 0x7f0480000000 (size 0x40000000) at IOVA 0x0 Searching for IOVA 0x0 in /sys/kernel/debug/iommu/intel/0000:6a:01.0/domain_translation_struct Found IOMMU mappings for IOVA 0x0: PGD: 0x0000000203475027 P4D: 0x0000000203476027 PUD: 0x0000000203477027 PMD: 0x00000001e7562027 PTE: 0x00000041c0000067 # tools/testing/selftests/vfio/vfio_dma_mapping_test.c:188:dma_map_unmap:Expected 0 (0) == mapping.pte (282394099815) # dma_map_unmap: Test terminated by assertion # FAIL vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap
I can't think of any reason this would fail, I think your tests have found a real bug?? Can you check into it, what kernel call fails and where does the kernel code come from?
Oh I thought it was by design. This code in iommufd_vfio_set_iommu():
/* * The difference between TYPE1 and TYPE1v2 is the ability to unmap in * the middle of mapped ranges. This is complicated by huge page support * which creates single large IOPTEs that cannot be split by the iommu * driver. TYPE1 is very old at this point and likely nothing uses it, * however it is simple enough to emulate by simply disabling the * problematic large IOPTEs. Then we can safely unmap within any range. */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt);
git-blame says some guy named Jason Gunthorpe wrote it :P
Er, maybe I mis understood the output then?
This is not a "failure" though, the map succeeded and gave a small page mapping.
This is not reflecting a bug in iommufd but a bug in the TYPE1 support in VFIO itself because it definitely cannot maintain the required unmap anywhere semantic if it mapped in a 1G huge page like this.
Basically, if you are mapping with TYPE1 mode then this should be triggered:
if (!strcmp(variant->iommu_mode, "iommufd_compat_type1")) mapping_size = SZ_4K;
And VFIO should be the one to fail, not iommufd.
If you really want to test TYPE1 you need to test what makes it unique, which is that you can map any VMA and then unmap any slice of it. Including within what should otherwise be a 1G page.
But I doubt anyone cares enough to fix this, so just exclude VFIO_TYPE1_IOMMU from this test?
Jason
On Thu, Jan 8, 2026 at 4:54 PM Jason Gunthorpe jgg@ziepe.ca wrote:
On Fri, Jan 09, 2026 at 12:43:32AM +0000, David Matlack wrote:
On 2026-01-08 08:36 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 09:29:29PM +0000, David Matlack wrote:
On 2026-01-08 02:33 PM, Jason Gunthorpe wrote:
On Thu, Jan 08, 2026 at 10:24:19AM -0800, David Matlack wrote:
> > Oh, I was thinking about a compatability only flow only in the type 1 > > emulation that internally magically converts a VMA to a dmabuf, but I > > haven't written anything.. It is a bit tricky and the type 1 emulation > > has not been as popular as I expected?? > > In part because of this gap, I'd guess. Thanks,
Lack of huge mappings in the IOMMU when using VFIO_TYPE1_IOMMU is another gap I'm aware of. vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap fails when IOMMUFD_VFIO_CONTAINER is enabled.
What is this? I'm not aware of it..
It's one of the test cases within tools/testing/selftests/vfio/vfio_dma_mapping_test.c.
Here's the output when running with CONFIG_IOMMUFD_VFIO_CONTAINER=y:
# RUN vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap ... Mapped HVA 0x7f0480000000 (size 0x40000000) at IOVA 0x0 Searching for IOVA 0x0 in /sys/kernel/debug/iommu/intel/0000:6a:01.0/domain_translation_struct Found IOMMU mappings for IOVA 0x0: PGD: 0x0000000203475027 P4D: 0x0000000203476027 PUD: 0x0000000203477027 PMD: 0x00000001e7562027 PTE: 0x00000041c0000067 # tools/testing/selftests/vfio/vfio_dma_mapping_test.c:188:dma_map_unmap:Expected 0 (0) == mapping.pte (282394099815) # dma_map_unmap: Test terminated by assertion # FAIL vfio_dma_mapping_test.vfio_type1_iommu_anonymous_hugetlb_1gb.dma_map_unmap
I can't think of any reason this would fail, I think your tests have found a real bug?? Can you check into it, what kernel call fails and where does the kernel code come from?
Oh I thought it was by design. This code in iommufd_vfio_set_iommu():
/* * The difference between TYPE1 and TYPE1v2 is the ability to unmap in * the middle of mapped ranges. This is complicated by huge page support * which creates single large IOPTEs that cannot be split by the iommu * driver. TYPE1 is very old at this point and likely nothing uses it, * however it is simple enough to emulate by simply disabling the * problematic large IOPTEs. Then we can safely unmap within any range. */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt);git-blame says some guy named Jason Gunthorpe wrote it :P
Er, maybe I mis understood the output then?
This is not a "failure" though, the map succeeded and gave a small page mapping.
This is not reflecting a bug in iommufd but a bug in the TYPE1 support in VFIO itself because it definitely cannot maintain the required unmap anywhere semantic if it mapped in a 1G huge page like this.
Basically, if you are mapping with TYPE1 mode then this should be triggered:
if (!strcmp(variant->iommu_mode, "iommufd_compat_type1")) mapping_size = SZ_4K;And VFIO should be the one to fail, not iommufd.
If you really want to test TYPE1 you need to test what makes it unique, which is that you can map any VMA and then unmap any slice of it. Including within what should otherwise be a 1G page.
But I doubt anyone cares enough to fix this, so just exclude VFIO_TYPE1_IOMMU from this test?
Ah, ok, thanks for the explanation. So VFIO_TYPE1_IOMMU should always use 4K mappings regardless of backend (VFIO or iommufd) so that unmap can work as intended.
I think excluding VFIO_TYPE1_IOMMU from this assertion makes sense if we don't care about fixing it.
On Fri, Jan 09, 2026 at 09:04:30AM -0800, David Matlack wrote:
If you really want to test TYPE1 you need to test what makes it unique, which is that you can map any VMA and then unmap any slice of it. Including within what should otherwise be a 1G page.
But I doubt anyone cares enough to fix this, so just exclude VFIO_TYPE1_IOMMU from this test?
Ah, ok, thanks for the explanation. So VFIO_TYPE1_IOMMU should always use 4K mappings regardless of backend (VFIO or iommufd) so that unmap can work as intended.
IDK, I think you should just ignore testing TYPE1v0. The actual real semantics that it had are quite confusing and iommufd provides an emulation that is going to be functionally OK (indeed, functionally more capable) but is not the exactly the same.
The old comment here is sort of enlightening:
+ * vfio-iommu-type1 (v1) - User mappings were coalesced together to + * avoid tracking individual mappings. This means that the granularity + * of the original mapping was lost and the user was allowed to attempt + * to unmap any range. Depending on the contiguousness of physical + * memory and page sizes supported by the IOMMU, arbitrary unmaps may + * or may not have worked. We only guaranteed unmap granularity + * matching the original mapping; even though it was untracked here, + * the original mappings are reflected in IOMMU mappings. This + * resulted in a couple unusual behaviors. First, if a range is not + * able to be unmapped, ex. a set of 4k pages that was mapped as a + * 2M hugepage into the IOMMU, the unmap ioctl returns success but with + * a zero sized unmap. Also, if an unmap request overlaps the first + * address of a hugepage, the IOMMU will unmap the entire hugepage. + * This also returns success and the returned unmap size reflects the + * actual size unmapped.
iommufd does not try to do this "returned unmap size reflects the actual size unmapped" part, it always unmaps exactly what was requested, because it disables huge pages.
Jason
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
On 2026-01-07 02:13 PM, Alex Mastro wrote:
Test MMIO-backed DMA mappings by iommu_map()-ing mmap'ed BAR regions.
Thanks for adding this!
Also update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings.
Only vfio_type1 variants are tested; iommufd variants can be added once kernel support lands.
Are there plans to support mapping BARs via virtual address in iommufd? I thought the plan was only to support via dma-bufs. Maybe Jason can confirm.
Assuming not, should we add negative tests here to make sure iommufd does not allow mapping BARs?
And then we can add dma-buf tests in a future commit.
The manual mmap alignment can be removed once mmap(!MAP_FIXED) on vfio device fds improves to automatically return well-aligned addresses.
Signed-off-by: Alex Mastro amastro@fb.com
Sanity test run:
$ ./vfio_dma_mapping_mmio_test 0000:05:00.0 TAP version 13 1..4 # Starting 4 tests from 2 test cases. # RUN vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar ... Mapping BAR4: vaddr=0x7fad40000000 size=0x2000000000 iova=0x2000000000 # OK vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar ok 1 vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_full_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar ... Mapping BAR4 (partial): vaddr=0x7fad40000000 size=0x1000 iova=0x0 # OK vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar ok 2 vfio_dma_mapping_mmio_test.vfio_type1_iommu.map_partial_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar ... Mapping BAR4: vaddr=0x7fad40000000 size=0x2000000000 iova=0x2000000000 # OK vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar ok 3 vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_full_bar # RUN vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar ... Mapping BAR4 (partial): vaddr=0x7fad40000000 size=0x1000 iova=0x0 # OK vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar ok 4 vfio_dma_mapping_mmio_test.vfio_type1v2_iommu.map_partial_bar # PASSED: 4 / 4 tests passed.
# Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0
tools/testing/selftests/vfio/Makefile | 1 + tools/testing/selftests/vfio/lib/vfio_pci_device.c | 28 ++++- .../selftests/vfio/vfio_dma_mapping_mmio_test.c | 132 +++++++++++++++++++++ 3 files changed, 160 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 3c796ca99a50..ead27892ab65 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,5 +1,6 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test +TEST_GEN_PROGS += vfio_dma_mapping_mmio_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 13fdb4b0b10f..6f29543856a5 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -12,10 +12,13 @@ #include <sys/mman.h> #include <uapi/linux/types.h> +#include <linux/align.h> #include <linux/iommufd.h> +#include <linux/kernel.h> #include <linux/limits.h> #include <linux/mman.h> #include <linux/overflow.h> +#include <linux/sizes.h> #include <linux/types.h> #include <linux/vfio.h> @@ -124,20 +127,43 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index];
- size_t align, size;
- void *map_base, *map_align; int prot = 0;
VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP);
- VFIO_ASSERT_GT(bar->info.size, 0);
if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE;
- bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED,
- /*
* Align the mmap for more efficient IOMMU mapping.* The largest PUD size supporting huge pfnmap is 1GiB.*/- size = bar->info.size;
- align = min_t(u64, 1ULL << __builtin_ctzll(size), SZ_1G);
What's the reason to align to 1ULL << __builtin_ctzll(size) and not just size?
This was inspired by QEMU's hw/vfio/region.c which also does this rounding up of size to the next power of two [1].
I'm now realizing that's only necessary for regions with VFIO_REGION_INFO_CAP_SPARSE_MMAP where there are multiple mmaps per region, and each mmap's size is less than the size of the BAR. Here, since we're mapping the entire BAR which must be pow2, it shouldn't be necessary.
It's also making me realize...
That the existing vfio_pci_device_setup() BAR mapping path isn't accounting for the possibility of SPARE_MMAP where attempting to mmap the entire region would fail due to non-mmap'able holes.
The intent of QEMU's mmap alignment code is imperfect in the SPARE_MMAP case? After a hole, the next mmap'able range could be some arbitrary page-aligned offset into the region. It's not helpful mmap some region offset which is maximally 4K-aligned at a 1G-aligned vaddr.
I think to be optimal, QEMU should be attempting to align the vaddr for bar mmaps such that
vaddr % {2M,1G} == region_offset % {2M,1G}
Would love someone to sanity check me on this. Kind of a diversion.
[1] https://github.com/qemu/qemu/blob/master/hw/vfio/region.c#L255-L286
- map_base = mmap(NULL, size + align, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);- VFIO_ASSERT_NE(map_base, MAP_FAILED);
- map_align = (void *)ALIGN((uintptr_t)map_base, align);
- if (map_align > map_base)
munmap(map_base, map_align - map_base);- if (align > (size_t)(map_align - map_base))
munmap(map_align + size, align - (map_align - map_base));- bar->vaddr = mmap(map_align, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED);
- madvise(bar->vaddr, size, MADV_HUGEPAGE);
}
Can you split these changes out into a precursor commit? I think they stand on their own.
Ack.
static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c new file mode 100644 index 000000000000..211fa4203b49 --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <sys/mman.h> +#include <unistd.h>
+#include <uapi/linux/types.h> +#include <linux/pci_regs.h> +#include <linux/sizes.h> +#include <linux/vfio.h>
+#include <libvfio.h>
+#include "../kselftest_harness.h"
+static const char *device_bdf;
+static int largest_mapped_bar(struct vfio_pci_device *device) +{
- int bar_idx = -1;
- u64 bar_size = 0;
- for (int i = 0; i < PCI_STD_NUM_BARS; i++) {
struct vfio_pci_bar *bar = &device->bars[i];if (!bar->vaddr)continue;if (!(bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE))continue;nit: Add a comment here. I assume this is because iommu_map() tries to create writable IOMMU mappings?
Yes, and I'll actually make a stronger check for both READ|WRITE here since iommu_map() always maps with VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE.
Speaking of, maybe we can add a test that creating writable IOMMU mappings fails for read-only BARs?
I think I'll have to look into this as a follow-on. I'm not sure how to validate it yet without mocks or similar since I don't have such HW.
if (bar->info.size > bar_size) {bar_size = bar->info.size;bar_idx = i;}- }
- return bar_idx;
+}
+FIXTURE(vfio_dma_mapping_mmio_test) {
- struct iommu *iommu;
- struct vfio_pci_device *device;
- struct iova_allocator *iova_allocator;
- int bar_idx;
+};
+FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) {
- const char *iommu_mode;
+};
+#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) { \
- .iommu_mode = #_iommu_mode, \
+}
nit: Alignment of trailing backslashes is off.
Ack
+FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu); +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu);
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+FIXTURE_SETUP(vfio_dma_mapping_mmio_test) +{
- self->iommu = iommu_init(variant->iommu_mode);
- self->device = vfio_pci_device_init(device_bdf, self->iommu);
- self->iova_allocator = iova_allocator_init(self->iommu);
- self->bar_idx = largest_mapped_bar(self->device);
+}
+FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test) +{
- iova_allocator_cleanup(self->iova_allocator);
- vfio_pci_device_cleanup(self->device);
- iommu_cleanup(self->iommu);
+}
+TEST_F(vfio_dma_mapping_mmio_test, map_full_bar) +{
- struct vfio_pci_bar *bar;
- struct dma_region region;
- if (self->bar_idx < 0)
SKIP(return, "No mappable BAR found on device %s", device_bdf);I think you can do this in the FIXTURE_SETUP() to avoid duplication.
Ack
- bar = &self->device->bars[self->bar_idx];
- region = (struct dma_region) {
.vaddr = bar->vaddr,.size = bar->info.size,.iova = iova_allocator_alloc(self->iova_allocator, bar->info.size),- };
- printf("Mapping BAR%d: vaddr=%p size=0x%lx iova=0x%lx\n",
self->bar_idx, region.vaddr, region.size, region.iova);- iommu_map(self->iommu, ®ion);
- iommu_unmap(self->iommu, ®ion);
+}
+TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar) +{
- struct vfio_pci_bar *bar;
- struct dma_region region;
- size_t page_size;
- if (self->bar_idx < 0)
SKIP(return, "No mappable BAR found on device %s", device_bdf);- bar = &self->device->bars[self->bar_idx];
- page_size = getpagesize();
- if (bar->info.size < 2 * page_size)
SKIP(return, "BAR%d too small for partial mapping test (size=0x%llx)",self->bar_idx, bar->info.size);- region = (struct dma_region) {
.vaddr = bar->vaddr,.size = page_size,.iova = iova_allocator_alloc(self->iova_allocator, page_size),- };
- printf("Mapping BAR%d (partial): vaddr=%p size=0x%lx iova=0x%lx\n",
self->bar_idx, region.vaddr, region.size, region.iova);- iommu_map(self->iommu, ®ion);
- iommu_unmap(self->iommu, ®ion);
+}
+int main(int argc, char *argv[]) +{
- device_bdf = vfio_selftests_get_bdf(&argc, argv);
- return test_harness_run(argc, argv);
+}
base-commit: d721f52e31553a848e0e9947ca15a49c5674aef3 change-id: 20260107-scratch-amastro-vfio-dma-mapping-mmio-test-eecf25d9a742
Best regards,
Alex Mastro amastro@fb.com
On Wed, Jan 07, 2026 at 07:36:44PM -0800, Alex Mastro wrote:
This was inspired by QEMU's hw/vfio/region.c which also does this rounding up of size to the next power of two [1].
I'm now realizing that's only necessary for regions with VFIO_REGION_INFO_CAP_SPARSE_MMAP where there are multiple mmaps per region, and each mmap's size is less than the size of the BAR. Here, since we're mapping the entire BAR which must be pow2, it shouldn't be necessary.
You only need to do this dance if you care about having large PTEs under the VMAs, which is probably something worth testing both scenarios.
The intent of QEMU's mmap alignment code is imperfect in the SPARE_MMAP case? After a hole, the next mmap'able range could be some arbitrary page-aligned offset into the region. It's not helpful mmap some region offset which is maximally 4K-aligned at a 1G-aligned vaddr.
I think to be optimal, QEMU should be attempting to align the vaddr for bar mmaps such that
vaddr % {2M,1G} == region_offset % {2M,1G}
Would love someone to sanity check me on this. Kind of a diversion.
What you write is correct. Ankit recently discovered this bug in qemu. It happens not just with SPARSE_MMAP but also when mmmaping around the MSI-X hole..
I also advocated for what you write here that qemu should ensure:
vaddr % region_size == region_offset % region_size
Until VFIO learns to align its VMAs on its own via Peter's work.
Jason
On Thu, Jan 08, 2026 at 10:38:04AM -0400, Jason Gunthorpe wrote:
On Wed, Jan 07, 2026 at 07:36:44PM -0800, Alex Mastro wrote:
This was inspired by QEMU's hw/vfio/region.c which also does this rounding up of size to the next power of two [1].
I'm now realizing that's only necessary for regions with VFIO_REGION_INFO_CAP_SPARSE_MMAP where there are multiple mmaps per region, and each mmap's size is less than the size of the BAR. Here, since we're mapping the entire BAR which must be pow2, it shouldn't be necessary.
You only need to do this dance if you care about having large PTEs under the VMAs, which is probably something worth testing both scenarios.
Yep, makes sense. The test takes a long time to run without this due potentially faulting a 128G BAR region 4K at a time during VFIO_IOMMU_MAP_DMA.
The intent of QEMU's mmap alignment code is imperfect in the SPARE_MMAP case? After a hole, the next mmap'able range could be some arbitrary page-aligned offset into the region. It's not helpful mmap some region offset which is maximally 4K-aligned at a 1G-aligned vaddr.
I think to be optimal, QEMU should be attempting to align the vaddr for bar mmaps such that
vaddr % {2M,1G} == region_offset % {2M,1G}
Would love someone to sanity check me on this. Kind of a diversion.
What you write is correct. Ankit recently discovered this bug in qemu. It happens not just with SPARSE_MMAP but also when mmmaping around the MSI-X hole..
Is my mental model broken? I thought MSI-X holes in a VFIO-exposed BAR region implied SPARSE_MMAP? I didn't think there was another way for the uapi to express hole-yness.
I also advocated for what you write here that qemu should ensure:
vaddr % region_size == region_offset % region_size
Why region_size out of curiosity? Assuming perfect knowledge of kernel internals I would have expected something like this:
diff --git a/hw/vfio/region.c b/hw/vfio/region.c index ca75ab1be4..1d8595e808 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -238,6 +238,18 @@ static void vfio_subregion_unmap(VFIORegion *region, int index) region->mmaps[index].mmap = NULL; }
+/* + * Return the next value greater than or equal to `input` such that + * (value % align) == offset. + */ +static size_t align_offset(size_t input, size_t offset, size_t align) +{ + size_t remainder = input % align; + size_t delta = (align + offset - remainder) % align; + + return input + delta; +} + int vfio_region_mmap(VFIORegion *region) { int i, ret, prot = 0; @@ -252,7 +264,11 @@ int vfio_region_mmap(VFIORegion *region) prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
for (i = 0; i < region->nr_mmaps; i++) { - size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); + size_t size = region->mmaps[i].size; + size_t offs = region->mmaps[i].offset; + size_t align = size >= GiB ? GiB : + size >= 2 * MiB ? 2 * MiB : + getpagesize(); void *map_base, *map_align;
/* @@ -275,7 +291,7 @@ int vfio_region_mmap(VFIORegion *region)
fd = vfio_device_get_region_fd(region->vbasedev, region->nr);
- map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); + map_align = (void *)align_offset((size_t)map_base, offs % align, align); munmap(map_base, map_align - map_base); munmap(map_align + region->mmaps[i].size, align - (map_align - map_base));
On Thu, Jan 08, 2026 at 08:25:19AM -0800, Alex Mastro wrote:
On Thu, Jan 08, 2026 at 10:38:04AM -0400, Jason Gunthorpe wrote:
On Wed, Jan 07, 2026 at 07:36:44PM -0800, Alex Mastro wrote:
The intent of QEMU's mmap alignment code is imperfect in the SPARE_MMAP case? After a hole, the next mmap'able range could be some arbitrary page-aligned offset into the region. It's not helpful mmap some region offset which is maximally 4K-aligned at a 1G-aligned vaddr.
I think to be optimal, QEMU should be attempting to align the vaddr for bar mmaps such that
vaddr % {2M,1G} == region_offset % {2M,1G}
Would love someone to sanity check me on this. Kind of a diversion.
What you write is correct. Ankit recently discovered this bug in qemu. It happens not just with SPARSE_MMAP but also when mmmaping around the MSI-X hole..
Is my mental model broken? I thought MSI-X holes in a VFIO-exposed BAR region implied SPARSE_MMAP? I didn't think there was another way for the uapi to express hole-yness.
Yes, it was broken. Creating MSI-X table holes with SPARSE_MMAP ended back in 2017 and was superseded by VFIO_REGION_INFO_CAP_MSIX_MAPPABLE [1].
[1] https://lore.kernel.org/all/20171213023131.41233-1-aik@ozlabs.ru/
Only nvgrace-gpu and some i915 reference SPARSE_MMAP today.
On Wed, 7 Jan 2026 19:36:44 -0800 Alex Mastro amastro@fb.com wrote:
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
Speaking of, maybe we can add a test that creating writable IOMMU mappings fails for read-only BARs?
I think I'll have to look into this as a follow-on. I'm not sure how to validate it yet without mocks or similar since I don't have such HW.
I think the read-only aspect would be in the mmap, not the BAR itself, ie. can we create a read-write DMA mapping to a read-only mmap.
ROM BARs are the only BARs that are read-only, but they can share a decoder with the standard BARs and therefore have a separate enable in the BAR register itself. Due to this, and their general usage, it's never been necessary to allow mmap of the ROM BAR, therefore we cannot actually DMA map the ROM BAR. Thanks,
Alex
On Thu, Jan 8, 2026 at 7:42 AM Alex Williamson alex@shazbot.org wrote:
On Wed, 7 Jan 2026 19:36:44 -0800 Alex Mastro amastro@fb.com wrote:
On Wed, Jan 07, 2026 at 11:54:09PM +0000, David Matlack wrote:
Speaking of, maybe we can add a test that creating writable IOMMU mappings fails for read-only BARs?
I think I'll have to look into this as a follow-on. I'm not sure how to validate it yet without mocks or similar since I don't have such HW.
I think the read-only aspect would be in the mmap, not the BAR itself, ie. can we create a read-write DMA mapping to a read-only mmap.
Good point. So it'd be better to have a test of that in vfio_dma_mapping_test. No need to use a BAR mapping.
ROM BARs are the only BARs that are read-only, but they can share a decoder with the standard BARs and therefore have a separate enable in the BAR register itself. Due to this, and their general usage, it's never been necessary to allow mmap of the ROM BAR, therefore we cannot actually DMA map the ROM BAR. Thanks,
Ahh, good to know, thanks for the context!
linux-kselftest-mirror@lists.linaro.org