mas_skip_node() is used to move the maple state to the node with a
higher limit. It does this by walking up the tree and increasing the
slot count. Since slot count may not be able to be increased, it may
need to walk up multiple times to find room to walk right to a higher
limit node. The limit of slots that was being used was the node limit
and not the last location of data in the node. This would cause the
maple state to be shifted outside actual data and enter an error state,
thus returning -EBUSY.
The result of the incorrect error state means that mas_awalk() would
return an error instead of finding the allocation space.
The fix is to use mas_data_end() in mas_skip_node() to detect the nodes
data end point and continue walking the tree up until it is safe to move
to a node with a higher limit.
mas_skip_node() may also be passed a maple state in an error state from
mas_anode_descend() when no allocations are available. Return on such
an error state immediately.
Reported-by: Snild Dolkow <snild(a)sony.com>
Link: https://lore.kernel.org/linux-mm/cb8dc31a-fef2-1d09-f133-e9f7b9f9e77a@sony.…
Cc: <Stable(a)vger.kernel.org>
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
---
lib/maple_tree.c | 25 ++++++++++---------------
1 file changed, 10 insertions(+), 15 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 2be86368237d..2efe854946d6 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5188,34 +5188,29 @@ static inline bool mas_rewind_node(struct ma_state *mas)
*/
static inline bool mas_skip_node(struct ma_state *mas)
{
- unsigned char slot, slot_count;
unsigned long *pivots;
enum maple_type mt;
- mt = mte_node_type(mas->node);
- slot_count = mt_slots[mt] - 1;
+ if (mas_is_err(mas))
+ return false;
+
do {
if (mte_is_root(mas->node)) {
- slot = mas->offset;
- if (slot > slot_count) {
+ if (mas->offset >= mas_data_end(mas)) {
mas_set_err(mas, -EBUSY);
return false;
}
} else {
mas_ascend(mas);
- slot = mas->offset;
- mt = mte_node_type(mas->node);
- slot_count = mt_slots[mt] - 1;
}
- } while (slot > slot_count);
+ } while (mas->offset >= mas_data_end(mas));
- mas->offset = ++slot;
+ mt = mte_node_type(mas->node);
pivots = ma_pivots(mas_mn(mas), mt);
- if (slot > 0)
- mas->min = pivots[slot - 1] + 1;
-
- if (slot <= slot_count)
- mas->max = pivots[slot];
+ mas->min = pivots[mas->offset] + 1;
+ mas->offset++;
+ if (mas->offset < mt_slots[mt])
+ mas->max = pivots[mas->offset];
return true;
}
--
2.39.2
The patch below does not apply to the 6.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.2.y
git checkout FETCH_HEAD
git cherry-pick -x 1eac28201ac0725192f5ced34192d281a06692e5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167820482516243(a)kroah.com' --subject-prefix 'PATCH 6.2.y' HEAD^..
Possible dependencies:
1eac28201ac0 ("RISC-V: fix ordering of Zbb extension")
80c200b34ee8 ("RISC-V: resort all extensions in consistent orders")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1eac28201ac0725192f5ced34192d281a06692e5 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko.stuebner(a)vrull.eu>
Date: Wed, 8 Feb 2023 23:53:27 +0100
Subject: [PATCH] RISC-V: fix ordering of Zbb extension
As Andrew reported,
Zb* comes after Zi* according 27.11 "Subset Naming Convention"
so fix the ordering accordingly.
Reported-by: Andrew Jones <ajones(a)ventanamicro.com>
Signed-off-by: Heiko Stuebner <heiko.stuebner(a)vrull.eu>
Reviewed-by: Conor Dooley <conor.dooley(a)microchip.com>
Reviewed-by: Andrew Jones <ajones(a)ventanamicro.com>
Tested-by: Conor Dooley <conor.dooley(a)microchip.com>
Link: https://lore.kernel.org/r/20230208225328.1636017-2-heiko@sntech.de
Cc: stable(a)vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer(a)rivosinc.com>
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index 420228e219f7..8400f0cc9704 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -185,9 +185,9 @@ arch_initcall(riscv_cpuinfo_init);
* New entries to this struct should follow the ordering rules described above.
*/
static struct riscv_isa_ext_data isa_ext_arr[] = {
- __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
+ __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 85636167e3206c3fbd52254fc432991cc4e90194
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16782054308810(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
85636167e320 ("drm/i915: Don't use BAR mappings for ring buffers with LLC")
fa85bfd19c26 ("drm/i915: Update the helper to set correct mapping")
e09e903a6e89 ("drm/i915/selftests: Prepare execlists and lrc selftests for obj->mm.lock removal")
17b7ab92bec3 ("drm/i915/selftests: Prepare hangcheck for obj->mm.lock removal")
d3ad29567d4e ("drm/i915/selftests: Prepare context selftest for obj->mm.lock removal")
c858ffa17716 ("drm/i915: Lock ww in ucode objects correctly")
c05258889ed4 ("drm/i915: Add igt_spinner_pin() to allow for ww locking around spinner.")
6895649bf13f ("drm/i915/selftests: Set error returns")
a0d3fdb628b8 ("drm/i915/gt: Split logical ring contexts from execlist submission")
d0d829e56674 ("drm/i915: split gen8+ flush and bb_start emission functions")
70a2b431c364 ("drm/i915/gt: Rename lrc.c to execlists_submission.c")
d33fcd798cb7 ("drm/i915/gt: Ignore dt==0 for reporting underflows")
09212e81e545 ("drm/i915/gt: Flush xcs before tgl breadcrumbs")
c10f6019d0b2 ("drm/i915/gt: Use the local HWSP offset during submission")
89db95377be4 ("drm/i915/gt: Confirm the context survives execution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85636167e3206c3fbd52254fc432991cc4e90194 Mon Sep 17 00:00:00 2001
From: John Harrison <John.C.Harrison(a)Intel.com>
Date: Wed, 15 Feb 2023 17:11:01 -0800
Subject: [PATCH] drm/i915: Don't use BAR mappings for ring buffers with LLC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Direction from hardware is that ring buffers should never be mapped
via the BAR on systems with LLC. There are too many caching pitfalls
due to the way BAR accesses are routed. So it is safest to just not
use it.
Signed-off-by: John Harrison <John.C.Harrison(a)Intel.com>
Fixes: 9d80841ea4c9 ("drm/i915: Allow ringbuffers to be bound anywhere")
Cc: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen(a)linux.intel.com>
Cc: Jani Nikula <jani.nikula(a)linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi(a)intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)linux.intel.com>
Cc: intel-gfx(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v4.9+
Tested-by: Jouni Högander <jouni.hogander(a)intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230216011101.1909009-3-John…
(cherry picked from commit 65c08339db1ada87afd6cfe7db8e60bb4851d919)
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c
index fb1d2595392e..fb99143be98e 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring.c
@@ -53,7 +53,7 @@ int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww)
if (unlikely(ret))
goto err_unpin;
- if (i915_vma_is_map_and_fenceable(vma)) {
+ if (i915_vma_is_map_and_fenceable(vma) && !HAS_LLC(vma->vm->i915)) {
addr = (void __force *)i915_vma_pin_iomap(vma);
} else {
int type = i915_coherent_map_type(vma->vm->i915, vma->obj, false);
@@ -98,7 +98,7 @@ void intel_ring_unpin(struct intel_ring *ring)
return;
i915_vma_unset_ggtt_write(vma);
- if (i915_vma_is_map_and_fenceable(vma))
+ if (i915_vma_is_map_and_fenceable(vma) && !HAS_LLC(vma->vm->i915))
i915_vma_unpin_iomap(vma);
else
i915_gem_object_unpin_map(vma->obj);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167820515930221(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
046eca5018f8 ("vfio/type1: prevent underflow of locked_vm via exec()")
4ab4fcfce5b5 ("vfio/type1: fix vaddr_get_pfns() return in vfio_pin_page_external()")
4b6c33b32296 ("vfio/type1: Prepare for batched pinning with struct vfio_batch")
be16c1fd99f4 ("vfio/type1: Change success value of vaddr_get_pfn()")
aae7a75a821a ("vfio/type1: Add proper error unwind for vfio_iommu_replay()")
64019a2e467a ("mm/gup: remove task_struct pointer for all gup code")
bce617edecad ("mm: do page fault accounting in handle_mm_fault")
ed03d924587e ("mm/gup: use a standard migration target allocation callback")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
d92bbc2719bd ("mm/hugetlb: unify migration callbacks")
b4b382238ed2 ("mm/migrate: move migration helper from .h to .c")
c7073bab5772 ("mm/page_isolation: prefer the node of the source page")
3e4e28c5a8f0 ("mmap locking API: convert mmap_sem API comments")
d8ed45c5dcd4 ("mmap locking API: use coccinelle to convert mmap_sem rwsem call sites")
ca5999fde0a1 ("mm: introduce include/linux/pgtable.h")
420c2091b65a ("mm/gup: introduce pin_user_pages_locked()")
5a36f0f3f518 ("Merge tag 'vfio-v5.8-rc1' of git://github.com/awilliam/linux-vfio")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare(a)oracle.com>
Date: Tue, 31 Jan 2023 08:58:04 -0800
Subject: [PATCH] vfio/type1: prevent underflow of locked_vm via exec()
When a vfio container is preserved across exec, the task does not change,
but it gets a new mm with locked_vm=0, and loses the count from existing
dma mappings. If the user later unmaps a dma mapping, locked_vm underflows
to a large unsigned value, and a subsequent dma map request fails with
ENOMEM in __account_locked_vm.
To avoid underflow, grab and save the mm at the time a dma is mapped.
Use that mm when adjusting locked_vm, rather than re-acquiring the saved
task's mm, which may have changed. If the saved mm is dead, do nothing.
locked_vm is incremented for existing mappings in a subsequent patch.
Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Steve Sistare <steven.sistare(a)oracle.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/1675184289-267876-3-git-send-email-steven.sistare…
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 144f5bb20fb8..6b757d035457 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -100,6 +100,7 @@ struct vfio_dma {
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap;
+ struct mm_struct *mm;
};
struct vfio_batch {
@@ -420,8 +421,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
if (!npage)
return 0;
- mm = async ? get_task_mm(dma->task) : dma->task->mm;
- if (!mm)
+ mm = dma->mm;
+ if (async && !mmget_not_zero(mm))
return -ESRCH; /* process exited */
ret = mmap_write_lock_killable(mm);
@@ -794,8 +795,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
struct mm_struct *mm;
int ret;
- mm = get_task_mm(dma->task);
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -ENODEV;
ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
@@ -805,7 +806,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
ret = 0;
if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
- ret = vfio_lock_acct(dma, 1, true);
+ ret = vfio_lock_acct(dma, 1, false);
if (ret) {
put_pfn(*pfn_base, dma->prot);
if (ret == -ENOMEM)
@@ -1180,6 +1181,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
+ mmdrop(dma->mm);
vfio_dma_bitmap_free(dma);
if (dma->vaddr_invalid) {
iommu->vaddr_invalid_count--;
@@ -1664,29 +1666,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
* against the locked memory limit and we need to be able to do both
* outside of this call path as pinning can be asynchronous via the
* external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
- * task_struct and VM locked pages requires an mm_struct, however
- * holding an indefinite mm reference is not recommended, therefore we
- * only hold a reference to a task. We could hold a reference to
- * current, however QEMU uses this call path through vCPU threads,
- * which can be killed resulting in a NULL mm and failure in the unmap
- * path when called via a different thread. Avoid this problem by
- * using the group_leader as threads within the same group require
- * both CLONE_THREAD and CLONE_VM and will therefore use the same
- * mm_struct.
- *
- * Previously we also used the task for testing CAP_IPC_LOCK at the
- * time of pinning and accounting, however has_capability() makes use
- * of real_cred, a copy-on-write field, so we can't guarantee that it
- * matches group_leader, or in fact that it might not change by the
- * time it's evaluated. If a process were to call MAP_DMA with
- * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
- * possibly see different results for an iommu_mapped vfio_dma vs
- * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
- * time of calling MAP_DMA.
+ * task_struct. Save the group_leader so that all DMA tracking uses
+ * the same task, to make debugging easier. VM locked pages requires
+ * an mm_struct, so grab the mm in case the task dies.
*/
get_task_struct(current->group_leader);
dma->task = current->group_leader;
dma->lock_cap = capable(CAP_IPC_LOCK);
+ dma->mm = current->mm;
+ mmgrab(dma->mm);
dma->pfn_list = RB_ROOT;
@@ -3122,9 +3110,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
!(dma->prot & IOMMU_READ))
return -EPERM;
- mm = get_task_mm(dma->task);
-
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -EPERM;
if (kthread)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167820515320650(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
046eca5018f8 ("vfio/type1: prevent underflow of locked_vm via exec()")
4ab4fcfce5b5 ("vfio/type1: fix vaddr_get_pfns() return in vfio_pin_page_external()")
4b6c33b32296 ("vfio/type1: Prepare for batched pinning with struct vfio_batch")
be16c1fd99f4 ("vfio/type1: Change success value of vaddr_get_pfn()")
aae7a75a821a ("vfio/type1: Add proper error unwind for vfio_iommu_replay()")
64019a2e467a ("mm/gup: remove task_struct pointer for all gup code")
bce617edecad ("mm: do page fault accounting in handle_mm_fault")
ed03d924587e ("mm/gup: use a standard migration target allocation callback")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
d92bbc2719bd ("mm/hugetlb: unify migration callbacks")
b4b382238ed2 ("mm/migrate: move migration helper from .h to .c")
c7073bab5772 ("mm/page_isolation: prefer the node of the source page")
3e4e28c5a8f0 ("mmap locking API: convert mmap_sem API comments")
d8ed45c5dcd4 ("mmap locking API: use coccinelle to convert mmap_sem rwsem call sites")
ca5999fde0a1 ("mm: introduce include/linux/pgtable.h")
420c2091b65a ("mm/gup: introduce pin_user_pages_locked()")
5a36f0f3f518 ("Merge tag 'vfio-v5.8-rc1' of git://github.com/awilliam/linux-vfio")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare(a)oracle.com>
Date: Tue, 31 Jan 2023 08:58:04 -0800
Subject: [PATCH] vfio/type1: prevent underflow of locked_vm via exec()
When a vfio container is preserved across exec, the task does not change,
but it gets a new mm with locked_vm=0, and loses the count from existing
dma mappings. If the user later unmaps a dma mapping, locked_vm underflows
to a large unsigned value, and a subsequent dma map request fails with
ENOMEM in __account_locked_vm.
To avoid underflow, grab and save the mm at the time a dma is mapped.
Use that mm when adjusting locked_vm, rather than re-acquiring the saved
task's mm, which may have changed. If the saved mm is dead, do nothing.
locked_vm is incremented for existing mappings in a subsequent patch.
Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Steve Sistare <steven.sistare(a)oracle.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/1675184289-267876-3-git-send-email-steven.sistare…
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 144f5bb20fb8..6b757d035457 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -100,6 +100,7 @@ struct vfio_dma {
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap;
+ struct mm_struct *mm;
};
struct vfio_batch {
@@ -420,8 +421,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
if (!npage)
return 0;
- mm = async ? get_task_mm(dma->task) : dma->task->mm;
- if (!mm)
+ mm = dma->mm;
+ if (async && !mmget_not_zero(mm))
return -ESRCH; /* process exited */
ret = mmap_write_lock_killable(mm);
@@ -794,8 +795,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
struct mm_struct *mm;
int ret;
- mm = get_task_mm(dma->task);
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -ENODEV;
ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
@@ -805,7 +806,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
ret = 0;
if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
- ret = vfio_lock_acct(dma, 1, true);
+ ret = vfio_lock_acct(dma, 1, false);
if (ret) {
put_pfn(*pfn_base, dma->prot);
if (ret == -ENOMEM)
@@ -1180,6 +1181,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
+ mmdrop(dma->mm);
vfio_dma_bitmap_free(dma);
if (dma->vaddr_invalid) {
iommu->vaddr_invalid_count--;
@@ -1664,29 +1666,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
* against the locked memory limit and we need to be able to do both
* outside of this call path as pinning can be asynchronous via the
* external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
- * task_struct and VM locked pages requires an mm_struct, however
- * holding an indefinite mm reference is not recommended, therefore we
- * only hold a reference to a task. We could hold a reference to
- * current, however QEMU uses this call path through vCPU threads,
- * which can be killed resulting in a NULL mm and failure in the unmap
- * path when called via a different thread. Avoid this problem by
- * using the group_leader as threads within the same group require
- * both CLONE_THREAD and CLONE_VM and will therefore use the same
- * mm_struct.
- *
- * Previously we also used the task for testing CAP_IPC_LOCK at the
- * time of pinning and accounting, however has_capability() makes use
- * of real_cred, a copy-on-write field, so we can't guarantee that it
- * matches group_leader, or in fact that it might not change by the
- * time it's evaluated. If a process were to call MAP_DMA with
- * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
- * possibly see different results for an iommu_mapped vfio_dma vs
- * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
- * time of calling MAP_DMA.
+ * task_struct. Save the group_leader so that all DMA tracking uses
+ * the same task, to make debugging easier. VM locked pages requires
+ * an mm_struct, so grab the mm in case the task dies.
*/
get_task_struct(current->group_leader);
dma->task = current->group_leader;
dma->lock_cap = capable(CAP_IPC_LOCK);
+ dma->mm = current->mm;
+ mmgrab(dma->mm);
dma->pfn_list = RB_ROOT;
@@ -3122,9 +3110,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
!(dma->prot & IOMMU_READ))
return -EPERM;
- mm = get_task_mm(dma->task);
-
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -EPERM;
if (kthread)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167820515210204(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
046eca5018f8 ("vfio/type1: prevent underflow of locked_vm via exec()")
4ab4fcfce5b5 ("vfio/type1: fix vaddr_get_pfns() return in vfio_pin_page_external()")
4b6c33b32296 ("vfio/type1: Prepare for batched pinning with struct vfio_batch")
be16c1fd99f4 ("vfio/type1: Change success value of vaddr_get_pfn()")
aae7a75a821a ("vfio/type1: Add proper error unwind for vfio_iommu_replay()")
64019a2e467a ("mm/gup: remove task_struct pointer for all gup code")
bce617edecad ("mm: do page fault accounting in handle_mm_fault")
ed03d924587e ("mm/gup: use a standard migration target allocation callback")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
d92bbc2719bd ("mm/hugetlb: unify migration callbacks")
b4b382238ed2 ("mm/migrate: move migration helper from .h to .c")
c7073bab5772 ("mm/page_isolation: prefer the node of the source page")
3e4e28c5a8f0 ("mmap locking API: convert mmap_sem API comments")
d8ed45c5dcd4 ("mmap locking API: use coccinelle to convert mmap_sem rwsem call sites")
ca5999fde0a1 ("mm: introduce include/linux/pgtable.h")
420c2091b65a ("mm/gup: introduce pin_user_pages_locked()")
5a36f0f3f518 ("Merge tag 'vfio-v5.8-rc1' of git://github.com/awilliam/linux-vfio")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare(a)oracle.com>
Date: Tue, 31 Jan 2023 08:58:04 -0800
Subject: [PATCH] vfio/type1: prevent underflow of locked_vm via exec()
When a vfio container is preserved across exec, the task does not change,
but it gets a new mm with locked_vm=0, and loses the count from existing
dma mappings. If the user later unmaps a dma mapping, locked_vm underflows
to a large unsigned value, and a subsequent dma map request fails with
ENOMEM in __account_locked_vm.
To avoid underflow, grab and save the mm at the time a dma is mapped.
Use that mm when adjusting locked_vm, rather than re-acquiring the saved
task's mm, which may have changed. If the saved mm is dead, do nothing.
locked_vm is incremented for existing mappings in a subsequent patch.
Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Steve Sistare <steven.sistare(a)oracle.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/1675184289-267876-3-git-send-email-steven.sistare…
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 144f5bb20fb8..6b757d035457 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -100,6 +100,7 @@ struct vfio_dma {
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap;
+ struct mm_struct *mm;
};
struct vfio_batch {
@@ -420,8 +421,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
if (!npage)
return 0;
- mm = async ? get_task_mm(dma->task) : dma->task->mm;
- if (!mm)
+ mm = dma->mm;
+ if (async && !mmget_not_zero(mm))
return -ESRCH; /* process exited */
ret = mmap_write_lock_killable(mm);
@@ -794,8 +795,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
struct mm_struct *mm;
int ret;
- mm = get_task_mm(dma->task);
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -ENODEV;
ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
@@ -805,7 +806,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
ret = 0;
if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
- ret = vfio_lock_acct(dma, 1, true);
+ ret = vfio_lock_acct(dma, 1, false);
if (ret) {
put_pfn(*pfn_base, dma->prot);
if (ret == -ENOMEM)
@@ -1180,6 +1181,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
+ mmdrop(dma->mm);
vfio_dma_bitmap_free(dma);
if (dma->vaddr_invalid) {
iommu->vaddr_invalid_count--;
@@ -1664,29 +1666,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
* against the locked memory limit and we need to be able to do both
* outside of this call path as pinning can be asynchronous via the
* external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
- * task_struct and VM locked pages requires an mm_struct, however
- * holding an indefinite mm reference is not recommended, therefore we
- * only hold a reference to a task. We could hold a reference to
- * current, however QEMU uses this call path through vCPU threads,
- * which can be killed resulting in a NULL mm and failure in the unmap
- * path when called via a different thread. Avoid this problem by
- * using the group_leader as threads within the same group require
- * both CLONE_THREAD and CLONE_VM and will therefore use the same
- * mm_struct.
- *
- * Previously we also used the task for testing CAP_IPC_LOCK at the
- * time of pinning and accounting, however has_capability() makes use
- * of real_cred, a copy-on-write field, so we can't guarantee that it
- * matches group_leader, or in fact that it might not change by the
- * time it's evaluated. If a process were to call MAP_DMA with
- * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
- * possibly see different results for an iommu_mapped vfio_dma vs
- * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
- * time of calling MAP_DMA.
+ * task_struct. Save the group_leader so that all DMA tracking uses
+ * the same task, to make debugging easier. VM locked pages requires
+ * an mm_struct, so grab the mm in case the task dies.
*/
get_task_struct(current->group_leader);
dma->task = current->group_leader;
dma->lock_cap = capable(CAP_IPC_LOCK);
+ dma->mm = current->mm;
+ mmgrab(dma->mm);
dma->pfn_list = RB_ROOT;
@@ -3122,9 +3110,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
!(dma->prot & IOMMU_READ))
return -EPERM;
- mm = get_task_mm(dma->task);
-
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -EPERM;
if (kthread)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ef3a3f6a294ba65fd906a291553935881796f8a5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678205132164221(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
ef3a3f6a294b ("vfio/type1: exclude mdevs from VFIO_UPDATE_VADDR")
296e505baddf ("vfio/iommu_type1: remove the "external" domain")
65cdbf106337 ("vfio/iommu_type1: initialize pgsize_bitmap in ->open")
c3c0fa9d94f7 ("vfio: clean up the check for mediated device in vfio_iommu_type1")
fda49d97f2c4 ("vfio: remove the unused mdev iommu hook")
8cc02d22d7e1 ("vfio: move the vfio_iommu_driver_ops interface out of <linux/vfio.h>")
67462037872d ("vfio: remove unused method from vfio_iommu_driver_ops")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ef3a3f6a294ba65fd906a291553935881796f8a5 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare(a)oracle.com>
Date: Tue, 31 Jan 2023 08:58:03 -0800
Subject: [PATCH] vfio/type1: exclude mdevs from VFIO_UPDATE_VADDR
Disable the VFIO_UPDATE_VADDR capability if mediated devices are present.
Their kernel threads could be blocked indefinitely by a misbehaving
userland while trying to pin/unpin pages while vaddrs are being updated.
Do not allow groups to be added to the container while vaddr's are invalid,
so we never need to block user threads from pinning, and can delete the
vaddr-waiting code in a subsequent patch.
Fixes: c3cbab24db38 ("vfio/type1: implement interfaces to update vaddr")
Cc: stable(a)vger.kernel.org
Signed-off-by: Steve Sistare <steven.sistare(a)oracle.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/1675184289-267876-2-git-send-email-steven.sistare…
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 23c24fe98c00..144f5bb20fb8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -861,6 +861,12 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
mutex_lock(&iommu->lock);
+ if (WARN_ONCE(iommu->vaddr_invalid_count,
+ "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
+ ret = -EBUSY;
+ goto pin_done;
+ }
+
/*
* Wait for all necessary vaddr's to be valid so they can be used in
* the main loop without dropping the lock, to avoid racing vs unmap.
@@ -1343,6 +1349,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(&iommu->lock);
+ /* Cannot update vaddr if mdev is present. */
+ if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
pgshift = __ffs(iommu->pgsize_bitmap);
pgsize = (size_t)1 << pgshift;
@@ -2185,11 +2197,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_domain_geometry *geo;
LIST_HEAD(iova_copy);
LIST_HEAD(group_resv_regions);
- int ret = -EINVAL;
+ int ret = -EBUSY;
mutex_lock(&iommu->lock);
+ /* Attach could require pinning, so disallow while vaddr is invalid. */
+ if (iommu->vaddr_invalid_count)
+ goto out_unlock;
+
/* Check for duplicates */
+ ret = -EINVAL;
if (vfio_iommu_find_iommu_group(iommu, iommu_group))
goto out_unlock;
@@ -2660,6 +2677,16 @@ static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
return ret;
}
+static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
+{
+ bool ret;
+
+ mutex_lock(&iommu->lock);
+ ret = !list_empty(&iommu->emulated_iommu_groups);
+ mutex_unlock(&iommu->lock);
+ return ret;
+}
+
static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
unsigned long arg)
{
@@ -2668,8 +2695,13 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
- case VFIO_UPDATE_VADDR:
return 1;
+ case VFIO_UPDATE_VADDR:
+ /*
+ * Disable this feature if mdevs are present. They cannot
+ * safely pin/unpin/rw while vaddrs are being updated.
+ */
+ return iommu && !vfio_iommu_has_emulated(iommu);
case VFIO_DMA_CC_IOMMU:
if (!iommu)
return 0;
@@ -3138,6 +3170,13 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
size_t done;
mutex_lock(&iommu->lock);
+
+ if (WARN_ONCE(iommu->vaddr_invalid_count,
+ "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
+ ret = -EBUSY;
+ goto out;
+ }
+
while (count > 0) {
ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
count, write, &done);
@@ -3149,6 +3188,7 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
user_iova += done;
}
+out:
mutex_unlock(&iommu->lock);
return ret;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 23105eb036fa..0552e8dcf0cb 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -49,7 +49,11 @@
/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
#define VFIO_UNMAP_ALL 9
-/* Supports the vaddr flag for DMA map and unmap */
+/*
+ * Supports the vaddr flag for DMA map and unmap. Not supported for mediated
+ * devices, so this capability is subject to change as groups are added or
+ * removed.
+ */
#define VFIO_UPDATE_VADDR 10
/*
@@ -1343,8 +1347,7 @@ struct vfio_iommu_type1_info_dma_avail {
* Map process virtual addresses to IO virtual addresses using the
* provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
*
- * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
- * unblock translation of host virtual addresses in the iova range. The vaddr
+ * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova. The vaddr
* must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR. To
* maintain memory consistency within the user application, the updated vaddr
* must address the same memory object as originally mapped. Failure to do so
@@ -1395,9 +1398,9 @@ struct vfio_bitmap {
* must be 0. This cannot be combined with the get-dirty-bitmap flag.
*
* If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
- * virtual addresses in the iova range. Tasks that attempt to translate an
- * iova's vaddr will block. DMA to already-mapped pages continues. This
- * cannot be combined with the get-dirty-bitmap flag.
+ * virtual addresses in the iova range. DMA to already-mapped pages continues.
+ * Groups may not be added to the container while any addresses are invalid.
+ * This cannot be combined with the get-dirty-bitmap flag.
*/
struct vfio_iommu_type1_dma_unmap {
__u32 argsz;