Extend kvm_s390_shadow_fault to return the pointer to the valid leaf
DAT table entry, or to the invalid entry.
Also return some flags in the lower bits of the address:
PEI_DAT_PROT: indicates that DAT protection applies because of the
protection bit in the segment (or, if EDAT, region) tables.
PEI_NOT_PTE: indicates that the address of the DAT table entry returned
does not refer to a PTE, but to a segment or region table.
Signed-off-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Janosch Frank <frankja(a)de.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
---
arch/s390/kvm/gaccess.c | 30 +++++++++++++++++++++++++-----
arch/s390/kvm/gaccess.h | 6 +++++-
arch/s390/kvm/vsie.c | 8 ++++----
3 files changed, 34 insertions(+), 10 deletions(-)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6d6b57059493..33a03e518640 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ * successful (return value 0), or to the first invalid DAT entry in
+ * case of exceptions (return value > 0)
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
+ *pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
@@ -1060,6 +1063,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rste.val = ptr;
goto shadow_r3t;
}
+ *pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
@@ -1087,6 +1091,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rtte.val = ptr;
goto shadow_sgt;
}
+ *pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
@@ -1123,6 +1128,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
ste.val = ptr;
goto shadow_pgt;
}
+ *pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
@@ -1157,6 +1163,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
@@ -1165,11 +1173,11 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr)
+ unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
- unsigned long pgt;
+ unsigned long pgt = 0;
int dat_protection, fake;
int rc;
@@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
- if (!rc)
- rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+ switch (rc) {
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_FIRST_TRANS:
+ pgt |= PEI_NOT_PTE;
+ break;
+ case 0:
+ pgt += vaddr.px * 8;
+ rc = gmap_read_table(sg->parent, pgt, &pte.val);
+ }
+ if (*datptr)
+ *datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 107fdfd2eadd..df4014f09e26 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -378,7 +378,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr);
+ unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index bd803e091918..78b604326016 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -620,10 +620,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE);
+ prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -914,7 +914,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr);
+ current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
@@ -936,7 +936,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr);
+ vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
--
2.26.2
Creating a series of detached mounts, attaching them to the filesystem,
and unmounting them can be used to trigger an integer overflow in
ns->mounts causing the kernel to block any new mounts in count_mounts()
and returning ENOSPC because it falsely assumes that the maximum number
of mounts in the mount namespace has been reached, i.e. it thinks it
can't fit the new mounts into the mount namespace anymore.
Depending on the number of mounts in your system, this can be reproduced
on any kernel that supportes open_tree() and move_mount() with the
following instructions:
1. Compile the following program "repro.c" via "make repro"
> cat repro.c
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <linux/mount.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
/* open_tree() */
#ifndef OPEN_TREE_CLONE
#define OPEN_TREE_CLONE 1
#endif
#ifndef OPEN_TREE_CLOEXEC
#define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif
#ifndef __NR_open_tree
#if defined __alpha__
#define __NR_open_tree 538
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_open_tree 4428
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_open_tree 6428
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_open_tree 5428
#endif
#elif defined __ia64__
#define __NR_open_tree (428 + 1024)
#else
#define __NR_open_tree 428
#endif
#endif
/* move_mount() */
#ifndef MOVE_MOUNT_F_EMPTY_PATH
#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
#endif
#ifndef __NR_move_mount
#if defined __alpha__
#define __NR_move_mount 539
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_move_mount 4429
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_move_mount 6429
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_move_mount 5429
#endif
#elif defined __ia64__
#define __NR_move_mount (428 + 1024)
#else
#define __NR_move_mount 429
#endif
#endif
static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
{
return syscall(__NR_open_tree, dfd, filename, flags);
}
static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
const char *to_pathname, unsigned int flags)
{
return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
}
static void usage(void)
{
const char *text = "mount-new [--recursive] <source> <target>\n";
fprintf(stderr, "%s", text);
_exit(EXIT_SUCCESS);
}
#define exit_usage(format, ...) \
({ \
fprintf(stderr, format, ##__VA_ARGS__); \
usage(); \
})
#define exit_log(format, ...) \
({ \
fprintf(stderr, format, ##__VA_ARGS__); \
exit(EXIT_FAILURE); \
})
static const struct option longopts[] = {
{"recursive", no_argument, 0, 'a'},
{"help", no_argument, 0, 'b'},
{ NULL, no_argument, 0, 0 },
};
int main(int argc, char *argv[])
{
int index = 0;
bool recursive = false;
int fd_tree, new_argc, ret;
char *source, *target;
char *const *new_argv;
while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
switch (ret) {
case 'a':
recursive = true;
break;
case 'b':
/* fallthrough */
default:
usage();
}
}
new_argv = &argv[optind];
new_argc = argc - optind;
if (new_argc < 2)
exit_usage("Missing source or target mountpoint\n\n");
source = new_argv[0];
target = new_argv[1];
fd_tree = sys_open_tree(-EBADF, source,
OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC |
AT_EMPTY_PATH |
(recursive ? AT_RECURSIVE : 0));
if (fd_tree < 0) {
exit_log("%m - Failed to open %s\n", source);
exit(EXIT_FAILURE);
}
ret = sys_move_mount(fd_tree, "", -EBADF, target, MOVE_MOUNT_F_EMPTY_PATH);
if (ret < 0)
exit_log("%m - Failed to attach mount to %s\n", target);
close(fd_tree);
exit(EXIT_SUCCESS);
}
2. Run a loop like:
while true; do sudo ./repro; done
and wait for the kernel to refuse any new mounts by returning ENOSPC.
Depending on the number of mounts you have on the system this might take
shorter or longer.
The root cause of this is that detached mounts aren't handled correctly
when the destination mount point is MS_SHARED causing a borked mount
tree and ultimately to a miscalculation of the number of mounts in the
mount namespace.
Detached mounts created via
open_tree(fd, path, OPEN_TREE_CLONE)
are essentially like an unattached new mount, or an unattached
bind-mount. They can then later on be attached to the filesystem via
move_mount() which calles into attach_recursive_mount(). Part of
attaching it to the filesystem is making sure that mounts get correctly
propagated in case the destination mountpoint is MS_SHARED, i.e. is a
shared mountpoint. This is done by calling into propagate_mnt() which
walks the list of peers calling propagate_one() on each mount in this
list making sure it receives the propagation event.
For new mounts and bind-mounts propagate_one() knows to skip them by
realizing that there's no mount namespace attached to them.
However, detached mounts have an anonymous mount namespace attached to
them and so they aren't skipped causing the mount table to get wonky and
ultimately preventing any new mounts via the ENOSPC issue.
So teach propagation to handle detached mounts by making it aware of
them. I've been tracking this issue down for the last couple of days by
unmounting everything in my current mount table leaving only /proc and
/sys mounted and running the reproducer above verifying the counting of
the mounts. With this fix the counts are correct and the ENOSPC issue
can't be reproduced.
Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around")
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: David Howells <dhowells(a)redhat.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: linux-fsdevel(a)vger.kernel.org
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Christian Brauner <christian.brauner(a)ubuntu.com>
---
fs/pnode.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/pnode.h b/fs/pnode.h
index 26f74e092bd9..988f1aa9b02a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m) (!(m)->mnt_ns)
+#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
base-commit: f69d02e37a85645aa90d18cacfff36dba370f797
--
2.27.0