 
            Introduce the trace_mm_insufficient_vma_slots tracepoint to improve observability of VMA allocation failures.
This event fires when an operation is about to fail because it requires more VMA slots than are currently available, according to the sysctl_max_map_count limit. This is a preemptive check that occurs in call paths like mmap(), mremap(), and split_vma() before they attempt to create new VMAs.
This tracepoint can be used with event-driven telemetry, such as BPF programs, to collect data from devices in the field with minimal overhead.
The tracepoint captures the mm_struct pointer and the current vma_count at the time of failure. This allows for observing the distribution of these events to determine if there are legitimate bugs or if an increase to the limit is warranted.
Cc: Andrew Morton akpm@linux-foundation.org Cc: David Hildenbrand david@redhat.com Cc: "Liam R. Howlett" Liam.Howlett@oracle.com Cc: Lorenzo Stoakes lorenzo.stoakes@oracle.com Cc: Mike Rapoport rppt@kernel.org Cc: Minchan Kim minchan@kernel.org Cc: Pedro Falcato pfalcato@suse.de Signed-off-by: Kalesh Singh kaleshsingh@google.com ---
Changes in v4: - Update commit description to accurately reflect the trace event's parameters.
Changes in v3: - capture the mm pointer as the unique identifier and capture the vma_count as well, instead of current task tgid, per Steve - Add include/trace/events/vma.h to MEMORY MAPPING section in MAINTAINERS, per Lorenzo - rename trace_max_vma_count_exceeded() to trace_mm_insufficient_vma_slots(), since this is a preemptive check, per Lorenzo - Fix tools/testing/vma build errors, per Lorenzo
MAINTAINERS | 1 + include/trace/events/vma.h | 32 ++++++++++++++++++++++++++++++++ mm/mmap.c | 5 ++++- mm/mremap.c | 10 ++++++++-- mm/vma.c | 4 +++- mm/vma_internal.h | 2 ++ tools/testing/vma/vma_internal.h | 5 +++++ 7 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 include/trace/events/vma.h
diff --git a/MAINTAINERS b/MAINTAINERS index 66f7ca5b01ad..223124cb7d21 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16567,6 +16567,7 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/trace/events/mmap.h +F: include/trace/events/vma.h F: mm/interval_tree.c F: mm/mincore.c F: mm/mlock.c diff --git a/include/trace/events/vma.h b/include/trace/events/vma.h new file mode 100644 index 000000000000..4540fa607f66 --- /dev/null +++ b/include/trace/events/vma.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vma + +#if !defined(_TRACE_VMA_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VMA_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(mm_insufficient_vma_slots, + + TP_PROTO(struct mm_struct *mm), + + TP_ARGS(mm), + + TP_STRUCT__entry( + __field(void *, mm) + __field(int, vma_count) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->vma_count = mm->vma_count; + ), + + TP_printk("mm=%p vma_count=%d", __entry->mm, __entry->vma_count) +); + +#endif /* _TRACE_VMA_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/mm/mmap.c b/mm/mmap.c index 647a676c0ab4..3ebe9d5f7dfe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -56,6 +56,7 @@
#define CREATE_TRACE_POINTS #include <trace/events/mmap.h> +#include <trace/events/vma.h>
#include "internal.h"
@@ -383,8 +384,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * sysctl_max_map_count limit by one. This behavior is preserved to * avoid breaking existing applications. */ - if (max_vma_count() - mm->vma_count < 0) + if (max_vma_count() - mm->vma_count < 0) { + trace_mm_insufficient_vma_slots(mm); return -ENOMEM; + }
/* * addr is returned from get_unmapped_area, diff --git a/mm/mremap.c b/mm/mremap.c index 4874729cd65c..dfb481c5bfb1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -30,6 +30,8 @@ #include <asm/cacheflush.h> #include <asm/tlb.h>
+#include <trace/events/vma.h> + #include "internal.h"
/* Classify the kind of remap operation being performed. */ @@ -1040,8 +1042,10 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ - if (max_vma_count() - current->mm->vma_count < 4) + if (max_vma_count() - current->mm->vma_count < 4) { + trace_mm_insufficient_vma_slots(current->mm); return -ENOMEM; + }
if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) @@ -1814,8 +1818,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) * the threshold. In other words, is the current map count + 6 at or * below the threshold? Otherwise return -ENOMEM here to be more safe. */ - if (max_vma_count() - current->mm->vma_count < 6) + if (max_vma_count() - current->mm->vma_count < 6) { + trace_mm_insufficient_vma_slots(current->mm); return -ENOMEM; + }
return 0; } diff --git a/mm/vma.c b/mm/vma.c index fbb8d1a0449d..2c35c3d008bc 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -594,8 +594,10 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (max_vma_count() - vma->vm_mm->vma_count < 1) + if (max_vma_count() - vma->vm_mm->vma_count < 1) { + trace_mm_insufficient_vma_slots(vma->vm_mm); return -ENOMEM; + }
return __split_vma(vmi, vma, addr, new_below); } diff --git a/mm/vma_internal.h b/mm/vma_internal.h index 2f05735ff190..86823ca6857b 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -52,4 +52,6 @@
#include "internal.h"
+#include <trace/events/vma.h> + #endif /* __MM_VMA_INTERNAL_H */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index d89b26e81679..0fdde2eb5a57 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1497,4 +1497,9 @@ static int max_vma_count(void) return sysctl_max_map_count; }
+/* Stub for trace_mm_insufficient_vma_slots */ +static inline void trace_mm_insufficient_vma_slots(struct mm_struct *mm) +{ +} + #endif /* __MM_VMA_INTERNAL_H */