[ Replying to show this patch that got dropped from the mailing list ]
On Sat, 30 Sep 2023 16:32:16 -0400
Steven Rostedt <rostedt(a)goodmis.org> wrote:
> From: Beau Belgrave <beaub(a)linux.microsoft.com>
>
> All architectures should use a long aligned address passed to set_bit().
> User processes can pass either a 32-bit or 64-bit sized value to be
> updated when tracing is enabled when on a 64-bit kernel. Both cases are
> ensured to be naturally aligned, however, that is not enough. The
> address must be long aligned without affecting checks on the value
> within the user process which require different adjustments for the bit
> for little and big endian CPUs.
>
> Add a compat flag to user_event_enabler that indicates when a 32-bit
> value is being used on a 64-bit kernel. Long align addresses and correct
> the bit to be used by set_bit() to account for this alignment. Ensure
> compat flags are copied during forks and used during deletion clears.
>
> Link: https://lore.kernel.org/linux-trace-kernel/20230925230829.341-2-beaub@linux…
> Link: https://lore.kernel.org/linux-trace-kernel/20230914131102.179100-1-cleger@r…
>
> Cc: stable(a)vger.kernel.org
> Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event enablement")
> Reported-by: Clément Léger <cleger(a)rivosinc.com>
> Suggested-by: Clément Léger <cleger(a)rivosinc.com>
> Signed-off-by: Beau Belgrave <beaub(a)linux.microsoft.com>
> Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
> ---
> kernel/trace/trace_events_user.c | 58 ++++++++++++++++++++++++++++----
> 1 file changed, 51 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 6f046650e527..b87f41187c6a 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -127,8 +127,13 @@ struct user_event_enabler {
> /* Bit 7 is for freeing status of enablement */
> #define ENABLE_VAL_FREEING_BIT 7
>
> -/* Only duplicate the bit value */
> -#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
> +/* Bit 8 is for marking 32-bit on 64-bit */
> +#define ENABLE_VAL_32_ON_64_BIT 8
> +
> +#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
> +
> +/* Only duplicate the bit and compat values */
> +#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
>
> #define ENABLE_BITOPS(e) (&(e)->values)
>
> @@ -174,6 +179,30 @@ struct user_event_validator {
> int flags;
> };
>
> +static inline void align_addr_bit(unsigned long *addr, int *bit,
> + unsigned long *flags)
> +{
> + if (IS_ALIGNED(*addr, sizeof(long))) {
> +#ifdef __BIG_ENDIAN
> + /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
> + if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
> + *bit += 32;
> +#endif
> + return;
> + }
> +
> + *addr = ALIGN_DOWN(*addr, sizeof(long));
> +
> + /*
> + * We only support 32 and 64 bit values. The only time we need
> + * to align is a 32 bit value on a 64 bit kernel, which on LE
> + * is always 32 bits, and on BE requires no change when unaligned.
> + */
> +#ifdef __LITTLE_ENDIAN
> + *bit += 32;
> +#endif
> +}
> +
> typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
> void *tpdata, bool *faulted);
>
> @@ -482,6 +511,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
> unsigned long *ptr;
> struct page *page;
> void *kaddr;
> + int bit = ENABLE_BIT(enabler);
> int ret;
>
> lockdep_assert_held(&event_mutex);
> @@ -497,6 +527,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
> test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
> return -EBUSY;
>
> + align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
> +
> ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
> &page, NULL);
>
> @@ -515,9 +547,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
>
> /* Update bit atomically, user tracers must be atomic as well */
> if (enabler->event && enabler->event->status)
> - set_bit(ENABLE_BIT(enabler), ptr);
> + set_bit(bit, ptr);
> else
> - clear_bit(ENABLE_BIT(enabler), ptr);
> + clear_bit(bit, ptr);
>
> kunmap_local(kaddr);
> unpin_user_pages_dirty_lock(&page, 1, true);
> @@ -849,6 +881,12 @@ static struct user_event_enabler
> enabler->event = user;
> enabler->addr = uaddr;
> enabler->values = reg->enable_bit;
> +
> +#if BITS_PER_LONG >= 64
> + if (reg->enable_size == 4)
> + set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
> +#endif
> +
> retry:
> /* Prevents state changes from racing with new enablers */
> mutex_lock(&event_mutex);
> @@ -2377,7 +2415,8 @@ static long user_unreg_get(struct user_unreg __user *ureg,
> }
>
> static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
> - unsigned long uaddr, unsigned char bit)
> + unsigned long uaddr, unsigned char bit,
> + unsigned long flags)
> {
> struct user_event_enabler enabler;
> int result;
> @@ -2385,7 +2424,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
>
> memset(&enabler, 0, sizeof(enabler));
> enabler.addr = uaddr;
> - enabler.values = bit;
> + enabler.values = bit | flags;
> retry:
> /* Prevents state changes from racing with new enablers */
> mutex_lock(&event_mutex);
> @@ -2415,6 +2454,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> struct user_event_mm *mm = current->user_event_mm;
> struct user_event_enabler *enabler, *next;
> struct user_unreg reg;
> + unsigned long flags;
> long ret;
>
> ret = user_unreg_get(ureg, ®);
> @@ -2425,6 +2465,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> if (!mm)
> return -ENOENT;
>
> + flags = 0;
> ret = -ENOENT;
>
> /*
> @@ -2441,6 +2482,9 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> ENABLE_BIT(enabler) == reg.disable_bit) {
> set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
>
> + /* We must keep compat flags for the clear */
> + flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
> +
> if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
> user_event_enabler_destroy(enabler, true);
>
> @@ -2454,7 +2498,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> /* Ensure bit is now cleared for user, regardless of event status */
> if (!ret)
> ret = user_event_mm_clear_bit(mm, reg.disable_addr,
> - reg.disable_bit);
> + reg.disable_bit, flags);
>
> return ret;
> }
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
It was discovered that the ring buffer polling was incorrectly stating
that read would not block, but that's because polling did not take into
account that reads will block if the "buffer-percent" was set. Instead,
the ring buffer polling would say reads would not block if there was any
data in the ring buffer. This was incorrect behavior from a user space
point of view. This was fixed by commit 42fb0a1e84ff by having the polling
code check if the ring buffer had more data than what the user specified
"buffer percent" had.
The problem now is that the polling code did not register itself to the
writer that it wanted to wait for a specific "full" value of the ring
buffer. The result was that the writer would wake the polling waiter
whenever there was a new event. The polling waiter would then wake up, see
that there's not enough data in the ring buffer to notify user space and
then go back to sleep. The next event would wake it up again.
Before the polling fix was added, the code would wake up around 100 times
for a hackbench 30 benchmark. After the "fix", due to the constant waking
of the writer, it would wake up over 11,0000 times! It would never leave
the kernel, so the user space behavior was still "correct", but this
definitely is not the desired effect.
To fix this, have the polling code add what it's waiting for to the
"shortest_full" variable, to tell the writer not to wake it up if the
buffer is not as full as it expects to be.
Note, after this fix, it appears that the waiter is now woken up around 2x
the times it was before (~200). This is a tremendous improvement from the
11,000 times, but I will need to spend some time to see why polling is
more aggressive in its wakeups than the read blocking code.
Link: https://lore.kernel.org/linux-trace-kernel/20230929180113.01c2cae3@rorschac…
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Fixes: 42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark")
Reported-by: Julia Lawall <julia.lawall(a)inria.fr>
Tested-by: Julia Lawall <julia.lawall(a)inria.fr>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28daf0ce95c5..515cafdb18d9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1137,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
if (full) {
poll_wait(filp, &work->full_waiters, poll_table);
work->full_waiters_pending = true;
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
} else {
poll_wait(filp, &work->waiters, poll_table);
work->waiters_pending = true;
--
2.40.1
For an unknown TX CQE error type (probably from a newer hardware),
still free the SKB, update the queue tail, etc., otherwise the
accounting will be wrong.
Also, TX errors can be triggered by injecting corrupted packets, so
replace the WARN_ONCE to ratelimited error logging, because we don't
need stack trace here.
Cc: stable(a)vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Haiyang Zhang <haiyangz(a)microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 4a16ebff3d1d..5cdcf7561b38 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1317,19 +1317,23 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
case CQE_TX_VPORT_IDX_OUT_OF_RANGE:
case CQE_TX_VPORT_DISABLED:
case CQE_TX_VLAN_TAGGING_VIOLATION:
- WARN_ONCE(1, "TX: CQE error %d: ignored.\n",
- cqe_oob->cqe_hdr.cqe_type);
+ if (net_ratelimit())
+ netdev_err(ndev, "TX: CQE error %d\n",
+ cqe_oob->cqe_hdr.cqe_type);
+
apc->eth_stats.tx_cqe_err++;
break;
default:
- /* If the CQE type is unexpected, log an error, assert,
- * and go through the error path.
+ /* If the CQE type is unknown, log an error,
+ * and still free the SKB, update tail, etc.
*/
- WARN_ONCE(1, "TX: Unexpected CQE type %d: HW BUG?\n",
- cqe_oob->cqe_hdr.cqe_type);
+ if (net_ratelimit())
+ netdev_err(ndev, "TX: unknown CQE type %d\n",
+ cqe_oob->cqe_hdr.cqe_type);
+
apc->eth_stats.tx_cqe_unknown_type++;
- return;
+ break;
}
if (WARN_ON_ONCE(txq->gdma_txq_id != completions[i].wq_num))
--
2.25.1
It is possible for multiple vCPUs to fault on the same IPA and attempt
to resolve the fault. One of the page table walks will actually update
the PTE and the rest will return -EAGAIN per our race detection scheme.
KVM elides the TLB invalidation on the racing threads as the return
value is nonzero.
Before commit a12ab1378a88 ("KVM: arm64: Use local TLBI on permission
relaxation") KVM always used broadcast TLB invalidations when handling
permission faults, which had the convenient property of making the
stage-2 updates visible to all CPUs in the system. However now we do a
local invalidation, and TLBI elision leads to vCPUs getting stuck in a
permission fault loop. Remember that the architecture permits the TLB to
cache translations that precipitate a permission fault.
Invalidate the TLB entry responsible for the permission fault if the
stage-2 descriptor has been relaxed, regardless of which thread actually
did the job.
Cc: stable(a)vger.kernel.org
Fixes: a12ab1378a88 ("KVM: arm64: Use local TLBI on permission relaxation")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/kvm/hyp/pgtable.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index f155b8c9e98c..286888751793 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1314,7 +1314,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
- if (!ret)
+ if (!ret || ret == -EAGAIN)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
}
base-commit: ce9ecca0238b140b88f43859b211c9fdfd8e5b70
--
2.42.0.515.g380fc7ccd1-goog
The quilt patch titled
Subject: Crash: add lock to serialize crash hotplug handling
has been removed from the -mm tree. Its filename was
crash-add-lock-to-serialize-crash-hotplug-handling.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Baoquan He <bhe(a)redhat.com>
Subject: Crash: add lock to serialize crash hotplug handling
Date: Tue, 26 Sep 2023 20:09:05 +0800
Eric reported that handling corresponding crash hotplug event can be
failed easily when many memory hotplug event are notified in a short
period. They failed because failing to take __kexec_lock.
=======
[ 78.714569] Fallback order for Node 0: 0
[ 78.714575] Built 1 zonelists, mobility grouping on. Total pages: 1817886
[ 78.717133] Policy zone: Normal
[ 78.724423] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[ 78.727207] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[ 80.056643] PEFILE: Unsigned PE binary
=======
The memory hotplug events are notified very quickly and very many, while
the handling of crash hotplug is much slower relatively. So the atomic
variable __kexec_lock and kexec_trylock() can't guarantee the
serialization of crash hotplug handling.
Here, add a new mutex lock __crash_hotplug_lock to serialize crash hotplug
handling specifically. This doesn't impact the usage of __kexec_lock.
Link: https://lkml.kernel.org/r/20230926120905.392903-1-bhe@redhat.com
Fixes: 247262756121 ("crash: add generic infrastructure for crash hotplug support")
Signed-off-by: Baoquan He <bhe(a)redhat.com>
Tested-by: Eric DeVolder <eric.devolder(a)oracle.com>
Reviewed-by: Eric DeVolder <eric.devolder(a)oracle.com>
Reviewed-by: Valentin Schneider <vschneid(a)redhat.com>
Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/crash_core.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
--- a/kernel/crash_core.c~crash-add-lock-to-serialize-crash-hotplug-handling
+++ a/kernel/crash_core.c
@@ -740,6 +740,17 @@ subsys_initcall(crash_notes_memory_init)
#define pr_fmt(fmt) "crash hp: " fmt
/*
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
+/*
* This routine utilized when the crash_hotplug sysfs node is read.
* It reflects the kernel's ability/permission to update the crash
* elfcorehdr directly.
@@ -748,9 +759,11 @@ int crash_check_update_elfcorehdr(void)
{
int rc = 0;
+ crash_hotplug_lock();
/* Obtain lock while reading crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
return 0;
}
if (kexec_crash_image) {
@@ -761,6 +774,7 @@ int crash_check_update_elfcorehdr(void)
}
/* Release lock now that update complete */
kexec_unlock();
+ crash_hotplug_unlock();
return rc;
}
@@ -783,9 +797,11 @@ static void crash_handle_hotplug_event(u
{
struct kimage *image;
+ crash_hotplug_lock();
/* Obtain lock while changing crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
return;
}
@@ -852,6 +868,7 @@ static void crash_handle_hotplug_event(u
out:
/* Release lock now that update complete */
kexec_unlock();
+ crash_hotplug_unlock();
}
static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
_
Patches currently in -mm which might be from bhe(a)redhat.com are
crash_corec-remove-unnecessary-parameter-of-function.patch
crash_core-change-the-prototype-of-function-parse_crashkernel.patch
crash_core-change-parse_crashkernel-to-support-crashkernel=highlow-parsing.patch
crash_core-add-generic-function-to-do-reservation.patch
crash_core-move-crashk_res-definition-into-crash_corec.patch
x86-kdump-use-generic-interface-to-simplify-crashkernel-reservation-code.patch
x86-kdump-use-generic-interface-to-simplify-crashkernel-reservation-code-fix.patch
arm64-kdump-use-generic-interface-to-simplify-crashkernel-reservation.patch
riscv-kdump-use-generic-interface-to-simplify-crashkernel-reservation.patch
crash_corec-remove-unneeded-functions.patch
The quilt patch titled
Subject: selftests/mm: fix awk usage in charge_reserved_hugetlb.sh and hugetlb_reparenting_test.sh that may cause error
has been removed from the -mm tree. Its filename was
selftests-mm-fix-awk-usage-in-charge_reserved_hugetlbsh-and-hugetlb_reparenting_testsh-that-may-cause-error.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Juntong Deng <juntong.deng(a)outlook.com>
Subject: selftests/mm: fix awk usage in charge_reserved_hugetlb.sh and hugetlb_reparenting_test.sh that may cause error
Date: Wed, 27 Sep 2023 02:19:44 +0800
According to the awk manual, the -e option does not need to be specified
in front of 'program' (unless you need to mix program-file).
The redundant -e option can cause error when users use awk tools other
than gawk (for example, mawk does not support the -e option).
Error Example:
awk: not an option: -e
Link: https://lkml.kernel.org/r/VI1P193MB075228810591AF2FDD7D42C599C3A@VI1P193MB0…
Signed-off-by: Juntong Deng <juntong.deng(a)outlook.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++--
tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh~selftests-mm-fix-awk-usage-in-charge_reserved_hugetlbsh-and-hugetlb_reparenting_testsh-that-may-cause-error
+++ a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -25,7 +25,7 @@ if [[ "$1" == "-cgroup-v2" ]]; then
fi
if [[ $cgroup2 ]]; then
- cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+ cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}')
if [[ -z "$cgroup_path" ]]; then
cgroup_path=/dev/cgroup/memory
mount -t cgroup2 none $cgroup_path
@@ -33,7 +33,7 @@ if [[ $cgroup2 ]]; then
fi
echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
else
- cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+ cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
if [[ -z "$cgroup_path" ]]; then
cgroup_path=/dev/cgroup/memory
mount -t cgroup memory,hugetlb $cgroup_path
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh~selftests-mm-fix-awk-usage-in-charge_reserved_hugetlbsh-and-hugetlb_reparenting_testsh-that-may-cause-error
+++ a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -20,7 +20,7 @@ fi
if [[ $cgroup2 ]]; then
- CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+ CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk '{print $3}')
if [[ -z "$CGROUP_ROOT" ]]; then
CGROUP_ROOT=/dev/cgroup/memory
mount -t cgroup2 none $CGROUP_ROOT
@@ -28,7 +28,7 @@ if [[ $cgroup2 ]]; then
fi
echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
else
- CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+ CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
if [[ -z "$CGROUP_ROOT" ]]; then
CGROUP_ROOT=/dev/cgroup/memory
mount -t cgroup memory,hugetlb $CGROUP_ROOT
_
Patches currently in -mm which might be from juntong.deng(a)outlook.com are
The quilt patch titled
Subject: mm: mempolicy: keep VMA walk if both MPOL_MF_STRICT and MPOL_MF_MOVE are specified
has been removed from the -mm tree. Its filename was
mm-mempolicy-keep-vma-walk-if-both-mpol_mf_strict-and-mpol_mf_move-are-specified.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Yang Shi <yang(a)os.amperecomputing.com>
Subject: mm: mempolicy: keep VMA walk if both MPOL_MF_STRICT and MPOL_MF_MOVE are specified
Date: Wed, 20 Sep 2023 15:32:42 -0700
When calling mbind() with MPOL_MF_{MOVE|MOVEALL} | MPOL_MF_STRICT, kernel
should attempt to migrate all existing pages, and return -EIO if there is
misplaced or unmovable page. Then commit 6f4576e3687b ("mempolicy: apply
page table walker on queue_pages_range()") messed up the return value and
didn't break VMA scan early ianymore when MPOL_MF_STRICT alone. The
return value problem was fixed by commit a7f40cfe3b7a ("mm: mempolicy:
make mbind() return -EIO when MPOL_MF_STRICT is specified"), but it broke
the VMA walk early if unmovable page is met, it may cause some pages are
not migrated as expected.
The code should conceptually do:
if (MPOL_MF_MOVE|MOVEALL)
scan all vmas
try to migrate the existing pages
return success
else if (MPOL_MF_MOVE* | MPOL_MF_STRICT)
scan all vmas
try to migrate the existing pages
return -EIO if unmovable or migration failed
else /* MPOL_MF_STRICT alone */
break early if meets unmovable and don't call mbind_range() at all
else /* none of those flags */
check the ranges in test_walk, EFAULT without mbind_range() if discontig.
Fixed the behavior.
Link: https://lkml.kernel.org/r/20230920223242.3425775-1-yang@os.amperecomputing.…
Fixes: a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified")
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Rafael Aquini <aquini(a)redhat.com>
Cc: Kirill A. Shutemov <kirill(a)shutemov.name>
Cc: David Rientjes <rientjes(a)google.com>
Cc: <stable(a)vger.kernel.org> [4.9+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 39 +++++++++++++++++++--------------------
1 file changed, 19 insertions(+), 20 deletions(-)
--- a/mm/mempolicy.c~mm-mempolicy-keep-vma-walk-if-both-mpol_mf_strict-and-mpol_mf_move-are-specified
+++ a/mm/mempolicy.c
@@ -426,6 +426,7 @@ struct queue_pages {
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
+ bool has_unmovable;
};
/*
@@ -446,9 +447,8 @@ static inline bool queue_folio_required(
/*
* queue_folios_pmd() has three possible return values:
* 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. huge zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
+ * special page is met, i.e. zero page, or unmovable page is found
+ * but continue walking (indicated by queue_pages.has_unmovable).
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing folio was already on a node that does not follow the
* policy.
@@ -479,7 +479,7 @@ static int queue_folios_pmd(pmd_t *pmd,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_folio_add(folio, qp->pagelist, flags)) {
- ret = 1;
+ qp->has_unmovable = true;
goto unlock;
}
} else
@@ -495,9 +495,8 @@ unlock:
*
* queue_folios_pte_range() has three possible return values:
* 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
+ * special page is met, i.e. zero page, or unmovable page is found
+ * but continue walking (indicated by queue_pages.has_unmovable).
* -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
* on a node that does not follow the policy.
*/
@@ -508,7 +507,6 @@ static int queue_folios_pte_range(pmd_t
struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- bool has_unmovable = false;
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
@@ -538,11 +536,12 @@ static int queue_folios_pte_range(pmd_t
if (!queue_folio_required(folio, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- /* MPOL_MF_STRICT must be specified if we get here */
- if (!vma_migratable(vma)) {
- has_unmovable = true;
- break;
- }
+ /*
+ * MPOL_MF_STRICT must be specified if we get here.
+ * Continue walking vmas due to MPOL_MF_MOVE* flags.
+ */
+ if (!vma_migratable(vma))
+ qp->has_unmovable = true;
/*
* Do not abort immediately since there may be
@@ -550,16 +549,13 @@ static int queue_folios_pte_range(pmd_t
* need migrate other LRU pages.
*/
if (migrate_folio_add(folio, qp->pagelist, flags))
- has_unmovable = true;
+ qp->has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
- if (has_unmovable)
- return 1;
-
return addr != end ? -EIO : 0;
}
@@ -599,7 +595,7 @@ static int queue_folios_hugetlb(pte_t *p
* Detecting misplaced folio but allow migrating folios which
* have been queued.
*/
- ret = 1;
+ qp->has_unmovable = true;
goto unlock;
}
@@ -620,7 +616,7 @@ static int queue_folios_hugetlb(pte_t *p
* Failed to isolate folio but allow migrating pages
* which have been queued.
*/
- ret = 1;
+ qp->has_unmovable = true;
}
unlock:
spin_unlock(ptl);
@@ -756,12 +752,15 @@ queue_pages_range(struct mm_struct *mm,
.start = start,
.end = end,
.first = NULL,
+ .has_unmovable = false,
};
const struct mm_walk_ops *ops = lock_vma ?
&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
err = walk_page_range(mm, start, end, ops, &qp);
+ if (qp.has_unmovable)
+ err = 1;
if (!qp.first)
/* whole range in hole */
err = -EFAULT;
@@ -1358,7 +1357,7 @@ static long do_mbind(unsigned long start
putback_movable_pages(&pagelist);
}
- if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
+ if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT))
err = -EIO;
} else {
up_out:
_
Patches currently in -mm which might be from yang(a)os.amperecomputing.com are
The quilt patch titled
Subject: mm, memcg: reconsider kmem.limit_in_bytes deprecation
has been removed from the -mm tree. Its filename was
mm-memcg-reconsider-kmemlimit_in_bytes-deprecation.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Michal Hocko <mhocko(a)suse.com>
Subject: mm, memcg: reconsider kmem.limit_in_bytes deprecation
Date: Thu, 21 Sep 2023 09:38:29 +0200
This reverts commits 86327e8eb94c ("memcg: drop kmem.limit_in_bytes") and
partially reverts 58056f77502f ("memcg, kmem: further deprecate
kmem.limit_in_bytes") which have incrementally removed support for the
kernel memory accounting hard limit. Unfortunately it has turned out that
there is still userspace depending on the existence of
memory.kmem.limit_in_bytes [1]. The underlying functionality is not
really required but the non-existent file just confuses the userspace
which fails in the result. The patch to fix this on the userspace side
has been submitted but it is hard to predict how it will propagate through
the maze of 3rd party consumers of the software.
Now, reverting alone 86327e8eb94c is not an option because there is
another set of userspace which cannot cope with ENOTSUPP returned when
writing to the file. Therefore we have to go and revisit 58056f77502f as
well. There are two ways to go ahead. Either we give up on the
deprecation and fully revert 58056f77502f as well or we can keep
kmem.limit_in_bytes but make the write a noop and warn about the fact.
This should work for both known breaking workloads which depend on the
existence but do not depend on the hard limit enforcement.
Note to backporters to stable trees. a8c49af3be5f ("memcg: add per-memcg
total kernel memory stat") introduced in 4.18 has added memcg_account_kmem
so the accounting is not done by obj_cgroup_charge_pages directly for v1
anymore. Prior kernels need to add it explicitly (thanks to Johannes for
pointing this out).
[akpm(a)linux-foundation.org: fix build - remove unused local]
Link: http://lkml.kernel.org/r/20230920081101.GA12096@linuxonhyperv3.guj3yctzbm1e… [1]
Link: https://lkml.kernel.org/r/ZRE5VJozPZt9bRPy@dhcp22.suse.cz
Fixes: 86327e8eb94c ("memcg: drop kmem.limit_in_bytes")
Fixes: 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes")
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Shakeel Butt <shakeelb(a)google.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Jeremi Piotrowski <jpiotrowski(a)linux.microsoft.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Tejun heo <tj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Documentation/admin-guide/cgroup-v1/memory.rst | 7 +++++++
mm/memcontrol.c | 13 +++++++++++++
2 files changed, 20 insertions(+)
--- a/Documentation/admin-guide/cgroup-v1/memory.rst~mm-memcg-reconsider-kmemlimit_in_bytes-deprecation
+++ a/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -92,6 +92,13 @@ Brief summary of control files.
memory.oom_control set/show oom controls.
memory.numa_stat show the number of memory usage per numa
node
+ memory.kmem.limit_in_bytes Deprecated knob to set and read the kernel
+ memory hard limit. Kernel hard limit is not
+ supported since 5.16. Writing any value to
+ do file will not have any effect same as if
+ nokmem kernel parameter was specified.
+ Kernel memory is still charged and reported
+ by memory.kmem.usage_in_bytes.
memory.kmem.usage_in_bytes show current kernel memory allocation
memory.kmem.failcnt show the number of kernel memory usage
hits limits
--- a/mm/memcontrol.c~mm-memcg-reconsider-kmemlimit_in_bytes-deprecation
+++ a/mm/memcontrol.c
@@ -3867,6 +3867,13 @@ static ssize_t mem_cgroup_write(struct k
case _MEMSWAP:
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
+ case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Writing any value to this file has no effect. "
+ "Please report your usecase to linux-mm(a)kvack.org if you "
+ "depend on this functionality.\n");
+ ret = 0;
+ break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
@@ -5078,6 +5085,12 @@ static struct cftype mem_cgroup_legacy_f
},
#endif
{
+ .name = "kmem.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
_
Patches currently in -mm which might be from mhocko(a)suse.com are