The patch titled
Subject: mm/hugetlb: fix a addressing exception caused by huge_pte_offset
has been removed from the -mm tree. Its filename was
mm-hugetlb-fix-a-addressing-exception-caused-by-huge_pte_offset.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: Longpeng <longpeng2(a)huawei.com>
Subject: mm/hugetlb: fix a addressing exception caused by huge_pte_offset
Our machine encountered a panic(addressing exception) after run
for a long time and the calltrace is:
RIP: 0010:[<ffffffff9dff0587>] [<ffffffff9dff0587>] hugetlb_fault+0x307/0xbe0
RSP: 0018:ffff9567fc27f808 EFLAGS: 00010286
RAX: e800c03ff1258d48 RBX: ffffd3bb003b69c0 RCX: e800c03ff1258d48
RDX: 17ff3fc00eda72b7 RSI: 00003ffffffff000 RDI: e800c03ff1258d48
RBP: ffff9567fc27f8c8 R08: e800c03ff1258d48 R09: 0000000000000080
R10: ffffaba0704c22a8 R11: 0000000000000001 R12: ffff95c87b4b60d8
R13: 00005fff00000000 R14: 0000000000000000 R15: ffff9567face8074
FS: 00007fe2d9ffb700(0000) GS:ffff956900e40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffd3bb003b69c0 CR3: 000000be67374000 CR4: 00000000003627e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
[<ffffffff9df9b71b>] ? unlock_page+0x2b/0x30
[<ffffffff9dff04a2>] ? hugetlb_fault+0x222/0xbe0
[<ffffffff9dff1405>] follow_hugetlb_page+0x175/0x540
[<ffffffff9e15b825>] ? cpumask_next_and+0x35/0x50
[<ffffffff9dfc7230>] __get_user_pages+0x2a0/0x7e0
[<ffffffff9dfc648d>] __get_user_pages_unlocked+0x15d/0x210
[<ffffffffc068cfc5>] __gfn_to_pfn_memslot+0x3c5/0x460 [kvm]
[<ffffffffc06b28be>] try_async_pf+0x6e/0x2a0 [kvm]
[<ffffffffc06b4b41>] tdp_page_fault+0x151/0x2d0 [kvm]
...
[<ffffffffc06a6f90>] kvm_arch_vcpu_ioctl_run+0x330/0x490 [kvm]
[<ffffffffc068d919>] kvm_vcpu_ioctl+0x309/0x6d0 [kvm]
[<ffffffff9deaa8c2>] ? dequeue_signal+0x32/0x180
[<ffffffff9deae34d>] ? do_sigtimedwait+0xcd/0x230
[<ffffffff9e03aed0>] do_vfs_ioctl+0x3f0/0x540
[<ffffffff9e03b0c1>] SyS_ioctl+0xa1/0xc0
[<ffffffff9e53879b>] system_call_fastpath+0x22/0x27
For 1G hugepages, huge_pte_offset() wants to return NULL or pudp, but it
may return a wrong 'pmdp' if there is a race. Please look at the following
code snippet:
...
pud = pud_offset(p4d, addr);
if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
/* hugepage or swap? */
if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
if (sz != PMD_SIZE && pmd_none(*pmd))
return NULL;
/* hugepage or swap? */
if (pmd_huge(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
...
The following sequence would trigger this bug:
1. CPU0: sz = PUD_SIZE and *pud = 0 , continue
1. CPU0: "pud_huge(*pud)" is false
2. CPU1: calling hugetlb_no_page and set *pud to xxxx8e7(PRESENT)
3. CPU0: "!pud_present(*pud)" is false, continue
4. CPU0: pmd = pmd_offset(pud, addr) and maybe return a wrong pmdp
However, we want CPU0 to return NULL or pudp in this case.
Also, according to the section 'COMPILER BARRIER' of memory-barriers.txt:
'''
(*) The compiler is within its rights to reorder loads and stores
to the same variable, and in some cases, the CPU is within its
rights to reorder loads to the same variable. This means that
the following code:
a[0] = x;
a[1] = x;
Might result in an older value of x stored in a[1] than in a[0].
'''
there're several other data races in huge_pte_offset, for example:
'''
p4d = p4d_offset(pgd, addr)
if (!p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, addr) <-- will be unwinded as:
pud = (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
'''
which is free for the compiler/CPU to execute as:
'''
p4d = p4d_offset(pgd, addr)
p4d_for_vaddr = *p4d;
if (!p4d_present(*p4d))
return NULL;
pud = (pud_t *)p4d_page_vaddr(p4d_for_vaddr) + pud_index(address);
'''
so in the case where *p4d goes from '!present' to 'present':
p4d_present(*p4d) == true and p4d_for_vaddr == none, meaning the
p4d_page_vaddr() will crash.
For these reasons, we must make sure there is exactly one dereference of
p4d, pud and pmd.
Link: http://lkml.kernel.org/r/20200327235748.2048-1-longpeng2@huawei.com
Signed-off-by: Longpeng <longpeng2(a)huawei.com>
Suggested-by: Jason Gunthorpe <jgg(a)ziepe.ca>
Reviewed-by: Jason Gunthorpe <jgg(a)mellanox.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Jason Gunthorpe <jgg(a)ziepe.ca>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Sean Christopherson <sean.j.christopherson(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
--- a/mm/hugetlb.c~mm-hugetlb-fix-a-addressing-exception-caused-by-huge_pte_offset
+++ a/mm/hugetlb.c
@@ -4909,29 +4909,33 @@ pte_t *huge_pte_offset(struct mm_struct
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
+ p4d_t *p4d, p4d_entry;
+ pud_t *pud, pud_entry;
+ pmd_t *pmd, pmd_entry;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return NULL;
+
p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
+ p4d_entry = READ_ONCE(*p4d);
+ if (!p4d_present(p4d_entry))
return NULL;
- pud = pud_offset(p4d, addr);
- if (sz != PUD_SIZE && pud_none(*pud))
+ pud = pud_offset(&p4d_entry, addr);
+ pud_entry = READ_ONCE(*pud);
+ if (sz != PUD_SIZE && pud_none(pud_entry))
return NULL;
/* hugepage or swap? */
- if (pud_huge(*pud) || !pud_present(*pud))
+ if (pud_huge(pud_entry) || !pud_present(pud_entry))
return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- if (sz != PMD_SIZE && pmd_none(*pmd))
+ pmd = pmd_offset(&pud_entry, addr);
+ pmd_entry = READ_ONCE(*pmd);
+ if (sz != PMD_SIZE && pmd_none(pmd_entry))
return NULL;
/* hugepage or swap? */
- if (pmd_huge(*pmd) || !pmd_present(*pmd))
+ if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
return (pte_t *)pmd;
return NULL;
_
Patches currently in -mm which might be from longpeng2(a)huawei.com are
The patch titled
Subject: mm, memcg: Do not high throttle allocators based on wraparound
has been added to the -mm tree. Its filename is
mm-memcg-do-not-high-throttle-allocators-based-on-wraparound.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/mm-memcg-do-not-high-throttle-allo…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/mm-memcg-do-not-high-throttle-allo…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Jakub Kicinski <kuba(a)kernel.org>
Subject: mm, memcg: Do not high throttle allocators based on wraparound
If a cgroup violates its memory.high constraints, we may end up unduly
penalising it. For example, for the following hierarchy:
A: max high, 20 usage
A/B: 9 high, 10 usage
A/C: max high, 10 usage
We would end up doing the following calculation below when calculating
high delay for A/B:
A/B: 10 - 9 = 1...
A: 20 - PAGE_COUNTER_MAX = 21, so set max_overage to 21.
This gets worse with higher disparities in usage in the parent.
I have no idea how this disappeared from the final version of the patch,
but it is certainly Not Good(tm). This wasn't obvious in testing because,
for a simple cgroup hierarchy with only one child, the result is usually
roughly the same. It's only in more complex hierarchies that things go
really awry (although still, the effects are limited to a maximum of 2
seconds in schedule_timeout_killable at a maximum).
[chris(a)chrisdown.name: changelog]
Link: http://lkml.kernel.org/r/20200331152424.GA1019937@chrisdown.name
Fixes: e26733e0d0ec ("mm, memcg: throttle allocators based on ancestral memory.high")
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Chris Down <chris(a)chrisdown.name>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org> [5.4.x]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 3 +++
1 file changed, 3 insertions(+)
--- a/mm/memcontrol.c~mm-memcg-do-not-high-throttle-allocators-based-on-wraparound
+++ a/mm/memcontrol.c
@@ -2324,6 +2324,9 @@ static unsigned long calculate_high_dela
usage = page_counter_read(&memcg->memory);
high = READ_ONCE(memcg->high);
+ if (usage <= high)
+ continue;
+
/*
* Prevent division by 0 in overage calculation by acting as if
* it was a threshold of 1 page
_
Patches currently in -mm which might be from kuba(a)kernel.org are
mm-memcg-do-not-high-throttle-allocators-based-on-wraparound.patch
The patch titled
Subject: ipc/mqueue.c: change __do_notify() to bypass check_kill_permission()
has been added to the -mm tree. Its filename is
ipc-mqueuec-change-__do_notify-to-bypass-check_kill_permission-v2.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/ipc-mqueuec-change-__do_notify-to-…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/ipc-mqueuec-change-__do_notify-to-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Oleg Nesterov <oleg(a)redhat.com>
Subject: ipc/mqueue.c: change __do_notify() to bypass check_kill_permission()
Commit cc731525f26a ("signal: Remove kernel interal si_code magic")
changed the value of SI_FROMUSER(SI_MESGQ), this means that mq_notify() no
longer works if the sender doesn't have rights to send a signal.
Change __do_notify() to use do_send_sig_info() instead of kill_pid_info()
to avoid check_kill_permission().
This needs the additional notify.sigev_signo != 0 check, shouldn't we
change do_mq_notify() to deny sigev_signo == 0 ?
Test-case:
#include <signal.h>
#include <mqueue.h>
#include <unistd.h>
#include <sys/wait.h>
#include <assert.h>
static int notified;
static void sigh(int sig)
{
notified = 1;
}
int main(void)
{
signal(SIGIO, sigh);
int fd = mq_open("/mq", O_RDWR|O_CREAT, 0666, NULL);
assert(fd >= 0);
struct sigevent se = {
.sigev_notify = SIGEV_SIGNAL,
.sigev_signo = SIGIO,
};
assert(mq_notify(fd, &se) == 0);
if (!fork()) {
assert(setuid(1) == 0);
mq_send(fd, "",1,0);
return 0;
}
wait(NULL);
mq_unlink("/mq");
assert(notified);
return 0;
}
[manfred(a)colorfullife.com: 1) Add self_exec_id evaluation so that the implementation matches do_notify_parent 2) use PIDTYPE_TGID everywhere]
Link: http://lkml.kernel.org/r/e2a782e4-eab9-4f5c-c749-c07a8f7a4e66@colorfullife.…
Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic")
Reported-by: Yoji <yoji.fujihar.min(a)gmail.com>
Signed-off-by: Oleg Nesterov <oleg(a)redhat.com>
Signed-off-by: Manfred Spraul <manfred(a)colorfullife.com>
Acked-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Markus Elfring <elfring(a)users.sourceforge.net>
Cc: <1vier1(a)web.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
ipc/mqueue.c | 34 ++++++++++++++++++++++++++--------
1 file changed, 26 insertions(+), 8 deletions(-)
--- a/ipc/mqueue.c~ipc-mqueuec-change-__do_notify-to-bypass-check_kill_permission-v2
+++ a/ipc/mqueue.c
@@ -142,6 +142,7 @@ struct mqueue_inode_info {
struct sigevent notify;
struct pid *notify_owner;
+ u32 notify_self_exec_id;
struct user_namespace *notify_user_ns;
struct user_struct *user; /* user who created, for accounting */
struct sock *notify_sock;
@@ -774,28 +775,44 @@ static void __do_notify(struct mqueue_in
* synchronously. */
if (info->notify_owner &&
info->attr.mq_curmsgs == 1) {
- struct kernel_siginfo sig_i;
switch (info->notify.sigev_notify) {
case SIGEV_NONE:
break;
- case SIGEV_SIGNAL:
- /* sends signal */
+ case SIGEV_SIGNAL: {
+ struct kernel_siginfo sig_i;
+ struct task_struct *task;
+
+ /* do_mq_notify() accepts sigev_signo == 0, why?? */
+ if (!info->notify.sigev_signo)
+ break;
clear_siginfo(&sig_i);
sig_i.si_signo = info->notify.sigev_signo;
sig_i.si_errno = 0;
sig_i.si_code = SI_MESGQ;
sig_i.si_value = info->notify.sigev_value;
- /* map current pid/uid into info->owner's namespaces */
rcu_read_lock();
+ /* map current pid/uid into info->owner's namespaces */
sig_i.si_pid = task_tgid_nr_ns(current,
ns_of_pid(info->notify_owner));
- sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
+ sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
+ current_uid());
+ /*
+ * We can't use kill_pid_info(), this signal should
+ * bypass check_kill_permission(). It is from kernel
+ * but si_fromuser() can't know this.
+ * We do check the self_exec_id, to avoid sending
+ * signals to programs that don't expect them.
+ */
+ task = pid_task(info->notify_owner, PIDTYPE_TGID);
+ if (task && task->self_exec_id ==
+ info->notify_self_exec_id) {
+ do_send_sig_info(info->notify.sigev_signo,
+ &sig_i, task, PIDTYPE_TGID);
+ }
rcu_read_unlock();
-
- kill_pid_info(info->notify.sigev_signo,
- &sig_i, info->notify_owner);
break;
+ }
case SIGEV_THREAD:
set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
netlink_sendskb(info->notify_sock, info->notify_cookie);
@@ -1384,6 +1401,7 @@ retry:
info->notify.sigev_signo = notification->sigev_signo;
info->notify.sigev_value = notification->sigev_value;
info->notify.sigev_notify = SIGEV_SIGNAL;
+ info->notify_self_exec_id = current->self_exec_id;
break;
}
_
Patches currently in -mm which might be from oleg(a)redhat.com are
ipc-mqueuec-change-__do_notify-to-bypass-check_kill_permission-v2.patch
aio-simplify-read_events.patch