From: Johannes Weiner <hannes(a)cmpxchg.org>
commit 739f79fc9db1b38f96b5a5109b247a650fbebf6d upstream
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
Reported-by: Bradley Bolen <bradleybolen(a)gmail.com>
Tested-by: Brad Bolen <bradleybolen(a)gmail.com>
Cc: Vladimir Davydov <vdavydov(a)virtuozzo.com>
Cc: Michal Hocko <mhocko(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
[guptap(a)codeaurora.org: Resolved merge conflicts]
Signed-off-by: Prakash Gupta <guptap(a)codeaurora.org>
Signed-off-by: Florian Fainelli <f.fainelli(a)gmail.com>
---
This patch is present in a downstream Android tree:
https://source.mcwhirter.io/craige/bluecross/commit/d4a742865c6b69ef9316947…
and I happened to have stumbled across the same problem too.
Johannes can you review it for correctness with respect to the 4.9
kernel? Thanks!
include/linux/memcontrol.h | 33 ++++++++++++++++++++++++-----
mm/memcontrol.c | 43 +++++++++++++++++++++++++++-----------
mm/page-writeback.c | 14 ++++++++++---
3 files changed, 70 insertions(+), 20 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8b35bdbdc214..fd77f8303ab9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -490,9 +490,21 @@ bool mem_cgroup_oom_synchronize(bool wait);
extern int do_swap_account;
#endif
-void lock_page_memcg(struct page *page);
+struct mem_cgroup *lock_page_memcg(struct page *page);
+void __unlock_page_memcg(struct mem_cgroup *memcg);
void unlock_page_memcg(struct page *page);
+static inline void __mem_cgroup_update_page_stat(struct page *page,
+ struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx,
+ int val)
+{
+ VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
+
+ if (memcg && memcg->stat)
+ this_cpu_add(memcg->stat->count[idx], val);
+}
+
/**
* mem_cgroup_update_page_stat - update page state statistics
* @page: the page
@@ -508,13 +520,12 @@ void unlock_page_memcg(struct page *page);
* mem_cgroup_update_page_stat(page, state, -1);
* unlock_page(page) or unlock_page_memcg(page)
*/
+
static inline void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_stat_index idx, int val)
{
- VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
- if (page->mem_cgroup)
- this_cpu_add(page->mem_cgroup->stat->count[idx], val);
+ __mem_cgroup_update_page_stat(page, page->mem_cgroup, idx, val);
}
static inline void mem_cgroup_inc_page_stat(struct page *page,
@@ -709,7 +720,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
-static inline void lock_page_memcg(struct page *page)
+static inline struct mem_cgroup *lock_page_memcg(struct page *page)
+{
+ return NULL;
+}
+
+static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
{
}
@@ -745,6 +761,13 @@ static inline void mem_cgroup_update_page_stat(struct page *page,
{
}
+static inline void __mem_cgroup_update_page_stat(struct page *page,
+ struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx,
+ int nr)
+{
+}
+
static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d4232744c59f..27b0b4f03fcd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1638,9 +1638,13 @@ bool mem_cgroup_oom_synchronize(bool handle)
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
*/
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1649,18 +1653,24 @@ void lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- */
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page itself from being freed. E.g. writeback
+ * doesn't hold a page reference and relies on PG_writeback to
+ * keep off truncation, migration and so forth.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
- return;
+ return NULL;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return;
+ return NULL;
if (atomic_read(&memcg->moving_account) <= 0)
- return;
+ return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1676,18 +1686,18 @@ void lock_page_memcg(struct page *page)
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return;
+ return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
*/
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
-
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1699,6 +1709,15 @@ void unlock_page_memcg(struct page *page)
rcu_read_unlock();
}
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+ __unlock_page_memcg(page->mem_cgroup);
+}
EXPORT_SYMBOL(unlock_page_memcg);
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 462c778b9fb5..498c924f2fcd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2717,9 +2717,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ struct mem_cgroup *memcg;
int ret;
- lock_page_memcg(page);
+ memcg = lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2747,13 +2748,20 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
+ /*
+ * NOTE: Page might be free now! Writeback doesn't hold a page
+ * reference on its own, it relies on truncation to wait for
+ * the clearing of PG_writeback. The below can only access
+ * page state that is static across allocation cycles.
+ */
if (ret) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ __mem_cgroup_update_page_stat(page, memcg,
+ MEM_CGROUP_STAT_WRITEBACK, -1);
dec_node_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
- unlock_page_memcg(page);
+ __unlock_page_memcg(memcg);
return ret;
}
--
2.25.1
commit 97c753e62e6c31a404183898d950d8c08d752dbd upstream.
Fix kprobe_on_func_entry() returns error code instead of false so that
register_kretprobe() can return an appropriate error code.
append_trace_kprobe() expects the kprobe registration returns -ENOENT
when the target symbol is not found, and it checks whether the target
module is unloaded or not. If the target module doesn't exist, it
defers to probe the target symbol until the module is loaded.
However, since register_kretprobe() returns -EINVAL instead of -ENOENT
in that case, it always fail on putting the kretprobe event on unloaded
modules. e.g.
Kprobe event:
/sys/kernel/debug/tracing # echo p xfs:xfs_end_io >> kprobe_events
[ 16.515574] trace_kprobe: This probe might be able to register after target module is loaded. Continue.
Kretprobe event: (p -> r)
/sys/kernel/debug/tracing # echo r xfs:xfs_end_io >> kprobe_events
sh: write error: Invalid argument
/sys/kernel/debug/tracing # cat error_log
[ 41.122514] trace_kprobe: error: Failed to register probe event
Command: r xfs:xfs_end_io
^
To fix this bug, change kprobe_on_func_entry() to detect symbol lookup
failure and return -ENOENT in that case. Otherwise it returns -EINVAL
or 0 (succeeded, given address is on the entry).
Link: https://lkml.kernel.org/r/161176187132.1067016.8118042342894378981.stgit@de…
Cc: stable(a)vger.kernel.org
Fixes: 59158ec4aef7 ("tracing/kprobes: Check the probe on unloaded module correctly")
Reported-by: Jianlin Lv <Jianlin.Lv(a)arm.com>
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
include/linux/kprobes.h | 2 +-
kernel/kprobes.c | 34 +++++++++++++++++++++++++---------
kernel/trace/trace_kprobe.c | 4 ++--
3 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9f22652d69bb..c28204e22b54 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -245,7 +245,7 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
extern bool arch_kprobe_on_func_entry(unsigned long offset);
-extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
+extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
extern bool within_kprobe_blacklist(unsigned long addr);
extern int kprobe_add_ksym_blacklist(unsigned long entry);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2161f519d481..ebbd4320143d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1921,29 +1921,45 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
return !offset;
}
-bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+/**
+ * kprobe_on_func_entry() -- check whether given address is function entry
+ * @addr: Target address
+ * @sym: Target symbol name
+ * @offset: The offset from the symbol or the address
+ *
+ * This checks whether the given @addr+@offset or @sym+@offset is on the
+ * function entry address or not.
+ * This returns 0 if it is the function entry, or -EINVAL if it is not.
+ * And also it returns -ENOENT if it fails the symbol or address lookup.
+ * Caller must pass @addr or @sym (either one must be NULL), or this
+ * returns -EINVAL.
+ */
+int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
if (IS_ERR(kp_addr))
- return false;
+ return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_kprobe_on_func_entry(offset))
- return false;
+ if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
+ return -ENOENT;
- return true;
+ if (!arch_kprobe_on_func_entry(offset))
+ return -EINVAL;
+
+ return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
- int ret = 0;
+ int ret;
struct kretprobe_instance *inst;
int i;
void *addr;
- if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
- return -EINVAL;
+ ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
+ if (ret)
+ return ret;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5c17f70c7f2d..61eff45653f5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -112,9 +112,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
- return kprobe_on_func_entry(tk->rp.kp.addr,
+ return (kprobe_on_func_entry(tk->rp.kp.addr,
tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
- tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0);
}
bool trace_kprobe_error_injectable(struct trace_event_call *call)
This is the start of the stable review cycle for the 5.10.15 release.
There are 120 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 10 Feb 2021 14:57:55 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.10.15-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.10.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.10.15-rc1
Alexander Ovechkin <ovov(a)yandex-team.ru>
net: sched: replaced invalid qdisc tree flush helper in qdisc_replace
DENG Qingfang <dqfext(a)gmail.com>
net: dsa: mv88e6xxx: override existent unicast portvec in port_fdb_add
Dongseok Yi <dseok.yi(a)samsung.com>
udp: ipv4: manipulate network header of NATed UDP GRO fraglist
Vadim Fedorenko <vfedorenko(a)novek.ru>
net: ip_tunnel: fix mtu calculation
Chinmay Agarwal <chinagar(a)codeaurora.org>
neighbour: Prevent a dead entry from updating gc_list
Kai-Heng Feng <kai.heng.feng(a)canonical.com>
igc: Report speed and duplex as unknown when device is runtime suspended
Xiao Ni <xni(a)redhat.com>
md: Set prev_flush_start and flush_bio in an atomic way
Marek Vasut <marex(a)denx.de>
Input: ili210x - implement pressure reporting for ILI251x
Benjamin Valentin <benpicco(a)googlemail.com>
Input: xpad - sync supported devices with fork on GitHub
AngeloGioacchino Del Regno <angelogioacchino.delregno(a)somainline.org>
Input: goodix - add support for Goodix GT9286 chip
Dave Hansen <dave.hansen(a)linux.intel.com>
x86/apic: Add extra serialization for non-serializing MSRs
Lai Jiangshan <laijs(a)linux.alibaba.com>
x86/debug: Prevent data breakpoints on cpu_dr7
Lai Jiangshan <laijs(a)linux.alibaba.com>
x86/debug: Prevent data breakpoints on __per_cpu_offset
Peter Zijlstra <peterz(a)infradead.org>
x86/debug: Fix DR6 handling
Josh Poimboeuf <jpoimboe(a)redhat.com>
x86/build: Disable CET instrumentation in the kernel
Waiman Long <longman(a)redhat.com>
mm/filemap: add missing mem_cgroup_uncharge() to __add_to_page_cache_locked()
Hugh Dickins <hughd(a)google.com>
mm: thp: fix MADV_REMOVE deadlock on shmem THP
Rick Edgecombe <rick.p.edgecombe(a)intel.com>
mm/vmalloc: separate put pages and flush VM flags
Rokudo Yan <wu-yan(a)tcl.com>
mm, compaction: move high_pfn to the for loop scope
Muchun Song <songmuchun(a)bytedance.com>
mm: hugetlb: remove VM_BUG_ON_PAGE from page_huge_active
Muchun Song <songmuchun(a)bytedance.com>
mm: hugetlb: fix a race between isolating and freeing page
Muchun Song <songmuchun(a)bytedance.com>
mm: hugetlb: fix a race between freeing and dissolving the page
Muchun Song <songmuchun(a)bytedance.com>
mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page
Dmitry Osipenko <digetx(a)gmail.com>
ARM: 9043/1: tegra: Fix misplaced tegra_uart_config in decompressor
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: footbridge: fix dc21285 PCI configuration accessors
H. Nikolaus Schaller <hns(a)goldelico.com>
ARM: dts; gta04: SPI panel chip select is active low
H. Nikolaus Schaller <hns(a)goldelico.com>
DTS: ARM: gta04: remove legacy spi-cs-high to make display work again
Sean Christopherson <seanjc(a)google.com>
KVM: x86: Set so called 'reserved CR3 bits in LM mask' at vCPU reset
Sean Christopherson <seanjc(a)google.com>
KVM: x86: Update emulator context mode if SYSENTER xfers to 64-bit mode
Michael Roth <michael.roth(a)amd.com>
KVM: x86: fix CPUID entries returned by KVM_GET_CPUID2 ioctl
Paolo Bonzini <pbonzini(a)redhat.com>
KVM: x86: Allow guests to see MSR_IA32_TSX_CTRL even if tsx=off
Ben Gardon <bgardon(a)google.com>
KVM: x86/mmu: Fix TDP MMU zap collapsible SPTEs
Sean Christopherson <seanjc(a)google.com>
KVM: SVM: Treat SVM as unsupported when running as an SEV guest
Thorsten Leemhuis <linux(a)leemhuis.info>
nvme-pci: avoid the deepest sleep state on Kingston A2000 SSDs
Xiaoguang Wang <xiaoguang.wang(a)linux.alibaba.com>
io_uring: don't modify identity's files uncess identity is cowed
Stylon Wang <stylon.wang(a)amd.com>
drm/amd/display: Revert "Fix EDID parsing after resume from suspend"
Ville Syrjälä <ville.syrjala(a)linux.intel.com>
drm/i915: Power up combo PHY lanes for for HDMI as well
Ville Syrjälä <ville.syrjala(a)linux.intel.com>
drm/i915: Extract intel_ddi_power_up_lanes()
Andres Calderon Jaramillo <andrescj(a)chromium.org>
drm/i915/display: Prevent double YUV range correction on HDR planes
Chris Wilson <chris(a)chris-wilson.co.uk>
drm/i915/gt: Close race between enable_breadcrumbs and cancel_breadcrumbs
Chris Wilson <chris(a)chris-wilson.co.uk>
drm/i915/gem: Drop lru bumping on display unpinning
Imre Deak <imre.deak(a)intel.com>
drm/i915: Fix the MST PBN divider calculation
Imre Deak <imre.deak(a)intel.com>
drm/dp/mst: Export drm_dp_get_vc_payload_bw()
Peter Gonda <pgonda(a)google.com>
Fix unsynchronized access to sev members through svm_register_enc_region
Fengnan Chang <fengnanchang(a)gmail.com>
mmc: core: Limit retries when analyse of SDIO tuples fails
Ulf Hansson <ulf.hansson(a)linaro.org>
mmc: sdhci-pltfm: Fix linking err for sdhci-brcmstb
Pavel Shilovsky <pshilov(a)microsoft.com>
smb3: fix crediting for compounding when only one request in flight
Gustavo A. R. Silva <gustavoars(a)kernel.org>
smb3: Fix out-of-bounds bug in SMB2_negotiate()
Joerg Roedel <jroedel(a)suse.de>
iommu: Check dev->iommu in dev_iommu_priv_get() before dereferencing it
Aurelien Aptel <aaptel(a)suse.com>
cifs: report error instead of invalid when revalidating a dentry fails
Atish Patra <atish.patra(a)wdc.com>
RISC-V: Define MAXPHYSMEM_1GB only for RV32
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: fix bounce buffer usage for non-sg list case
Rolf Eike Beer <eb(a)emlix.com>
scripts: use pkg-config to locate libcrypto
Marc Zyngier <maz(a)kernel.org>
genirq/msi: Activate Multi-MSI early when MSI_FLAG_ACTIVATE_EARLY is set
Hans de Goede <hdegoede(a)redhat.com>
genirq: Prevent [devm_]irq_alloc_desc from returning irq 0
Dan Williams <dan.j.williams(a)intel.com>
libnvdimm/dimm: Avoid race between probe and available_slots_show()
Dan Williams <dan.j.williams(a)intel.com>
libnvdimm/namespace: Fix visibility of namespace resource attribute
Alexey Kardashevskiy <aik(a)ozlabs.ru>
tracepoint: Fix race between tracing and removing tracepoint
Viktor Rosendahl <Viktor.Rosendahl(a)bmw.de>
tracing: Use pause-on-trace with the latency tracers
Wang ShaoBo <bobo.shaobowang(a)huawei.com>
kretprobe: Avoid re-registration of the same kretprobe earlier
Masami Hiramatsu <mhiramat(a)kernel.org>
tracing/kprobe: Fix to support kretprobe events on unloaded modules
Steven Rostedt (VMware) <rostedt(a)goodmis.org>
fgraph: Initialize tracing_graph_pause at task creation
Quanyang Wang <quanyang.wang(a)windriver.com>
gpiolib: free device name on error path to fix kmemleak
Felix Fietkau <nbd(a)nbd.name>
mac80211: fix station rate table updates on assoc
Sargun Dhillon <sargun(a)sargun.me>
ovl: implement volatile-specific fsync error behaviour
Miklos Szeredi <mszeredi(a)redhat.com>
ovl: avoid deadlock on directory ioctl
Liangyan <liangyan.peng(a)linux.alibaba.com>
ovl: fix dentry leak in ovl_get_redirect
Mario Limonciello <mario.limonciello(a)dell.com>
thunderbolt: Fix possible NULL pointer dereference in tb_acpi_add_link()
Masahiro Yamada <masahiroy(a)kernel.org>
kbuild: fix duplicated flags in DEBUG_CFLAGS
Roman Gushchin <guro(a)fb.com>
memblock: do not start bottom-up allocations with kernel_end
Eli Cohen <elic(a)nvidia.com>
vdpa/mlx5: Restore the hardware used index after change map
Sagi Grimberg <sagi(a)grimberg.me>
nvmet-tcp: fix out-of-bounds access when receiving multiple h2cdata PDUs
Hermann Lauer <Hermann.Lauer(a)uni-heidelberg.de>
ARM: dts: sun7i: a20: bananapro: Fix ethernet phy-mode
Dan Carpenter <dan.carpenter(a)oracle.com>
net: ipa: pass correct dma_handle to dma_free_coherent()
Heiner Kallweit <hkallweit1(a)gmail.com>
r8169: fix WoL on shutdown if CONFIG_DEBUG_SHIRQ is set
Stefan Chulski <stefanc(a)marvell.com>
net: mvpp2: TCAM entry enable should be written after SRAM data
Xie He <xie.he.0141(a)gmail.com>
net: lapb: Copy the skb before sending a packet
Maor Dickman <maord(a)nvidia.com>
net/mlx5e: Release skb in case of failure in tc update skb
Maxim Mikityanskiy <maximmi(a)mellanox.com>
net/mlx5e: Update max_opened_tc also when channels are closed
Maor Gottlieb <maorg(a)nvidia.com>
net/mlx5: Fix leak upon failure of rule creation
Daniel Jurgens <danielj(a)nvidia.com>
net/mlx5: Fix function calculation for page trees
Lijun Pan <ljp(a)linux.ibm.com>
ibmvnic: device remove has higher precedence over reset
Aleksandr Loktionov <aleksandr.loktionov(a)intel.com>
i40e: Revert "i40e: don't report link up for a VF who hasn't enabled queues"
Kevin Lo <kevlo(a)kevlo.org>
igc: check return value of ret_val in igc_config_fc_after_link_up
Kevin Lo <kevlo(a)kevlo.org>
igc: set the default return value to -IGC_ERR_NVM in igc_write_nvm_srwr
Chuck Lever <chuck.lever(a)oracle.com>
SUNRPC: Fix NFS READs that start at non-page-aligned offsets
Zyta Szpak <zr(a)semihalf.com>
arm64: dts: ls1046a: fix dcfg address range
David Howells <dhowells(a)redhat.com>
rxrpc: Fix deadlock around release of dst cached on udp tunnel
Heiner Kallweit <hkallweit1(a)gmail.com>
r8169: work around RTL8125 UDP hw bug
Marek Szyprowski <m.szyprowski(a)samsung.com>
arm64: dts: meson: switch TFLASH_VDD_EN pin to open drain on Odroid-C4
Quentin Monnet <quentin(a)isovalent.com>
bpf, preload: Fix build when $(O) points to a relative path
Johannes Berg <johannes.berg(a)intel.com>
um: virtio: free vu_dev only with the contained struct device
Pan Bian <bianpan2016(a)163.com>
bpf, inode_storage: Put file handler if no storage was found
Loris Reiff <loris.reiff(a)liblor.ch>
bpf, cgroup: Fix problematic bounds check
Loris Reiff <loris.reiff(a)liblor.ch>
bpf, cgroup: Fix optlen WARN_ON_ONCE toctou
Eli Cohen <elic(a)nvidia.com>
vdpa/mlx5: Fix memory key MTT population
Marek Vasut <marex(a)denx.de>
ARM: dts: stm32: Fix GPIO hog flags on DHCOM DRC02
Marek Vasut <marex(a)denx.de>
ARM: dts: stm32: Disable optional TSC2004 on DRC02 board
Marek Vasut <marex(a)denx.de>
ARM: dts: stm32: Disable WP on DHCOM uSD slot
Marek Vasut <marex(a)denx.de>
ARM: dts: stm32: Connect card-detect signal on DHCOM
Marek Vasut <marex(a)denx.de>
ARM: dts: stm32: Fix polarity of the DH DRC02 uSD card detect
Simon South <simon(a)simonsouth.net>
arm64: dts: rockchip: Use only supported PCIe link speed on Pinebook Pro
Sandy Huang <hjc(a)rock-chips.com>
arm64: dts: rockchip: fix vopl iommu irq on px30
Serge Semin <Sergey.Semin(a)baikalelectronics.ru>
arm64: dts: amlogic: meson-g12: Set FL-adj property value
Alexey Dobriyan <adobriyan(a)gmail.com>
Input: i8042 - unbreak Pegatron C15B
Shawn Guo <shawn.guo(a)linaro.org>
arm64: dts: qcom: c630: keep both touchpad devices enabled
Linus Walleij <linus.walleij(a)linaro.org>
ARM: OMAP1: OSK: fix ohci-omap breakage
Chunfeng Yun <chunfeng.yun(a)mediatek.com>
usb: xhci-mtk: break loop when find the endpoint to drop
Chunfeng Yun <chunfeng.yun(a)mediatek.com>
usb: xhci-mtk: skip dropping bandwidth of unchecked endpoints
Ikjoon Jang <ikjn(a)chromium.org>
usb: xhci-mtk: fix unreleased bandwidth data
Gary Bisson <gary.bisson(a)boundarydevices.com>
usb: dwc3: fix clock issue during resume in OTG mode
Heiko Stuebner <heiko.stuebner(a)theobroma-systems.com>
usb: dwc2: Fix endpoint direction check in ep_from_windex
Yoshihiro Shimoda <yoshihiro.shimoda.uh(a)renesas.com>
usb: renesas_usbhs: Clear pipe running flag in usbhs_pkt_pop()
Jeremy Figgins <kernel(a)jeremyfiggins.com>
USB: usblp: don't call usb_set_interface if there's a single alt
kernel test robot <lkp(a)intel.com>
usb: gadget: aspeed: add missing of_node_put
Dan Carpenter <dan.carpenter(a)oracle.com>
USB: gadget: legacy: fix an error code in eth_bind()
Pali Rohár <pali(a)kernel.org>
usb: host: xhci: mvebu: make USB 3.0 PHY optional for Armada 3720
Christoph Schemmel <christoph.schemmel(a)gmail.com>
USB: serial: option: Adding support for Cinterion MV31
Chenxin Jin <bg4akv(a)hotmail.com>
USB: serial: cp210x: add new VID/PID for supporting Teraoka AD2000
Pho Tran <Pho.Tran(a)silabs.com>
USB: serial: cp210x: add pid/vid for WSDA-200-USB
-------------
Diffstat:
Documentation/filesystems/overlayfs.rst | 8 ++
Makefile | 14 +--
arch/arm/boot/dts/omap3-gta04.dtsi | 3 +-
arch/arm/boot/dts/stm32mp15xx-dhcom-drc02.dtsi | 12 +-
arch/arm/boot/dts/stm32mp15xx-dhcom-som.dtsi | 3 +-
arch/arm/boot/dts/sun7i-a20-bananapro.dts | 2 +-
arch/arm/include/debug/tegra.S | 54 ++++-----
arch/arm/mach-footbridge/dc21285.c | 12 +-
arch/arm/mach-omap1/board-osk.c | 2 +
arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 2 +-
.../arm64/boot/dts/amlogic/meson-sm1-odroid-c4.dts | 2 +-
arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 2 +-
.../boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 10 +-
arch/arm64/boot/dts/rockchip/px30.dtsi | 2 +-
.../boot/dts/rockchip/rk3399-pinebook-pro.dts | 1 -
arch/riscv/Kconfig | 2 +
arch/um/drivers/virtio_uml.c | 3 +-
arch/x86/Makefile | 3 +
arch/x86/include/asm/apic.h | 10 --
arch/x86/include/asm/barrier.h | 18 +++
arch/x86/kernel/apic/apic.c | 4 +
arch/x86/kernel/apic/x2apic_cluster.c | 6 +-
arch/x86/kernel/apic/x2apic_phys.c | 9 +-
arch/x86/kernel/hw_breakpoint.c | 61 ++++++----
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/emulate.c | 2 +
arch/x86/kvm/mmu/tdp_mmu.c | 6 +-
arch/x86/kvm/svm/sev.c | 17 +--
arch/x86/kvm/svm/svm.c | 5 +
arch/x86/kvm/vmx/vmx.c | 17 ++-
arch/x86/kvm/x86.c | 27 +++--
arch/x86/mm/mem_encrypt.c | 1 +
drivers/gpio/gpiolib.c | 10 +-
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 -
drivers/gpu/drm/drm_dp_mst_topology.c | 24 +++-
drivers/gpu/drm/i915/display/intel_ddi.c | 37 +++---
drivers/gpu/drm/i915/display/intel_display.c | 9 +-
drivers/gpu/drm/i915/display/intel_dp_mst.c | 4 +-
drivers/gpu/drm/i915/display/intel_overlay.c | 4 +-
drivers/gpu/drm/i915/display/intel_sprite.c | 65 ++---------
drivers/gpu/drm/i915/gem/i915_gem_domain.c | 45 -------
drivers/gpu/drm/i915/gem/i915_gem_object.h | 1 -
drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 6 +-
drivers/input/joystick/xpad.c | 17 ++-
drivers/input/serio/i8042-x86ia64io.h | 2 +
drivers/input/touchscreen/goodix.c | 2 +
drivers/input/touchscreen/ili210x.c | 26 +++--
drivers/md/md.c | 2 +
drivers/mmc/core/sdio_cis.c | 6 +
drivers/mmc/host/sdhci-pltfm.h | 7 +-
drivers/net/dsa/mv88e6xxx/chip.c | 6 +-
drivers/net/ethernet/ibm/ibmvnic.c | 5 -
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 13 +--
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 1 -
drivers/net/ethernet/intel/igc/igc_ethtool.c | 3 +-
drivers/net/ethernet/intel/igc/igc_i225.c | 3 +-
drivers/net/ethernet/intel/igc/igc_mac.c | 2 +-
drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c | 10 +-
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 +-
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 16 ++-
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 5 +
.../net/ethernet/mellanox/mlx5/core/pagealloc.c | 2 +-
drivers/net/ethernet/realtek/r8169_main.c | 75 ++++++++++--
drivers/net/ipa/gsi.c | 2 +-
drivers/nvdimm/dimm_devs.c | 18 ++-
drivers/nvdimm/namespace_devs.c | 10 +-
drivers/nvme/host/pci.c | 2 +
drivers/nvme/target/tcp.c | 3 +-
drivers/thunderbolt/acpi.c | 2 +-
drivers/usb/class/usblp.c | 19 +--
drivers/usb/dwc2/gadget.c | 8 +-
drivers/usb/dwc3/core.c | 2 +-
drivers/usb/gadget/legacy/ether.c | 4 +-
drivers/usb/gadget/udc/aspeed-vhub/hub.c | 4 +-
drivers/usb/host/xhci-mtk-sch.c | 130 +++++++++++++++------
drivers/usb/host/xhci-mtk.c | 2 +
drivers/usb/host/xhci-mtk.h | 15 +++
drivers/usb/host/xhci-mvebu.c | 42 +++++++
drivers/usb/host/xhci-mvebu.h | 6 +
drivers/usb/host/xhci-plat.c | 20 +++-
drivers/usb/host/xhci-plat.h | 1 +
drivers/usb/host/xhci-ring.c | 31 +++--
drivers/usb/host/xhci.c | 8 +-
drivers/usb/host/xhci.h | 4 +
drivers/usb/renesas_usbhs/fifo.c | 1 +
drivers/usb/serial/cp210x.c | 2 +
drivers/usb/serial/option.c | 6 +
drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 +
drivers/vdpa/mlx5/core/mr.c | 28 ++---
drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 +++
fs/afs/main.c | 6 +-
fs/cifs/dir.c | 22 +++-
fs/cifs/smb2pdu.h | 2 +-
fs/cifs/transport.c | 18 ++-
fs/hugetlbfs/inode.c | 3 +-
fs/io_uring.c | 6 -
fs/overlayfs/dir.c | 2 +-
fs/overlayfs/file.c | 5 +-
fs/overlayfs/overlayfs.h | 1 +
fs/overlayfs/ovl_entry.h | 2 +
fs/overlayfs/readdir.c | 28 ++---
fs/overlayfs/super.c | 34 ++++--
fs/overlayfs/util.c | 27 +++++
include/drm/drm_dp_mst_helper.h | 1 +
include/linux/hugetlb.h | 2 +
include/linux/iommu.h | 5 +-
include/linux/irq.h | 4 +-
include/linux/kprobes.h | 2 +-
include/linux/msi.h | 6 +
include/linux/tracepoint.h | 12 +-
include/linux/vmalloc.h | 9 +-
include/net/sch_generic.h | 2 +-
include/net/udp.h | 2 +-
init/init_task.c | 3 +-
kernel/bpf/bpf_inode_storage.c | 6 +-
kernel/bpf/cgroup.c | 7 +-
kernel/bpf/preload/Makefile | 5 +-
kernel/irq/msi.c | 44 ++++---
kernel/kprobes.c | 36 ++++--
kernel/trace/fgraph.c | 2 -
kernel/trace/trace_irqsoff.c | 4 +
kernel/trace/trace_kprobe.c | 10 +-
mm/compaction.c | 3 +-
mm/filemap.c | 4 +
mm/huge_memory.c | 37 +++---
mm/hugetlb.c | 48 +++++++-
mm/memblock.c | 49 +-------
net/core/neighbour.c | 7 +-
net/ipv4/ip_tunnel.c | 16 ++-
net/ipv4/udp_offload.c | 69 ++++++++++-
net/ipv6/udp_offload.c | 2 +-
net/lapb/lapb_out.c | 3 +-
net/mac80211/driver-ops.c | 5 +-
net/mac80211/rate.c | 3 +-
net/rxrpc/af_rxrpc.c | 6 +-
net/sunrpc/svcsock.c | 7 +-
scripts/Makefile | 8 +-
137 files changed, 1106 insertions(+), 606 deletions(-)
We park SQPOLL task before going into io_uring_cancel_files(), so the
task won't run task_works including those that might be important for
the cancellation passes. In this case it's io_poll_remove_one(), which
frees requests via io_put_req_deferred().
Unpark it for while waiting, it's ok as we disable submissions
beforehand, so no new will be generated.
INFO: task syz-executor893:8493 blocked for more than 143 seconds.
Call Trace:
context_switch kernel/sched/core.c:4327 [inline]
__schedule+0x90c/0x21a0 kernel/sched/core.c:5078
schedule+0xcf/0x270 kernel/sched/core.c:5157
io_uring_cancel_files fs/io_uring.c:8912 [inline]
io_uring_cancel_task_requests+0xe70/0x11a0 fs/io_uring.c:8979
__io_uring_files_cancel+0x110/0x1b0 fs/io_uring.c:9067
io_uring_files_cancel include/linux/io_uring.h:51 [inline]
do_exit+0x2fe/0x2ae0 kernel/exit.c:780
do_group_exit+0x125/0x310 kernel/exit.c:922
__do_sys_exit_group kernel/exit.c:933 [inline]
__se_sys_exit_group kernel/exit.c:931 [inline]
__x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Cc: stable(a)vger.kernel.org # 5.5+
Reported-by: syzbot+695b03d82fa8e4901b06(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
---
fs/io_uring.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6b73e38aa1a9..1e803a9afc8e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9056,11 +9056,16 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
break;
io_uring_try_cancel_requests(ctx, task, files);
+
+ if (ctx->sq_data)
+ io_sq_thread_unpark(ctx->sq_data);
prepare_to_wait(&task->io_uring->wait, &wait,
TASK_UNINTERRUPTIBLE);
if (inflight == io_uring_count_inflight(ctx, task, files))
schedule();
finish_wait(&task->io_uring->wait, &wait);
+ if (ctx->sq_data)
+ io_sq_thread_park(ctx->sq_data);
}
}
--
2.24.0
Fix kprobe_on_func_entry() returns error code instead of false so that
register_kretprobe() can return an appropriate error code.
append_trace_kprobe() expects the kprobe registration returns -ENOENT
when the target symbol is not found, and it checks whether the target
module is unloaded or not. If the target module doesn't exist, it
defers to probe the target symbol until the module is loaded.
However, since register_kretprobe() returns -EINVAL instead of -ENOENT
in that case, it always fail on putting the kretprobe event on unloaded
modules. e.g.
Kprobe event:
/sys/kernel/debug/tracing # echo p xfs:xfs_end_io >> kprobe_events
[ 16.515574] trace_kprobe: This probe might be able to register after target module is loaded. Continue.
Kretprobe event: (p -> r)
/sys/kernel/debug/tracing # echo r xfs:xfs_end_io >> kprobe_events
sh: write error: Invalid argument
/sys/kernel/debug/tracing # cat error_log
[ 41.122514] trace_kprobe: error: Failed to register probe event
Command: r xfs:xfs_end_io
^
To fix this bug, change kprobe_on_func_entry() to detect symbol lookup
failure and return -ENOENT in that case. Otherwise it returns -EINVAL
or 0 (succeeded, given address is on the entry).
Link: https://lkml.kernel.org/r/161176187132.1067016.8118042342894378981.stgit@de…
Cc: stable(a)vger.kernel.org
Fixes: 59158ec4aef7 ("tracing/kprobes: Check the probe on unloaded module correctly")
Reported-by: Jianlin Lv <Jianlin.Lv(a)arm.com>
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
include/linux/kprobes.h | 2 +-
kernel/kprobes.c | 34 +++++++++++++++++++++++++---------
kernel/trace/trace_kprobe.c | 4 ++--
3 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9f22652d69bb..c28204e22b54 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -245,7 +245,7 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
extern bool arch_kprobe_on_func_entry(unsigned long offset);
-extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
+extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
extern bool within_kprobe_blacklist(unsigned long addr);
extern int kprobe_add_ksym_blacklist(unsigned long entry);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2161f519d481..ebbd4320143d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1921,29 +1921,45 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
return !offset;
}
-bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+/**
+ * kprobe_on_func_entry() -- check whether given address is function entry
+ * @addr: Target address
+ * @sym: Target symbol name
+ * @offset: The offset from the symbol or the address
+ *
+ * This checks whether the given @addr+@offset or @sym+@offset is on the
+ * function entry address or not.
+ * This returns 0 if it is the function entry, or -EINVAL if it is not.
+ * And also it returns -ENOENT if it fails the symbol or address lookup.
+ * Caller must pass @addr or @sym (either one must be NULL), or this
+ * returns -EINVAL.
+ */
+int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
if (IS_ERR(kp_addr))
- return false;
+ return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_kprobe_on_func_entry(offset))
- return false;
+ if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
+ return -ENOENT;
- return true;
+ if (!arch_kprobe_on_func_entry(offset))
+ return -EINVAL;
+
+ return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
- int ret = 0;
+ int ret;
struct kretprobe_instance *inst;
int i;
void *addr;
- if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
- return -EINVAL;
+ ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
+ if (ret)
+ return ret;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5c17f70c7f2d..61eff45653f5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -112,9 +112,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
- return kprobe_on_func_entry(tk->rp.kp.addr,
+ return (kprobe_on_func_entry(tk->rp.kp.addr,
tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
- tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0);
}
bool trace_kprobe_error_injectable(struct trace_event_call *call)
commit 97c753e62e6c31a404183898d950d8c08d752dbd upstream.
Fix kprobe_on_func_entry() returns error code instead of false so that
register_kretprobe() can return an appropriate error code.
append_trace_kprobe() expects the kprobe registration returns -ENOENT
when the target symbol is not found, and it checks whether the target
module is unloaded or not. If the target module doesn't exist, it
defers to probe the target symbol until the module is loaded.
However, since register_kretprobe() returns -EINVAL instead of -ENOENT
in that case, it always fail on putting the kretprobe event on unloaded
modules. e.g.
Kprobe event:
/sys/kernel/debug/tracing # echo p xfs:xfs_end_io >> kprobe_events
[ 16.515574] trace_kprobe: This probe might be able to register after target module is loaded. Continue.
Kretprobe event: (p -> r)
/sys/kernel/debug/tracing # echo r xfs:xfs_end_io >> kprobe_events
sh: write error: Invalid argument
/sys/kernel/debug/tracing # cat error_log
[ 41.122514] trace_kprobe: error: Failed to register probe event
Command: r xfs:xfs_end_io
^
To fix this bug, change kprobe_on_func_entry() to detect symbol lookup
failure and return -ENOENT in that case. Otherwise it returns -EINVAL
or 0 (succeeded, given address is on the entry).
Link: https://lkml.kernel.org/r/161176187132.1067016.8118042342894378981.stgit@de…
Cc: stable(a)vger.kernel.org
Fixes: 59158ec4aef7 ("tracing/kprobes: Check the probe on unloaded module correctly")
Reported-by: Jianlin Lv <Jianlin.Lv(a)arm.com>
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
include/linux/kprobes.h | 2 +-
kernel/kprobes.c | 34 +++++++++++++++++++++++++---------
kernel/trace/trace_kprobe.c | 10 ++++++----
3 files changed, 32 insertions(+), 14 deletions(-)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index a60488867dd0..a121fd8e7c3a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -232,7 +232,7 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
extern bool arch_kprobe_on_func_entry(unsigned long offset);
-extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
+extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
extern bool within_kprobe_blacklist(unsigned long addr);
extern int kprobe_add_ksym_blacklist(unsigned long entry);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 283c8b01ce78..8f9fbc74021d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1948,29 +1948,45 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
return !offset;
}
-bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+/**
+ * kprobe_on_func_entry() -- check whether given address is function entry
+ * @addr: Target address
+ * @sym: Target symbol name
+ * @offset: The offset from the symbol or the address
+ *
+ * This checks whether the given @addr+@offset or @sym+@offset is on the
+ * function entry address or not.
+ * This returns 0 if it is the function entry, or -EINVAL if it is not.
+ * And also it returns -ENOENT if it fails the symbol or address lookup.
+ * Caller must pass @addr or @sym (either one must be NULL), or this
+ * returns -EINVAL.
+ */
+int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
if (IS_ERR(kp_addr))
- return false;
+ return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_kprobe_on_func_entry(offset))
- return false;
+ if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
+ return -ENOENT;
- return true;
+ if (!arch_kprobe_on_func_entry(offset))
+ return -EINVAL;
+
+ return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
- int ret = 0;
+ int ret;
struct kretprobe_instance *inst;
int i;
void *addr;
- if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
- return -EINVAL;
+ ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
+ if (ret)
+ return ret;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1074a69beff3..233322c77b76 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -220,9 +220,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
- return tk ? kprobe_on_func_entry(tk->rp.kp.addr,
+ return tk ? (kprobe_on_func_entry(tk->rp.kp.addr,
tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
- tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false;
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0) : false;
}
bool trace_kprobe_error_injectable(struct trace_event_call *call)
@@ -811,9 +811,11 @@ static int trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
goto parse_error;
}
- if (kprobe_on_func_entry(NULL, symbol, offset))
+ ret = kprobe_on_func_entry(NULL, symbol, offset);
+ if (ret == 0)
flags |= TPARG_FL_FENTRY;
- if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
+ /* Defer the ENOENT case until register kprobe */
+ if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
goto parse_error;
}
From: Dave Hansen <dave.hansen(a)linux.intel.com>
I went to go add a new RECLAIM_* mode for the zone_reclaim_mode
sysctl. Like a good kernel developer, I also went to go update the
documentation. I noticed that the bits in the documentation didn't
match the bits in the #defines.
The VM never explicitly checks the RECLAIM_ZONE bit. The bit is,
however implicitly checked when checking 'node_reclaim_mode==0'.
The RECLAIM_ZONE #define was removed in a cleanup. That, by itself
is fine.
But, when the bit was removed (bit 0) the _other_ bit locations also
got changed. That's not OK because the bit values are documented to
mean one specific thing and users surely rely on them meaning that one
thing and not changing from kernel to kernel. The end result is that
if someone had a script that did:
sysctl vm.zone_reclaim_mode=1
This script would have gone from enalbing node reclaim for clean
unmapped pages to writing out pages during node reclaim after the
commit in question. That's not great.
Put the bits back the way they were and add a comment so something
like this is a bit harder to do again. Update the documentation to
make it clear that the first bit is ignored.
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Fixes: 648b5cf368e0 ("mm/vmscan: remove unused RECLAIM_OFF/RECLAIM_ZONE")
Reviewed-by: Ben Widawsky <ben.widawsky(a)intel.com>
Acked-by: David Rientjes <rientjes(a)google.com>
Acked-by: Christoph Lameter <cl(a)linux.com>
Cc: Alex Shi <alex.shi(a)linux.alibaba.com>
Cc: Daniel Wagner <dwagner(a)suse.de>
Cc: "Tobin C. Harding" <tobin(a)kernel.org>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Huang Ying <ying.huang(a)intel.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Qian Cai <cai(a)lca.pw>
Cc: Daniel Wagner <dwagner(a)suse.de>
Cc: osalvador <osalvador(a)suse.de>
Cc: stable(a)vger.kernel.org
--
Changes from v2:
* Update description to indicate that bit0 was used for clean
unmapped page node reclaim.
---
b/Documentation/admin-guide/sysctl/vm.rst | 10 +++++-----
b/mm/vmscan.c | 9 +++++++--
2 files changed, 12 insertions(+), 7 deletions(-)
diff -puN Documentation/admin-guide/sysctl/vm.rst~mm-vmscan-restore-old-zone_reclaim_mode-abi Documentation/admin-guide/sysctl/vm.rst
--- a/Documentation/admin-guide/sysctl/vm.rst~mm-vmscan-restore-old-zone_reclaim_mode-abi 2021-01-25 16:23:06.048866718 -0800
+++ b/Documentation/admin-guide/sysctl/vm.rst 2021-01-25 16:23:06.056866718 -0800
@@ -978,11 +978,11 @@ that benefit from having their data cach
left disabled as the caching effect is likely to be more important than
data locality.
-zone_reclaim may be enabled if it's known that the workload is partitioned
-such that each partition fits within a NUMA node and that accessing remote
-memory would cause a measurable performance reduction. The page allocator
-will then reclaim easily reusable pages (those page cache pages that are
-currently not used) before allocating off node pages.
+Consider enabling one or more zone_reclaim mode bits if it's known that the
+workload is partitioned such that each partition fits within a NUMA node
+and that accessing remote memory would cause a measurable performance
+reduction. The page allocator will take additional actions before
+allocating off node pages.
Allowing zone reclaim to write out pages stops processes that are
writing large amounts of data from dirtying pages on other nodes. Zone
diff -puN mm/vmscan.c~mm-vmscan-restore-old-zone_reclaim_mode-abi mm/vmscan.c
--- a/mm/vmscan.c~mm-vmscan-restore-old-zone_reclaim_mode-abi 2021-01-25 16:23:06.052866718 -0800
+++ b/mm/vmscan.c 2021-01-25 16:23:06.057866718 -0800
@@ -4086,8 +4086,13 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
-#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI. New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
_
This is a note to let you know that I've just added the patch titled
phy: lantiq: rcu-usb2: wait after clock enable
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-testing branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will be merged to the char-misc-next branch sometime soon,
after it passes testing, and the merge window is open.
If you have any questions about this process, please let me know.
>From 36acd5e24e3000691fb8d1ee31cf959cb1582d35 Mon Sep 17 00:00:00 2001
From: Mathias Kresin <dev(a)kresin.me>
Date: Thu, 7 Jan 2021 23:49:01 +0100
Subject: phy: lantiq: rcu-usb2: wait after clock enable
Commit 65dc2e725286 ("usb: dwc2: Update Core Reset programming flow.")
revealed that the phy isn't ready immediately after enabling it's
clocks. The dwc2_check_core_version() fails and the dwc2 usb driver
errors out.
Add a short delay to let the phy get up and running. There isn't any
documentation how much time is required, the value was chosen based on
tests.
Signed-off-by: Mathias Kresin <dev(a)kresin.me>
Acked-by: Hauke Mehrtens <hauke(a)hauke-m.de>
Acked-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Cc: <stable(a)vger.kernel.org> # v5.7+
Link: https://lore.kernel.org/r/20210107224901.2102479-1-dev@kresin.me
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
---
drivers/phy/lantiq/phy-lantiq-rcu-usb2.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/phy/lantiq/phy-lantiq-rcu-usb2.c b/drivers/phy/lantiq/phy-lantiq-rcu-usb2.c
index a7d126192cf1..29d246ea24b4 100644
--- a/drivers/phy/lantiq/phy-lantiq-rcu-usb2.c
+++ b/drivers/phy/lantiq/phy-lantiq-rcu-usb2.c
@@ -124,8 +124,16 @@ static int ltq_rcu_usb2_phy_power_on(struct phy *phy)
reset_control_deassert(priv->phy_reset);
ret = clk_prepare_enable(priv->phy_gate_clk);
- if (ret)
+ if (ret) {
dev_err(dev, "failed to enable PHY gate\n");
+ return ret;
+ }
+
+ /*
+ * at least the xrx200 usb2 phy requires some extra time to be
+ * operational after enabling the clock
+ */
+ usleep_range(100, 200);
return ret;
}
--
2.30.1
ftw() has been obsolete for about 12 years now.
Fixes: bb1c15b60b98 ("perf stat: Support regex pattern in --for-each-cgroup")
CC: stable(a)vger.kernel.org
Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
---
Notes:
NOTE: Not runtime-tested, I have no idea what I need to do in perf
to test this. But at least it compiles now with my uClibc-based
toolchain.
tools/perf/util/cgroup.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 5dff7e489921..f24ab4585553 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -161,7 +161,7 @@ void evlist__set_default_cgroup(struct evlist *evlist, struct cgroup *cgroup)
/* helper function for ftw() in match_cgroups and list_cgroups */
static int add_cgroup_name(const char *fpath, const struct stat *sb __maybe_unused,
- int typeflag)
+ int typeflag, struct FTW *ftwbuf __maybe_unused)
{
struct cgroup_name *cn;
@@ -209,12 +209,12 @@ static int list_cgroups(const char *str)
if (!s)
return -1;
/* pretend if it's added by ftw() */
- ret = add_cgroup_name(s, NULL, FTW_D);
+ ret = add_cgroup_name(s, NULL, FTW_D, NULL);
free(s);
if (ret)
return -1;
} else {
- if (add_cgroup_name("", NULL, FTW_D) < 0)
+ if (add_cgroup_name("", NULL, FTW_D, NULL) < 0)
return -1;
}
@@ -247,7 +247,7 @@ static int match_cgroups(const char *str)
prefix_len = strlen(mnt);
/* collect all cgroups in the cgroup_list */
- if (ftw(mnt, add_cgroup_name, 20) < 0)
+ if (nftw(mnt, add_cgroup_name, 20, 0) < 0)
return -1;
for (;;) {
--
2.30.0