From: Loic Poulain <loic.poulain(a)linaro.org>
For whatever reason, some devices like QCA6390, WCN6855 using ath11k
are not in M3 state during PM resume, but still functional. The
mhi_pm_resume should then not fail in those cases, and let the higher
level device specific stack continue resuming process.
Add an API mhi_pm_resume_force(), to force resuming irrespective of the
current MHI state. This fixes a regression with non functional ath11k WiFi
after suspend/resume cycle on some machines.
Bug report: https://bugzilla.kernel.org/show_bug.cgi?id=214179
Fixes: 020d3b26c07a ("bus: mhi: Early MHI resume failure in non M3 state")
Cc: stable(a)vger.kernel.org #5.13
Link: https://lore.kernel.org/regressions/871r5p0x2u.fsf@codeaurora.org/
Reported-by: Kalle Valo <kvalo(a)codeaurora.org>
Reported-by: Pengyu Ma <mapengyu(a)gmail.com>
Signed-off-by: Loic Poulain <loic.poulain(a)linaro.org>
[mani: Switched to API, added bug report, reported-by tags and CCed stable]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
---
Changes in v2:
* Switched to a new API "mhi_pm_resume_force()" instead of the "force" flag as
suggested by Greg. The "force" flag is now used inside the API.
Greg: I'm sending this patch directly to you so that you can apply it to
char-misc once we get an ACK from Kalle.
drivers/bus/mhi/core/pm.c | 21 ++++++++++++++++++---
drivers/net/wireless/ath/ath11k/mhi.c | 6 +++++-
include/linux/mhi.h | 13 +++++++++++++
3 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c
index fb99e3727155..547e6e769546 100644
--- a/drivers/bus/mhi/core/pm.c
+++ b/drivers/bus/mhi/core/pm.c
@@ -881,7 +881,7 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl)
}
EXPORT_SYMBOL_GPL(mhi_pm_suspend);
-int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
+static int __mhi_pm_resume(struct mhi_controller *mhi_cntrl, bool force)
{
struct mhi_chan *itr, *tmp;
struct device *dev = &mhi_cntrl->mhi_dev->dev;
@@ -898,8 +898,12 @@ int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
if (MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state))
return -EIO;
- if (mhi_get_mhi_state(mhi_cntrl) != MHI_STATE_M3)
- return -EINVAL;
+ if (mhi_get_mhi_state(mhi_cntrl) != MHI_STATE_M3) {
+ dev_warn(dev, "Resuming from non M3 state (%s)\n",
+ TO_MHI_STATE_STR(mhi_get_mhi_state(mhi_cntrl)));
+ if (!force)
+ return -EINVAL;
+ }
/* Notify clients about exiting LPM */
list_for_each_entry_safe(itr, tmp, &mhi_cntrl->lpm_chans, node) {
@@ -940,8 +944,19 @@ int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
return 0;
}
+
+int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
+{
+ return __mhi_pm_resume(mhi_cntrl, false);
+}
EXPORT_SYMBOL_GPL(mhi_pm_resume);
+int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl)
+{
+ return __mhi_pm_resume(mhi_cntrl, true);
+}
+EXPORT_SYMBOL_GPL(mhi_pm_resume_force);
+
int __mhi_device_get_sync(struct mhi_controller *mhi_cntrl)
{
int ret;
diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c
index 26c7ae242db6..49c0b1ad40a0 100644
--- a/drivers/net/wireless/ath/ath11k/mhi.c
+++ b/drivers/net/wireless/ath/ath11k/mhi.c
@@ -533,7 +533,11 @@ static int ath11k_mhi_set_state(struct ath11k_pci *ab_pci,
ret = mhi_pm_suspend(ab_pci->mhi_ctrl);
break;
case ATH11K_MHI_RESUME:
- ret = mhi_pm_resume(ab_pci->mhi_ctrl);
+ /* Do force MHI resume as some devices like QCA6390, WCN6855
+ * are not in M3 state but they are functional. So just ignore
+ * the MHI state while resuming.
+ */
+ ret = mhi_pm_resume_force(ab_pci->mhi_ctrl);
break;
case ATH11K_MHI_TRIGGER_RDDM:
ret = mhi_force_rddm_mode(ab_pci->mhi_ctrl);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 723985879035..a5cc4cdf9cc8 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -663,6 +663,19 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl);
*/
int mhi_pm_resume(struct mhi_controller *mhi_cntrl);
+/**
+ * mhi_pm_resume_force - Force resume MHI from suspended state
+ * @mhi_cntrl: MHI controller
+ *
+ * Resume the device irrespective of its MHI state. As per the MHI spec, devices
+ * has to be in M3 state during resume. But some devices seem to be in a
+ * different MHI state other than M3 but they continue working fine if allowed.
+ * This API is intented to be used for such devices.
+ *
+ * Return: 0 if the resume succeeds, a negative error code otherwise
+ */
+int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl);
+
/**
* mhi_download_rddm_image - Download ramdump image from device for
* debugging purpose.
--
2.25.1
Without the bound checks for scpi_pd->name, it could result in the buffer
overflow when copying the SCPI device name from the corresponding device
tree node as the name string is set at maximum size of 30.
Let us fix it by using devm_kasprintf so that the string buffer is
allocated dynamically.
Cc: stable(a)vger.kernel.org
Fixes: 8bec4337ad40 ("firmware: scpi: add device power domain support using genpd")
Reported-by: Pedro Batista <pedbap.g(a)gmail.com>
Cc: Cristian Marussi <cristian.marussi(a)arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla(a)arm.com>
---
drivers/firmware/scpi_pm_domain.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
Hi ARM SoC team,
Can you apply this directly as I don't have any other fix at the moment.
Regards,
Sudeep
v1->v2:
- Fixed accidentally dropped '.' in the name
- Used devm_kasprintf instead of combination of kasprintf and
devm_kstrdup
v1: https://lore.kernel.org/r/20211206153150.565685-1-sudeep.holla@arm.com/
diff --git a/drivers/firmware/scpi_pm_domain.c b/drivers/firmware/scpi_pm_domain.c
index 51201600d789..800673910b51 100644
--- a/drivers/firmware/scpi_pm_domain.c
+++ b/drivers/firmware/scpi_pm_domain.c
@@ -16,7 +16,6 @@ struct scpi_pm_domain {
struct generic_pm_domain genpd;
struct scpi_ops *ops;
u32 domain;
- char name[30];
};
/*
@@ -110,8 +109,13 @@ static int scpi_pm_domain_probe(struct platform_device *pdev)
scpi_pd->domain = i;
scpi_pd->ops = scpi_ops;
- sprintf(scpi_pd->name, "%pOFn.%d", np, i);
- scpi_pd->genpd.name = scpi_pd->name;
+ scpi_pd->genpd.name = devm_kasprintf(dev, GFP_KERNEL,
+ "%pOFn.%d", np, i);
+ if (!scpi_pd->genpd.name) {
+ dev_err(dev, "Failed to allocate genpd name:%pOFn.%d\n",
+ np, i);
+ continue;
+ }
scpi_pd->genpd.power_off = scpi_pd_power_off;
scpi_pd->genpd.power_on = scpi_pd_power_on;
--
2.25.1
If cfg80211 is providing extraie's for a scanning process then ath11k will
copy that over to the firmware. The extraie.len is a 32 bit value in struct
element_info and describes the amount of bytes for the vendor information
elements.
The WMI_TLV packet is having a special WMI_TAG_ARRAY_BYTE section. This
section can have a (payload) length up to 65535 bytes because the
WMI_TLV_LEN can store up to 16 bits. The code was missing such a check and
could have created a scan request which cannot be parsed correctly by the
firmware.
But the bigger problem was the allocation of the buffer. It has to align
the TLV sections by 4 bytes. But the code was using an u8 to store the
newly calculated length of this section (with alignment). And the new
calculated length was then used to allocate the skbuff. But the actual code
to copy in the data is using the extraie.len and not the calculated
"aligned" length.
The length of extraie with IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS enabled
was 264 bytes during tests with a QCA Milan card. But it only allocated 8
bytes (264 bytes % 256) for it. As consequence, the code to memcpy the
extraie into the skb was then just overwriting data after skb->end. Things
like shinfo were therefore corrupted. This could usually be seen by a crash
in skb_zcopy_clear which tried to call a ubuf_info callback (using a bogus
address).
Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-02892.1-QCAHSPSWPL_V1_V2_SILICONZ_LITE-1
Cc: stable(a)vger.kernel.org
Fixes: d5c65159f289 ("ath11k: driver for Qualcomm IEEE 802.11ax devices")
Signed-off-by: Sven Eckelmann <sven(a)narfation.org>
---
drivers/net/wireless/ath/ath11k/wmi.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index 6da16d095e37..d817ea468ab5 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -2115,7 +2115,7 @@ int ath11k_wmi_send_scan_start_cmd(struct ath11k *ar,
void *ptr;
int i, ret, len;
u32 *tmp_ptr;
- u8 extraie_len_with_pad = 0;
+ u16 extraie_len_with_pad = 0;
struct hint_short_ssid *s_ssid = NULL;
struct hint_bssid *hint_bssid = NULL;
@@ -2134,7 +2134,7 @@ int ath11k_wmi_send_scan_start_cmd(struct ath11k *ar,
len += sizeof(*bssid) * params->num_bssid;
len += TLV_HDR_SIZE;
- if (params->extraie.len)
+ if (params->extraie.len && params->extraie.len <= 0xFFFF)
extraie_len_with_pad =
roundup(params->extraie.len, sizeof(u32));
len += extraie_len_with_pad;
@@ -2241,7 +2241,7 @@ int ath11k_wmi_send_scan_start_cmd(struct ath11k *ar,
FIELD_PREP(WMI_TLV_LEN, len);
ptr += TLV_HDR_SIZE;
- if (params->extraie.len)
+ if (extraie_len_with_pad)
memcpy(ptr, params->extraie.ptr,
params->extraie.len);
--
2.30.2
Christophe Leroy <christophe.leroy(a)csgroup.eu> writes:
> Le 07/12/2021 à 11:34, Maxime Bizon a écrit :
>>
>> On Tue, 2021-12-07 at 06:10 +0000, Christophe Leroy wrote:
>>
>> Hello,
>>
>> With the patch applied and
>>
>> CONFIG_DEBUG_PAGEALLOC=y
>> CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT=y
>> CONFIG_DEBUG_VM=y
>>
>> I get tons of this during boot:
>>
>> [ 0.000000] Dentry cache hash table entries: 262144 (order: 8, 1048576 bytes, linear)
>> [ 0.000000] Inode-cache hash table entries: 131072 (order: 7, 524288 bytes, linear)
>> [ 0.000000] mem auto-init: stack:off, heap alloc:off, heap free:off
>> [ 0.000000] ------------[ cut here ]------------
>> [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/powerpc/mm/pgtable.c:194 set_pte_at+0x18/0x160
>> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.15.0+ #442
>> [ 0.000000] NIP: 80015ebc LR: 80016728 CTR: 800166e4
>> [ 0.000000] REGS: 80751dd0 TRAP: 0700 Not tainted (5.15.0+)
>> [ 0.000000] MSR: 00021032 <ME,IR,DR,RI> CR: 42228882 XER: 20000000
>> [ 0.000000]
>> [ 0.000000] GPR00: 800b8dc8 80751e80 806c6300 807311d8 807a1000 8ffffe84 80751ea8 00000000
>> [ 0.000000] GPR08: 007a1591 00000001 007a1180 00000000 42224882 00000000 3ff9c608 3fffd79c
>> [ 0.000000] GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 800166e4 807a2000
>> [ 0.000000] GPR24: 807a1fff 807311d8 807311d8 807a2000 80768804 00000000 807a1000 007a1180
>> [ 0.000000] NIP [80015ebc] set_pte_at+0x18/0x160
>> [ 0.000000] LR [80016728] set_page_attr+0x44/0xc0
>> [ 0.000000] Call Trace:
>> [ 0.000000] [80751e80] [80058570] console_unlock+0x340/0x428 (unreliable)
>> [ 0.000000] [80751ea0] [00000000] 0x0
>> [ 0.000000] [80751ec0] [800b8dc8] __apply_to_page_range+0x144/0x2a8
>> [ 0.000000] [80751f00] [80016918] __kernel_map_pages+0x54/0x64
>> [ 0.000000] [80751f10] [800cfeb0] __free_pages_ok+0x1b0/0x440
>> [ 0.000000] [80751f50] [805cfc8c] memblock_free_all+0x1d8/0x274
>> [ 0.000000] [80751f90] [805c5e0c] mem_init+0x3c/0xd0
>> [ 0.000000] [80751fb0] [805c0bdc] start_kernel+0x404/0x5c4
>> [ 0.000000] [80751ff0] [000033f0] 0x33f0
>> [ 0.000000] Instruction dump:
>> [ 0.000000] 7c630034 83e1000c 5463d97e 7c0803a6 38210010 4e800020 9421ffe0 93e1001c
>> [ 0.000000] 83e60000 81250000 71290001 41820014 <0fe00000> 7c0802a6 93c10018 90010024
>>
>>
>
> That's unrelated to this patch.
>
> The problem is linked to patch c988cfd38e48 ("powerpc/32: use
> set_memory_attr()"), which changed from using __set_pte_at() to using
> set_memory_attr() which uses set_pte_at().
>
> set_pte_at() has additional checks and shall not be used to updating an
> existing PTE.
>
> Wondering if I should just use __set_pte_at() instead like in the past,
> or do like commit 9f7853d7609d ("powerpc/mm: Fix set_memory_*() against
> concurrent accesses") and use pte_update()
>
> Michael, Aneesh, any suggestion ?
The motivation for using pte_update() in that commit is that it does the
update atomically and also handles flushing the HPTE for 64-bit Hash.
But the books/32 version of pte_update() doesn't do that. In fact
there's some HPTE handling in __set_pte_at(), but then also a comment
saying it's handling in a subsequent flush_tlb_xxx().
So that doesn't really help make a decision :)
On the other hand, could you convert those set_memory_attr() calls to
change_memory_attr() and then eventually drop the former?
cheers
This is the start of the stable review cycle for the 4.19.220 release.
There are 48 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 08 Dec 2021 14:55:37 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.220-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.19.220-rc1
Wei Yongjun <weiyongjun1(a)huawei.com>
ipmi: msghandler: Make symbol 'remove_work_wq' static
Helge Deller <deller(a)gmx.de>
parisc: Mark cr16 CPU clocksource unstable on all SMP machines
Johan Hovold <johan(a)kernel.org>
serial: core: fix transmit-buffer reset and memleak
Pierre Gondois <Pierre.Gondois(a)arm.com>
serial: pl011: Add ACPI SBSA UART match id
Sven Eckelmann <sven(a)narfation.org>
tty: serial: msm_serial: Deactivate RX DMA for polling support
Joerg Roedel <jroedel(a)suse.de>
x86/64/mm: Map all kernel memory into trampoline_pgd
Badhri Jagan Sridharan <badhri(a)google.com>
usb: typec: tcpm: Wait in SNK_DEBOUNCED until disconnect
Ole Ernst <olebowle(a)gmx.com>
USB: NO_LPM quirk Lenovo Powered USB-C Travel Hub
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: Fix commad ring abort, write all 64 bits to CRCR register.
Maciej W. Rozycki <macro(a)orcam.me.uk>
vgacon: Propagate console boot parameters before calling `vc_resize'
Helge Deller <deller(a)gmx.de>
parisc: Fix "make install" on newer debian releases
Helge Deller <deller(a)gmx.de>
parisc: Fix KBUILD_IMAGE for self-extracting kernel
Rob Clark <robdclark(a)chromium.org>
drm/msm: Do hw_init() before capturing GPU state
Tony Lu <tonylu(a)linux.alibaba.com>
net/smc: Keep smc_close_final rc during active close
William Kucharski <william.kucharski(a)oracle.com>
net/rds: correct socket tunable error in rds_tcp_tune()
Eric Dumazet <edumazet(a)google.com>
net: annotate data-races on txq->xmit_lock_owner
Sven Schuchmann <schuchmann(a)schleissheimer.de>
net: usb: lan78xx: lan78xx_phy_init(): use PHY_POLL instead of "0" if no IRQ is available
Eiichi Tsukata <eiichi.tsukata(a)nutanix.com>
rxrpc: Fix rxrpc_local leak in rxrpc_lookup_peer()
Zhou Qingyang <zhou1615(a)umn.edu>
net/mlx4_en: Fix an use-after-free bug in mlx4_en_try_alloc_resources()
Arnd Bergmann <arnd(a)arndb.de>
siphash: use _unaligned version by default
Benjamin Poirier <bpoirier(a)nvidia.com>
net: mpls: Fix notifications when deleting a device
Zhou Qingyang <zhou1615(a)umn.edu>
net: qlogic: qlcnic: Fix a NULL pointer dereference in qlcnic_83xx_add_rings()
Randy Dunlap <rdunlap(a)infradead.org>
natsemi: xtensa: fix section mismatch warnings
Alain Volmat <alain.volmat(a)foss.st.com>
i2c: stm32f7: stop dma transfer in case of NACK
Alain Volmat <alain.volmat(a)foss.st.com>
i2c: stm32f7: recover the bus on access timeout
Linus Torvalds <torvalds(a)linux-foundation.org>
fget: check that the fd still exists after getting a ref to it
Jens Axboe <axboe(a)kernel.dk>
fs: add fget_many() and fput_many()
Baokun Li <libaokun1(a)huawei.com>
sata_fsl: fix warning in remove_proc_entry when rmmod sata_fsl
Baokun Li <libaokun1(a)huawei.com>
sata_fsl: fix UAF in sata_fsl_port_stop when rmmod sata_fsl
Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
ipmi: Move remove_work to dedicated workqueue
Masami Hiramatsu <mhiramat(a)kernel.org>
kprobes: Limit max data_size of the kretprobe instances
Stephen Suryaputra <ssuryaextr(a)gmail.com>
vrf: Reset IPCB/IP6CB when processing outbound pkts in vrf dev xmit
Ian Rogers <irogers(a)google.com>
perf hist: Fix memory leak of a perf_hpp_fmt
Teng Qi <starmiku1207184332(a)gmail.com>
net: ethernet: dec: tulip: de4x5: fix possible array overflows in type3_infoblock()
zhangyue <zhangyue1(a)kylinos.cn>
net: tulip: de4x5: fix the problem that the array 'lp->phy[8]' may be out of bound
Teng Qi <starmiku1207184332(a)gmail.com>
ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible array overflow in hns_dsaf_ge_srst_by_port()
Mario Limonciello <mario.limonciello(a)amd.com>
ata: ahci: Add Green Sardine vendor ID as board_ahci_mobile
Mike Christie <michael.christie(a)oracle.com>
scsi: iscsi: Unblock session then wake up error handler
Manaf Meethalavalappu Pallikunhi <manafm(a)codeaurora.org>
thermal: core: Reset previous low and high trip during thermal zone init
Wang Yugui <wangyugui(a)e16-tech.com>
btrfs: check-integrity: fix a warning on write caching disabled disk
Vasily Gorbik <gor(a)linux.ibm.com>
s390/setup: avoid using memblock_enforce_memory_limit
Slark Xiao <slark_xiao(a)163.com>
platform/x86: thinkpad_acpi: Fix WWAN device disabled issue after S3 deep
liuguoqiang <liuguoqiang(a)uniontech.com>
net: return correct error code
Zekun Shen <bruceshenzk(a)gmail.com>
atlantic: Fix OOB read and write in hw_atl_utils_fw_rpc_wait
Andreas Gruenbacher <agruenba(a)redhat.com>
gfs2: Fix length of holes reported at end-of-file
Geert Uytterhoeven <geert+renesas(a)glider.be>
of: clk: Make <linux/of_clk.h> self-contained
Benjamin Coddington <bcodding(a)redhat.com>
NFSv42: Fix pagecache invalidation after COPY/CLONE
Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
shm: extend forced shm destroy to support objects from several IPC nses
-------------
Diffstat:
Makefile | 4 +-
arch/parisc/Makefile | 5 +
arch/parisc/install.sh | 1 +
arch/parisc/kernel/time.c | 24 +--
arch/s390/kernel/setup.c | 3 -
arch/x86/realmode/init.c | 12 +-
drivers/ata/ahci.c | 1 +
drivers/ata/sata_fsl.c | 20 ++-
drivers/char/ipmi/ipmi_msghandler.c | 13 +-
drivers/gpu/drm/msm/msm_debugfs.c | 1 +
drivers/i2c/busses/i2c-stm32f7.c | 11 +-
.../aquantia/atlantic/hw_atl/hw_atl_utils.c | 10 ++
drivers/net/ethernet/dec/tulip/de4x5.c | 34 ++--
drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 4 +
drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 9 +-
drivers/net/ethernet/natsemi/xtsonic.c | 2 +-
.../net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 10 +-
drivers/net/usb/lan78xx.c | 2 +-
drivers/net/vrf.c | 2 +
drivers/platform/x86/thinkpad_acpi.c | 12 --
drivers/scsi/scsi_transport_iscsi.c | 6 +-
drivers/thermal/thermal_core.c | 2 +
drivers/tty/serial/amba-pl011.c | 1 +
drivers/tty/serial/msm_serial.c | 3 +
drivers/tty/serial/serial_core.c | 13 +-
drivers/usb/core/quirks.c | 3 +
drivers/usb/host/xhci-ring.c | 21 ++-
drivers/usb/typec/tcpm.c | 4 -
drivers/video/console/vgacon.c | 14 +-
fs/btrfs/disk-io.c | 14 +-
fs/file.c | 19 ++-
fs/file_table.c | 9 +-
fs/gfs2/bmap.c | 2 +-
fs/nfs/nfs42proc.c | 5 +-
include/linux/file.h | 2 +
include/linux/fs.h | 4 +-
include/linux/ipc_namespace.h | 15 ++
include/linux/kprobes.h | 2 +
include/linux/netdevice.h | 19 ++-
include/linux/of_clk.h | 3 +
include/linux/sched/task.h | 2 +-
include/linux/siphash.h | 14 +-
ipc/shm.c | 189 ++++++++++++++++-----
kernel/kprobes.c | 3 +
lib/siphash.c | 12 +-
net/core/dev.c | 5 +-
net/ipv4/devinet.c | 2 +-
net/mpls/af_mpls.c | 68 ++++++--
net/rds/tcp.c | 2 +-
net/rxrpc/peer_object.c | 14 +-
net/smc/smc_close.c | 8 +-
tools/perf/ui/hist.c | 28 +--
tools/perf/util/hist.h | 1 -
53 files changed, 481 insertions(+), 208 deletions(-)
The patch titled
Subject: mm: mempolicy: fix THP allocations escaping mempolicy restrictions
has been added to the -mm tree. Its filename is
mm-mempolicy-fix-thp-allocations-escaping-mempolicy-restrictions.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-mempolicy-fix-thp-allocations-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-mempolicy-fix-thp-allocations-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Andrey Ryabinin <arbn(a)yandex-team.com>
Subject: mm: mempolicy: fix THP allocations escaping mempolicy restrictions
alloc_pages_vma() may try to allocate THP page on the local NUMA node
first:
page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE | __GFP_NORETRY, order);
And if the allocation fails it retries allowing remote memory:
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
page = __alloc_pages_node(hpage_node,
gfp, order);
However, this retry allocation completely ignores memory policy nodemask
allowing allocation to escape restrictions.
The first appearance of this bug seems to be the commit ac5b2c18911f
("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings")
The bug disappeared later in the commit 89c83fb539f9
("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask")
and reappeared again in slightly different form in the commit 76e654cc91bb
("mm, page_alloc: allow hugepage fallback to remote nodes when madvised")
Fix this by passing correct nodemask to the __alloc_pages() call.
The demonstration/reproducer of the problem:
$ mount -oremount,size=4G,huge=always /dev/shm/
$ echo always > /sys/kernel/mm/transparent_hugepage/defrag
$ cat mbind_thp.c
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <numaif.h>
#define SIZE 2ULL << 30
int main(int argc, char **argv)
{
int fd;
unsigned long long i;
char *addr;
pid_t pid;
char buf[100];
unsigned long nodemask = 1;
fd = open("/dev/shm/test", O_RDWR|O_CREAT);
assert(fd > 0);
assert(ftruncate(fd, SIZE) == 0);
addr = mmap(NULL, SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED, fd, 0);
assert(mbind(addr, SIZE, MPOL_BIND, &nodemask, 2, MPOL_MF_STRICT|MPOL_MF_MOVE)==0);
for (i = 0; i < SIZE; i+=4096) {
addr[i] = 1;
}
pid = getpid();
snprintf(buf, sizeof(buf), "grep shm /proc/%d/numa_maps", pid);
system(buf);
sleep(10000);
return 0;
}
$ gcc mbind_thp.c -o mbind_thp -lnuma
$ numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 2
node 0 size: 1918 MB
node 0 free: 1595 MB
node 1 cpus: 1 3
node 1 size: 2014 MB
node 1 free: 1731 MB
node distances:
node 0 1
0: 10 20
1: 20 10
$ rm -f /dev/shm/test; taskset -c 0 ./mbind_thp
7fd970a00000 bind:0 file=/dev/shm/test dirty=524288 active=0 N0=396800 N1=127488 kernelpagesize_kB=4
Link: https://lkml.kernel.org/r/20211208165343.22349-1-arbn@yandex-team.com
Fixes: ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings")
Signed-off-by: Andrey Ryabinin <arbn(a)yandex-team.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: David Rientjes <rientjes(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/mm/mempolicy.c~mm-mempolicy-fix-thp-allocations-escaping-mempolicy-restrictions
+++ a/mm/mempolicy.c
@@ -2140,8 +2140,7 @@ struct page *alloc_pages_vma(gfp_t gfp,
* memory with both reclaim and compact as well.
*/
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
- page = __alloc_pages_node(hpage_node,
- gfp, order);
+ page = __alloc_pages(gfp, order, hpage_node, nmask);
goto out;
}
_
Patches currently in -mm which might be from arbn(a)yandex-team.com are
mm-mempolicy-fix-thp-allocations-escaping-mempolicy-restrictions.patch