The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a9d2e9ae953f0ddd0327479c81a085adaa76d903 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)nvidia.com>
Date: Fri, 6 Nov 2020 10:00:49 -0400
Subject: [PATCH] RDMA/siw,rxe: Make emulated devices virtual in the device
tree
This moves siw and rxe to be virtual devices in the device tree:
lrwxrwxrwx 1 root root 0 Nov 6 13:55 /sys/class/infiniband/rxe0 -> ../../devices/virtual/infiniband/rxe0/
Previously they were trying to parent themselves to the physical device of
their attached netdev, which doesn't make alot of sense.
My hope is this will solve some weird syzkaller hits related to sysfs as
it could be possible that the parent of a netdev is another netdev, eg
under bonding or some other syzkaller found netdev configuration.
Nesting a ib_device under anything but a physical device is going to cause
inconsistencies in sysfs during destructions.
Link: https://lore.kernel.org/r/0-v1-dcbfc68c4b4a+d6-virtual_dev_jgg@nvidia.com
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index ee3cf3af7cab..c4b06ced30a7 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -19,15 +19,6 @@
static struct rxe_recv_sockets recv_sockets;
-struct device *rxe_dma_device(struct rxe_dev *rxe)
-{
- struct net_device *ndev;
-
- ndev = rxe->ndev;
-
- return ndev->dev.parent;
-}
-
int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
{
int err;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index a2bd91aaa5de..2fbea2b2d72a 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1134,7 +1134,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
dev->node_type = RDMA_NODE_IB_CA;
dev->phys_port_cnt = 1;
dev->num_comp_vectors = num_possible_cpus();
- dev->dev.parent = rxe_dma_device(rxe);
dev->local_dma_lkey = 0;
addrconf_addr_eui48((unsigned char *)&dev->node_guid,
rxe->ndev->dev_addr);
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index d9de062852c4..ee95cf29179d 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -305,24 +305,8 @@ static struct siw_device *siw_device_create(struct net_device *netdev)
{
struct siw_device *sdev = NULL;
struct ib_device *base_dev;
- struct device *parent = netdev->dev.parent;
int rv;
- if (!parent) {
- /*
- * The loopback device has no parent device,
- * so it appears as a top-level device. To support
- * loopback device connectivity, take this device
- * as the parent device. Skip all other devices
- * w/o parent device.
- */
- if (netdev->type != ARPHRD_LOOPBACK) {
- pr_warn("siw: device %s error: no parent device\n",
- netdev->name);
- return NULL;
- }
- parent = &netdev->dev;
- }
sdev = ib_alloc_device(siw_device, base_dev);
if (!sdev)
return NULL;
@@ -359,7 +343,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev)
* per physical port.
*/
base_dev->phys_port_cnt = 1;
- base_dev->dev.parent = parent;
base_dev->num_comp_vectors = num_possible_cpus();
xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
@@ -401,7 +384,7 @@ static struct siw_device *siw_device_create(struct net_device *netdev)
atomic_set(&sdev->num_mr, 0);
atomic_set(&sdev->num_pd, 0);
- sdev->numa_node = dev_to_node(parent);
+ sdev->numa_node = dev_to_node(&netdev->dev);
spin_lock_init(&sdev->lock);
return sdev;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bd14bf0e4a084514aa62d24d2109e0f09a93822f Mon Sep 17 00:00:00 2001
From: Stanley Chu <stanley.chu(a)mediatek.com>
Date: Tue, 8 Dec 2020 21:56:34 +0800
Subject: [PATCH] scsi: ufs: Re-enable WriteBooster after device reset
UFS 3.1 specification mentions that the WriteBooster flags listed below
will be set to their default values, i.e. disabled, after power cycle or
any type of reset event. Thus we need to reset the flag variables kept in
struct hba to align with the device status and ensure that
WriteBooster-related functions are configured properly after device reset.
Without this fix, WriteBooster will not be enabled successfully after by
ufshcd_wb_ctrl() after device reset because hba->wb_enabled remains true.
Flags required to be reset to default values:
- fWriteBoosterEn: hba->wb_enabled
- fWriteBoosterBufferFlushEn: hba->wb_buf_flush_enabled
- fWriteBoosterBufferFlushDuringHibernate: No variable mapped
Link: https://lore.kernel.org/r/20201208135635.15326-2-stanley.chu@mediatek.com
Fixes: 3d17b9b5ab11 ("scsi: ufs: Add write booster feature support")
Reviewed-by: Bean Huo <beanhuo(a)micron.com>
Signed-off-by: Stanley Chu <stanley.chu(a)mediatek.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 08c8a591e6b0..36d367eb8139 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -1221,8 +1221,13 @@ static inline void ufshcd_vops_device_reset(struct ufs_hba *hba)
if (hba->vops && hba->vops->device_reset) {
int err = hba->vops->device_reset(hba);
- if (!err)
+ if (!err) {
ufshcd_set_ufs_dev_active(hba);
+ if (ufshcd_is_wb_allowed(hba)) {
+ hba->wb_enabled = false;
+ hba->wb_buf_flush_enabled = false;
+ }
+ }
if (err != -EOPNOTSUPP)
ufshcd_update_evt_hist(hba, UFS_EVT_DEV_RESET, err);
}
The dentries such as /proc/<pid>/ns/ have the DCACHE_OP_DELETE flag, they
should be deleted when the process exits.
Suppose the following race appears:
release_task dput
-> proc_flush_task
-> dentry->d_op->d_delete(dentry)
-> __exit_signal
-> dentry->d_lockref.count-- and return.
In the proc_flush_task(), if another process is using this dentry, it will
not be deleted. At the same time, in dput(), d_op->d_delete() can be executed
before __exit_signal(pid has not been hashed), d_delete returns false, so
this dentry still cannot be deleted.
This dentry will always be cached (although its count is 0 and the
DCACHE_OP_DELETE flag is set), its parent denry will also be cached too, and
these dentries can only be deleted when drop_caches is manually triggered.
This will result in wasted memory. What's more troublesome is that these
dentries reference pid, according to the commit f333c700c610 ("pidns: Add a
limit on the number of pid namespaces"), if the pid cannot be released, it
may result in the inability to create a new pid_ns.
This problem occurred in our cluster environment (Linux 4.9 LTS).
We could reproduce it by manually constructing a test program + adding some
debugging switches in the kernel:
* A test program to open the directory (/proc/<pid>/ns) [1]
* Adding some debugging switches to the kernel, adding a delay between
proc_flush_task and __exit_signal in release_task() [2]
The test process is as follows:
A, terminal #1
Turn on the debug switch:
echo 1> /proc/sys/vm/dentry_debug_trace
Execute the following unshare command:
sudo unshare --pid --fork --mount-proc bash
B, terminal #2
Find the pid of the unshare process:
# pstree -p | grep unshare
| `-sshd(716)---bash(718)--sudo(816)---unshare(817)---bash(818)
Find the corresponding dentry:
# dmesg | grep pid=818
[70.424722] XXX proc_pid_instantiate:3119 pid=818 tid=818 entry=818/ffff8802c7b670e8
C, terminal #3
Execute the opendir program, it will always open the /proc/818/ns/ directory:
# ./a.out /proc/818/ns/
pid: 876
.
..
net
uts
ipc
pid
user
mnt
cgroup
D, go back to terminal #2
Turn on the debugging switches to construct the race:
# echo 818> /proc/sys/vm/dentry_debug_pid
# echo 1> /proc/sys/vm/dentry_debug_delay
Kill the unshare process (pid 818). Since the debugging switches have been
turned on, it will get stuck in release_task():
# kill -9 818
Then kill the process that opened the /proc/818/ns/ directory:
# kill -9 876
Then turn off these debugging switches to allow the 818 process to exit:
# echo 0> /proc/sys/vm/dentry_debug_delay
# echo 0> /proc/sys/vm/dentry_debug_pid
Checking the dmesg, we will find that the dentry(/proc/818/ns) ’s count is 0,
and the flag is 2800cc (#define DCACHE_OP_DELETE 0x00000008), but it is still
cached:
# dmesg | grep ffff8802a3999548
…
[565.559156] XXX dput:853 dentry=ns/ffff8802bea7b528, flag=2800cc, cnt=0, inode=ffff8802b38c2010, pdentry=818/ffff8802c7b670e8, pflag=20008c, pcnt=1, pinode=ffff8802c7812010, keywords: be cached
It could also be verified via the crash tool:
crash> dentry.d_flags,d_iname,d_inode,d_lockref -x ffff8802bea7b528
d_flags = 0x2800cc
d_iname = "ns\000kkkkkkkkkkkkkkkkkkkkkkkkkkkk"
d_inode = 0xffff8802b38c2010
d_lockref = {
{
lock_count = 0x0,
{
lock = {
{
rlock = {
raw_lock = {
{
val = {
counter = 0x0
},
{
locked = 0x0,
pending = 0x0
},
{
locked_pending = 0x0,
tail = 0x0
}
}
}
}
}
},
count = 0x0
}
}
}
crash> kmem ffff8802bea7b528
CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME
ffff8802dd5f5900 192 23663 26130 871 16k dentry
SLAB MEMORY NODE TOTAL ALLOCATED FREE
ffffea000afa9e00 ffff8802bea78000 0 30 25 5
FREE / [ALLOCATED]
[ffff8802bea7b520]
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea000afa9ec0 2bea7b000 dead000000000400 0 0 2fffff80000000
crash>
This series of patches is to fix this issue.
Regards,
Wen
Alexey Dobriyan (1):
proc: use %u for pid printing and slightly less stack
Andreas Gruenbacher (1):
proc: Pass file mode to proc_pid_make_inode
Christian Brauner (1):
clone: add CLONE_PIDFD
Eric W. Biederman (6):
proc: Better ownership of files for non-dumpable tasks in user
namespaces
proc: Rename in proc_inode rename sysctl_inodes sibling_inodes
proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache
proc: Clear the pieces of proc_inode that proc_evict_inode cares about
proc: Use d_invalidate in proc_prune_siblings_dcache
proc: Use a list of inodes to flush from proc
Joel Fernandes (Google) (1):
pidfd: add polling support
fs/proc/base.c | 242 ++++++++++++++++++++-------------------------
fs/proc/fd.c | 20 +---
fs/proc/inode.c | 67 ++++++++++++-
fs/proc/internal.h | 22 ++---
fs/proc/namespaces.c | 3 +-
fs/proc/proc_sysctl.c | 45 ++-------
fs/proc/self.c | 6 +-
fs/proc/thread_self.c | 5 +-
include/linux/pid.h | 5 +
include/linux/proc_fs.h | 4 +-
include/uapi/linux/sched.h | 1 +
kernel/exit.c | 5 +-
kernel/fork.c | 145 ++++++++++++++++++++++++++-
kernel/pid.c | 3 +
kernel/signal.c | 11 +++
security/selinux/hooks.c | 1 +
16 files changed, 357 insertions(+), 228 deletions(-)
[1] A test program to open the directory (/proc/<pid>/ns)
#include <stdio.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
int main(int argc, char *argv[])
{
DIR *dip;
struct dirent *dit;
if (argc < 2) {
printf("Usage :%s <directory>\n", argv[0]);
return -1;
}
if ((dip = opendir(argv[1])) == NULL) {
perror("opendir");
return -1;
}
printf("pid: %d\n", getpid());
while((dit = readdir (dip)) != NULL) {
printf("%s\n", dit->d_name);
}
while (1)
sleep (1);
return 0;
}
[2] Adding some debugging switches to the kernel, also adding a delay between
proc_flush_task and __exit_signal in release_task():
diff --git a/fs/dcache.c b/fs/dcache.c
index 05bad55..fafad37 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -84,6 +84,9 @@
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+int sysctl_dentry_debug_trace __read_mostly = 0;
+EXPORT_SYMBOL_GPL(sysctl_dentry_debug_trace);
+
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
@@ -758,6 +761,26 @@ static inline bool fast_dput(struct dentry *dentry)
return 0;
}
+#define DENTRY_DEBUG_TRACE(dentry, keywords) \
+do { \
+ if (sysctl_dentry_debug_trace) \
+ printk("XXX %s:%d " \
+ "dentry=%s/%p, flag=%x, cnt=%d, inode=%p, " \
+ "pdentry=%s/%p, pflag=%x, pcnt=%d, pinode=%p, " \
+ "keywords: %s\n", \
+ __func__, __LINE__, \
+ dentry->d_name.name, \
+ dentry, \
+ dentry->d_flags, \
+ dentry->d_lockref.count, \
+ dentry->d_inode, \
+ dentry->d_parent->d_name.name, \
+ dentry->d_parent, \
+ dentry->d_parent->d_flags, \
+ dentry->d_parent->d_lockref.count, \
+ dentry->d_parent->d_inode, \
+ keywords); \
+} while (0)
/*
* This is dput
@@ -804,6 +827,8 @@ void dput(struct dentry *dentry)
WARN_ON(d_in_lookup(dentry));
+ DENTRY_DEBUG_TRACE(dentry, "be checked");
+
/* Unreachable? Get rid of it */
if (unlikely(d_unhashed(dentry)))
goto kill_it;
@@ -812,8 +837,10 @@ void dput(struct dentry *dentry)
goto kill_it;
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
- if (dentry->d_op->d_delete(dentry))
+ if (dentry->d_op->d_delete(dentry)) {
+ DENTRY_DEBUG_TRACE(dentry, "be killed");
goto kill_it;
+ }
}
if (!(dentry->d_flags & DCACHE_REFERENCED))
@@ -822,6 +849,9 @@ void dput(struct dentry *dentry)
dentry->d_lockref.count--;
spin_unlock(&dentry->d_lock);
+
+ DENTRY_DEBUG_TRACE(dentry, "be cached");
+
return;
kill_it:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b9e4183..419a409 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3090,6 +3090,8 @@ void proc_flush_task(struct task_struct *task)
}
}
+extern int sysctl_dentry_debug_trace;
+
static int proc_pid_instantiate(struct inode *dir,
struct dentry * dentry,
struct task_struct *task, const void *ptr)
@@ -3111,6 +3113,12 @@ static int proc_pid_instantiate(struct inode *dir,
d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
+
+ if (sysctl_dentry_debug_trace)
+ printk("XXX %s:%d pid=%d tid=%d entry=%s/%p\n",
+ __func__, __LINE__, task->pid, task->tgid,
+ dentry->d_name.name, dentry);
+
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, 0))
return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 27f4168..2b3e1b6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,8 @@
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/delay.h>
+
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
@@ -164,6 +166,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}
+int sysctl_dentry_debug_delay __read_mostly = 0;
+int sysctl_dentry_debug_pid __read_mostly = 0;
void release_task(struct task_struct *p)
{
@@ -178,6 +182,11 @@ void release_task(struct task_struct *p)
proc_flush_task(p);
+ if (sysctl_dentry_debug_delay && p->pid == sysctl_dentry_debug_pid) {
+ while (sysctl_dentry_debug_delay)
+ mdelay(1);
+ }
+
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
__exit_signal(p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 513e6da..27f1395 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -282,6 +282,10 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
static int max_extfrag_threshold = 1000;
#endif
+extern int sysctl_dentry_debug_trace;
+extern int sysctl_dentry_debug_delay;
+extern int sysctl_dentry_debug_pid;
+
static struct ctl_table kern_table[] = {
{
.procname = "sched_child_runs_first",
@@ -1498,6 +1502,30 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
.proc_handler = proc_dointvec,
.extra1 = &zero,
},
+ {
+ .procname = "dentry_debug_trace",
+ .data = &sysctl_dentry_debug_trace,
+ .maxlen = sizeof(sysctl_dentry_debug_trace),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "dentry_debug_delay",
+ .data = &sysctl_dentry_debug_delay,
+ .maxlen = sizeof(sysctl_dentry_debug_delay),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "dentry_debug_pid",
+ .data = &sysctl_dentry_debug_pid,
+ .maxlen = sizeof(sysctl_dentry_debug_pid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
{
.procname = "legacy_va_layout",
Signed-off-by: Wen Yang <wenyang(a)linux.alibaba.com>
Cc: Pavel Emelyanov <xemul(a)openvz.org>
Cc: Oleg Nesterov <oleg(a)tv-sign.ru>
Cc: Sukadev Bhattiprolu <sukadev(a)us.ibm.com>
Cc: Paul Menage <menage(a)google.com>
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: <stable(a)vger.kernel.org>
--
1.8.3.1
From: Alexander Duyck <alexander.h.duyck(a)linux.intel.com>
From: Alexander Duyck <alexander.h.duyck(a)linux.intel.com>
commit 56ec43d8b02719402c9fcf984feb52ec2300f8a5 upstream.
As best as I can tell the meminit_pfn_in_nid call is completely redundant.
The deferred memory initialization is already making use of
for_each_free_mem_range which in turn will call into __next_mem_range
which will only return a memory range if it matches the node ID provided
assuming it is not NUMA_NO_NODE.
I am operating on the assumption that there are no zones or pgdata_t
structures that have a NUMA node of NUMA_NO_NODE associated with them. If
that is the case then __next_mem_range will never return a memory range
that doesn't match the zone's node ID and as such the check is redundant.
So one piece I would like to verify on this is if this works for ia64.
Technically it was using a different approach to get the node ID, but it
seems to have the node ID also encoded into the memblock. So I am
assuming this is okay, but would like to get confirmation on that.
On my x86_64 test system with 384GB of memory per node I saw a reduction
in initialization time from 2.80s to 1.85s as a result of this patch.
Link: http://lkml.kernel.org/r/20190405221219.12227.93957.stgit@localhost.localdo…
Signed-off-by: Alexander Duyck <alexander.h.duyck(a)linux.intel.com>
Reviewed-by: Pavel Tatashin <pavel.tatashin(a)microsoft.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Mike Rapoport <rppt(a)linux.ibm.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Khalid Aziz <khalid.aziz(a)oracle.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Laurent Dufour <ldufour(a)linux.vnet.ibm.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Mike Rapoport <rppt(a)linux.vnet.ibm.com>
Cc: Pavel Tatashin <pasha.tatashin(a)soleen.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <yi.z.zhang(a)linux.intel.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Pavel Tatashin <pasha.tatashin(a)soleen.com>
---
mm/page_alloc.c | 51 ++++++++++++++-----------------------------------
1 file changed, 14 insertions(+), 37 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8c3051387d1..c86a117acb5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1321,36 +1321,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1480,21 +1466,13 @@ static inline void __init pgdat_init_report_one_done(void)
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
return true;
}
@@ -1502,15 +1480,14 @@ deferred_pfn_valid(int nid, unsigned long pfn,
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
@@ -1530,17 +1507,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
+static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
@@ -1603,12 +1581,12 @@ static int __init deferred_init_memmap(void *data)
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
+ nr_pages += deferred_init_pages(zone, spfn, epfn);
}
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
+ deferred_free_pages(spfn, epfn);
}
pgdat_resize_unlock(pgdat, &flags);
@@ -1640,7 +1618,6 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
int nid = zone_to_nid(zone);
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
@@ -1690,7 +1667,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
while (spfn < epfn && nr_pages < nr_pages_needed) {
t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
+ nr_pages += deferred_init_pages(zone, spfn,
first_deferred_pfn);
spfn = first_deferred_pfn;
}
@@ -1702,7 +1679,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
+ deferred_free_pages(spfn, epfn);
if (first_deferred_pfn == epfn)
break;
--
2.25.1