The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b321c31c9b7b309dcde5e8854b741c8e6a9a05f0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023072324-aviation-delirious-b27d@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
b321c31c9b7b ("KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t preemption")
0c2f9acf6ae7 ("KVM: arm64: PMU: Don't overwrite PMUSERENR with vcpu loaded")
8681f7175901 ("KVM: arm64: PMU: Restore the host's PMUSERENR_EL0")
009d6dc87a56 ("ARM: perf: Allow the use of the PMUv3 driver on 32bit ARM")
711432770f78 ("perf: pmuv3: Abstract PMU version checks")
df29ddf4f04b ("arm64: perf: Abstract system register accesses away")
7755cec63ade ("arm64: perf: Move PMUv3 driver to drivers/perf")
cc91b9481605 ("arm64/perf: Replace PMU version number '0' with ID_AA64DFR0_EL1_PMUVer_NI")
4151bb636acf ("KVM: arm64: Fix SMPRI_EL1/TPIDR2_EL0 trapping on VHE")
bb0cca240a16 ("Merge branch kvm-arm64/single-step-async-exception into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b321c31c9b7b309dcde5e8854b741c8e6a9a05f0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 13 Jul 2023 08:06:57 +0100
Subject: [PATCH] KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t
preemption
Xiang reports that VMs occasionally fail to boot on GICv4.1 systems when
running a preemptible kernel, as it is possible that a vCPU is blocked
without requesting a doorbell interrupt.
The issue is that any preemption that occurs between vgic_v4_put() and
schedule() on the block path will mark the vPE as nonresident and *not*
request a doorbell irq. This occurs because when the vcpu thread is
resumed on its way to block, vcpu_load() will make the vPE resident
again. Once the vcpu actually blocks, we don't request a doorbell
anymore, and the vcpu won't be woken up on interrupt delivery.
Fix it by tracking that we're entering WFI, and key the doorbell
request on that flag. This allows us not to make the vPE resident
when going through a preempt/schedule cycle, meaning we don't lose
any state.
Cc: stable(a)vger.kernel.org
Fixes: 8e01d9a396e6 ("KVM: arm64: vgic-v4: Move the GICv4 residency flow to be driven by vcpu_load/put")
Reported-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Suggested-by: Zenghui Yu <yuzenghui(a)huawei.com>
Tested-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Co-developed-by: Oliver Upton <oliver.upton(a)linux.dev>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Zenghui Yu <yuzenghui(a)huawei.com>
Link: https://lore.kernel.org/r/20230713070657.3873244-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8b6096753740..d3dd05bbfe23 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -727,6 +727,8 @@ struct kvm_vcpu_arch {
#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5))
/* PMUSERENR for the guest EL0 is on physical CPU */
#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6))
+/* WFI instruction trapped */
+#define IN_WFI __vcpu_single_flag(sflags, BIT(7))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a402ea5511f3..72dc53a75d1c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -718,13 +718,15 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
*/
preempt_disable();
kvm_vgic_vmcr_sync(vcpu);
- vgic_v4_put(vcpu, true);
+ vcpu_set_flag(vcpu, IN_WFI);
+ vgic_v4_put(vcpu);
preempt_enable();
kvm_vcpu_halt(vcpu);
vcpu_clear_flag(vcpu, IN_WFIT);
preempt_disable();
+ vcpu_clear_flag(vcpu, IN_WFI);
vgic_v4_load(vcpu);
preempt_enable();
}
@@ -792,7 +794,7 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
/* The distributor enable bits were changed */
preempt_disable();
- vgic_v4_put(vcpu, false);
+ vgic_v4_put(vcpu);
vgic_v4_load(vcpu);
preempt_enable();
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index c3b8e132d599..3dfc8b84e03e 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -749,7 +749,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- WARN_ON(vgic_v4_put(vcpu, false));
+ WARN_ON(vgic_v4_put(vcpu));
vgic_v3_vmcr_sync(vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index c1c28fe680ba..339a55194b2c 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -336,14 +336,14 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
return 0;
- return its_make_vpe_non_resident(vpe, need_db);
+ return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI));
}
int vgic_v4_load(struct kvm_vcpu *vcpu)
@@ -354,6 +354,9 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
return 0;
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ return 0;
+
/*
* Before making the VPE resident, make sure the redistributor
* corresponding to our current CPU expects us here. See the
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 402b545959af..5b27f94d4fad 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -431,7 +431,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db);
+int vgic_v4_put(struct kvm_vcpu *vcpu);
/* CPU HP callbacks */
void kvm_vgic_cpu_up(void);
在 2024/6/30 10:23, Sasha Levin 写道:
> This is a note to let you know that I've just added the patch titled
>
> md: Fix overflow in is_mddev_idle
>
> to the 6.9-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> md-fix-overflow-in-is_mddev_idle.patch
> and it can be found in the queue-6.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
This patch is reverted by 504fbcffea649cad69111e7597081dd8adc3b395. It
should not be added to the stable tree.
>
> commit 4a45a13010724614f33c9096cee4a1ee19955aa0
> Author: Li Nan <linan122(a)huawei.com>
> Date: Wed Jan 17 11:19:45 2024 +0800
>
> md: Fix overflow in is_mddev_idle
>
> [ Upstream commit 3f9f231236ce7e48780d8a4f1f8cb9fae2df1e4e ]
>
> UBSAN reports this problem:
>
> UBSAN: Undefined behaviour in drivers/md/md.c:8175:15
> signed integer overflow:
> -2147483291 - 2072033152 cannot be represented in type 'int'
> Call trace:
> dump_backtrace+0x0/0x310
> show_stack+0x28/0x38
> dump_stack+0xec/0x15c
> ubsan_epilogue+0x18/0x84
> handle_overflow+0x14c/0x19c
> __ubsan_handle_sub_overflow+0x34/0x44
> is_mddev_idle+0x338/0x3d8
> md_do_sync+0x1bb8/0x1cf8
> md_thread+0x220/0x288
> kthread+0x1d8/0x1e0
> ret_from_fork+0x10/0x18
>
> 'curr_events' will overflow when stat accum or 'sync_io' is greater than
> INT_MAX.
>
> Fix it by changing sync_io, last_events and curr_events to 64bit.
>
> Signed-off-by: Li Nan <linan122(a)huawei.com>
> Reviewed-by: Yu Kuai <yukuai3(a)huawei.com>
> Link: https://lore.kernel.org/r/20240117031946.2324519-2-linan666@huaweicloud.com
> Signed-off-by: Song Liu <song(a)kernel.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index e575e74aabf5e..c88b50a4be82f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8576,14 +8576,15 @@ static int is_mddev_idle(struct mddev *mddev, int init)
> {
> struct md_rdev *rdev;
> int idle;
> - int curr_events;
> + long long curr_events;
>
> idle = 1;
> rcu_read_lock();
> rdev_for_each_rcu(rdev, mddev) {
> struct gendisk *disk = rdev->bdev->bd_disk;
> - curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
> - atomic_read(&disk->sync_io);
> + curr_events =
> + (long long)part_stat_read_accum(disk->part0, sectors) -
> + atomic64_read(&disk->sync_io);
> /* sync IO will cause sync_io to increase before the disk_stats
> * as sync_io is counted when a request starts, and
> * disk_stats is counted when it completes.
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 097d9dbd69b83..d0db98c0d33be 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -51,7 +51,7 @@ struct md_rdev {
>
> sector_t sectors; /* Device size (in 512bytes sectors) */
> struct mddev *mddev; /* RAID array if running */
> - int last_events; /* IO event timestamp */
> + long long last_events; /* IO event timestamp */
>
> /*
> * If meta_bdev is non-NULL, it means that a separate device is
> @@ -621,7 +621,7 @@ extern void mddev_unlock(struct mddev *mddev);
>
> static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
> {
> - atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
> + atomic64_add(nr_sectors, &bdev->bd_disk->sync_io);
> }
>
> static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 00e62b81a7363..a28cccd15f753 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -174,7 +174,7 @@ struct gendisk {
> struct list_head slave_bdevs;
> #endif
> struct timer_rand_state *random;
> - atomic_t sync_io; /* RAID */
> + atomic64_t sync_io; /* RAID */
> struct disk_events *ev;
>
> #ifdef CONFIG_BLK_DEV_ZONED
> .
--
Thanks,
Nan
Hi Greg, couple bcachefs fixes; this gets the downgrade path working so
6.9 works with the latest tools release.
CI testing... https://evilpiepirate.org/~testdashboard/ci?branch=bcachefs-for-v69
The following changes since commit 12c740d50d4e74e6b97d879363b85437dc895dde:
Linux 6.9.7 (2024-06-27 13:52:32 +0200)
are available in the Git repository at:
https://evilpiepirate.org/git/bcachefs.git tags/bcachefs-2024-06-20-stable-v6.9
for you to fetch changes up to 3d86d0704d4d03f76e5098ddf16152ee53f000f8:
bcachefs: btree_gc can now handle unknown btrees (2024-06-29 16:57:24 -0400)
----------------------------------------------------------------
bcachefs fixes for 6.9 stable
Downgrade path fixes; this fixes downgrade recovery passes not running
when downgrading from on disk format version 1.9; this results in
missing allocation information and writes immediately hanging due to the
allocator thinking there are no free buckets.
----------------------------------------------------------------
Kent Overstreet (5):
bcachefs: Fix sb_field_downgrade validation
bcachefs: Fix sb-downgrade validation
bcachefs: Fix bch2_sb_downgrade_update()
bcachefs: Fix setting of downgrade recovery passes/errors
bcachefs: btree_gc can now handle unknown btrees
fs/bcachefs/bcachefs.h | 44 +---------------------------------------
fs/bcachefs/btree_gc.c | 15 +++++++-------
fs/bcachefs/btree_gc.h | 48 ++++++++++++++++++++------------------------
fs/bcachefs/btree_gc_types.h | 29 ++++++++++++++++++++++++++
fs/bcachefs/ec.c | 2 +-
fs/bcachefs/sb-downgrade.c | 17 +++++++++++++---
fs/bcachefs/super-io.c | 12 +++--------
7 files changed, 77 insertions(+), 90 deletions(-)
create mode 100644 fs/bcachefs/btree_gc_types.h
Introduce a version of the fence ops that on release doesn't remove
the fence from the pending list, and thus doesn't require a lock to
fix poll->fence wait->fence unref deadlocks.
vmwgfx overwrites the wait callback to iterate over the list of all
fences and update their status, to do that it holds a lock to prevent
the list modifcations from other threads. The fence destroy callback
both deletes the fence and removes it from the list of pending
fences, for which it holds a lock.
dma buf polling cb unrefs a fence after it's been signaled: so the poll
calls the wait, which signals the fences, which are being destroyed.
The destruction tries to acquire the lock on the pending fences list
which it can never get because it's held by the wait from which it
was called.
Old bug, but not a lot of userspace apps were using dma-buf polling
interfaces. Fix those, in particular this fixes KDE stalls/deadlock.
Signed-off-by: Zack Rusin <zack.rusin(a)broadcom.com>
Fixes: 2298e804e96e ("drm/vmwgfx: rework to new fence interface, v2")
Cc: Broadcom internal kernel review list <bcm-kernel-feedback-list(a)broadcom.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.2+
---
drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
index 5efc6a766f64..588d50ababf6 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
@@ -32,7 +32,6 @@
#define VMW_FENCE_WRAP (1 << 31)
struct vmw_fence_manager {
- int num_fence_objects;
struct vmw_private *dev_priv;
spinlock_t lock;
struct list_head fence_list;
@@ -124,13 +123,13 @@ static void vmw_fence_obj_destroy(struct dma_fence *f)
{
struct vmw_fence_obj *fence =
container_of(f, struct vmw_fence_obj, base);
-
struct vmw_fence_manager *fman = fman_from_fence(fence);
- spin_lock(&fman->lock);
- list_del_init(&fence->head);
- --fman->num_fence_objects;
- spin_unlock(&fman->lock);
+ if (!list_empty(&fence->head)) {
+ spin_lock(&fman->lock);
+ list_del_init(&fence->head);
+ spin_unlock(&fman->lock);
+ }
fence->destroy(fence);
}
@@ -257,7 +256,6 @@ static const struct dma_fence_ops vmw_fence_ops = {
.release = vmw_fence_obj_destroy,
};
-
/*
* Execute signal actions on fences recently signaled.
* This is done from a workqueue so we don't have to execute
@@ -355,7 +353,6 @@ static int vmw_fence_obj_init(struct vmw_fence_manager *fman,
goto out_unlock;
}
list_add_tail(&fence->head, &fman->fence_list);
- ++fman->num_fence_objects;
out_unlock:
spin_unlock(&fman->lock);
@@ -403,7 +400,7 @@ static bool vmw_fence_goal_new_locked(struct vmw_fence_manager *fman,
u32 passed_seqno)
{
u32 goal_seqno;
- struct vmw_fence_obj *fence;
+ struct vmw_fence_obj *fence, *next_fence;
if (likely(!fman->seqno_valid))
return false;
@@ -413,7 +410,7 @@ static bool vmw_fence_goal_new_locked(struct vmw_fence_manager *fman,
return false;
fman->seqno_valid = false;
- list_for_each_entry(fence, &fman->fence_list, head) {
+ list_for_each_entry_safe(fence, next_fence, &fman->fence_list, head) {
if (!list_empty(&fence->seq_passed_actions)) {
fman->seqno_valid = true;
vmw_fence_goal_write(fman->dev_priv,
--
2.43.0
SAT-5 revision 8 specification removed the text about the ANSI INCITS
431-2007 compliance which was requiring SCSI/ATA Translation (SAT) to
return descriptor format sense data for the ATA PASS-THROUGH commands
regardless of the setting of the D_SENSE bit.
Let's honor the D_SENSE bit for ATA PASS-THROUGH commands while
generating the "ATA PASS-THROUGH INFORMATION AVAILABLE" sense data.
SAT-5 revision 7
================
12.2.2.8 Fixed format sense data
Table 212 shows the fields returned in the fixed format sense data
(see SPC-5) for ATA PASS-THROUGH commands. SATLs compliant with ANSI
INCITS 431-2007, SCSI/ATA Translation (SAT) return descriptor format
sense data for the ATA PASS-THROUGH commands regardless of the setting
of the D_SENSE bit.
SAT-5 revision 8
================
12.2.2.8 Fixed format sense data
Table 211 shows the fields returned in the fixed format sense data
(see SPC-5) for ATA PASS-THROUGH commands.
Cc: stable(a)vger.kernel.org # 4.19+
Reported-by: Niklas Cassel <cassel(a)kernel.org>
Closes: https://lore.kernel.org/linux-ide/Zn1WUhmLglM4iais@ryzen.lan
Reviewed-by: Niklas Cassel <cassel(a)kernel.org>
Signed-off-by: Igor Pylypiv <ipylypiv(a)google.com>
---
drivers/ata/libata-scsi.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b59cbb5ce5a6..076fbeadce01 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -941,11 +941,8 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc)
&sense_key, &asc, &ascq);
ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq);
} else {
- /*
- * ATA PASS-THROUGH INFORMATION AVAILABLE
- * Always in descriptor format sense.
- */
- scsi_build_sense(cmd, 1, RECOVERED_ERROR, 0, 0x1D);
+ /* ATA PASS-THROUGH INFORMATION AVAILABLE */
+ ata_scsi_set_sense(qc->dev, cmd, RECOVERED_ERROR, 0, 0x1D);
}
}
--
2.45.2.803.g4e1b14247a-goog
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x bf14ed81f571f8dba31cd72ab2e50fbcc877cc31
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024070127-escapade-brutishly-2851@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
bf14ed81f571 ("mm/page_alloc: Separate THP PCP into movable and non-movable categories")
6303d1c553c8 ("mm: page_alloc: use the correct THP order for THP PCP")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bf14ed81f571f8dba31cd72ab2e50fbcc877cc31 Mon Sep 17 00:00:00 2001
From: yangge <yangge1116(a)126.com>
Date: Thu, 20 Jun 2024 08:59:50 +0800
Subject: [PATCH] mm/page_alloc: Separate THP PCP into movable and non-movable
categories
Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for
THP-sized allocations") no longer differentiates the migration type of
pages in THP-sized PCP list, it's possible that non-movable allocation
requests may get a CMA page from the list, in some cases, it's not
acceptable.
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
machine with device passthrough will get stuck. During starting the
virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM,
...) to pin memory. Normally if a page is present and in CMA area,
pin_user_pages_remote() will migrate the page from CMA area to non-CMA
area because of FOLL_LONGTERM flag. But if non-movable allocation
requests return CMA memory, migrate_longterm_unpinnable_pages() will
migrate a CMA page to another CMA page, which will fail to pass the check
in check_and_migrate_movable_pages() and cause migration endless.
Call trace:
pin_user_pages_remote
--__gup_longterm_locked // endless loops in this function
----_get_user_pages_locked
----check_and_migrate_movable_pages
------migrate_longterm_unpinnable_pages
--------alloc_migration_target
This problem will also have a negative impact on CMA itself. For example,
when CMA is borrowed by THP, and we need to reclaim it through cma_alloc()
or dma_alloc_coherent(), we must move those pages out to ensure CMA's
users can retrieve that contigous memory. Currently, CMA's memory is
occupied by non-movable pages, meaning we can't relocate them. As a
result, cma_alloc() is more likely to fail.
To fix the problem above, we add one PCP list for THP, which will not
introduce a new cacheline for struct per_cpu_pages. THP will have 2 PCP
lists, one PCP list is used by MOVABLE allocation, and the other PCP list
is used by UNMOVABLE allocation. MOVABLE allocation contains GPF_MOVABLE,
and UNMOVABLE allocation contains GFP_UNMOVABLE and GFP_RECLAIMABLE.
Link: https://lkml.kernel.org/r/1718845190-4456-1-git-send-email-yangge1116@126.c…
Fixes: 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations")
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8f9c9590a42c..586a8f0104d7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -654,13 +654,12 @@ enum zone_watermarks {
};
/*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
- * for THP which will usually be GFP_MOVABLE. Even if it is another type,
- * it should not contribute to serious fragmentation causing THP allocation
- * failures.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
+ * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
+ * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define NR_PCP_THP 1
+#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7300aa9f14b0..9ecf99190ea2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -504,10 +504,15 @@ static void bad_page(struct page *page, const char *reason)
static inline unsigned int order_to_pindex(int migratetype, int order)
{
+ bool __maybe_unused movable;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
- return NR_LOWORDER_PCP_LISTS;
+
+ movable = migratetype == MIGRATE_MOVABLE;
+
+ return NR_LOWORDER_PCP_LISTS + movable;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -521,7 +526,7 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pindex == NR_LOWORDER_PCP_LISTS)
+ if (pindex >= NR_LOWORDER_PCP_LISTS)
order = HPAGE_PMD_ORDER;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);