From: Steven Rostedt <rostedt(a)goodmis.org>
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable(a)vger.kernel.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_events.c | 82 ++++++++++++++++++++++++-------------
1 file changed, 53 insertions(+), 29 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 77e68efbd43e..14e160a5b905 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -265,8 +265,7 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
len = p - fmt;
for (; field->type; field++) {
- if (strncmp(field->name, fmt, len) ||
- field->name[len])
+ if (strncmp(field->name, fmt, len) || field->name[len])
continue;
array_descriptor = strchr(field->type, '[');
/* This is an array and is OK to dereference. */
@@ -275,6 +274,32 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
return false;
}
+/* Return true if the argument pointer is safe */
+static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
+{
+ const char *r, *e, *a;
+
+ e = fmt + len;
+
+ /* Find the REC-> in the argument */
+ r = strstr(fmt, "REC->");
+ if (r && r < e) {
+ /*
+ * Addresses of events on the buffer, or an array on the buffer is
+ * OK to dereference. There's ways to fool this, but
+ * this is to catch common mistakes, not malicious code.
+ */
+ a = strchr(fmt, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ return true;
+ } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) {
+ return true;
+ } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) {
+ return true;
+ }
+ return false;
+}
+
/*
* Examine the print fmt of the event looking for unsafe dereference
* pointers using %p* that could be recorded in the trace event and
@@ -285,12 +310,12 @@ static void test_event_printk(struct trace_event_call *call)
{
u64 dereference_flags = 0;
bool first = true;
- const char *fmt, *c, *r, *a;
+ const char *fmt;
int parens = 0;
char in_quote = 0;
int start_arg = 0;
int arg = 0;
- int i;
+ int i, e;
fmt = call->print_fmt;
@@ -403,42 +428,41 @@ static void test_event_printk(struct trace_event_call *call)
case ',':
if (in_quote || parens)
continue;
+ e = i;
i++;
while (isspace(fmt[i]))
i++;
- start_arg = i;
- if (!(dereference_flags & (1ULL << arg)))
- goto next_arg;
- /* Find the REC-> in the argument */
- c = strchr(fmt + i, ',');
- r = strstr(fmt + i, "REC->");
- if (r && (!c || r < c)) {
- /*
- * Addresses of events on the buffer,
- * or an array on the buffer is
- * OK to dereference.
- * There's ways to fool this, but
- * this is to catch common mistakes,
- * not malicious code.
- */
- a = strchr(fmt + i, '&');
- if ((a && (a < r)) || test_field(r, call))
+ /*
+ * If start_arg is zero, then this is the start of the
+ * first argument. The processing of the argument happens
+ * when the end of the argument is found, as it needs to
+ * handle paranthesis and such.
+ */
+ if (!start_arg) {
+ start_arg = i;
+ /* Balance out the i++ in the for loop */
+ i--;
+ continue;
+ }
+
+ if (dereference_flags & (1ULL << arg)) {
+ if (process_pointer(fmt + start_arg, e - start_arg, call))
dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
}
- next_arg:
- i--;
+ start_arg = i;
arg++;
+ /* Balance out the i++ in the for loop */
+ i--;
}
}
+ if (dereference_flags & (1ULL << arg)) {
+ if (process_pointer(fmt + start_arg, i - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+
/*
* If you triggered the below warning, the trace event reported
* uses an unsafe dereference pointer %p*. As the data stored
--
2.45.2
commit 54bbee190d42166209185d89070c58a343bf514b upstream.
DDI0487K.a D13.3.1 describes the PMU overflow condition, which evaluates
to true if any counter's global enable (PMCR_EL0.E), overflow flag
(PMOVSSET_EL0[n]), and interrupt enable (PMINTENSET_EL1[n]) are all 1.
Of note, this does not require a counter to be enabled
(i.e. PMCNTENSET_EL0[n] = 1) to generate an overflow.
Align kvm_pmu_overflow_status() with the reality of the architecture
and stop using PMCNTENSET_EL0 as part of the overflow condition. The
bug was discovered while running an SBSA PMU test [*], which only sets
PMCR.E, PMOVSSET<0>, PMINTENSET<0>, and expects an overflow interrupt.
Cc: stable(a)vger.kernel.org
Fixes: 76d883c4e640 ("arm64: KVM: Add access handler for PMOVSSET and PMOVSCLR register")
Link: https://github.com/ARM-software/sbsa-acs/blob/master/test_pool/pmu/operatin…
Signed-off-by: Raghavendra Rao Ananta <rananta(a)google.com>
[ oliver: massaged changelog ]
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Link: https://lore.kernel.org/r/20241120005230.2335682-2-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
virt/kvm/arm/pmu.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 4c08fd009768..9732c5e0ed22 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -357,7 +357,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
- reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
reg &= kvm_pmu_valid_counter_mask(vcpu);
}
base-commit: 6708005a36cfdb32aa991ebd9ea172e1183231ef
--
2.47.1.613.gc27f4b7a9f-goog
Hi,
This is to report that after jumping from generic kernel 5.15.167 to
5.15.170 I apparently observe ext4 damage.
After some few days of regular daily use of 5.15.170, one morning my
ext4 partition refused to mount complaining about corrupted system
area (-117).
There were no unusual events preceding this. The device in question is
a laptop with healthy battery, also connected to AC permanently.
The laptop is privately owned by me, in daily use at home, so I am
100% aware of everything happening with it.
The filesystem in question lives on md raid1 with very assymmetric
members (ssd+hdd) so one would not possibly expect that in the event
of emergency cpu halt or some other abnormal stop while filesystem was
actively writing data, raid members could stay in perfect sync.
After the incident, I've run raid1 check multiple times and run
memtest multiple times from different boot media and certainly
consulted startctl.
Nothing. No issues whatsoever except for this spontaneous ext4 damage.
Looking at git log for ext4 changes between 5.15.167 and 5.15.170
shows a few commits. All landed in 5.15.168.
Interestingly, one of them is a comeback of the (in)famous
91562895f803 "properly sync file size update after O_SYNC ..." which
caused some blowup 1 year ago due to "subtle interaction".
I've no idea if 91562895f803 is related to damage this time or not,
but most definitely it looks like some problem was introduced between
5.15.167 and 5.15.170 anyway.
And because there are apparently 0 commits to ext4 in 5.15 since
5.15.168 at the moment, I thought I'd report.
Please CC me if you want me to see your reply and/or need more info
(I'm not subscribed to the normal flow).
Take care,
Nick
Hi Greg,
Due to backporting 2320c9e of to 6.1.120, 6.6.66 and 6.12.5 kernel
series we have the following bug report:
https://gitlab.freedesktop.org/drm/amd/-/issues/3831
The commit 47f402a3e08113e0f5d8e1e6fcc197667a16022f should be backported
to 6.1, 6.6 and 6.12 stable branches.
--
Best, Philip
The patch titled
Subject: mm: hugetlb: independent PMD page table shared count
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-hugetlb-independent-pmd-page-table-shared-count.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Liu Shixin <liushixin2(a)huawei.com>
Subject: mm: hugetlb: independent PMD page table shared count
Date: Mon, 16 Dec 2024 15:11:47 +0800
The folio refcount may be increased unexpectly through try_get_folio() by
caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount
to check whether a pmd page table is shared. The check is incorrect if
the refcount is increased by the above caller, and this can cause the page
table leaked:
BUG: Bad page state in process sh pfn:109324
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
page_type: f2(table)
raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
page dumped because: nonzero mapcount
...
CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7
Tainted: [B]=BAD_PAGE
Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
Call trace:
show_stack+0x20/0x38 (C)
dump_stack_lvl+0x80/0xf8
dump_stack+0x18/0x28
bad_page+0x8c/0x130
free_page_is_bad_report+0xa4/0xb0
free_unref_page+0x3cc/0x620
__folio_put+0xf4/0x158
split_huge_pages_all+0x1e0/0x3e8
split_huge_pages_write+0x25c/0x2d8
full_proxy_write+0x64/0xd8
vfs_write+0xcc/0x280
ksys_write+0x70/0x110
__arm64_sys_write+0x24/0x38
invoke_syscall+0x50/0x120
el0_svc_common.constprop.0+0xc8/0xf0
do_el0_svc+0x24/0x38
el0_svc+0x34/0x128
el0t_64_sync_handler+0xc8/0xd0
el0t_64_sync+0x190/0x198
The issue may be triggered by damon, offline_page, page_idle, etc, which
will increase the refcount of page table.
Fix it by introducing independent PMD page table shared count. As
described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390
gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv
pmds, so we can reuse the field as pt_share_count.
Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Liu Shixin <liushixin2(a)huawei.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Ken Chen <kenneth.w.chen(a)intel.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 1 +
include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++
mm/hugetlb.c | 16 +++++++---------
3 files changed, 38 insertions(+), 9 deletions(-)
--- a/include/linux/mm.h~mm-hugetlb-independent-pmd-page-table-shared-count
+++ a/include/linux/mm.h
@@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(st
if (!pmd_ptlock_init(ptdesc))
return false;
__folio_set_pgtable(folio);
+ ptdesc_pmd_pts_init(ptdesc);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
--- a/include/linux/mm_types.h~mm-hugetlb-independent-pmd-page-table-shared-count
+++ a/include/linux/mm_types.h
@@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a);
* @pt_index: Used for s390 gmap.
* @pt_mm: Used for x86 pgds.
* @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
+ * @pt_share_count: Used for HugeTLB PMD page table share count.
* @_pt_pad_2: Padding to ensure proper alignment.
* @ptl: Lock for the page table.
* @__page_type: Same as page->page_type. Unused for page tables.
@@ -471,6 +472,9 @@ struct ptdesc {
pgoff_t pt_index;
struct mm_struct *pt_mm;
atomic_t pt_frag_refcount;
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+ atomic_t pt_share_count;
+#endif
};
union {
@@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= s
const struct page *: (const struct ptdesc *)(p), \
struct page *: (struct ptdesc *)(p)))
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+ atomic_set(&ptdesc->pt_share_count, 0);
+}
+
+static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
+{
+ atomic_inc(&ptdesc->pt_share_count);
+}
+
+static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
+{
+ atomic_dec(&ptdesc->pt_share_count);
+}
+
+static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
+{
+ return atomic_read(&ptdesc->pt_share_count);
+}
+#else
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+}
+#endif
+
/*
* Used for sizing the vmemmap region on some architectures
*/
--- a/mm/hugetlb.c~mm-hugetlb-independent-pmd-page-table-shared-count
+++ a/mm/hugetlb.c
@@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *
spte = hugetlb_walk(svma, saddr,
vma_mmu_pagesize(svma));
if (spte) {
- get_page(virt_to_page(spte));
+ ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
break;
}
}
@@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
- put_page(virt_to_page(spte));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
}
spin_unlock(&mm->page_table_lock);
out:
@@ -7238,10 +7238,6 @@ out:
/*
* unmap huge page backed by shared pte.
*
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
* Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
@@ -7250,18 +7246,20 @@ out:
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
+ unsigned long sz = huge_page_size(hstate_vma(vma));
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
+ if (sz != PMD_SIZE)
+ return 0;
+ if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
- put_page(virt_to_page(ptep));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
}
_
Patches currently in -mm which might be from liushixin2(a)huawei.com are
mm-hugetlb-independent-pmd-page-table-shared-count.patch
The patch titled
Subject: mm, compaction: don't use ALLOC_CMA in long term GUP flow
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: yangge <yangge1116(a)126.com>
Subject: mm, compaction: don't use ALLOC_CMA in long term GUP flow
Date: Mon, 16 Dec 2024 19:54:04 +0800
Since commit 984fdba6a32e ("mm, compaction: use proper alloc_flags in
__compaction_suitable()") allow compaction to proceed when free pages
required for compaction reside in the CMA pageblocks, it's possible that
__compaction_suitable() always returns true, and in some cases, it's not
acceptable.
There are 4 NUMA nodes on my machine, and each NUMA node has 32GB of
memory. I have configured 16GB of CMA memory on each NUMA node, and
starting a 32GB virtual machine with device passthrough is extremely slow,
taking almost an hour.
During the start-up of the virtual machine, it will call
pin_user_pages_remote(..., FOLL_LONGTERM, ...) to allocate memory. Long
term GUP cannot allocate memory from CMA area, so a maximum of 16 GB of
no-CMA memory on a NUMA node can be used as virtual machine memory. Since
there is 16G of free CMA memory on the NUMA node, watermark for order-0
always be met for compaction, so __compaction_suitable() always returns
true, even if the node is unable to allocate non-CMA memory for the
virtual machine.
For costly allocations, because __compaction_suitable() always
returns true, __alloc_pages_slowpath() can't exit at the appropriate
place, resulting in excessively long virtual machine startup times.
Call trace:
__alloc_pages_slowpath
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage; // should exit __alloc_pages_slowpath() from here
In order to quickly fall back to remote node, we should remove ALLOC_CMA
both in __compaction_suitable() and __isolate_free_page() in long term GUP
flow. After this fix, starting a 32GB virtual machine with device
passthrough takes only a few seconds.
Link: https://lkml.kernel.org/r/1734350044-12928-1-git-send-email-yangge1116@126.…
Fixes: 984fdba6a32e ("mm, compaction: use proper alloc_flags in __compaction_suitable()")
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/compaction.h | 6 ++++--
mm/compaction.c | 20 +++++++++++---------
mm/internal.h | 3 ++-
mm/page_alloc.c | 6 ++++--
mm/page_isolation.c | 3 ++-
mm/page_reporting.c | 2 +-
mm/vmscan.c | 4 ++--
7 files changed, 26 insertions(+), 18 deletions(-)
--- a/include/linux/compaction.h~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/include/linux/compaction.h
@@ -90,7 +90,8 @@ extern enum compact_result try_to_compac
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx);
+ int highest_zoneidx,
+ unsigned int alloc_flags);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
@@ -108,7 +109,8 @@ static inline void reset_isolation_suita
}
static inline bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx)
+ int highest_zoneidx,
+ unsigned int alloc_flags)
{
return false;
}
--- a/mm/compaction.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/compaction.c
@@ -654,7 +654,7 @@ static unsigned long isolate_freepages_b
/* Found a free page, will break it into order-0 pages */
order = buddy_order(page);
- isolated = __isolate_free_page(page, order);
+ isolated = __isolate_free_page(page, order, cc->alloc_flags);
if (!isolated)
break;
set_page_private(page, order);
@@ -1633,7 +1633,7 @@ static void fast_isolate_freepages(struc
/* Isolate the page if available */
if (page) {
- if (__isolate_free_page(page, order)) {
+ if (__isolate_free_page(page, order, cc->alloc_flags)) {
set_page_private(page, order);
nr_isolated = 1 << order;
nr_scanned += nr_isolated - 1;
@@ -2379,6 +2379,7 @@ static enum compact_result compact_finis
static bool __compaction_suitable(struct zone *zone, int order,
int highest_zoneidx,
+ unsigned int alloc_flags,
unsigned long wmark_target)
{
unsigned long watermark;
@@ -2393,25 +2394,26 @@ static bool __compaction_suitable(struct
* even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * In addition to long term GUP flow, ALLOC_CMA is used, as pages in
+ * CMA pageblocks are considered suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ alloc_flags & ALLOC_CMA, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx,
+ unsigned int alloc_flags)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, highest_zoneidx, alloc_flags,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2472,7 +2474,7 @@ bool compaction_zonelist_suitable(struct
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ alloc_flags, available))
return true;
}
@@ -2497,7 +2499,7 @@ compaction_suit_allocation_order(struct
alloc_flags))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ if (!compaction_suitable(zone, order, highest_zoneidx, alloc_flags))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
--- a/mm/internal.h~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/internal.h
@@ -662,7 +662,8 @@ static inline void clear_zone_contiguous
zone->contiguous = false;
}
-extern int __isolate_free_page(struct page *page, unsigned int order);
+extern int __isolate_free_page(struct page *page, unsigned int order,
+ unsigned int alloc_flags);
extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
--- a/mm/page_alloc.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/page_alloc.c
@@ -2808,7 +2808,8 @@ void split_page(struct page *page, unsig
}
EXPORT_SYMBOL_GPL(split_page);
-int __isolate_free_page(struct page *page, unsigned int order)
+int __isolate_free_page(struct page *page, unsigned int order,
+ unsigned int alloc_flags)
{
struct zone *zone = page_zone(page);
int mt = get_pageblock_migratetype(page);
@@ -2822,7 +2823,8 @@ int __isolate_free_page(struct page *pag
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ if (!zone_watermark_ok(zone, 0, watermark, 0,
+ alloc_flags & ALLOC_CMA))
return 0;
}
--- a/mm/page_isolation.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/page_isolation.c
@@ -229,7 +229,8 @@ static void unset_migratetype_isolate(st
buddy = find_buddy_page_pfn(page, page_to_pfn(page),
order, NULL);
if (buddy && !is_migrate_isolate_page(buddy)) {
- isolated_page = !!__isolate_free_page(page, order);
+ isolated_page = !!__isolate_free_page(page, order,
+ ALLOC_CMA);
/*
* Isolating a free page in an isolated pageblock
* is expected to always work as watermarks don't
--- a/mm/page_reporting.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/page_reporting.c
@@ -198,7 +198,7 @@ page_reporting_cycle(struct page_reporti
/* Attempt to pull page from list and place in scatterlist */
if (*offset) {
- if (!__isolate_free_page(page, order)) {
+ if (!__isolate_free_page(page, order, ALLOC_CMA)) {
next = page;
break;
}
--- a/mm/vmscan.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/vmscan.c
@@ -5861,7 +5861,7 @@ static inline bool should_continue_recla
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
}
@@ -6089,7 +6089,7 @@ static inline bool compaction_ready(stru
return true;
/* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
/*
_
Patches currently in -mm which might be from yangge1116(a)126.com are
mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow.patch
This is a note to let you know that I've just added the patch titled
iio: adc: ti-ads1119: fix sample size in scan struct for triggered
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 54d394905c92b9ecc65c1f9b2692c8e10716d8e1 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
Date: Mon, 2 Dec 2024 20:18:44 +0100
Subject: iio: adc: ti-ads1119: fix sample size in scan struct for triggered
buffer
This device returns signed, 16-bit samples as stated in its datasheet
(see 8.5.2 Data Format). That is in line with the scan_type definition
for the IIO_VOLTAGE channel, but 'unsigned int' is being used to read
and push the data to userspace.
Given that the size of that type depends on the architecture (at least
2 bytes to store values up to 65535, but its actual size is often 4
bytes), use the 's16' type to provide the same structure in all cases.
Fixes: a9306887eba4 ("iio: adc: ti-ads1119: Add driver")
Signed-off-by: Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
Reviewed-by: Francesco Dolcini <francesco.dolcini(a)toradex.com>
Link: https://patch.msgid.link/20241202-ti-ads1119_s16_chan-v1-1-fafe3136dc90@gma…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/ti-ads1119.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iio/adc/ti-ads1119.c b/drivers/iio/adc/ti-ads1119.c
index 2615a275acb3..c268e27eec12 100644
--- a/drivers/iio/adc/ti-ads1119.c
+++ b/drivers/iio/adc/ti-ads1119.c
@@ -500,7 +500,7 @@ static irqreturn_t ads1119_trigger_handler(int irq, void *private)
struct iio_dev *indio_dev = pf->indio_dev;
struct ads1119_state *st = iio_priv(indio_dev);
struct {
- unsigned int sample;
+ s16 sample;
s64 timestamp __aligned(8);
} scan;
unsigned int index;
--
2.47.1
This is a note to let you know that I've just added the patch titled
iio: inkern: call iio_device_put() only on mapped devices
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 64f43895b4457532a3cc524ab250b7a30739a1b1 Mon Sep 17 00:00:00 2001
From: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Date: Wed, 4 Dec 2024 20:13:42 +0900
Subject: iio: inkern: call iio_device_put() only on mapped devices
In the error path of iio_channel_get_all(), iio_device_put() is called
on all IIO devices, which can cause a refcount imbalance. Fix this error
by calling iio_device_put() only on IIO devices whose refcounts were
previously incremented by iio_device_get().
Fixes: 314be14bb893 ("iio: Rename _st_ functions to loose the bit that meant the staging version.")
Signed-off-by: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Link: https://patch.msgid.link/20241204111342.1246706-1-joe@pf.is.s.u-tokyo.ac.jp
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/inkern.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 136b225b6bc8..9050a59129e6 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -500,7 +500,7 @@ struct iio_channel *iio_channel_get_all(struct device *dev)
return_ptr(chans);
error_free_chans:
- for (i = 0; i < nummaps; i++)
+ for (i = 0; i < mapind; i++)
iio_device_put(chans[i].indio_dev);
return ERR_PTR(ret);
}
--
2.47.1
This is a note to let you know that I've just added the patch titled
iio: adc: at91: call input_free_device() on allocated iio_dev
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From de6a73bad1743e9e81ea5a24c178c67429ff510b Mon Sep 17 00:00:00 2001
From: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Date: Sat, 7 Dec 2024 13:30:45 +0900
Subject: iio: adc: at91: call input_free_device() on allocated iio_dev
Current implementation of at91_ts_register() calls input_free_deivce()
on st->ts_input, however, the err label can be reached before the
allocated iio_dev is stored to st->ts_input. Thus call
input_free_device() on input instead of st->ts_input.
Fixes: 84882b060301 ("iio: adc: at91_adc: Add support for touchscreens without TSMR")
Signed-off-by: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Link: https://patch.msgid.link/20241207043045.1255409-1-joe@pf.is.s.u-tokyo.ac.jp
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/at91_adc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index a3f0a2321666..5927756b749a 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -979,7 +979,7 @@ static int at91_ts_register(struct iio_dev *idev,
return ret;
err:
- input_free_device(st->ts_input);
+ input_free_device(input);
return ret;
}
--
2.47.1
This is a note to let you know that I've just added the patch titled
iio: adc: ad7173: fix using shared static info struct
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 36a44e05cd807a54e5ffad4b96d0d67f68ad8576 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner(a)baylibre.com>
Date: Wed, 27 Nov 2024 14:01:53 -0600
Subject: iio: adc: ad7173: fix using shared static info struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Fix a possible race condition during driver probe in the ad7173 driver
due to using a shared static info struct. If more that one instance of
the driver is probed at the same time, some of the info could be
overwritten by the other instance, leading to incorrect operation.
To fix this, make the static info struct const so that it is read-only
and make a copy of the info struct for each instance of the driver that
can be modified.
Reported-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver")
Signed-off-by: David Lechner <dlechner(a)baylibre.com>
Tested-by: Guillaume Ranquet <granquet(a)baylibre.com>
Link: https://patch.msgid.link/20241127-iio-adc-ad7313-fix-non-const-info-struct-…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/ad7173.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c
index 8a0c931ca83a..8b03c1e5567e 100644
--- a/drivers/iio/adc/ad7173.c
+++ b/drivers/iio/adc/ad7173.c
@@ -200,6 +200,7 @@ struct ad7173_channel {
struct ad7173_state {
struct ad_sigma_delta sd;
+ struct ad_sigma_delta_info sigma_delta_info;
const struct ad7173_device_info *info;
struct ad7173_channel *channels;
struct regulator_bulk_data regulators[3];
@@ -753,7 +754,7 @@ static int ad7173_disable_one(struct ad_sigma_delta *sd, unsigned int chan)
return ad_sd_write_reg(sd, AD7173_REG_CH(chan), 2, 0);
}
-static struct ad_sigma_delta_info ad7173_sigma_delta_info = {
+static const struct ad_sigma_delta_info ad7173_sigma_delta_info = {
.set_channel = ad7173_set_channel,
.append_status = ad7173_append_status,
.disable_all = ad7173_disable_all,
@@ -1403,7 +1404,7 @@ static int ad7173_fw_parse_device_config(struct iio_dev *indio_dev)
if (ret < 0)
return dev_err_probe(dev, ret, "Interrupt 'rdy' is required\n");
- ad7173_sigma_delta_info.irq_line = ret;
+ st->sigma_delta_info.irq_line = ret;
return ad7173_fw_parse_channel_config(indio_dev);
}
@@ -1436,8 +1437,9 @@ static int ad7173_probe(struct spi_device *spi)
spi->mode = SPI_MODE_3;
spi_setup(spi);
- ad7173_sigma_delta_info.num_slots = st->info->num_configs;
- ret = ad_sd_init(&st->sd, indio_dev, spi, &ad7173_sigma_delta_info);
+ st->sigma_delta_info = ad7173_sigma_delta_info;
+ st->sigma_delta_info.num_slots = st->info->num_configs;
+ ret = ad_sd_init(&st->sd, indio_dev, spi, &st->sigma_delta_info);
if (ret)
return ret;
--
2.47.1
This is a note to let you know that I've just added the patch titled
iio: adc: ti-ads1298: Add NULL check in ads1298_init
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From bcb394bb28e55312cace75362b8e489eb0e02a30 Mon Sep 17 00:00:00 2001
From: Charles Han <hanchunchao(a)inspur.com>
Date: Mon, 18 Nov 2024 17:02:08 +0800
Subject: iio: adc: ti-ads1298: Add NULL check in ads1298_init
devm_kasprintf() can return a NULL pointer on failure. A check on the
return value of such a call in ads1298_init() is missing. Add it.
Fixes: 00ef7708fa60 ("iio: adc: ti-ads1298: Add driver")
Signed-off-by: Charles Han <hanchunchao(a)inspur.com>
Link: https://patch.msgid.link/20241118090208.14586-1-hanchunchao@inspur.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/ti-ads1298.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/iio/adc/ti-ads1298.c b/drivers/iio/adc/ti-ads1298.c
index 36d43495f603..03f762415fa5 100644
--- a/drivers/iio/adc/ti-ads1298.c
+++ b/drivers/iio/adc/ti-ads1298.c
@@ -613,6 +613,8 @@ static int ads1298_init(struct iio_dev *indio_dev)
}
indio_dev->name = devm_kasprintf(dev, GFP_KERNEL, "ads129%u%s",
indio_dev->num_channels, suffix);
+ if (!indio_dev->name)
+ return -ENOMEM;
/* Enable internal test signal, double amplitude, double frequency */
ret = regmap_write(priv->regmap, ADS1298_REG_CONFIG2,
--
2.47.1
This is a note to let you know that I've just added the patch titled
iio: gyro: fxas21002c: Fix missing data update in trigger handler
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From fa13ac6cdf9b6c358e7d77c29fb60145c7a87965 Mon Sep 17 00:00:00 2001
From: Carlos Song <carlos.song(a)nxp.com>
Date: Sat, 16 Nov 2024 10:29:45 -0500
Subject: iio: gyro: fxas21002c: Fix missing data update in trigger handler
The fxas21002c_trigger_handler() may fail to acquire sample data because
the runtime PM enters the autosuspend state and sensor can not return
sample data in standby mode..
Resume the sensor before reading the sample data into the buffer within the
trigger handler. After the data is read, place the sensor back into the
autosuspend state.
Fixes: a0701b6263ae ("iio: gyro: add core driver for fxas21002c")
Signed-off-by: Carlos Song <carlos.song(a)nxp.com>
Signed-off-by: Frank Li <Frank.Li(a)nxp.com>
Link: https://patch.msgid.link/20241116152945.4006374-1-Frank.Li@nxp.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/gyro/fxas21002c_core.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/iio/gyro/fxas21002c_core.c b/drivers/iio/gyro/fxas21002c_core.c
index 0391c78c2f18..754c8a564ba4 100644
--- a/drivers/iio/gyro/fxas21002c_core.c
+++ b/drivers/iio/gyro/fxas21002c_core.c
@@ -730,14 +730,21 @@ static irqreturn_t fxas21002c_trigger_handler(int irq, void *p)
int ret;
mutex_lock(&data->lock);
- ret = regmap_bulk_read(data->regmap, FXAS21002C_REG_OUT_X_MSB,
- data->buffer, CHANNEL_SCAN_MAX * sizeof(s16));
+ ret = fxas21002c_pm_get(data);
if (ret < 0)
goto out_unlock;
+ ret = regmap_bulk_read(data->regmap, FXAS21002C_REG_OUT_X_MSB,
+ data->buffer, CHANNEL_SCAN_MAX * sizeof(s16));
+ if (ret < 0)
+ goto out_pm_put;
+
iio_push_to_buffers_with_timestamp(indio_dev, data->buffer,
data->timestamp);
+out_pm_put:
+ fxas21002c_pm_put(data);
+
out_unlock:
mutex_unlock(&data->lock);
--
2.47.1
This is a note to let you know that I've just added the patch titled
iio: adc: ad7124: Disable all channels at probe time
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 4be339af334c283a1a1af3cb28e7e448a0aa8a7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Mon, 4 Nov 2024 11:19:04 +0100
Subject: iio: adc: ad7124: Disable all channels at probe time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When during a measurement two channels are enabled, two measurements are
done that are reported sequencially in the DATA register. As the code
triggered by reading one of the sysfs properties expects that only one
channel is enabled it only reads the first data set which might or might
not belong to the intended channel.
To prevent this situation disable all channels during probe. This fixes
a problem in practise because the reset default for channel 0 is
enabled. So all measurements before the first measurement on channel 0
(which disables channel 0 at the end) might report wrong values.
Fixes: 7b8d045e497a ("iio: adc: ad7124: allow more than 8 channels")
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://patch.msgid.link/20241104101905.845737-2-u.kleine-koenig@baylibre.c…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/ad7124.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c
index 7314fb32bdec..3d678c420cbf 100644
--- a/drivers/iio/adc/ad7124.c
+++ b/drivers/iio/adc/ad7124.c
@@ -917,6 +917,9 @@ static int ad7124_setup(struct ad7124_state *st)
* set all channels to this default value.
*/
ad7124_set_channel_odr(st, i, 10);
+
+ /* Disable all channels to prevent unintended conversions. */
+ ad_sd_write_reg(&st->sd, AD7124_CHANNEL(i), 2, 0);
}
ret = ad_sd_write_reg(&st->sd, AD7124_ADC_CONTROL, 2, st->adc_control);
--
2.47.1
In commit 892f7237b3ff ("arm64: Delay initialisation of
cpuinfo_arm64::reg_{zcr,smcr}") we moved access to ZCR, SMCR and SMIDR
later in the boot process in order to ensure that we don't attempt to
interact with them if SVE or SME is disabled on the command line.
Unfortunately when initialising the boot CPU in init_cpu_features() we work
on a copy of the struct cpuinfo_arm64 for the boot CPU used only during
boot, not the percpu copy used by the sysfs code.
Fix this by moving the handling for SMIDR_EL1 for the boot CPU to
cpuinfo_store_boot_cpu() so it can operate on the percpu copy of the data.
This reduces the potential for error that could come from having both the
percpu and boot CPU copies in init_cpu_features().
This issue wasn't apparent when testing on emulated platforms that do not
report values in this ID register.
Fixes: 892f7237b3ff ("arm64: Delay initialisation of cpuinfo_arm64::reg_{zcr,smcr}")
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/arm64/kernel/cpufeature.c | 6 ------
arch/arm64/kernel/cpuinfo.c | 11 +++++++++++
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6ce71f444ed84f9056196bb21bbfac61c9687e30..b88102fd2c20f77e25af6df513fda09a484e882e 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1167,12 +1167,6 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
unsigned long cpacr = cpacr_save_enable_kernel_sme();
- /*
- * We mask out SMPS since even if the hardware
- * supports priorities the kernel does not at present
- * and we block access to them.
- */
- info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
vec_init_vq_map(ARM64_VEC_SME);
cpacr_restore(cpacr);
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index d79e88fccdfce427507e7a34c5959ce6309cbd12..b7d403da71e5a01ed3943eb37e7a00af238771a2 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -499,4 +499,15 @@ void __init cpuinfo_store_boot_cpu(void)
boot_cpu_data = *info;
init_cpu_features(&boot_cpu_data);
+
+ /* SMIDR_EL1 needs to be stored in the percpu data for sysfs */
+ if (IS_ENABLED(CONFIG_ARM64_SME) &&
+ id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
+ /*
+ * We mask out SMPS since even if the hardware
+ * supports priorities the kernel does not at present
+ * and we block access to them.
+ */
+ info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
+ }
}
---
base-commit: fac04efc5c793dccbd07e2d59af9f90b7fc0dca4
change-id: 20241213-arm64-fix-boot-cpu-smidr-386b8db292b2
Best regards,
--
Mark Brown <broonie(a)kernel.org>
From: James Morse <james.morse(a)arm.com>
commit 6685f5d572c22e1003e7c0d089afe1c64340ab1f upstream.
commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in
ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to guests,
but didn't add trap handling. A previous patch supplied the missing trap
handling.
Existing VMs that have the MPAM field of ID_AA64PFR0_EL1 set need to
be migratable, but there is little point enabling the MPAM CPU
interface on new VMs until there is something a guest can do with it.
Clear the MPAM field from the guest's ID_AA64PFR0_EL1 and on hardware
that supports MPAM, politely ignore the VMMs attempts to set this bit.
Guests exposed to this bug have the sanitised value of the MPAM field,
so only the correct value needs to be ignored. This means the field
can continue to be used to block migration to incompatible hardware
(between MPAM=1 and MPAM=5), and the VMM can't rely on the field
being ignored.
Signed-off-by: James Morse <james.morse(a)arm.com>
Co-developed-by: Joey Gouly <joey.gouly(a)arm.com>
Signed-off-by: Joey Gouly <joey.gouly(a)arm.com>
Reviewed-by: Gavin Shan <gshan(a)redhat.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi(a)huawei.com>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Link: https://lore.kernel.org/r/20241030160317.2528209-7-joey.gouly@arm.com
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
[maz: adapted to lack of ID_FILTERED()]
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/arm64/kvm/sys_regs.c | 55 ++++++++++++++++++++++++++++++++++++---
1 file changed, 52 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index ff8c4e1b847ed..fbed433283c9b 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1535,6 +1535,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
break;
case SYS_ID_AA64PFR2_EL1:
/* We only expose FPMR */
@@ -1724,6 +1725,13 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
val &= ~ID_AA64PFR0_EL1_AMU_MASK;
+ /*
+ * MPAM is disabled by default as KVM also needs a set of PARTID to
+ * program the MPAMVPMx_EL2 PARTID remapping registers with. But some
+ * older kernels let the guest see the ID bit.
+ */
+ val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+
return val;
}
@@ -1834,6 +1842,42 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
return set_id_reg(vcpu, rd, val);
}
+static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+ u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK;
+
+ /*
+ * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits
+ * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to
+ * guests, but didn't add trap handling. KVM doesn't support MPAM and
+ * always returns an UNDEF for these registers. The guest must see 0
+ * for this field.
+ *
+ * But KVM must also accept values from user-space that were provided
+ * by KVM. On CPUs that support MPAM, permit user-space to write
+ * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field.
+ */
+ if ((hw_val & mpam_mask) == (user_val & mpam_mask))
+ user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
+static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+ u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK;
+
+ /* See set_id_aa64pfr0_el1 for comment about MPAM */
+ if ((hw_val & mpam_mask) == (user_val & mpam_mask))
+ user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+
+ return set_id_reg(vcpu, rd, user_val);
+}
+
/*
* cpufeature ID register user accessors
*
@@ -2377,7 +2421,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ID_AA64PFR0_EL1),
.access = access_id_reg,
.get_user = get_id_reg,
- .set_user = set_id_reg,
+ .set_user = set_id_aa64pfr0_el1,
.reset = read_sanitised_id_aa64pfr0_el1,
.val = ~(ID_AA64PFR0_EL1_AMU |
ID_AA64PFR0_EL1_MPAM |
@@ -2385,7 +2429,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR0_EL1_RAS |
ID_AA64PFR0_EL1_AdvSIMD |
ID_AA64PFR0_EL1_FP), },
- ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR |
+ { SYS_DESC(SYS_ID_AA64PFR1_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_aa64pfr1_el1,
+ .reset = kvm_read_sanitised_id_reg,
+ .val = ~(ID_AA64PFR1_EL1_PFAR |
ID_AA64PFR1_EL1_DF2 |
ID_AA64PFR1_EL1_MTEX |
ID_AA64PFR1_EL1_THE |
@@ -2397,7 +2446,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR1_EL1_RES0 |
ID_AA64PFR1_EL1_MPAM_frac |
ID_AA64PFR1_EL1_RAS_frac |
- ID_AA64PFR1_EL1_MTE)),
+ ID_AA64PFR1_EL1_MTE), },
ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR),
ID_UNALLOCATED(4,3),
ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
--
2.39.2
commit b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols")
avoids checking number_of_same_symbols() for module symbol in
__trace_kprobe_create(), but create_local_trace_kprobe() should avoid this
check too. Doing this check leads to ENOENT for module_name:symbol_name
constructions passed over perf_event_open.
No bug in newer kernels as it was fixed more generally by
commit 9d8616034f16 ("tracing/kprobes: Add symbol counting check when module loads")
Link: https://lore.kernel.org/linux-trace-kernel/20240705161030.b3ddb33a8167013b9…
Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols")
Signed-off-by: Nikolay Kuratov <kniv(a)yandex-team.ru>
---
v1 -> v2:
* Reword commit title and message
* Send for stable instead of mainline
kernel/trace/trace_kprobe.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 12d997bb3e78..94cb09d44115 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1814,7 +1814,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
int ret;
char *event;
- if (func) {
+ if (func && !strchr(func, ':')) {
unsigned int count;
count = number_of_same_symbols(func);
--
2.34.1
If the clock tu->ick was not enabled in tahvo_usb_probe,
it may still hold a non-error pointer, potentially causing
the clock to be incorrectly disabled later in the function.
Use the devm_clk_get_enabled helper function to ensure proper call balance
for tu->ick.
Found by Linux Verification Center (linuxtesting.org) with Klever.
Fixes: 9ba96ae5074c ("usb: omap1: Tahvo USB transceiver driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Vitalii Mordan <mordan(a)ispras.ru>
---
v2: Corrected a typo in the error handling of the devm_clk_get_enabled
call. This issue was reported by Dan Carpenter <dan.carpenter(a)linaro.org>.
drivers/usb/phy/phy-tahvo.c | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c
index ae7bf3ff89ee..4182e86dc450 100644
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -341,9 +341,11 @@ static int tahvo_usb_probe(struct platform_device *pdev)
mutex_init(&tu->serialize);
- tu->ick = devm_clk_get(&pdev->dev, "usb_l4_ick");
- if (!IS_ERR(tu->ick))
- clk_enable(tu->ick);
+ tu->ick = devm_clk_get_enabled(&pdev->dev, "usb_l4_ick");
+ if (IS_ERR(tu->ick)) {
+ dev_err(&pdev->dev, "failed to get and enable clock\n");
+ return PTR_ERR(tu->ick);
+ }
/*
* Set initial state, so that we generate kevents only on state changes.
@@ -353,15 +355,14 @@ static int tahvo_usb_probe(struct platform_device *pdev)
tu->extcon = devm_extcon_dev_allocate(&pdev->dev, tahvo_cable);
if (IS_ERR(tu->extcon)) {
dev_err(&pdev->dev, "failed to allocate memory for extcon\n");
- ret = PTR_ERR(tu->extcon);
- goto err_disable_clk;
+ return PTR_ERR(tu->extcon);
}
ret = devm_extcon_dev_register(&pdev->dev, tu->extcon);
if (ret) {
dev_err(&pdev->dev, "could not register extcon device: %d\n",
ret);
- goto err_disable_clk;
+ return ret;
}
/* Set the initial cable state. */
@@ -384,7 +385,7 @@ static int tahvo_usb_probe(struct platform_device *pdev)
if (ret < 0) {
dev_err(&pdev->dev, "cannot register USB transceiver: %d\n",
ret);
- goto err_disable_clk;
+ return ret;
}
dev_set_drvdata(&pdev->dev, tu);
@@ -405,9 +406,6 @@ static int tahvo_usb_probe(struct platform_device *pdev)
err_remove_phy:
usb_remove_phy(&tu->phy);
-err_disable_clk:
- if (!IS_ERR(tu->ick))
- clk_disable(tu->ick);
return ret;
}
@@ -418,8 +416,6 @@ static void tahvo_usb_remove(struct platform_device *pdev)
free_irq(tu->irq, tu);
usb_remove_phy(&tu->phy);
- if (!IS_ERR(tu->ick))
- clk_disable(tu->ick);
}
static struct platform_driver tahvo_usb_driver = {
--
2.25.1
If the clock tu->ick was not enabled in tahvo_usb_probe,
it may still hold a non-error pointer, potentially causing
the clock to be incorrectly disabled later in the function.
Use the devm_clk_get_enabled helper function to ensure proper call balance
for tu->ick.
Found by Linux Verification Center (linuxtesting.org) with Klever.
Fixes: 9ba96ae5074c ("usb: omap1: Tahvo USB transceiver driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Vitalii Mordan <mordan(a)ispras.ru>
---
drivers/usb/phy/phy-tahvo.c | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c
index ae7bf3ff89ee..d393308d23d4 100644
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -341,9 +341,11 @@ static int tahvo_usb_probe(struct platform_device *pdev)
mutex_init(&tu->serialize);
- tu->ick = devm_clk_get(&pdev->dev, "usb_l4_ick");
- if (!IS_ERR(tu->ick))
- clk_enable(tu->ick);
+ tu->ick = devm_clk_get_enabled(&pdev->dev, "usb_l4_ick");
+ if (!IS_ERR(tu->ick)) {
+ dev_err(&pdev->dev, "failed to get and enable clock\n");
+ return PTR_ERR(tu->ick);
+ }
/*
* Set initial state, so that we generate kevents only on state changes.
@@ -353,15 +355,14 @@ static int tahvo_usb_probe(struct platform_device *pdev)
tu->extcon = devm_extcon_dev_allocate(&pdev->dev, tahvo_cable);
if (IS_ERR(tu->extcon)) {
dev_err(&pdev->dev, "failed to allocate memory for extcon\n");
- ret = PTR_ERR(tu->extcon);
- goto err_disable_clk;
+ return PTR_ERR(tu->extcon);
}
ret = devm_extcon_dev_register(&pdev->dev, tu->extcon);
if (ret) {
dev_err(&pdev->dev, "could not register extcon device: %d\n",
ret);
- goto err_disable_clk;
+ return ret;
}
/* Set the initial cable state. */
@@ -384,7 +385,7 @@ static int tahvo_usb_probe(struct platform_device *pdev)
if (ret < 0) {
dev_err(&pdev->dev, "cannot register USB transceiver: %d\n",
ret);
- goto err_disable_clk;
+ return ret;
}
dev_set_drvdata(&pdev->dev, tu);
@@ -405,9 +406,6 @@ static int tahvo_usb_probe(struct platform_device *pdev)
err_remove_phy:
usb_remove_phy(&tu->phy);
-err_disable_clk:
- if (!IS_ERR(tu->ick))
- clk_disable(tu->ick);
return ret;
}
@@ -418,8 +416,6 @@ static void tahvo_usb_remove(struct platform_device *pdev)
free_irq(tu->irq, tu);
usb_remove_phy(&tu->phy);
- if (!IS_ERR(tu->ick))
- clk_disable(tu->ick);
}
static struct platform_driver tahvo_usb_driver = {
--
2.25.1
From: Chuck Lever <chuck.lever(a)oracle.com>
Testing shows that the EBUSY error return from mtree_alloc_cyclic()
leaks into user space. The ERRORS section of "man creat(2)" says:
> EBUSY O_EXCL was specified in flags and pathname refers
> to a block device that is in use by the system
> (e.g., it is mounted).
ENOSPC is closer to what applications expect in this situation.
Note that the normal range of simple directory offset values is
2..2^63, so hitting this error is going to be rare to impossible.
Fixes: 6faddda69f62 ("libfs: Add directory operations for stable offsets")
Cc: <stable(a)vger.kernel.org> # v6.9+
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
---
fs/libfs.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..f6d04c69f195 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -292,7 +292,9 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
LONG_MAX, &octx->next_offset, GFP_KERNEL);
- if (ret < 0)
+ if (unlikely(ret == -EBUSY))
+ return -ENOSPC;
+ if (unlikely(ret < 0))
return ret;
offset_set(dentry, offset);
--
2.47.0
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 978c4486cca5c7b9253d3ab98a88c8e769cb9bbd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121506-pancreas-mosaic-0ae0@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 978c4486cca5c7b9253d3ab98a88c8e769cb9bbd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa(a)kernel.org>
Date: Sun, 8 Dec 2024 15:25:07 +0100
Subject: [PATCH] bpf,perf: Fix invalid prog_array access in
perf_event_detach_bpf_prog
Syzbot reported [1] crash that happens for following tracing scenario:
- create tracepoint perf event with attr.inherit=1, attach it to the
process and set bpf program to it
- attached process forks -> chid creates inherited event
the new child event shares the parent's bpf program and tp_event
(hence prog_array) which is global for tracepoint
- exit both process and its child -> release both events
- first perf_event_detach_bpf_prog call will release tp_event->prog_array
and second perf_event_detach_bpf_prog will crash, because
tp_event->prog_array is NULL
The fix makes sure the perf_event_detach_bpf_prog checks prog_array
is valid before it tries to remove the bpf program from it.
[1] https://lore.kernel.org/bpf/Z1MR6dCIKajNS6nU@krava/T/#m91dbf0688221ec7a7fc9…
Fixes: 0ee288e69d03 ("bpf,perf: Fix perf_event_detach_bpf_prog error handling")
Reported-by: syzbot+2e0d2840414ce817aaac(a)syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa(a)kernel.org>
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Link: https://lore.kernel.org/bpf/20241208142507.1207698-1-jolsa@kernel.org
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a403b05a7091..1b8db5aee9d3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2250,6 +2250,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
+ if (!old_array)
+ goto put;
+
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
@@ -2258,6 +2261,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_free_sleepable(old_array);
}
+put:
/*
* It could be that the bpf_prog is not sleepable (and will be freed
* via normal RCU), but is called from a point that supports sleepable
From: yangge <yangge1116(a)126.com>
Since commit 984fdba6a32e ("mm, compaction: use proper alloc_flags
in __compaction_suitable()") allow compaction to proceed when free
pages required for compaction reside in the CMA pageblocks, it's
possible that __compaction_suitable() always returns true, and in
some cases, it's not acceptable.
There are 4 NUMA nodes on my machine, and each NUMA node has 32GB
of memory. I have configured 16GB of CMA memory on each NUMA node,
and starting a 32GB virtual machine with device passthrough is
extremely slow, taking almost an hour.
During the start-up of the virtual machine, it will call
pin_user_pages_remote(..., FOLL_LONGTERM, ...) to allocate memory.
Long term GUP cannot allocate memory from CMA area, so a maximum
of 16 GB of no-CMA memory on a NUMA node can be used as virtual
machine memory. Since there is 16G of free CMA memory on the NUMA
node, watermark for order-0 always be met for compaction, so
__compaction_suitable() always returns true, even if the node is
unable to allocate non-CMA memory for the virtual machine.
For costly allocations, because __compaction_suitable() always
returns true, __alloc_pages_slowpath() can't exit at the appropriate
place, resulting in excessively long virtual machine startup times.
Call trace:
__alloc_pages_slowpath
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage; // should exit __alloc_pages_slowpath() from here
In order to quickly fall back to remote node, we should remove
ALLOC_CMA both in __compaction_suitable() and __isolate_free_page()
in long term GUP flow. After this fix, starting a 32GB virtual machine
with device passthrough takes only a few seconds.
Fixes: 984fdba6a32e ("mm, compaction: use proper alloc_flags in __compaction_suitable()")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: yangge <yangge1116(a)126.com>
---
V4:
- rich the commit log description
V3:
- fix build errors
- add ALLOC_CMA both in should_continue_reclaim() and compaction_ready()
V2:
- using the 'cc->alloc_flags' to determin if 'ALLOC_CMA' is needed
- rich the commit log description
include/linux/compaction.h | 6 ++++--
mm/compaction.c | 18 +++++++++++-------
mm/page_alloc.c | 4 +++-
mm/vmscan.c | 4 ++--
4 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index e947764..b4c3ac3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -90,7 +90,8 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx);
+ int highest_zoneidx,
+ unsigned int alloc_flags);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
@@ -108,7 +109,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
}
static inline bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx)
+ int highest_zoneidx,
+ unsigned int alloc_flags)
{
return false;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 07bd227..585f5ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2381,9 +2381,11 @@ static enum compact_result compact_finished(struct compact_control *cc)
static bool __compaction_suitable(struct zone *zone, int order,
int highest_zoneidx,
+ unsigned int alloc_flags,
unsigned long wmark_target)
{
unsigned long watermark;
+ bool use_cma;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2395,25 +2397,27 @@ static bool __compaction_suitable(struct zone *zone, int order,
* even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * In addition to long term GUP flow, ALLOC_CMA is used, as pages in
+ * CMA pageblocks are considered suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ use_cma = !!(alloc_flags & ALLOC_CMA);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ use_cma ? ALLOC_CMA : 0, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx,
+ unsigned int alloc_flags)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, highest_zoneidx, alloc_flags,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2474,7 +2478,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ alloc_flags, available))
return true;
}
@@ -2499,7 +2503,7 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order,
alloc_flags))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ if (!compaction_suitable(zone, order, highest_zoneidx, alloc_flags))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde19db..9a5dfda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2813,6 +2813,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
{
struct zone *zone = page_zone(page);
int mt = get_pageblock_migratetype(page);
+ bool pin;
if (!is_migrate_isolate(mt)) {
unsigned long watermark;
@@ -2823,7 +2824,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ pin = !!(current->flags & PF_MEMALLOC_PIN);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, pin ? 0 : ALLOC_CMA))
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e03a61..33f5b46 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5815,7 +5815,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
}
@@ -6043,7 +6043,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return true;
/* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
/*
--
2.7.4
Ensure a non-interruptible wait is used when moving a bo to
XE_PL_SYSTEM. This prevents dma_mappings from being removed prematurely
while a GPU job is still in progress, even if the CPU receives a
signal during the operation.
Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT")
Cc: Thomas Hellström <thomas.hellstrom(a)linux.intel.com>
Cc: Matthew Brost <matthew.brost(a)intel.com>
Cc: Lucas De Marchi <lucas.demarchi(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v6.11+
Suggested-by: Matthew Auld <matthew.auld(a)intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das(a)intel.com>
Reviewed-by: Matthew Auld <matthew.auld(a)intel.com>
---
drivers/gpu/drm/xe/xe_bo.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 283cd0294570..06931df876ab 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -733,7 +733,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
new_mem->mem_type == XE_PL_SYSTEM) {
long timeout = dma_resv_wait_timeout(ttm_bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP,
- true,
+ false,
MAX_SCHEDULE_TIMEOUT);
if (timeout < 0) {
ret = timeout;
--
2.46.0
This change is specific to Hyper-V VM user.
If the Virtual Machine Connection window is focused,
a Hyper-V VM user can unintentionally touch the keyboard/mouse
when the VM is hibernating or resuming, and consequently the
hibernation or resume operation can be aborted unexpectedly.
Fix the issue by no longer registering the keyboard/mouse as
wakeup devices (see the other two patches for the
changes to drivers/input/serio/hyperv-keyboard.c and
drivers/hid/hid-hyperv.c).
The keyboard/mouse were registered as wakeup devices because the
VM needs to be woken up from the Suspend-to-Idle state after
a user runs "echo freeze > /sys/power/state". It seems like
the Suspend-to-Idle feature has no real users in practice, so
let's no longer support that by returning -EOPNOTSUPP if a
user tries to use that.
Fixes: 1a06d017fb3f ("Drivers: hv: vmbus: Fix Suspend-to-Idle for Generation-2 VM")
Cc: stable(a)vger.kernel.org
Signed-off-by: Saurabh Sengar <ssengar(a)linux.microsoft.com>
Signed-off-by: Erni Sri Satya Vennela <ernis(a)linux.microsoft.com>>
---
Changes in v4:
* No change
Changes in v3:
* Add "Cc: stable(a)vger.kernel.org" in sign-off area.
Changes in v2:
* Add "#define vmbus_freeze NULL" when CONFIG_PM_SLEEP is not
enabled.
---
drivers/hv/vmbus_drv.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 6d89d37b069a..4df6b12bf6a1 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -900,6 +900,19 @@ static void vmbus_shutdown(struct device *child_device)
}
#ifdef CONFIG_PM_SLEEP
+/*
+ * vmbus_freeze - Suspend-to-Idle
+ */
+static int vmbus_freeze(struct device *child_device)
+{
+/*
+ * Do not support Suspend-to-Idle ("echo freeze > /sys/power/state") as
+ * that would require registering the Hyper-V synthetic mouse/keyboard
+ * devices as wakeup devices, which can abort hibernation/resume unexpectedly.
+ */
+ return -EOPNOTSUPP;
+}
+
/*
* vmbus_suspend - Suspend a vmbus device
*/
@@ -938,6 +951,7 @@ static int vmbus_resume(struct device *child_device)
return drv->resume(dev);
}
#else
+#define vmbus_freeze NULL
#define vmbus_suspend NULL
#define vmbus_resume NULL
#endif /* CONFIG_PM_SLEEP */
@@ -969,7 +983,7 @@ static void vmbus_device_release(struct device *device)
*/
static const struct dev_pm_ops vmbus_pm = {
- .suspend_noirq = NULL,
+ .suspend_noirq = vmbus_freeze,
.resume_noirq = NULL,
.freeze_noirq = vmbus_suspend,
.thaw_noirq = vmbus_resume,
--
2.34.1
The quilt patch titled
Subject: mm/hugetlb: change ENOSPC to ENOMEM in alloc_hugetlb_folio
has been removed from the -mm tree. Its filename was
mm-hugetlb-change-enospc-to-enomem-in-alloc_hugetlb_folio.patch
This patch was dropped because it was nacked
------------------------------------------------------
From: Dafna Hirschfeld <dafna.hirschfeld(a)intel.com>
Subject: mm/hugetlb: change ENOSPC to ENOMEM in alloc_hugetlb_folio
Date: Sun, 1 Dec 2024 03:03:41 +0200
The error ENOSPC is translated in vmf_error to VM_FAULT_SIGBUS which is
further translated in EFAULT in i.e. pin/get_user_pages. But when
running out of pages/hugepages we expect to see ENOMEM and not EFAULT.
Link: https://lkml.kernel.org/r/20241201010341.1382431-1-dafna.hirschfeld@intel.c…
Fixes: 8f34af6f93ae ("mm, hugetlb: move the error handle logic out of normal code path")
Signed-off-by: Dafna Hirschfeld <dafna.hirschfeld(a)intel.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/hugetlb.c~mm-hugetlb-change-enospc-to-enomem-in-alloc_hugetlb_folio
+++ a/mm/hugetlb.c
@@ -3113,7 +3113,7 @@ out_end_reservation:
if (!memcg_charge_ret)
mem_cgroup_cancel_charge(memcg, nr_pages);
mem_cgroup_put(memcg);
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-ENOMEM);
}
int alloc_bootmem_huge_page(struct hstate *h, int nid)
_
Patches currently in -mm which might be from dafna.hirschfeld(a)intel.com are
The patch titled
Subject: maple_tree: reload mas before the second call for mas_empty_area
has been added to the -mm mm-unstable branch. Its filename is
maple_tree-reload-mas-before-the-second-call-for-mas_empty_area.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yang Erkun <yangerkun(a)huawei.com>
Subject: maple_tree: reload mas before the second call for mas_empty_area
Date: Sat, 14 Dec 2024 17:30:05 +0800
Change the LONG_MAX in simple_offset_add to 1024, and do latter:
[root@fedora ~]# mkdir /tmp/dir
[root@fedora ~]# for i in {1..1024}; do touch /tmp/dir/$i; done
touch: cannot touch '/tmp/dir/1024': Device or resource busy
[root@fedora ~]# rm /tmp/dir/123
[root@fedora ~]# touch /tmp/dir/1024
[root@fedora ~]# rm /tmp/dir/100
[root@fedora ~]# touch /tmp/dir/1025
touch: cannot touch '/tmp/dir/1025': Device or resource busy
After we delete file 100, actually this is a empty entry, but the latter
create failed unexpected.
mas_alloc_cyclic has two chance to find empty entry. First find the entry
with range range_lo and range_hi, if no empty entry exist, and range_lo >
min, retry find with range min and range_hi. However, the first call
mas_empty_area may mark mas as EBUSY, and the second call for
mas_empty_area will return false directly. Fix this by reload mas before
second call for mas_empty_area.
Link: https://lkml.kernel.org/r/20241214093005.72284-1-yangerkun@huaweicloud.com
Fixes: 9b6713cc7522 ("maple_tree: Add mtree_alloc_cyclic()")
Signed-off-by: Yang Erkun <yangerkun(a)huawei.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Chuck Lever <chuck.lever(a)oracle.com> says:
Cc: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/maple_tree.c | 2 ++
1 file changed, 2 insertions(+)
--- a/lib/maple_tree.c~maple_tree-reload-mas-before-the-second-call-for-mas_empty_area
+++ a/lib/maple_tree.c
@@ -4335,6 +4335,7 @@ int mas_alloc_cyclic(struct ma_state *ma
{
unsigned long min = range_lo;
int ret = 0;
+ struct ma_state m = *mas;
range_lo = max(min, *next);
ret = mas_empty_area(mas, range_lo, range_hi, 1);
@@ -4343,6 +4344,7 @@ int mas_alloc_cyclic(struct ma_state *ma
ret = 1;
}
if (ret < 0 && range_lo > min) {
+ *mas = m;
ret = mas_empty_area(mas, min, range_hi, 1);
if (ret == 0)
ret = 1;
_
Patches currently in -mm which might be from yangerkun(a)huawei.com are
maple_tree-reload-mas-before-the-second-call-for-mas_empty_area.patch
The patch titled
Subject: ocfs2: handle a symlink read error correctly
has been added to the -mm mm-nonmm-unstable branch. Its filename is
ocfs2-handle-a-symlink-read-error-correctly.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-nonmm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Subject: ocfs2: handle a symlink read error correctly
Date: Thu, 5 Dec 2024 17:16:29 +0000
Patch series "Convert ocfs2 to use folios".
Mark did a conversion of ocfs2 to use folios and sent it to me as a
giant patch for review ;-)
So I've redone it as individual patches, and credited Mark for the patches
where his code is substantially the same. It's not a bad way to do it;
his patch had some bugs and my patches had some bugs. Hopefully all our
bugs were different from each other. And hopefully Mark likes all the
changes I made to his code!
This patch (of 23):
If we can't read the buffer, be sure to unlock the page before returning.
Link: https://lkml.kernel.org/r/20241205171653.3179945-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20241205171653.3179945-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: Mark Tinguely <mark.tinguely(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/symlink.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/fs/ocfs2/symlink.c~ocfs2-handle-a-symlink-read-error-correctly
+++ a/fs/ocfs2/symlink.c
@@ -65,7 +65,7 @@ static int ocfs2_fast_symlink_read_folio
if (status < 0) {
mlog_errno(status);
- return status;
+ goto out;
}
fe = (struct ocfs2_dinode *) bh->b_data;
@@ -76,9 +76,10 @@ static int ocfs2_fast_symlink_read_folio
memcpy(kaddr, link, len + 1);
kunmap_atomic(kaddr);
SetPageUptodate(page);
+out:
unlock_page(page);
brelse(bh);
- return 0;
+ return status;
}
const struct address_space_operations ocfs2_fast_symlink_aops = {
_
Patches currently in -mm which might be from willy(a)infradead.org are
vmalloc-fix-accounting-with-i915.patch
mm-page_alloc-cache-page_zone-result-in-free_unref_page.patch
mm-make-alloc_pages_mpol-static.patch
mm-page_alloc-export-free_frozen_pages-instead-of-free_unref_page.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-post_alloc_hook.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-prep_new_page.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-get_page_from_freelist.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-__alloc_pages_cpuset_fallback.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-__alloc_pages_may_oom.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-__alloc_pages_direct_compact.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-__alloc_pages_direct_reclaim.patch
mm-page_alloc-move-set_page_refcounted-to-callers-of-__alloc_pages_slowpath.patch
mm-page_alloc-move-set_page_refcounted-to-end-of-__alloc_pages.patch
mm-page_alloc-add-__alloc_frozen_pages.patch
mm-mempolicy-add-alloc_frozen_pages.patch
slab-allocate-frozen-pages.patch
ocfs2-handle-a-symlink-read-error-correctly.patch
ocfs2-convert-ocfs2_page_mkwrite-to-use-a-folio.patch
ocfs2-pass-mmap_folio-around-instead-of-mmap_page.patch
ocfs2-convert-ocfs2_read_inline_data-to-take-a-folio.patch
ocfs2-use-a-folio-in-ocfs2_fast_symlink_read_folio.patch
ocfs2-remove-ocfs2_start_walk_page_trans-prototype.patch
The patch titled
Subject: mm/readahead: fix large folio support in async readahead
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-readahead-fix-large-folio-support-in-async-readahead.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yafang Shao <laoar.shao(a)gmail.com>
Subject: mm/readahead: fix large folio support in async readahead
Date: Fri, 6 Dec 2024 16:30:25 +0800
When testing large folio support with XFS on our servers, we observed that
only a few large folios are mapped when reading large files via mmap.
After a thorough analysis, I identified it was caused by the
`/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this
parameter is set to 128KB. After I tune it to 2MB, the large folio can
work as expected. However, I believe the large folio behavior should not
be dependent on the value of read_ahead_kb. It would be more robust if
the kernel can automatically adopt to it.
With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a
sequential read on a 1GB file using MADV_HUGEPAGE, the differences in
/proc/meminfo are as follows:
- before this patch
FileHugePages: 18432 kB
FilePmdMapped: 4096 kB
- after this patch
FileHugePages: 1067008 kB
FilePmdMapped: 1048576 kB
This shows that after applying the patch, the entire 1GB file is mapped to
huge pages. The stable list is CCed, as without this patch, large folios
don't function optimally in the readahead path.
It's worth noting that if read_ahead_kb is set to a larger value that
isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail
to map to hugepages.
Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com
Link: https://lkml.kernel.org/r/20241206083025.3478-1-laoar.shao@gmail.com
Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
Signed-off-by: Yafang Shao <laoar.shao(a)gmail.com>
Tested-by: kernel test robot <oliver.sang(a)intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/readahead.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
--- a/mm/readahead.c~mm-readahead-fix-large-folio-support-in-async-readahead
+++ a/mm/readahead.c
@@ -646,7 +646,11 @@ void page_cache_async_ra(struct readahea
1UL << order);
if (index == expected) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max_pages);
+ /*
+ * In the case of MADV_HUGEPAGE, the actual size might exceed
+ * the readahead window.
+ */
+ ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
ra->async_size = ra->size;
goto readit;
}
_
Patches currently in -mm which might be from laoar.shao(a)gmail.com are
mm-readahead-fix-large-folio-support-in-async-readahead.patch
The quilt patch titled
Subject: mm/readahead: fix large folio support in async readahead
has been removed from the -mm tree. Its filename was
mm-readahead-fix-large-folio-support-in-async-readahead.patch
This patch was dropped because an updated version will be issued
------------------------------------------------------
From: Yafang Shao <laoar.shao(a)gmail.com>
Subject: mm/readahead: fix large folio support in async readahead
Date: Fri, 8 Nov 2024 22:17:10 +0800
When testing large folio support with XFS on our servers, we observed that
only a few large folios are mapped when reading large files via mmap.
After a thorough analysis, I identified it was caused by the
`/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this
parameter is set to 128KB. After I tune it to 2MB, the large folio can
work as expected. However, I believe the large folio behavior should not
be dependent on the value of read_ahead_kb. It would be more robust if
the kernel can automatically adopt to it.
With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a
sequential read on a 1GB file using MADV_HUGEPAGE, the differences in
/proc/meminfo are as follows:
- before this patch
FileHugePages: 18432 kB
FilePmdMapped: 4096 kB
- after this patch
FileHugePages: 1067008 kB
FilePmdMapped: 1048576 kB
This shows that after applying the patch, the entire 1GB file is mapped to
huge pages. The stable list is CCed, as without this patch, large folios
don't function optimally in the readahead path.
It's worth noting that if read_ahead_kb is set to a larger value that
isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail
to map to hugepages.
Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com
Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
Suggested-by: Matthew Wilcox <willy(a)infradead.org>
Signed-off-by: Yafang Shao <laoar.shao(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/readahead.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/readahead.c~mm-readahead-fix-large-folio-support-in-async-readahead
+++ a/mm/readahead.c
@@ -390,6 +390,8 @@ static unsigned long get_next_ra_size(st
return 4 * cur;
if (cur <= max / 2)
return 2 * cur;
+ if (cur > max)
+ return cur;
return max;
}
_
Patches currently in -mm which might be from laoar.shao(a)gmail.com are
mm-readahead-fix-large-folio-support-in-async-readahead-v3.patch
The patch titled
Subject: mm, compaction: don't use ALLOC_CMA in long term GUP flow
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: yangge <yangge1116(a)126.com>
Subject: mm, compaction: don't use ALLOC_CMA in long term GUP flow
Date: Sun, 15 Dec 2024 18:01:07 +0800
Since commit 984fdba6a32e ("mm, compaction: use proper alloc_flags in
__compaction_suitable()") allow compaction to proceed when free pages
required for compaction reside in the CMA pageblocks, it's possible that
__compaction_suitable() always returns true, and in some cases, it's not
acceptable.
There are 4 NUMA nodes on my machine, and each NUMA node has 32GB of
memory. I have configured 16GB of CMA memory on each NUMA node, and
starting a 32GB virtual machine with device passthrough is extremely slow,
taking almost an hour.
During the start-up of the virtual machine, it will call
pin_user_pages_remote(..., FOLL_LONGTERM, ...) to allocate memory. Long
term GUP cannot allocate memory from CMA area, so a maximum of 16 GB of
no-CMA memory on a NUMA node can be used as virtual machine memory. Since
there is 16G of free CMA memory on the NUMA node, watermark for order-0
always be met for compaction, so __compaction_suitable() always returns
true, even if the node is unable to allocate non-CMA memory for the
virtual machine.
For costly allocations, because __compaction_suitable() always returns
true, __alloc_pages_slowpath() can't exit at the appropriate place,
resulting in excessively long virtual machine startup times.
Call trace:
__alloc_pages_slowpath
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage; // should exit __alloc_pages_slowpath() from here
In order to quickly fall back to remote node, we should remove ALLOC_CMA
both in __compaction_suitable() and __isolate_free_page() in long term GUP
flow. After this fix, starting a 32GB virtual machine with device
passthrough takes only a few seconds.
Link: https://lkml.kernel.org/r/1734256867-19614-1-git-send-email-yangge1116@126.…
Fixes: 984fdba6a32e ("mm, compaction: use proper alloc_flags in __compaction_suitable()")
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/compaction.h | 6 ++++--
mm/compaction.c | 18 +++++++++++-------
mm/page_alloc.c | 4 +++-
mm/vmscan.c | 4 ++--
4 files changed, 20 insertions(+), 12 deletions(-)
--- a/include/linux/compaction.h~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/include/linux/compaction.h
@@ -90,7 +90,8 @@ extern enum compact_result try_to_compac
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx);
+ int highest_zoneidx,
+ unsigned int alloc_flags);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
@@ -108,7 +109,8 @@ static inline void reset_isolation_suita
}
static inline bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx)
+ int highest_zoneidx,
+ unsigned int alloc_flags)
{
return false;
}
--- a/mm/compaction.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/compaction.c
@@ -2379,9 +2379,11 @@ static enum compact_result compact_finis
static bool __compaction_suitable(struct zone *zone, int order,
int highest_zoneidx,
+ unsigned int alloc_flags,
unsigned long wmark_target)
{
unsigned long watermark;
+ bool use_cma;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2393,25 +2395,27 @@ static bool __compaction_suitable(struct
* even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * In addition to long term GUP flow, ALLOC_CMA is used, as pages in
+ * CMA pageblocks are considered suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ use_cma = !!(alloc_flags & ALLOC_CMA);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ use_cma ? ALLOC_CMA : 0, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx,
+ unsigned int alloc_flags)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, highest_zoneidx, alloc_flags,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2472,7 +2476,7 @@ bool compaction_zonelist_suitable(struct
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ alloc_flags, available))
return true;
}
@@ -2497,7 +2501,7 @@ compaction_suit_allocation_order(struct
alloc_flags))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ if (!compaction_suitable(zone, order, highest_zoneidx, alloc_flags))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
--- a/mm/page_alloc.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/page_alloc.c
@@ -2812,6 +2812,7 @@ int __isolate_free_page(struct page *pag
{
struct zone *zone = page_zone(page);
int mt = get_pageblock_migratetype(page);
+ bool pin;
if (!is_migrate_isolate(mt)) {
unsigned long watermark;
@@ -2822,7 +2823,8 @@ int __isolate_free_page(struct page *pag
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ pin = !!(current->flags & PF_MEMALLOC_PIN);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, pin ? 0 : ALLOC_CMA))
return 0;
}
--- a/mm/vmscan.c~mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow
+++ a/mm/vmscan.c
@@ -5861,7 +5861,7 @@ static inline bool should_continue_recla
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
}
@@ -6089,7 +6089,7 @@ static inline bool compaction_ready(stru
return true;
/* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
/*
_
Patches currently in -mm which might be from yangge1116(a)126.com are
mm-compaction-dont-use-alloc_cma-in-long-term-gup-flow.patch
This patch series is to fix bugs and improve codes for drivers/of/*.
Signed-off-by: Zijun Hu <quic_zijuhu(a)quicinc.com>
---
Changes in v2:
- Drop applied/conflict/TBD patches.
- Correct based on Rob's comments.
- Link to v1: https://lore.kernel.org/r/20241206-of_core_fix-v1-0-dc28ed56bec3@quicinc.com
---
Zijun Hu (7):
of: Fix API of_find_node_opts_by_path() finding OF device node failure
of: unittest: Add a test case for API of_find_node_opts_by_path()
of: Correct child specifier used as input of the 2nd nexus node
of: Exchange implementation between of_property_present() and of_property_read_bool()
of: Fix available buffer size calculating error in API of_device_uevent_modalias()
of: Fix potential wrong MODALIAS uevent value
of: Do not expose of_alias_scan() and correct its comments
drivers/of/base.c | 13 +++---
drivers/of/device.c | 33 +++++++---------
drivers/of/module.c | 103 +++++++++++++++++++++++++++++-------------------
drivers/of/of_private.h | 2 +
drivers/of/pdt.c | 2 +
drivers/of/unittest.c | 9 +++++
include/linux/of.h | 29 +++++++-------
7 files changed, 109 insertions(+), 82 deletions(-)
---
base-commit: 0f7ca6f69354e0c3923bbc28c92d0ecab4d50a3e
change-id: 20241206-of_core_fix-dc3021a06418
Best regards,
--
Zijun Hu <quic_zijuhu(a)quicinc.com>
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 7a5f93ea5862da91488975acaa0c7abd508f192b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121500-switch-jab-65fc@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7a5f93ea5862da91488975acaa0c7abd508f192b Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda(a)kernel.org>
Date: Sat, 23 Nov 2024 19:03:23 +0100
Subject: [PATCH] rust: kbuild: set `bindgen`'s Rust target version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Each `bindgen` release may upgrade the list of Rust targets. For instance,
currently, in their master branch [1], the latest ones are:
Nightly => {
vectorcall_abi: #124485,
ptr_metadata: #81513,
layout_for_ptr: #69835,
},
Stable_1_77(77) => { offset_of: #106655 },
Stable_1_73(73) => { thiscall_abi: #42202 },
Stable_1_71(71) => { c_unwind_abi: #106075 },
Stable_1_68(68) => { abi_efiapi: #105795 },
By default, the highest stable release in their list is used, and users
are expected to set one if they need to support older Rust versions
(e.g. see [2]).
Thus, over time, new Rust features are used by default, and at some
point, it is likely that `bindgen` will emit Rust code that requires a
Rust version higher than our minimum (or perhaps enabling an unstable
feature). Currently, there is no problem because the maximum they have,
as seen above, is Rust 1.77.0, and our current minimum is Rust 1.78.0.
Therefore, set a Rust target explicitly now to prevent going forward in
time too much and thus getting potential build failures at some point.
Since we also support a minimum `bindgen` version, and since `bindgen`
does not support passing unknown Rust target versions, we need to use
the list of our minimum `bindgen` version, rather than the latest. So,
since `bindgen` 0.65.1 had this list [3], we need to use Rust 1.68.0:
/// Rust stable 1.64
/// * `core_ffi_c` ([Tracking issue](https://github.com/rust-lang/rust/issues/94501))
=> Stable_1_64 => 1.64;
/// Rust stable 1.68
/// * `abi_efiapi` calling convention ([Tracking issue](https://github.com/rust-lang/rust/issues/65815))
=> Stable_1_68 => 1.68;
/// Nightly rust
/// * `thiscall` calling convention ([Tracking issue](https://github.com/rust-lang/rust/issues/42202))
/// * `vectorcall` calling convention (no tracking issue)
/// * `c_unwind` calling convention ([Tracking issue](https://github.com/rust-lang/rust/issues/74990))
=> Nightly => nightly;
...
/// Latest stable release of Rust
pub const LATEST_STABLE_RUST: RustTarget = RustTarget::Stable_1_68;
Thus add the `--rust-target 1.68` parameter. Add a comment as well
explaining this.
An alternative would be to use the currently running (i.e. actual) `rustc`
and `bindgen` versions to pick a "better" Rust target version. However,
that would introduce more moving parts depending on the user setup and
is also more complex to implement.
Starting with `bindgen` 0.71.0 [4], we will be able to set any future
Rust version instead, i.e. we will be able to set here our minimum
supported Rust version. Christian implemented it [5] after seeing this
patch. Thanks!
Cc: Christian Poveda <git(a)pvdrz.com>
Cc: Emilio Cobos Álvarez <emilio(a)crisal.io>
Cc: stable(a)vger.kernel.org # needed for 6.12.y; unneeded for 6.6.y; do not apply to 6.1.y
Fixes: c844fa64a2d4 ("rust: start supporting several `bindgen` versions")
Link: https://github.com/rust-lang/rust-bindgen/blob/21c60f473f4e824d4aa9b2b50805… [1]
Link: https://github.com/rust-lang/rust-bindgen/issues/2960 [2]
Link: https://github.com/rust-lang/rust-bindgen/blob/7d243056d335fdc4537f7bca73c0… [3]
Link: https://github.com/rust-lang/rust-bindgen/blob/main/CHANGELOG.md#0710-2024-… [4]
Link: https://github.com/rust-lang/rust-bindgen/pull/2993 [5]
Reviewed-by: Alice Ryhl <aliceryhl(a)google.com>
Link: https://lore.kernel.org/r/20241123180323.255997-1-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
diff --git a/rust/Makefile b/rust/Makefile
index 9da9042fd627..a40a3936126d 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -280,9 +280,22 @@ endif
# architecture instead of generating `usize`.
bindgen_c_flags_final = $(bindgen_c_flags_lto) -fno-builtin -D__BINDGEN__
+# Each `bindgen` release may upgrade the list of Rust target versions. By
+# default, the highest stable release in their list is used. Thus we need to set
+# a `--rust-target` to avoid future `bindgen` releases emitting code that
+# `rustc` may not understand. On top of that, `bindgen` does not support passing
+# an unknown Rust target version.
+#
+# Therefore, the Rust target for `bindgen` can be only as high as the minimum
+# Rust version the kernel supports and only as high as the greatest stable Rust
+# target supported by the minimum `bindgen` version the kernel supports (that
+# is, if we do not test the actual `rustc`/`bindgen` versions running).
+#
+# Starting with `bindgen` 0.71.0, we will be able to set any future Rust version
+# instead, i.e. we will be able to set here our minimum supported Rust version.
quiet_cmd_bindgen = BINDGEN $@
cmd_bindgen = \
- $(BINDGEN) $< $(bindgen_target_flags) \
+ $(BINDGEN) $< $(bindgen_target_flags) --rust-target 1.68 \
--use-core --with-derive-default --ctypes-prefix ffi --no-layout-tests \
--no-debug '.*' --enable-function-attribute-detection \
-o $@ -- $(bindgen_c_flags_final) -DMODULE \
It should not be possible for every user to override the EDID.
Limit it to the system administrator.
Fixes: 8ef8cc4fca4a ("staging: udlfb: support for writing backup EDID to sysfs file")
Cc: stable(a)vger.kernel.org
Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net>
---
The EDID passed through sysfs is only used as a fallback if the hardware
does not provide one. To me it still feels incorrect to have this
world-writable.
---
drivers/video/fbdev/udlfb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/video/fbdev/udlfb.c b/drivers/video/fbdev/udlfb.c
index 71ac9e36f67c68aa7a54dce32323047a2a9a48bf..391bdb71197549caa839d862f0ce7456dc7bf9ec 100644
--- a/drivers/video/fbdev/udlfb.c
+++ b/drivers/video/fbdev/udlfb.c
@@ -1480,7 +1480,7 @@ static ssize_t metrics_reset_store(struct device *fbdev,
static const struct bin_attribute edid_attr = {
.attr.name = "edid",
- .attr.mode = 0666,
+ .attr.mode = 0644,
.size = EDID_LENGTH,
.read = edid_show,
.write = edid_store
---
base-commit: 2d8308bf5b67dff50262d8a9260a50113b3628c6
change-id: 20241215-udlfb-perms-bb6ed270facf
Best regards,
--
Thomas Weißschuh <linux(a)weissschuh.net>
The channel index is off by one unit if AS73211_SCAN_MASK_ALL is not
set (optimized path for color channel readings), and it must be shifted
instead of leaving an empty channel for the temperature when it is off.
Once the channel index is fixed, the uninitialized channel must be set
to zero to avoid pushing uninitialized data.
Add available_scan_masks for all channels and only-color channels to let
the IIO core demux and repack the enabled channels.
Cc: stable(a)vger.kernel.org
Fixes: 403e5586b52e ("iio: light: as73211: New driver")
Tested-by: Christian Eggers <ceggers(a)arri.de>
Signed-off-by: Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
---
This issue was found after attempting to make the same mistake for
a driver I maintain, which was fortunately spotted by Jonathan [1].
Keeping old sensor values if the channel configuration changes is known
and not considered an issue, which is also mentioned in [1], so it has
not been addressed by this series. That keeps most of the drivers out
of the way because they store the scan element in iio private data,
which is kzalloc() allocated.
This series only addresses cases where uninitialized i.e. unknown data
is pushed to the userspace, either due to holes in structs or
uninitialized struct members/array elements.
While analyzing involved functions, I found and fixed some triviality
(wrong function name) in the documentation of iio_dev_opaque.
Link: https://lore.kernel.org/linux-iio/20241123151634.303aa860@jic23-huawei/ [1]
---
Changes in v4:
- Fix as73211_scan_masks[] (first MASK_COLOR, then MASK_ALL, no comma
after 0 i.e. the last element).
- Link to v3: https://lore.kernel.org/r/20241212-iio_memset_scan_holes-v3-1-7f496b6f7222@…
Changes in v3:
- as73211.c: add available_scan_masks for all channels and only-color
channels to let the IIO core demux and repack the enabled channels.
- Link to v2: https://lore.kernel.org/r/20241204-iio_memset_scan_holes-v2-0-3f941592a76d@…
Changes in v2:
- as73211.c: shift channels if no temperature is available and
initialize chan[3] to zero.
- Link to v1: https://lore.kernel.org/r/20241125-iio_memset_scan_holes-v1-0-0cb6e98d895c@…
---
drivers/iio/light/as73211.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/drivers/iio/light/as73211.c b/drivers/iio/light/as73211.c
index be0068081ebb..11fbdcdd26d6 100644
--- a/drivers/iio/light/as73211.c
+++ b/drivers/iio/light/as73211.c
@@ -177,6 +177,12 @@ struct as73211_data {
BIT(AS73211_SCAN_INDEX_TEMP) | \
AS73211_SCAN_MASK_COLOR)
+static const unsigned long as73211_scan_masks[] = {
+ AS73211_SCAN_MASK_COLOR,
+ AS73211_SCAN_MASK_ALL,
+ 0
+};
+
static const struct iio_chan_spec as73211_channels[] = {
{
.type = IIO_TEMP,
@@ -672,9 +678,12 @@ static irqreturn_t as73211_trigger_handler(int irq __always_unused, void *p)
/* AS73211 starts reading at address 2 */
ret = i2c_master_recv(data->client,
- (char *)&scan.chan[1], 3 * sizeof(scan.chan[1]));
+ (char *)&scan.chan[0], 3 * sizeof(scan.chan[0]));
if (ret < 0)
goto done;
+
+ /* Avoid pushing uninitialized data */
+ scan.chan[3] = 0;
}
if (data_result) {
@@ -682,9 +691,15 @@ static irqreturn_t as73211_trigger_handler(int irq __always_unused, void *p)
* Saturate all channels (in case of overflows). Temperature channel
* is not affected by overflows.
*/
- scan.chan[1] = cpu_to_le16(U16_MAX);
- scan.chan[2] = cpu_to_le16(U16_MAX);
- scan.chan[3] = cpu_to_le16(U16_MAX);
+ if (*indio_dev->active_scan_mask == AS73211_SCAN_MASK_ALL) {
+ scan.chan[1] = cpu_to_le16(U16_MAX);
+ scan.chan[2] = cpu_to_le16(U16_MAX);
+ scan.chan[3] = cpu_to_le16(U16_MAX);
+ } else {
+ scan.chan[0] = cpu_to_le16(U16_MAX);
+ scan.chan[1] = cpu_to_le16(U16_MAX);
+ scan.chan[2] = cpu_to_le16(U16_MAX);
+ }
}
iio_push_to_buffers_with_timestamp(indio_dev, &scan, iio_get_time_ns(indio_dev));
@@ -758,6 +773,7 @@ static int as73211_probe(struct i2c_client *client)
indio_dev->channels = data->spec_dev->channels;
indio_dev->num_channels = data->spec_dev->num_channels;
indio_dev->modes = INDIO_DIRECT_MODE;
+ indio_dev->available_scan_masks = as73211_scan_masks;
ret = i2c_smbus_read_byte_data(data->client, AS73211_REG_OSR);
if (ret < 0)
---
base-commit: 91e71d606356e50f238d7a87aacdee4abc427f07
change-id: 20241123-iio_memset_scan_holes-a673833ef932
Best regards,
--
Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 24a9799aa8efecd0eb55a75e35f9d8e6400063aa
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024040834-magazine-audience-8aa4@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
24a9799aa8ef ("smb: client: fix UAF in smb2_reconnect_server()")
7257bcf3bdc7 ("cifs: cifs_chan_is_iface_active should be called with chan_lock held")
27e1fd343f80 ("cifs: after disabling multichannel, mark tcon for reconnect")
fa1d0508bdd4 ("cifs: account for primary channel in the interface list")
a6d8fb54a515 ("cifs: distribute channels across interfaces based on speed")
c37ed2d7d098 ("smb: client: remove extra @chan_count check in __cifs_put_smb_ses()")
ff7d80a9f271 ("cifs: fix session state transition to avoid use-after-free issue")
38c8a9a52082 ("smb: move client and server files to common directory fs/smb")
943fb67b0902 ("cifs: missing lock when updating session status")
bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects")
1bcd548d935a ("cifs: prevent data race in cifs_reconnect_tcon()")
e77978de4765 ("cifs: update ip_addr for ses only for primary chan setup")
3c0070f54b31 ("cifs: prevent data race in smb2_reconnect()")
05844bd661d9 ("cifs: print last update time for interface list")
25cf01b7c920 ("cifs: set correct status of tcon ipc when reconnecting")
abdb1742a312 ("cifs: get rid of mount options string parsing")
9fd29a5bae6e ("cifs: use fs_context for automounts")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 24a9799aa8efecd0eb55a75e35f9d8e6400063aa Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc(a)manguebit.com>
Date: Mon, 1 Apr 2024 14:13:10 -0300
Subject: [PATCH] smb: client: fix UAF in smb2_reconnect_server()
The UAF bug is due to smb2_reconnect_server() accessing a session that
is already being teared down by another thread that is executing
__cifs_put_smb_ses(). This can happen when (a) the client has
connection to the server but no session or (b) another thread ends up
setting @ses->ses_status again to something different than
SES_EXITING.
To fix this, we need to make sure to unconditionally set
@ses->ses_status to SES_EXITING and prevent any other threads from
setting a new status while we're still tearing it down.
The following can be reproduced by adding some delay to right after
the ipc is freed in __cifs_put_smb_ses() - which will give
smb2_reconnect_server() worker a chance to run and then accessing
@ses->ipc:
kinit ...
mount.cifs //srv/share /mnt/1 -o sec=krb5,nohandlecache,echo_interval=10
[disconnect srv]
ls /mnt/1 &>/dev/null
sleep 30
kdestroy
[reconnect srv]
sleep 10
umount /mnt/1
...
CIFS: VFS: Verify user has a krb5 ticket and keyutils is installed
CIFS: VFS: \\srv Send error in SessSetup = -126
CIFS: VFS: Verify user has a krb5 ticket and keyutils is installed
CIFS: VFS: \\srv Send error in SessSetup = -126
general protection fault, probably for non-canonical address
0x6b6b6b6b6b6b6b6b: 0000 [#1] PREEMPT SMP NOPTI
CPU: 3 PID: 50 Comm: kworker/3:1 Not tainted 6.9.0-rc2 #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39
04/01/2014
Workqueue: cifsiod smb2_reconnect_server [cifs]
RIP: 0010:__list_del_entry_valid_or_report+0x33/0xf0
Code: 4f 08 48 85 d2 74 42 48 85 c9 74 59 48 b8 00 01 00 00 00 00 ad
de 48 39 c2 74 61 48 b8 22 01 00 00 00 00 74 69 <48> 8b 01 48 39 f8 75
7b 48 8b 72 08 48 39 c6 0f 85 88 00 00 00 b8
RSP: 0018:ffffc900001bfd70 EFLAGS: 00010a83
RAX: dead000000000122 RBX: ffff88810da53838 RCX: 6b6b6b6b6b6b6b6b
RDX: 6b6b6b6b6b6b6b6b RSI: ffffffffc02f6878 RDI: ffff88810da53800
RBP: ffff88810da53800 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff88810c064000
R13: 0000000000000001 R14: ffff88810c064000 R15: ffff8881039cc000
FS: 0000000000000000(0000) GS:ffff888157c00000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fe3728b1000 CR3: 000000010caa4000 CR4: 0000000000750ef0
PKRU: 55555554
Call Trace:
<TASK>
? die_addr+0x36/0x90
? exc_general_protection+0x1c1/0x3f0
? asm_exc_general_protection+0x26/0x30
? __list_del_entry_valid_or_report+0x33/0xf0
__cifs_put_smb_ses+0x1ae/0x500 [cifs]
smb2_reconnect_server+0x4ed/0x710 [cifs]
process_one_work+0x205/0x6b0
worker_thread+0x191/0x360
? __pfx_worker_thread+0x10/0x10
kthread+0xe2/0x110
? __pfx_kthread+0x10/0x10
ret_from_fork+0x34/0x50
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paulo Alcantara (Red Hat) <pc(a)manguebit.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 9b85b5341822..ee29bc57300c 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -232,7 +232,13 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
- /* check if iface is still active */
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
+ continue;
+ }
+ spin_unlock(&ses->ses_lock);
+
spin_lock(&ses->chan_lock);
if (cifs_ses_get_chan_index(ses, server) ==
CIFS_INVAL_CHAN_INDEX) {
@@ -1963,31 +1969,6 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
return rc;
}
-/**
- * cifs_free_ipc - helper to release the session IPC tcon
- * @ses: smb session to unmount the IPC from
- *
- * Needs to be called everytime a session is destroyed.
- *
- * On session close, the IPC is closed and the server must release all tcons of the session.
- * No need to send a tree disconnect here.
- *
- * Besides, it will make the server to not close durable and resilient files on session close, as
- * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request.
- */
-static int
-cifs_free_ipc(struct cifs_ses *ses)
-{
- struct cifs_tcon *tcon = ses->tcon_ipc;
-
- if (tcon == NULL)
- return 0;
-
- tconInfoFree(tcon);
- ses->tcon_ipc = NULL;
- return 0;
-}
-
static struct cifs_ses *
cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
@@ -2019,48 +2000,52 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
void __cifs_put_smb_ses(struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
+ struct cifs_tcon *tcon;
unsigned int xid;
size_t i;
+ bool do_logoff;
int rc;
- spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_EXITING) {
- spin_unlock(&ses->ses_lock);
- return;
- }
- spin_unlock(&ses->ses_lock);
-
- cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
- cifs_dbg(FYI,
- "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE");
-
spin_lock(&cifs_tcp_ses_lock);
- if (--ses->ses_count > 0) {
+ spin_lock(&ses->ses_lock);
+ cifs_dbg(FYI, "%s: id=0x%llx ses_count=%d ses_status=%u ipc=%s\n",
+ __func__, ses->Suid, ses->ses_count, ses->ses_status,
+ ses->tcon_ipc ? ses->tcon_ipc->tree_name : "none");
+ if (ses->ses_status == SES_EXITING || --ses->ses_count > 0) {
+ spin_unlock(&ses->ses_lock);
spin_unlock(&cifs_tcp_ses_lock);
return;
}
- spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_GOOD)
- ses->ses_status = SES_EXITING;
- spin_unlock(&ses->ses_lock);
- spin_unlock(&cifs_tcp_ses_lock);
-
/* ses_count can never go negative */
WARN_ON(ses->ses_count < 0);
- spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_EXITING && server->ops->logoff) {
- spin_unlock(&ses->ses_lock);
- cifs_free_ipc(ses);
+ spin_lock(&ses->chan_lock);
+ cifs_chan_clear_need_reconnect(ses, server);
+ spin_unlock(&ses->chan_lock);
+
+ do_logoff = ses->ses_status == SES_GOOD && server->ops->logoff;
+ ses->ses_status = SES_EXITING;
+ tcon = ses->tcon_ipc;
+ ses->tcon_ipc = NULL;
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ /*
+ * On session close, the IPC is closed and the server must release all
+ * tcons of the session. No need to send a tree disconnect here.
+ *
+ * Besides, it will make the server to not close durable and resilient
+ * files on session close, as specified in MS-SMB2 3.3.5.6 Receiving an
+ * SMB2 LOGOFF Request.
+ */
+ tconInfoFree(tcon);
+ if (do_logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
if (rc)
cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
__func__, rc);
_free_xid(xid);
- } else {
- spin_unlock(&ses->ses_lock);
- cifs_free_ipc(ses);
}
spin_lock(&cifs_tcp_ses_lock);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 659b9ba7cb2d7adb64618b87ddfaa528a143766e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121504-cloak-campfire-b8a9@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 659b9ba7cb2d7adb64618b87ddfaa528a143766e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Date: Thu, 12 Dec 2024 01:20:49 -0800
Subject: [PATCH] bpf: Check size for BTF-based ctx access of pointer members
Robert Morris reported the following program type which passes the
verifier in [0]:
SEC("struct_ops/bpf_cubic_init")
void BPF_PROG(bpf_cubic_init, struct sock *sk)
{
asm volatile("r2 = *(u16*)(r1 + 0)"); // verifier should demand u64
asm volatile("*(u32 *)(r2 +1504) = 0"); // 1280 in some configs
}
The second line may or may not work, but the first instruction shouldn't
pass, as it's a narrow load into the context structure of the struct ops
callback. The code falls back to btf_ctx_access to ensure correctness
and obtaining the types of pointers. Ensure that the size of the access
is correctly checked to be 8 bytes, otherwise the verifier thinks the
narrow load obtained a trusted BTF pointer and will permit loads/stores
as it sees fit.
Perform the check on size after we've verified that the load is for a
pointer field, as for scalar values narrow loads are fine. Access to
structs passed as arguments to a BPF program are also treated as
scalars, therefore no adjustment is needed in their case.
Existing verifier selftests are broken by this change, but because they
were incorrect. Verifier tests for d_path were performing narrow load
into context to obtain path pointer, had this program actually run it
would cause a crash. The same holds for verifier_btf_ctx_access tests.
[0]: https://lore.kernel.org/bpf/51338.1732985814@localhost
Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF")
Reported-by: Robert Morris <rtm(a)mit.edu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Link: https://lore.kernel.org/r/20241212092050.3204165-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e7a59e6462a9..a63a03582f02 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6543,6 +6543,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
+ return false;
+ }
+
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
index a570e48b917a..bfc3bf18fed4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
@@ -11,7 +11,7 @@ __success __retval(0)
__naked void btf_ctx_access_accept(void)
{
asm volatile (" \
- r2 = *(u32*)(r1 + 8); /* load 2nd argument value (int pointer) */\
+ r2 = *(u64 *)(r1 + 8); /* load 2nd argument value (int pointer) */\
r0 = 0; \
exit; \
" ::: __clobber_all);
@@ -23,7 +23,7 @@ __success __retval(0)
__naked void ctx_access_u32_pointer_accept(void)
{
asm volatile (" \
- r2 = *(u32*)(r1 + 0); /* load 1nd argument value (u32 pointer) */\
+ r2 = *(u64 *)(r1 + 0); /* load 1nd argument value (u32 pointer) */\
r0 = 0; \
exit; \
" ::: __clobber_all);
diff --git a/tools/testing/selftests/bpf/progs/verifier_d_path.c b/tools/testing/selftests/bpf/progs/verifier_d_path.c
index ec79cbcfde91..87e51a215558 100644
--- a/tools/testing/selftests/bpf/progs/verifier_d_path.c
+++ b/tools/testing/selftests/bpf/progs/verifier_d_path.c
@@ -11,7 +11,7 @@ __success __retval(0)
__naked void d_path_accept(void)
{
asm volatile (" \
- r1 = *(u32*)(r1 + 0); \
+ r1 = *(u64 *)(r1 + 0); \
r2 = r10; \
r2 += -8; \
r6 = 0; \
@@ -31,7 +31,7 @@ __failure __msg("helper call is not allowed in probe")
__naked void d_path_reject(void)
{
asm volatile (" \
- r1 = *(u32*)(r1 + 0); \
+ r1 = *(u64 *)(r1 + 0); \
r2 = r10; \
r2 += -8; \
r6 = 0; \
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 659b9ba7cb2d7adb64618b87ddfaa528a143766e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121503-footwork-harness-0128@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 659b9ba7cb2d7adb64618b87ddfaa528a143766e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Date: Thu, 12 Dec 2024 01:20:49 -0800
Subject: [PATCH] bpf: Check size for BTF-based ctx access of pointer members
Robert Morris reported the following program type which passes the
verifier in [0]:
SEC("struct_ops/bpf_cubic_init")
void BPF_PROG(bpf_cubic_init, struct sock *sk)
{
asm volatile("r2 = *(u16*)(r1 + 0)"); // verifier should demand u64
asm volatile("*(u32 *)(r2 +1504) = 0"); // 1280 in some configs
}
The second line may or may not work, but the first instruction shouldn't
pass, as it's a narrow load into the context structure of the struct ops
callback. The code falls back to btf_ctx_access to ensure correctness
and obtaining the types of pointers. Ensure that the size of the access
is correctly checked to be 8 bytes, otherwise the verifier thinks the
narrow load obtained a trusted BTF pointer and will permit loads/stores
as it sees fit.
Perform the check on size after we've verified that the load is for a
pointer field, as for scalar values narrow loads are fine. Access to
structs passed as arguments to a BPF program are also treated as
scalars, therefore no adjustment is needed in their case.
Existing verifier selftests are broken by this change, but because they
were incorrect. Verifier tests for d_path were performing narrow load
into context to obtain path pointer, had this program actually run it
would cause a crash. The same holds for verifier_btf_ctx_access tests.
[0]: https://lore.kernel.org/bpf/51338.1732985814@localhost
Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF")
Reported-by: Robert Morris <rtm(a)mit.edu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Link: https://lore.kernel.org/r/20241212092050.3204165-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e7a59e6462a9..a63a03582f02 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6543,6 +6543,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
+ return false;
+ }
+
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
index a570e48b917a..bfc3bf18fed4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
@@ -11,7 +11,7 @@ __success __retval(0)
__naked void btf_ctx_access_accept(void)
{
asm volatile (" \
- r2 = *(u32*)(r1 + 8); /* load 2nd argument value (int pointer) */\
+ r2 = *(u64 *)(r1 + 8); /* load 2nd argument value (int pointer) */\
r0 = 0; \
exit; \
" ::: __clobber_all);
@@ -23,7 +23,7 @@ __success __retval(0)
__naked void ctx_access_u32_pointer_accept(void)
{
asm volatile (" \
- r2 = *(u32*)(r1 + 0); /* load 1nd argument value (u32 pointer) */\
+ r2 = *(u64 *)(r1 + 0); /* load 1nd argument value (u32 pointer) */\
r0 = 0; \
exit; \
" ::: __clobber_all);
diff --git a/tools/testing/selftests/bpf/progs/verifier_d_path.c b/tools/testing/selftests/bpf/progs/verifier_d_path.c
index ec79cbcfde91..87e51a215558 100644
--- a/tools/testing/selftests/bpf/progs/verifier_d_path.c
+++ b/tools/testing/selftests/bpf/progs/verifier_d_path.c
@@ -11,7 +11,7 @@ __success __retval(0)
__naked void d_path_accept(void)
{
asm volatile (" \
- r1 = *(u32*)(r1 + 0); \
+ r1 = *(u64 *)(r1 + 0); \
r2 = r10; \
r2 += -8; \
r6 = 0; \
@@ -31,7 +31,7 @@ __failure __msg("helper call is not allowed in probe")
__naked void d_path_reject(void)
{
asm volatile (" \
- r1 = *(u32*)(r1 + 0); \
+ r1 = *(u64 *)(r1 + 0); \
r2 = r10; \
r2 += -8; \
r6 = 0; \
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 659b9ba7cb2d7adb64618b87ddfaa528a143766e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121502-patriarch-unguarded-b4c5@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 659b9ba7cb2d7adb64618b87ddfaa528a143766e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Date: Thu, 12 Dec 2024 01:20:49 -0800
Subject: [PATCH] bpf: Check size for BTF-based ctx access of pointer members
Robert Morris reported the following program type which passes the
verifier in [0]:
SEC("struct_ops/bpf_cubic_init")
void BPF_PROG(bpf_cubic_init, struct sock *sk)
{
asm volatile("r2 = *(u16*)(r1 + 0)"); // verifier should demand u64
asm volatile("*(u32 *)(r2 +1504) = 0"); // 1280 in some configs
}
The second line may or may not work, but the first instruction shouldn't
pass, as it's a narrow load into the context structure of the struct ops
callback. The code falls back to btf_ctx_access to ensure correctness
and obtaining the types of pointers. Ensure that the size of the access
is correctly checked to be 8 bytes, otherwise the verifier thinks the
narrow load obtained a trusted BTF pointer and will permit loads/stores
as it sees fit.
Perform the check on size after we've verified that the load is for a
pointer field, as for scalar values narrow loads are fine. Access to
structs passed as arguments to a BPF program are also treated as
scalars, therefore no adjustment is needed in their case.
Existing verifier selftests are broken by this change, but because they
were incorrect. Verifier tests for d_path were performing narrow load
into context to obtain path pointer, had this program actually run it
would cause a crash. The same holds for verifier_btf_ctx_access tests.
[0]: https://lore.kernel.org/bpf/51338.1732985814@localhost
Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF")
Reported-by: Robert Morris <rtm(a)mit.edu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor(a)gmail.com>
Link: https://lore.kernel.org/r/20241212092050.3204165-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e7a59e6462a9..a63a03582f02 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6543,6 +6543,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
+ return false;
+ }
+
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
index a570e48b917a..bfc3bf18fed4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
@@ -11,7 +11,7 @@ __success __retval(0)
__naked void btf_ctx_access_accept(void)
{
asm volatile (" \
- r2 = *(u32*)(r1 + 8); /* load 2nd argument value (int pointer) */\
+ r2 = *(u64 *)(r1 + 8); /* load 2nd argument value (int pointer) */\
r0 = 0; \
exit; \
" ::: __clobber_all);
@@ -23,7 +23,7 @@ __success __retval(0)
__naked void ctx_access_u32_pointer_accept(void)
{
asm volatile (" \
- r2 = *(u32*)(r1 + 0); /* load 1nd argument value (u32 pointer) */\
+ r2 = *(u64 *)(r1 + 0); /* load 1nd argument value (u32 pointer) */\
r0 = 0; \
exit; \
" ::: __clobber_all);
diff --git a/tools/testing/selftests/bpf/progs/verifier_d_path.c b/tools/testing/selftests/bpf/progs/verifier_d_path.c
index ec79cbcfde91..87e51a215558 100644
--- a/tools/testing/selftests/bpf/progs/verifier_d_path.c
+++ b/tools/testing/selftests/bpf/progs/verifier_d_path.c
@@ -11,7 +11,7 @@ __success __retval(0)
__naked void d_path_accept(void)
{
asm volatile (" \
- r1 = *(u32*)(r1 + 0); \
+ r1 = *(u64 *)(r1 + 0); \
r2 = r10; \
r2 += -8; \
r6 = 0; \
@@ -31,7 +31,7 @@ __failure __msg("helper call is not allowed in probe")
__naked void d_path_reject(void)
{
asm volatile (" \
- r1 = *(u32*)(r1 + 0); \
+ r1 = *(u64 *)(r1 + 0); \
r2 = r10; \
r2 += -8; \
r6 = 0; \
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x a440a28ddbdcb861150987b4d6e828631656b92f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121553-prison-usage-c221@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a440a28ddbdcb861150987b4d6e828631656b92f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:24 -0800
Subject: [PATCH] xfs: fix off-by-one error in fsmap's end_daddr usage
In commit ca6448aed4f10a, we created an "end_daddr" variable to fix
fsmap reporting when the end of the range requested falls in the middle
of an unknown (aka free on the rmapbt) region. Unfortunately, I didn't
notice that the the code sets end_daddr to the last sector of the device
but then uses that quantity to compute the length of the synthesized
mapping.
Zizhi Wo later observed that when end_daddr isn't set, we still don't
report the last fsblock on a device because in that case (aka when
info->last is true), the info->high mapping that we pass to
xfs_getfsmap_group_helper has a startblock that points to the last
fsblock. This is also wrong because the code uses startblock to
compute the length of the synthesized mapping.
Fix the second problem by setting end_daddr unconditionally, and fix the
first problem by setting start_daddr to one past the end of the range to
query.
Cc: <stable(a)vger.kernel.org> # v6.11
Fixes: ca6448aed4f10a ("xfs: Fix missing interval for missing_owner in xfs fsmap")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reported-by: Zizhi Wo <wozizhi(a)huawei.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 82f2e0dd2249..3290dd8524a6 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -163,7 +163,8 @@ struct xfs_getfsmap_info {
xfs_daddr_t next_daddr; /* next daddr we expect */
/* daddr of low fsmap key when we're using the rtbitmap */
xfs_daddr_t low_daddr;
- xfs_daddr_t end_daddr; /* daddr of high fsmap key */
+ /* daddr of high fsmap key, or the last daddr on the device */
+ xfs_daddr_t end_daddr;
u64 missing_owner; /* owner of holes */
u32 dev; /* device id */
/*
@@ -387,8 +388,8 @@ xfs_getfsmap_group_helper(
* we calculated from userspace's high key to synthesize the record.
* Note that if the btree query found a mapping, there won't be a gap.
*/
- if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
- frec->start_daddr = info->end_daddr;
+ if (info->last)
+ frec->start_daddr = info->end_daddr + 1;
else
frec->start_daddr = xfs_gbno_to_daddr(xg, startblock);
@@ -736,11 +737,10 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
* we calculated from userspace's high key to synthesize the record.
* Note that if the btree query found a mapping, there won't be a gap.
*/
- if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) {
- frec.start_daddr = info->end_daddr;
- } else {
+ if (info->last)
+ frec.start_daddr = info->end_daddr + 1;
+ else
frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb);
- }
frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount);
return xfs_getfsmap_helper(tp, info, &frec);
@@ -933,7 +933,10 @@ xfs_getfsmap(
struct xfs_trans *tp = NULL;
struct xfs_fsmap dkeys[2]; /* per-dev keys */
struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS];
- struct xfs_getfsmap_info info = { NULL };
+ struct xfs_getfsmap_info info = {
+ .fsmap_recs = fsmap_recs,
+ .head = head,
+ };
bool use_rmap;
int i;
int error = 0;
@@ -998,9 +1001,6 @@ xfs_getfsmap(
info.next_daddr = head->fmh_keys[0].fmr_physical +
head->fmh_keys[0].fmr_length;
- info.end_daddr = XFS_BUF_DADDR_NULL;
- info.fsmap_recs = fsmap_recs;
- info.head = head;
/* For each device we support... */
for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
@@ -1013,17 +1013,23 @@ xfs_getfsmap(
break;
/*
- * If this device number matches the high key, we have
- * to pass the high key to the handler to limit the
- * query results. If the device number exceeds the
- * low key, zero out the low key so that we get
- * everything from the beginning.
+ * If this device number matches the high key, we have to pass
+ * the high key to the handler to limit the query results, and
+ * set the end_daddr so that we can synthesize records at the
+ * end of the query range or device.
*/
if (handlers[i].dev == head->fmh_keys[1].fmr_device) {
dkeys[1] = head->fmh_keys[1];
info.end_daddr = min(handlers[i].nr_sectors - 1,
dkeys[1].fmr_physical);
+ } else {
+ info.end_daddr = handlers[i].nr_sectors - 1;
}
+
+ /*
+ * If the device number exceeds the low key, zero out the low
+ * key so that we get everything from the beginning.
+ */
if (handlers[i].dev > head->fmh_keys[0].fmr_device)
memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x aa7bfb537edf62085d7718845f6644b0e4efb9df
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121542-fretful-identity-5931@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aa7bfb537edf62085d7718845f6644b0e4efb9df Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:27 -0800
Subject: [PATCH] xfs: separate healthy clearing mask during repair
In commit d9041681dd2f53 we introduced some XFS_SICK_*ZAPPED flags so
that the inode record repair code could clean up a damaged inode record
enough to iget the inode but still be able to remember that the higher
level repair code needs to be called. As part of that, we introduced a
xchk_mark_healthy_if_clean helper that is supposed to cause the ZAPPED
state to be removed if that higher level metadata actually checks out.
This was done by setting additional bits in sick_mask hoping that
xchk_update_health will clear all those bits after a healthy scrub.
Unfortunately, that's not quite what sick_mask means -- bits in that
mask are indeed cleared if the metadata is healthy, but they're set if
the metadata is NOT healthy. fsck is only intended to set the ZAPPED
bits explicitly.
If something else sets the CORRUPT/XCORRUPT state after the
xchk_mark_healthy_if_clean call, we end up marking the metadata zapped.
This can happen if the following sequence happens:
1. Scrub runs, discovers that the metadata is fine but could be
optimized and calls xchk_mark_healthy_if_clean on a ZAPPED flag.
That causes the ZAPPED flag to be set in sick_mask because the
metadata is not CORRUPT or XCORRUPT.
2. Repair runs to optimize the metadata.
3. Some other metadata used for cross-referencing in (1) becomes
corrupt.
4. Post-repair scrub runs, but this time it sets CORRUPT or XCORRUPT due
to the events in (3).
5. Now the xchk_health_update sets the ZAPPED flag on the metadata we
just repaired. This is not the correct state.
Fix this by moving the "if healthy" mask to a separate field, and only
ever using it to clear the sick state.
Cc: <stable(a)vger.kernel.org> # v6.8
Fixes: d9041681dd2f53 ("xfs: set inode sick state flags when we zap either ondisk fork")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index ce86bdad37fa..ccc6ca5934ca 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -71,7 +71,8 @@
/* Map our scrub type to a sick mask and a set of health update functions. */
enum xchk_health_group {
- XHG_FS = 1,
+ XHG_NONE = 1,
+ XHG_FS,
XHG_AG,
XHG_INO,
XHG_RTGROUP,
@@ -83,6 +84,7 @@ struct xchk_health_map {
};
static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+ [XFS_SCRUB_TYPE_PROBE] = { XHG_NONE, 0 },
[XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB },
[XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF },
[XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL },
@@ -133,7 +135,7 @@ xchk_mark_healthy_if_clean(
{
if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
XFS_SCRUB_OFLAG_XCORRUPT)))
- sc->sick_mask |= mask;
+ sc->healthy_mask |= mask;
}
/*
@@ -189,6 +191,7 @@ xchk_update_health(
{
struct xfs_perag *pag;
struct xfs_rtgroup *rtg;
+ unsigned int mask = sc->sick_mask;
bool bad;
/*
@@ -203,50 +206,56 @@ xchk_update_health(
return;
}
- if (!sc->sick_mask)
- return;
-
bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
XFS_SCRUB_OFLAG_XCORRUPT));
+ if (!bad)
+ mask |= sc->healthy_mask;
switch (type_to_health_flag[sc->sm->sm_type].group) {
+ case XHG_NONE:
+ break;
case XHG_AG:
+ if (!mask)
+ return;
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_group_mark_corrupt(pag_group(pag), sc->sick_mask);
+ xfs_group_mark_corrupt(pag_group(pag), mask);
else
- xfs_group_mark_healthy(pag_group(pag), sc->sick_mask);
+ xfs_group_mark_healthy(pag_group(pag), mask);
xfs_perag_put(pag);
break;
case XHG_INO:
if (!sc->ip)
return;
- if (bad) {
- unsigned int mask = sc->sick_mask;
-
- /*
- * If we're coming in for repairs then we don't want
- * sickness flags to propagate to the incore health
- * status if the inode gets inactivated before we can
- * fix it.
- */
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
- mask |= XFS_SICK_INO_FORGET;
+ /*
+ * If we're coming in for repairs then we don't want sickness
+ * flags to propagate to the incore health status if the inode
+ * gets inactivated before we can fix it.
+ */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mask |= XFS_SICK_INO_FORGET;
+ if (!mask)
+ return;
+ if (bad)
xfs_inode_mark_corrupt(sc->ip, mask);
- } else
- xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+ else
+ xfs_inode_mark_healthy(sc->ip, mask);
break;
case XHG_FS:
+ if (!mask)
+ return;
if (bad)
- xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
+ xfs_fs_mark_corrupt(sc->mp, mask);
else
- xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+ xfs_fs_mark_healthy(sc->mp, mask);
break;
case XHG_RTGROUP:
+ if (!mask)
+ return;
rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_group_mark_corrupt(rtg_group(rtg), sc->sick_mask);
+ xfs_group_mark_corrupt(rtg_group(rtg), mask);
else
- xfs_group_mark_healthy(rtg_group(rtg), sc->sick_mask);
+ xfs_group_mark_healthy(rtg_group(rtg), mask);
xfs_rtgroup_put(rtg);
break;
default:
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index a7fda3e2b013..5dbbe93cb49b 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -184,6 +184,12 @@ struct xfs_scrub {
*/
unsigned int sick_mask;
+ /*
+ * Clear these XFS_SICK_* flags but only if the scan is ok. Useful for
+ * removing ZAPPED flags after a repair.
+ */
+ unsigned int healthy_mask;
+
/* next time we want to cond_resched() */
struct xchk_relax relax;
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x acc8f8628c3737108f36e5637f4d5daeaf96d90e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121529-renewal-passable-b4c9@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From acc8f8628c3737108f36e5637f4d5daeaf96d90e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:38 -0800
Subject: [PATCH] xfs: attach dquot buffer to dquot log item buffer
Ever since 6.12-rc1, I've observed a pile of warnings from the kernel
when running fstests with quotas enabled:
WARNING: CPU: 1 PID: 458580 at mm/page_alloc.c:4221 __alloc_pages_noprof+0xc9c/0xf18
CPU: 1 UID: 0 PID: 458580 Comm: xfsaild/sda3 Tainted: G W 6.12.0-rc6-djwa #rc6 6ee3e0e531f6457e2d26aa008a3b65ff184b377c
<snip>
Call trace:
__alloc_pages_noprof+0xc9c/0xf18
alloc_pages_mpol_noprof+0x94/0x240
alloc_pages_noprof+0x68/0xf8
new_slab+0x3e0/0x568
___slab_alloc+0x5a0/0xb88
__slab_alloc.constprop.0+0x7c/0xf8
__kmalloc_noprof+0x404/0x4d0
xfs_buf_get_map+0x594/0xde0 [xfs 384cb02810558b4c490343c164e9407332118f88]
xfs_buf_read_map+0x64/0x2e0 [xfs 384cb02810558b4c490343c164e9407332118f88]
xfs_trans_read_buf_map+0x1dc/0x518 [xfs 384cb02810558b4c490343c164e9407332118f88]
xfs_qm_dqflush+0xac/0x468 [xfs 384cb02810558b4c490343c164e9407332118f88]
xfs_qm_dquot_logitem_push+0xe4/0x148 [xfs 384cb02810558b4c490343c164e9407332118f88]
xfsaild+0x3f4/0xde8 [xfs 384cb02810558b4c490343c164e9407332118f88]
kthread+0x110/0x128
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
This corresponds to the line:
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
within the NOFAIL checks. What's happening here is that the XFS AIL is
trying to write a disk quota update back into the filesystem, but for
that it needs to read the ondisk buffer for the dquot. The buffer is
not in memory anymore, probably because it was evicted. Regardless, the
buffer cache tries to allocate a new buffer, but those allocations are
NOFAIL. The AIL thread has marked itself PF_MEMALLOC (aka noreclaim)
since commit 43ff2122e6492b ("xfs: on-stack delayed write buffer lists")
presumably because reclaim can push on XFS to push on the AIL.
An easy way to fix this probably would have been to drop the NOFAIL flag
from the xfs_buf allocation and open code a retry loop, but then there's
still the problem that for bs>ps filesystems, the buffer itself could
require up to 64k worth of pages.
Inode items had similar behavior (multi-page cluster buffers that we
don't want to allocate in the AIL) which we solved by making transaction
precommit attach the inode cluster buffers to the dirty log item. Let's
solve the dquot problem in the same way.
So: Make a real precommit handler to read the dquot buffer and attach it
to the log item; pass it to dqflush in the push method; and have the
iodone function detach the buffer once we've flushed everything. Add a
state flag to the log item to track when a thread has entered the
precommit -> push mechanism to skip the detaching if it turns out that
the dquot is very busy, as we don't hold the dquot lock between log item
commit and AIL push).
Reading and attaching the dquot buffer in the precommit hook is inspired
by the work done for inode cluster buffers some time ago.
Cc: <stable(a)vger.kernel.org> # v6.12
Fixes: 903edea6c53f09 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1dc85de58e59..708fd3358375 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -68,6 +68,30 @@ xfs_dquot_mark_sick(
}
}
+/*
+ * Detach the dquot buffer if it's still attached, because we can get called
+ * through dqpurge after a log shutdown. Caller must hold the dqflock or have
+ * otherwise isolated the dquot.
+ */
+void
+xfs_dquot_detach_buf(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_dq_logitem *qlip = &dqp->q_logitem;
+ struct xfs_buf *bp = NULL;
+
+ spin_lock(&qlip->qli_lock);
+ if (qlip->qli_item.li_buf) {
+ bp = qlip->qli_item.li_buf;
+ qlip->qli_item.li_buf = NULL;
+ }
+ spin_unlock(&qlip->qli_lock);
+ if (bp) {
+ list_del_init(&qlip->qli_item.li_bio_list);
+ xfs_buf_rele(bp);
+ }
+}
+
/*
* This is called to free all the memory associated with a dquot
*/
@@ -76,6 +100,7 @@ xfs_qm_dqdestroy(
struct xfs_dquot *dqp)
{
ASSERT(list_empty(&dqp->q_lru));
+ ASSERT(dqp->q_logitem.qli_item.li_buf == NULL);
kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
@@ -1146,6 +1171,7 @@ xfs_qm_dqflush_done(
container_of(lip, struct xfs_dq_logitem, qli_item);
struct xfs_dquot *dqp = qlip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
+ struct xfs_buf *bp = NULL;
xfs_lsn_t tail_lsn;
/*
@@ -1171,6 +1197,20 @@ xfs_qm_dqflush_done(
}
}
+ /*
+ * If this dquot hasn't been dirtied since initiating the last dqflush,
+ * release the buffer reference. We already unlinked this dquot item
+ * from the buffer.
+ */
+ spin_lock(&qlip->qli_lock);
+ if (!qlip->qli_dirty) {
+ bp = lip->li_buf;
+ lip->li_buf = NULL;
+ }
+ spin_unlock(&qlip->qli_lock);
+ if (bp)
+ xfs_buf_rele(bp);
+
/*
* Release the dq's flush lock since we're done with it.
*/
@@ -1197,7 +1237,7 @@ xfs_buf_dquot_io_fail(
spin_lock(&bp->b_mount->m_ail->ail_lock);
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
- xfs_set_li_failed(lip, bp);
+ set_bit(XFS_LI_FAILED, &lip->li_flags);
spin_unlock(&bp->b_mount->m_ail->ail_lock);
}
@@ -1249,6 +1289,7 @@ int
xfs_dquot_read_buf(
struct xfs_trans *tp,
struct xfs_dquot *dqp,
+ xfs_buf_flags_t xbf_flags,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
@@ -1256,7 +1297,7 @@ xfs_dquot_read_buf(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ mp->m_quotainfo->qi_dqchunklen, xbf_flags,
&bp, &xfs_dquot_buf_ops);
if (error == -EAGAIN)
return error;
@@ -1275,6 +1316,77 @@ xfs_dquot_read_buf(
return error;
}
+/*
+ * Attach a dquot buffer to this dquot to avoid allocating a buffer during a
+ * dqflush, since dqflush can be called from reclaim context.
+ */
+int
+xfs_dquot_attach_buf(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ struct xfs_dq_logitem *qlip = &dqp->q_logitem;
+ struct xfs_log_item *lip = &qlip->qli_item;
+ int error;
+
+ spin_lock(&qlip->qli_lock);
+ if (!lip->li_buf) {
+ struct xfs_buf *bp = NULL;
+
+ spin_unlock(&qlip->qli_lock);
+ error = xfs_dquot_read_buf(tp, dqp, 0, &bp);
+ if (error)
+ return error;
+
+ /*
+ * Attach the dquot to the buffer so that the AIL does not have
+ * to read the dquot buffer to push this item.
+ */
+ xfs_buf_hold(bp);
+ spin_lock(&qlip->qli_lock);
+ lip->li_buf = bp;
+ xfs_trans_brelse(tp, bp);
+ }
+ qlip->qli_dirty = true;
+ spin_unlock(&qlip->qli_lock);
+
+ return 0;
+}
+
+/*
+ * Get a new reference the dquot buffer attached to this dquot for a dqflush
+ * operation.
+ *
+ * Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked
+ * bp; or -EAGAIN if the buffer could not be locked.
+ */
+int
+xfs_dquot_use_attached_buf(
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp = dqp->q_logitem.qli_item.li_buf;
+
+ /*
+ * A NULL buffer can happen if the dquot dirty flag was set but the
+ * filesystem shut down before transaction commit happened. In that
+ * case we're not going to flush anyway.
+ */
+ if (!bp) {
+ ASSERT(xfs_is_shutdown(dqp->q_mount));
+
+ *bpp = NULL;
+ return 0;
+ }
+
+ if (!xfs_buf_trylock(bp))
+ return -EAGAIN;
+
+ xfs_buf_hold(bp);
+ *bpp = bp;
+ return 0;
+}
+
/*
* Write a modified dquot to disk.
* The dquot must be locked and the flush lock too taken by caller.
@@ -1289,7 +1401,8 @@ xfs_qm_dqflush(
struct xfs_buf *bp)
{
struct xfs_mount *mp = dqp->q_mount;
- struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
+ struct xfs_dq_logitem *qlip = &dqp->q_logitem;
+ struct xfs_log_item *lip = &qlip->qli_item;
struct xfs_dqblk *dqblk;
xfs_failaddr_t fa;
int error;
@@ -1319,8 +1432,15 @@ xfs_qm_dqflush(
*/
dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
- xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
- &lip->li_lsn);
+ /*
+ * We hold the dquot lock, so nobody can dirty it while we're
+ * scheduling the write out. Clear the dirty-since-flush flag.
+ */
+ spin_lock(&qlip->qli_lock);
+ qlip->qli_dirty = false;
+ spin_unlock(&qlip->qli_lock);
+
+ xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn);
/*
* copy the lsn into the on-disk dquot now while we have the in memory
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 50f8404c4117..c7e80fc90823 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -215,7 +215,7 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
int xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp,
- struct xfs_buf **bpp);
+ xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp);
void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
void xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
@@ -239,6 +239,10 @@ void xfs_dqlockn(struct xfs_dqtrx *q);
void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp);
+int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+void xfs_dquot_detach_buf(struct xfs_dquot *dqp);
+
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
xfs_dqlock(dqp);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 56ecc5ed0193..271b195ebb93 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -123,8 +123,9 @@ xfs_qm_dquot_logitem_push(
__releases(&lip->li_ailp->ail_lock)
__acquires(&lip->li_ailp->ail_lock)
{
- struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
- struct xfs_buf *bp = lip->li_buf;
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
+ struct xfs_dquot *dqp = qlip->qli_dquot;
+ struct xfs_buf *bp;
uint rval = XFS_ITEM_SUCCESS;
int error;
@@ -155,11 +156,10 @@ xfs_qm_dquot_logitem_push(
spin_unlock(&lip->li_ailp->ail_lock);
- error = xfs_dquot_read_buf(NULL, dqp, &bp);
- if (error) {
- if (error == -EAGAIN)
- rval = XFS_ITEM_LOCKED;
+ error = xfs_dquot_use_attached_buf(dqp, &bp);
+ if (error == -EAGAIN) {
xfs_dqfunlock(dqp);
+ rval = XFS_ITEM_LOCKED;
goto out_relock_ail;
}
@@ -207,12 +207,10 @@ xfs_qm_dquot_logitem_committing(
}
#ifdef DEBUG_EXPENSIVE
-static int
-xfs_qm_dquot_logitem_precommit(
- struct xfs_trans *tp,
- struct xfs_log_item *lip)
+static void
+xfs_qm_dquot_logitem_precommit_check(
+ struct xfs_dquot *dqp)
{
- struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
struct xfs_mount *mp = dqp->q_mount;
struct xfs_disk_dquot ddq = { };
xfs_failaddr_t fa;
@@ -228,13 +226,24 @@ xfs_qm_dquot_logitem_precommit(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
ASSERT(fa == NULL);
}
-
- return 0;
}
#else
-# define xfs_qm_dquot_logitem_precommit NULL
+# define xfs_qm_dquot_logitem_precommit_check(...) ((void)0)
#endif
+static int
+xfs_qm_dquot_logitem_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
+ struct xfs_dquot *dqp = qlip->qli_dquot;
+
+ xfs_qm_dquot_logitem_precommit_check(dqp);
+
+ return xfs_dquot_attach_buf(tp, dqp);
+}
+
static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
.iop_precommit = xfs_qm_dquot_logitem_precommit,
@@ -259,5 +268,7 @@ xfs_qm_dquot_logitem_init(
xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
&xfs_dquot_item_ops);
+ spin_lock_init(&lp->qli_lock);
lp->qli_dquot = dqp;
+ lp->qli_dirty = false;
}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 794710c24474..d66e52807d76 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -14,6 +14,13 @@ struct xfs_dq_logitem {
struct xfs_log_item qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
+
+ /*
+ * We use this spinlock to coordinate access to the li_buf pointer in
+ * the log item and the qli_dirty flag.
+ */
+ spinlock_t qli_lock;
+ bool qli_dirty; /* dirtied since last flush? */
};
void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d9ac50a33c57..7d07d4b5c339 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -148,7 +148,7 @@ xfs_qm_dqpurge(
* We don't care about getting disk errors here. We need
* to purge this dquot anyway, so we go ahead regardless.
*/
- error = xfs_dquot_read_buf(NULL, dqp, &bp);
+ error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
if (error == -EAGAIN) {
xfs_dqfunlock(dqp);
dqp->q_flags &= ~XFS_DQFLAG_FREEING;
@@ -168,6 +168,7 @@ xfs_qm_dqpurge(
}
xfs_dqflock(dqp);
}
+ xfs_dquot_detach_buf(dqp);
out_funlock:
ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -505,7 +506,7 @@ xfs_qm_dquot_isolate(
/* we have to drop the LRU lock to flush the dquot */
spin_unlock(&lru->lock);
- error = xfs_dquot_read_buf(NULL, dqp, &bp);
+ error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
if (error) {
xfs_dqfunlock(dqp);
goto out_unlock_dirty;
@@ -523,6 +524,8 @@ xfs_qm_dquot_isolate(
xfs_buf_relse(bp);
goto out_unlock_dirty;
}
+
+ xfs_dquot_detach_buf(dqp);
xfs_dqfunlock(dqp);
/*
@@ -1510,7 +1513,7 @@ xfs_qm_flush_one(
goto out_unlock;
}
- error = xfs_dquot_read_buf(NULL, dqp, &bp);
+ error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 8ede9d099d1f..f56d62dced97 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -360,7 +360,7 @@ xfsaild_resubmit_item(
/* protected by ail_lock */
list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (bp->b_flags & _XBF_INODES)
+ if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS))
clear_bit(XFS_LI_FAILED, &lip->li_flags);
else
xfs_clear_li_failed(lip);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 07137e925fa951646325762bda6bd2503dfe64c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121515-delta-gutter-495f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: [PATCH] xfs: don't lose solo dquot update transactions
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -924,7 +920,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 07137e925fa951646325762bda6bd2503dfe64c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121515-pursuant-reversion-cf82@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: [PATCH] xfs: don't lose solo dquot update transactions
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -924,7 +920,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 07137e925fa951646325762bda6bd2503dfe64c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121514-bagging-surplus-f572@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: [PATCH] xfs: don't lose solo dquot update transactions
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -924,7 +920,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 07137e925fa951646325762bda6bd2503dfe64c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121513-porcupine-corporate-5dfe@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: [PATCH] xfs: don't lose solo dquot update transactions
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -924,7 +920,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 07137e925fa951646325762bda6bd2503dfe64c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121512-nimbly-playlist-a34e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: [PATCH] xfs: don't lose solo dquot update transactions
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -924,7 +920,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 07137e925fa951646325762bda6bd2503dfe64c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121511-savings-proactive-9581@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:36 -0800
Subject: [PATCH] xfs: don't lose solo dquot update transactions
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fa1317cc396c..d7565462af3d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 427a8ba0ab99..4cd25717c9d1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -866,6 +866,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -894,11 +895,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -924,7 +920,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1018,7 +1014,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 481ba3dc9f19..713b6d243e56 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x ca378189fdfa890a4f0622f85ee41b710bbac271
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121557-herring-copartner-3013@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ca378189fdfa890a4f0622f85ee41b710bbac271 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:39 -0800
Subject: [PATCH] xfs: convert quotacheck to attach dquot buffers
Now that we've converted the dquot logging machinery to attach the dquot
buffer to the li_buf pointer so that the AIL dqflush doesn't have to
allocate or read buffers in a reclaim path, do the same for the
quotacheck code so that the reclaim shrinker dqflush call doesn't have
to do that either.
Cc: <stable(a)vger.kernel.org> # v6.12
Fixes: 903edea6c53f09 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 708fd3358375..f11d475898f2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1285,11 +1285,10 @@ xfs_qm_dqflush_check(
* Requires dquot flush lock, will clear the dirty flag, delete the quota log
* item from the AIL, and shut down the system if something goes wrong.
*/
-int
+static int
xfs_dquot_read_buf(
struct xfs_trans *tp,
struct xfs_dquot *dqp,
- xfs_buf_flags_t xbf_flags,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
@@ -1297,10 +1296,8 @@ xfs_dquot_read_buf(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, xbf_flags,
+ mp->m_quotainfo->qi_dqchunklen, 0,
&bp, &xfs_dquot_buf_ops);
- if (error == -EAGAIN)
- return error;
if (xfs_metadata_is_sick(error))
xfs_dquot_mark_sick(dqp);
if (error)
@@ -1334,7 +1331,7 @@ xfs_dquot_attach_buf(
struct xfs_buf *bp = NULL;
spin_unlock(&qlip->qli_lock);
- error = xfs_dquot_read_buf(tp, dqp, 0, &bp);
+ error = xfs_dquot_read_buf(tp, dqp, &bp);
if (error)
return error;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c7e80fc90823..c617bac75361 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -214,8 +214,6 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY)
void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
-int xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp,
- xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp);
void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
void xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 7d07d4b5c339..69b70c3e999d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -148,13 +148,13 @@ xfs_qm_dqpurge(
* We don't care about getting disk errors here. We need
* to purge this dquot anyway, so we go ahead regardless.
*/
- error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
+ error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error == -EAGAIN) {
xfs_dqfunlock(dqp);
dqp->q_flags &= ~XFS_DQFLAG_FREEING;
goto out_unlock;
}
- if (error)
+ if (!bp)
goto out_funlock;
/*
@@ -506,8 +506,8 @@ xfs_qm_dquot_isolate(
/* we have to drop the LRU lock to flush the dquot */
spin_unlock(&lru->lock);
- error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
- if (error) {
+ error = xfs_dquot_use_attached_buf(dqp, &bp);
+ if (!bp || error == -EAGAIN) {
xfs_dqfunlock(dqp);
goto out_unlock_dirty;
}
@@ -1331,6 +1331,10 @@ xfs_qm_quotacheck_dqadjust(
return error;
}
+ error = xfs_dquot_attach_buf(NULL, dqp);
+ if (error)
+ return error;
+
trace_xfs_dqadjust(dqp);
/*
@@ -1513,9 +1517,13 @@ xfs_qm_flush_one(
goto out_unlock;
}
- error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
+ error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error)
goto out_unlock;
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
error = xfs_qm_dqflush(dqp, bp);
if (!error)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121523-ought-imminent-d0a7@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121523-campus-alike-0706@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121522-parcel-pebble-9775@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121521-unrefined-numerate-45fa@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121521-squeegee-certified-9462@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121520-distract-perish-f190@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 7f8b718c58783f3ff0810b39e2f62f50ba2549f6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121530-postwar-percent-fbe9@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7f8b718c58783f3ff0810b39e2f62f50ba2549f6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:43 -0800
Subject: [PATCH] xfs: return from xfs_symlink_verify early on V4 filesystems
V4 symlink blocks didn't have headers, so return early if this is a V4
filesystem.
Cc: <stable(a)vger.kernel.org> # v5.1
Fixes: 39708c20ab5133 ("xfs: miscellaneous verifier magic value fixups")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index f228127a88ff..fb47a76ead18 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -92,8 +92,10 @@ xfs_symlink_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+ /* no verification of non-crc buffers */
if (!xfs_has_crc(mp))
- return __this_address;
+ return NULL;
+
if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 7f8b718c58783f3ff0810b39e2f62f50ba2549f6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121529-cornbread-brink-915a@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7f8b718c58783f3ff0810b39e2f62f50ba2549f6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:43 -0800
Subject: [PATCH] xfs: return from xfs_symlink_verify early on V4 filesystems
V4 symlink blocks didn't have headers, so return early if this is a V4
filesystem.
Cc: <stable(a)vger.kernel.org> # v5.1
Fixes: 39708c20ab5133 ("xfs: miscellaneous verifier magic value fixups")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index f228127a88ff..fb47a76ead18 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -92,8 +92,10 @@ xfs_symlink_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+ /* no verification of non-crc buffers */
if (!xfs_has_crc(mp))
- return __this_address;
+ return NULL;
+
if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 6d7b4bc1c3e00b1a25b7a05141a64337b4629337
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121558-humid-untagged-523e@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6d7b4bc1c3e00b1a25b7a05141a64337b4629337 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:31 -0800
Subject: [PATCH] xfs: update btree keys correctly when _insrec splits an inode
root block
In commit 2c813ad66a72, I partially fixed a bug wherein xfs_btree_insrec
would erroneously try to update the parent's key for a block that had
been split if we decided to insert the new record into the new block.
The solution was to detect this situation and update the in-core key
value that we pass up to the caller so that the caller will (eventually)
add the new block to the parent level of the tree with the correct key.
However, I missed a subtlety about the way inode-rooted btrees work. If
the full block was a maximally sized inode root block, we'll solve that
fullness by moving the root block's records to a new block, resizing the
root block, and updating the root to point to the new block. We don't
pass a pointer to the new block to the caller because that work has
already been done. The new record will /always/ land in the new block,
so in this case we need to use xfs_btree_update_keys to update the keys.
This bug can theoretically manifest itself in the very rare case that we
split a bmbt root block and the new record lands in the very first slot
of the new block, though I've never managed to trigger it in practice.
However, it is very easy to reproduce by running generic/522 with the
realtime rmapbt patchset if rtinherit=1.
Cc: <stable(a)vger.kernel.org> # v4.8
Fixes: 2c813ad66a7218 ("xfs: support btrees with overlapping intervals for keys")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c748866ef923..68ee1c299c25 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3557,14 +3557,31 @@ xfs_btree_insrec(
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
/*
- * If we just inserted into a new tree block, we have to
- * recalculate nkey here because nkey is out of date.
+ * Update btree keys to reflect the newly added record or keyptr.
+ * There are three cases here to be aware of. Normally, all we have to
+ * do is walk towards the root, updating keys as necessary.
*
- * Otherwise we're just updating an existing block (having shoved
- * some records into the new tree block), so use the regular key
- * update mechanism.
+ * If the caller had us target a full block for the insertion, we dealt
+ * with that by calling the _make_block_unfull function. If the
+ * "make unfull" function splits the block, it'll hand us back the key
+ * and pointer of the new block. We haven't yet added the new block to
+ * the next level up, so if we decide to add the new record to the new
+ * block (bp->b_bn != old_bn), we have to update the caller's pointer
+ * so that the caller adds the new block with the correct key.
+ *
+ * However, there is a third possibility-- if the selected block is the
+ * root block of an inode-rooted btree and cannot be expanded further,
+ * the "make unfull" function moves the root block contents to a new
+ * block and updates the root block to point to the new block. In this
+ * case, no block pointer is passed back because the block has already
+ * been added to the btree. In this case, we need to use the regular
+ * key update function, just like the first case. This is critical for
+ * overlapping btrees, because the high key must be updated to reflect
+ * the entire tree, not just the subtree accessible through the first
+ * child of the root (which is now two levels down from the root).
*/
- if (bp && xfs_buf_daddr(bp) != old_bn) {
+ if (!xfs_btree_ptr_is_null(cur, &nptr) &&
+ bp && xfs_buf_daddr(bp) != old_bn) {
xfs_btree_get_keys(cur, block, lkey);
} else if (xfs_btree_needs_key_update(cur, optr)) {
error = xfs_btree_update_keys(cur, level);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 6d7b4bc1c3e00b1a25b7a05141a64337b4629337
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121557-maker-flint-95f1@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6d7b4bc1c3e00b1a25b7a05141a64337b4629337 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:31 -0800
Subject: [PATCH] xfs: update btree keys correctly when _insrec splits an inode
root block
In commit 2c813ad66a72, I partially fixed a bug wherein xfs_btree_insrec
would erroneously try to update the parent's key for a block that had
been split if we decided to insert the new record into the new block.
The solution was to detect this situation and update the in-core key
value that we pass up to the caller so that the caller will (eventually)
add the new block to the parent level of the tree with the correct key.
However, I missed a subtlety about the way inode-rooted btrees work. If
the full block was a maximally sized inode root block, we'll solve that
fullness by moving the root block's records to a new block, resizing the
root block, and updating the root to point to the new block. We don't
pass a pointer to the new block to the caller because that work has
already been done. The new record will /always/ land in the new block,
so in this case we need to use xfs_btree_update_keys to update the keys.
This bug can theoretically manifest itself in the very rare case that we
split a bmbt root block and the new record lands in the very first slot
of the new block, though I've never managed to trigger it in practice.
However, it is very easy to reproduce by running generic/522 with the
realtime rmapbt patchset if rtinherit=1.
Cc: <stable(a)vger.kernel.org> # v4.8
Fixes: 2c813ad66a7218 ("xfs: support btrees with overlapping intervals for keys")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c748866ef923..68ee1c299c25 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3557,14 +3557,31 @@ xfs_btree_insrec(
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
/*
- * If we just inserted into a new tree block, we have to
- * recalculate nkey here because nkey is out of date.
+ * Update btree keys to reflect the newly added record or keyptr.
+ * There are three cases here to be aware of. Normally, all we have to
+ * do is walk towards the root, updating keys as necessary.
*
- * Otherwise we're just updating an existing block (having shoved
- * some records into the new tree block), so use the regular key
- * update mechanism.
+ * If the caller had us target a full block for the insertion, we dealt
+ * with that by calling the _make_block_unfull function. If the
+ * "make unfull" function splits the block, it'll hand us back the key
+ * and pointer of the new block. We haven't yet added the new block to
+ * the next level up, so if we decide to add the new record to the new
+ * block (bp->b_bn != old_bn), we have to update the caller's pointer
+ * so that the caller adds the new block with the correct key.
+ *
+ * However, there is a third possibility-- if the selected block is the
+ * root block of an inode-rooted btree and cannot be expanded further,
+ * the "make unfull" function moves the root block contents to a new
+ * block and updates the root block to point to the new block. In this
+ * case, no block pointer is passed back because the block has already
+ * been added to the btree. In this case, we need to use the regular
+ * key update function, just like the first case. This is critical for
+ * overlapping btrees, because the high key must be updated to reflect
+ * the entire tree, not just the subtree accessible through the first
+ * child of the root (which is now two levels down from the root).
*/
- if (bp && xfs_buf_daddr(bp) != old_bn) {
+ if (!xfs_btree_ptr_is_null(cur, &nptr) &&
+ bp && xfs_buf_daddr(bp) != old_bn) {
xfs_btree_get_keys(cur, block, lkey);
} else if (xfs_btree_needs_key_update(cur, optr)) {
error = xfs_btree_update_keys(cur, level);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121512-dazzling-encounter-7d53@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x c004a793e0ec34047c3bd423bcd8966f5fac88dc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121511-citable-photo-455b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 2 Dec 2024 10:57:42 -0800
Subject: [PATCH] xfs: fix zero byte checking in the superblock scrubber
The logic to check that the region past the end of the superblock is all
zeroes is wrong -- we don't want to check only the bytes past the end of
the maximally sized ondisk superblock structure as currently defined in
xfs_format.h; we want to check the bytes beyond the end of the ondisk as
defined by the feature bits.
Port the superblock size logic from xfs_repair and then put it to use in
xfs_scrub.
Cc: <stable(a)vger.kernel.org> # v4.15
Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 88063d67cb5f..9f8c312dfd3c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -59,6 +59,32 @@ xchk_superblock_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
/*
* Scrub the filesystem superblock.
*
@@ -75,6 +101,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -388,8 +415,8 @@ xchk_superblock(
}
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 70ec2e8be72c8cb71eb6a18f223484d2a39b708f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121546-path-thicket-fc09@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70ec2e8be72c8cb71eb6a18f223484d2a39b708f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala(a)linux.intel.com>
Date: Wed, 20 Nov 2024 18:41:20 +0200
Subject: [PATCH] drm/i915/dsb: Don't use indexed register writes needlessly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Turns out the DSB indexed register write command has
rather significant initial overhead compared to the normal
MMIO write command. Based on some quick experiments on TGL
you have to write the register at least ~5 times for the
indexed write command to come out ahead. If you write the
register less times than that the MMIO write is faster.
So it seems my automagic indexed write logic was a bit
misguided. Go back to the original approach only use
indexed writes for the cases we know will benefit from
it (indexed LUT register updates).
Currently we shouldn't have any cases where this truly
matters (just some rare double writes to the precision
LUT index registers), but we will need to switch the
legacy LUT updates to write each LUT register twice (to
avoid some palette anti-collision logic troubles).
This would be close to the worst case for using indexed
writes (two writes per register, and 256 separate registers).
Using the MMIO write command should shave off around 30%
of the execution time compared to using the indexed write
command.
Cc: stable(a)vger.kernel.org
Fixes: 34d8311f4a1c ("drm/i915/dsb: Re-instate DSB for LUT updates")
Fixes: 25ea3411bd23 ("drm/i915/dsb: Use non-posted register writes for legacy LUT")
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241120164123.12706-2-ville.…
Reviewed-by: Uma Shankar <uma.shankar(a)intel.com>
(cherry picked from commit ecba559a88ab8399a41893d7828caf4dccbeab6c)
Signed-off-by: Tvrtko Ursulin <tursulin(a)ursulin.net>
diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c
index 174753625bca..6ea3d5c58cb1 100644
--- a/drivers/gpu/drm/i915/display/intel_color.c
+++ b/drivers/gpu/drm/i915/display/intel_color.c
@@ -1343,6 +1343,17 @@ static void ilk_lut_write(const struct intel_crtc_state *crtc_state,
intel_de_write_fw(display, reg, val);
}
+static void ilk_lut_write_indexed(const struct intel_crtc_state *crtc_state,
+ i915_reg_t reg, u32 val)
+{
+ struct intel_display *display = to_intel_display(crtc_state);
+
+ if (crtc_state->dsb_color_vblank)
+ intel_dsb_reg_write_indexed(crtc_state->dsb_color_vblank, reg, val);
+ else
+ intel_de_write_fw(display, reg, val);
+}
+
static void ilk_load_lut_8(const struct intel_crtc_state *crtc_state,
const struct drm_property_blob *blob)
{
@@ -1458,8 +1469,8 @@ static void bdw_load_lut_10(const struct intel_crtc_state *crtc_state,
prec_index);
for (i = 0; i < lut_size; i++)
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
- ilk_lut_10(&lut[i]));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
+ ilk_lut_10(&lut[i]));
/*
* Reset the index, otherwise it prevents the legacy palette to be
@@ -1612,16 +1623,16 @@ static void glk_load_degamma_lut(const struct intel_crtc_state *crtc_state,
* ToDo: Extend to max 7.0. Enable 32 bit input value
* as compared to just 16 to achieve this.
*/
- ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe),
- DISPLAY_VER(display) >= 14 ?
- mtl_degamma_lut(&lut[i]) : glk_degamma_lut(&lut[i]));
+ ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe),
+ DISPLAY_VER(display) >= 14 ?
+ mtl_degamma_lut(&lut[i]) : glk_degamma_lut(&lut[i]));
}
/* Clamp values > 1.0. */
while (i++ < glk_degamma_lut_size(display))
- ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe),
- DISPLAY_VER(display) >= 14 ?
- 1 << 24 : 1 << 16);
+ ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe),
+ DISPLAY_VER(display) >= 14 ?
+ 1 << 24 : 1 << 16);
ilk_lut_write(crtc_state, PRE_CSC_GAMC_INDEX(pipe), 0);
}
@@ -1687,10 +1698,10 @@ icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
for (i = 0; i < 9; i++) {
const struct drm_color_lut *entry = &lut[i];
- ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe),
- ilk_lut_12p4_ldw(entry));
- ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe),
- ilk_lut_12p4_udw(entry));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe),
+ ilk_lut_12p4_ldw(entry));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe),
+ ilk_lut_12p4_udw(entry));
}
ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_INDEX(pipe),
@@ -1726,10 +1737,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
for (i = 1; i < 257; i++) {
entry = &lut[i * 8];
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
- ilk_lut_12p4_ldw(entry));
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
- ilk_lut_12p4_udw(entry));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
+ ilk_lut_12p4_ldw(entry));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
+ ilk_lut_12p4_udw(entry));
}
/*
@@ -1747,10 +1758,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
for (i = 0; i < 256; i++) {
entry = &lut[i * 8 * 128];
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
- ilk_lut_12p4_ldw(entry));
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
- ilk_lut_12p4_udw(entry));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
+ ilk_lut_12p4_ldw(entry));
+ ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
+ ilk_lut_12p4_udw(entry));
}
ilk_lut_write(crtc_state, PREC_PAL_INDEX(pipe),
diff --git a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c
index b7b44399adaa..4d3785f5cb52 100644
--- a/drivers/gpu/drm/i915/display/intel_dsb.c
+++ b/drivers/gpu/drm/i915/display/intel_dsb.c
@@ -273,16 +273,20 @@ static bool intel_dsb_prev_ins_is_indexed_write(struct intel_dsb *dsb, i915_reg_
}
/**
- * intel_dsb_reg_write() - Emit register wriite to the DSB context
+ * intel_dsb_reg_write_indexed() - Emit register wriite to the DSB context
* @dsb: DSB context
* @reg: register address.
* @val: value.
*
* This function is used for writing register-value pair in command
* buffer of DSB.
+ *
+ * Note that indexed writes are slower than normal MMIO writes
+ * for a small number (less than 5 or so) of writes to the same
+ * register.
*/
-void intel_dsb_reg_write(struct intel_dsb *dsb,
- i915_reg_t reg, u32 val)
+void intel_dsb_reg_write_indexed(struct intel_dsb *dsb,
+ i915_reg_t reg, u32 val)
{
/*
* For example the buffer will look like below for 3 dwords for auto
@@ -340,6 +344,15 @@ void intel_dsb_reg_write(struct intel_dsb *dsb,
}
}
+void intel_dsb_reg_write(struct intel_dsb *dsb,
+ i915_reg_t reg, u32 val)
+{
+ intel_dsb_emit(dsb, val,
+ (DSB_OPCODE_MMIO_WRITE << DSB_OPCODE_SHIFT) |
+ (DSB_BYTE_EN << DSB_BYTE_EN_SHIFT) |
+ i915_mmio_reg_offset(reg));
+}
+
static u32 intel_dsb_mask_to_byte_en(u32 mask)
{
return (!!(mask & 0xff000000) << 3 |
diff --git a/drivers/gpu/drm/i915/display/intel_dsb.h b/drivers/gpu/drm/i915/display/intel_dsb.h
index 33e0fc2ab380..da6df07a3c83 100644
--- a/drivers/gpu/drm/i915/display/intel_dsb.h
+++ b/drivers/gpu/drm/i915/display/intel_dsb.h
@@ -34,6 +34,8 @@ void intel_dsb_finish(struct intel_dsb *dsb);
void intel_dsb_cleanup(struct intel_dsb *dsb);
void intel_dsb_reg_write(struct intel_dsb *dsb,
i915_reg_t reg, u32 val);
+void intel_dsb_reg_write_indexed(struct intel_dsb *dsb,
+ i915_reg_t reg, u32 val);
void intel_dsb_reg_write_masked(struct intel_dsb *dsb,
i915_reg_t reg, u32 mask, u32 val);
void intel_dsb_noop(struct intel_dsb *dsb, int count);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x a48f744bef9ee74814a9eccb030b02223e48c76c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121529-embellish-ruby-d51f@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a48f744bef9ee74814a9eccb030b02223e48c76c Mon Sep 17 00:00:00 2001
From: Neal Frager <neal.frager(a)amd.com>
Date: Mon, 2 Dec 2024 23:41:51 +0530
Subject: [PATCH] usb: dwc3: xilinx: make sure pipe clock is deselected in usb2
only mode
When the USB3 PHY is not defined in the Linux device tree, there could
still be a case where there is a USB3 PHY active on the board and enabled
by the first stage bootloader. If serdes clock is being used then the USB
will fail to enumerate devices in 2.0 only mode.
To solve this, make sure that the PIPE clock is deselected whenever the
USB3 PHY is not defined and guarantees that the USB2 only mode will work
in all cases.
Fixes: 9678f3361afc ("usb: dwc3: xilinx: Skip resets and USB3 register settings for USB2.0 mode")
Cc: stable(a)vger.kernel.org
Signed-off-by: Neal Frager <neal.frager(a)amd.com>
Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey(a)amd.com>
Acked-by: Peter Korsgaard <peter(a)korsgaard.com>
Link: https://lore.kernel.org/r/1733163111-1414816-1-git-send-email-radhey.shyam.…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c
index e3738e1610db..a33a42ba0249 100644
--- a/drivers/usb/dwc3/dwc3-xilinx.c
+++ b/drivers/usb/dwc3/dwc3-xilinx.c
@@ -121,8 +121,11 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data)
* in use but the usb3-phy entry is missing from the device tree.
* Therefore, skip these operations in this case.
*/
- if (!priv_data->usb3_phy)
+ if (!priv_data->usb3_phy) {
+ /* Deselect the PIPE Clock Select bit in FPD PIPE Clock register */
+ writel(PIPE_CLK_DESELECT, priv_data->regs + XLNX_USB_FPD_PIPE_CLK);
goto skip_usb3_phy;
+ }
crst = devm_reset_control_get_exclusive(dev, "usb_crst");
if (IS_ERR(crst)) {
From: yangge <yangge1116(a)126.com>
Since commit 984fdba6a32e ("mm, compaction: use proper alloc_flags
in __compaction_suitable()") allow compaction to proceed when free
pages required for compaction reside in the CMA pageblocks, it's
possible that __compaction_suitable() always returns true, and in
some cases, it's not acceptable.
There are 4 NUMA nodes on my machine, and each NUMA node has 32GB
of memory. I have configured 16GB of CMA memory on each NUMA node,
and starting a 32GB virtual machine with device passthrough is
extremely slow, taking almost an hour.
During the start-up of the virtual machine, it will call
pin_user_pages_remote(..., FOLL_LONGTERM, ...) to allocate memory.
Long term GUP cannot allocate memory from CMA area, so a maximum
of 16 GB of no-CMA memory on a NUMA node can be used as virtual
machine memory. Since there is 16G of free CMA memory on the NUMA
node, watermark for order-0 always be met for compaction, so
__compaction_suitable() always returns true, even if the node is
unable to allocate non-CMA memory for the virtual machine.
For costly allocations, because __compaction_suitable() always
returns true, __alloc_pages_slowpath() can't exit at the appropriate
place, resulting in excessively long virtual machine startup times.
Call trace:
__alloc_pages_slowpath
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage; // should exit __alloc_pages_slowpath() from here
To sum up, during long term GUP flow, we should remove ALLOC_CMA
both in __compaction_suitable() and __isolate_free_page().
Signed-off-by: yangge <yangge1116(a)126.com>
---
V3:
- fix build errors
- add ALLOC_CMA both in should_continue_reclaim() and compaction_ready()
V2:
- using the 'cc->alloc_flags' to determin if 'ALLOC_CMA' is needed
- rich the commit log description
include/linux/compaction.h | 6 ++++--
mm/compaction.c | 18 +++++++++++-------
mm/page_alloc.c | 4 +++-
mm/vmscan.c | 4 ++--
4 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index e947764..b4c3ac3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -90,7 +90,8 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx);
+ int highest_zoneidx,
+ unsigned int alloc_flags);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
@@ -108,7 +109,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
}
static inline bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx)
+ int highest_zoneidx,
+ unsigned int alloc_flags)
{
return false;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 07bd227..585f5ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2381,9 +2381,11 @@ static enum compact_result compact_finished(struct compact_control *cc)
static bool __compaction_suitable(struct zone *zone, int order,
int highest_zoneidx,
+ unsigned int alloc_flags,
unsigned long wmark_target)
{
unsigned long watermark;
+ bool use_cma;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2395,25 +2397,27 @@ static bool __compaction_suitable(struct zone *zone, int order,
* even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * In addition to long term GUP flow, ALLOC_CMA is used, as pages in
+ * CMA pageblocks are considered suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ use_cma = !!(alloc_flags & ALLOC_CMA);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ use_cma ? ALLOC_CMA : 0, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx,
+ unsigned int alloc_flags)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, highest_zoneidx, alloc_flags,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2474,7 +2478,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ alloc_flags, available))
return true;
}
@@ -2499,7 +2503,7 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order,
alloc_flags))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ if (!compaction_suitable(zone, order, highest_zoneidx, alloc_flags))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde19db..9a5dfda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2813,6 +2813,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
{
struct zone *zone = page_zone(page);
int mt = get_pageblock_migratetype(page);
+ bool pin;
if (!is_migrate_isolate(mt)) {
unsigned long watermark;
@@ -2823,7 +2824,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ pin = !!(current->flags & PF_MEMALLOC_PIN);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, pin ? 0 : ALLOC_CMA))
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e03a61..33f5b46 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5815,7 +5815,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
}
@@ -6043,7 +6043,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return true;
/* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA))
return false;
/*
--
2.7.4
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 86e6ca55b83c575ab0f2e105cf08f98e58d3d7af
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121539-nutty-gangly-3dff@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 86e6ca55b83c575ab0f2e105cf08f98e58d3d7af Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Fri, 6 Dec 2024 07:59:51 -1000
Subject: [PATCH] blk-cgroup: Fix UAF in blkcg_unpin_online()
blkcg_unpin_online() walks up the blkcg hierarchy putting the online pin. To
walk up, it uses blkcg_parent(blkcg) but it was calling that after
blkcg_destroy_blkgs(blkcg) which could free the blkcg, leading to the
following UAF:
==================================================================
BUG: KASAN: slab-use-after-free in blkcg_unpin_online+0x15a/0x270
Read of size 8 at addr ffff8881057678c0 by task kworker/9:1/117
CPU: 9 UID: 0 PID: 117 Comm: kworker/9:1 Not tainted 6.13.0-rc1-work-00182-gb8f52214c61a-dirty #48
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 02/02/2022
Workqueue: cgwb_release cgwb_release_workfn
Call Trace:
<TASK>
dump_stack_lvl+0x27/0x80
print_report+0x151/0x710
kasan_report+0xc0/0x100
blkcg_unpin_online+0x15a/0x270
cgwb_release_workfn+0x194/0x480
process_scheduled_works+0x71b/0xe20
worker_thread+0x82a/0xbd0
kthread+0x242/0x2c0
ret_from_fork+0x33/0x70
ret_from_fork_asm+0x1a/0x30
</TASK>
...
Freed by task 1944:
kasan_save_track+0x2b/0x70
kasan_save_free_info+0x3c/0x50
__kasan_slab_free+0x33/0x50
kfree+0x10c/0x330
css_free_rwork_fn+0xe6/0xb30
process_scheduled_works+0x71b/0xe20
worker_thread+0x82a/0xbd0
kthread+0x242/0x2c0
ret_from_fork+0x33/0x70
ret_from_fork_asm+0x1a/0x30
Note that the UAF is not easy to trigger as the free path is indirected
behind a couple RCU grace periods and a work item execution. I could only
trigger it with artifical msleep() injected in blkcg_unpin_online().
Fix it by reading the parent pointer before destroying the blkcg's blkg's.
Signed-off-by: Tejun Heo <tj(a)kernel.org>
Reported-by: Abagail ren <renzezhongucas(a)gmail.com>
Suggested-by: Linus Torvalds <torvalds(a)linuxfoundation.org>
Fixes: 4308a434e5e0 ("blkcg: don't offline parent blkcg first")
Cc: stable(a)vger.kernel.org # v5.7+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e68c725cf8d9..45a395862fbc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1324,10 +1324,14 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
struct blkcg *blkcg = css_to_blkcg(blkcg_css);
do {
+ struct blkcg *parent;
+
if (!refcount_dec_and_test(&blkcg->online_pin))
break;
+
+ parent = blkcg_parent(blkcg);
blkcg_destroy_blkgs(blkcg);
- blkcg = blkcg_parent(blkcg);
+ blkcg = parent;
} while (blkcg);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 86e6ca55b83c575ab0f2e105cf08f98e58d3d7af
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121538-engorge-overplant-b846@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 86e6ca55b83c575ab0f2e105cf08f98e58d3d7af Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Fri, 6 Dec 2024 07:59:51 -1000
Subject: [PATCH] blk-cgroup: Fix UAF in blkcg_unpin_online()
blkcg_unpin_online() walks up the blkcg hierarchy putting the online pin. To
walk up, it uses blkcg_parent(blkcg) but it was calling that after
blkcg_destroy_blkgs(blkcg) which could free the blkcg, leading to the
following UAF:
==================================================================
BUG: KASAN: slab-use-after-free in blkcg_unpin_online+0x15a/0x270
Read of size 8 at addr ffff8881057678c0 by task kworker/9:1/117
CPU: 9 UID: 0 PID: 117 Comm: kworker/9:1 Not tainted 6.13.0-rc1-work-00182-gb8f52214c61a-dirty #48
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 02/02/2022
Workqueue: cgwb_release cgwb_release_workfn
Call Trace:
<TASK>
dump_stack_lvl+0x27/0x80
print_report+0x151/0x710
kasan_report+0xc0/0x100
blkcg_unpin_online+0x15a/0x270
cgwb_release_workfn+0x194/0x480
process_scheduled_works+0x71b/0xe20
worker_thread+0x82a/0xbd0
kthread+0x242/0x2c0
ret_from_fork+0x33/0x70
ret_from_fork_asm+0x1a/0x30
</TASK>
...
Freed by task 1944:
kasan_save_track+0x2b/0x70
kasan_save_free_info+0x3c/0x50
__kasan_slab_free+0x33/0x50
kfree+0x10c/0x330
css_free_rwork_fn+0xe6/0xb30
process_scheduled_works+0x71b/0xe20
worker_thread+0x82a/0xbd0
kthread+0x242/0x2c0
ret_from_fork+0x33/0x70
ret_from_fork_asm+0x1a/0x30
Note that the UAF is not easy to trigger as the free path is indirected
behind a couple RCU grace periods and a work item execution. I could only
trigger it with artifical msleep() injected in blkcg_unpin_online().
Fix it by reading the parent pointer before destroying the blkcg's blkg's.
Signed-off-by: Tejun Heo <tj(a)kernel.org>
Reported-by: Abagail ren <renzezhongucas(a)gmail.com>
Suggested-by: Linus Torvalds <torvalds(a)linuxfoundation.org>
Fixes: 4308a434e5e0 ("blkcg: don't offline parent blkcg first")
Cc: stable(a)vger.kernel.org # v5.7+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e68c725cf8d9..45a395862fbc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1324,10 +1324,14 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
struct blkcg *blkcg = css_to_blkcg(blkcg_css);
do {
+ struct blkcg *parent;
+
if (!refcount_dec_and_test(&blkcg->online_pin))
break;
+
+ parent = blkcg_parent(blkcg);
blkcg_destroy_blkgs(blkcg);
- blkcg = blkcg_parent(blkcg);
+ blkcg = parent;
} while (blkcg);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 7cc0e0a43a91052477c2921f924a37d9c3891f0c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121535-obscurity-banking-b3af@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7cc0e0a43a91052477c2921f924a37d9c3891f0c Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Date: Mon, 25 Nov 2024 13:58:56 +0200
Subject: [PATCH] serial: sh-sci: Check if TX data was written to device in
.tx_empty()
On the Renesas RZ/G3S, when doing suspend to RAM, the uart_suspend_port()
is called. The uart_suspend_port() calls 3 times the
struct uart_port::ops::tx_empty() before shutting down the port.
According to the documentation, the struct uart_port::ops::tx_empty()
API tests whether the transmitter FIFO and shifter for the port is
empty.
The Renesas RZ/G3S SCIFA IP reports the number of data units stored in the
transmit FIFO through the FDR (FIFO Data Count Register). The data units
in the FIFOs are written in the shift register and transmitted from there.
The TEND bit in the Serial Status Register reports if the data was
transmitted from the shift register.
In the previous code, in the tx_empty() API implemented by the sh-sci
driver, it is considered that the TX is empty if the hardware reports the
TEND bit set and the number of data units in the FIFO is zero.
According to the HW manual, the TEND bit has the following meaning:
0: Transmission is in the waiting state or in progress.
1: Transmission is completed.
It has been noticed that when opening the serial device w/o using it and
then switch to a power saving mode, the tx_empty() call in the
uart_port_suspend() function fails, leading to the "Unable to drain
transmitter" message being printed on the console. This is because the
TEND=0 if nothing has been transmitted and the FIFOs are empty. As the
TEND=0 has double meaning (waiting state, in progress) we can't
determined the scenario described above.
Add a software workaround for this. This sets a variable if any data has
been sent on the serial console (when using PIO) or if the DMA callback has
been called (meaning something has been transmitted). In the tx_empty()
API the status of the DMA transaction is also checked and if it is
completed or in progress the code falls back in checking the hardware
registers instead of relying on the software variable.
Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.")
Cc: stable(a)vger.kernel.org
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Link: https://lore.kernel.org/r/20241125115856.513642-1-claudiu.beznea.uj@bp.rene…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index df523c744423..924b803af440 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -157,6 +157,7 @@ struct sci_port {
bool has_rtscts;
bool autorts;
+ bool tx_occurred;
};
#define SCI_NPORTS CONFIG_SERIAL_SH_SCI_NR_UARTS
@@ -850,6 +851,7 @@ static void sci_transmit_chars(struct uart_port *port)
{
struct tty_port *tport = &port->state->port;
unsigned int stopped = uart_tx_stopped(port);
+ struct sci_port *s = to_sci_port(port);
unsigned short status;
unsigned short ctrl;
int count;
@@ -885,6 +887,7 @@ static void sci_transmit_chars(struct uart_port *port)
}
sci_serial_out(port, SCxTDR, c);
+ s->tx_occurred = true;
port->icount.tx++;
} while (--count > 0);
@@ -1241,6 +1244,8 @@ static void sci_dma_tx_complete(void *arg)
if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
uart_write_wakeup(port);
+ s->tx_occurred = true;
+
if (!kfifo_is_empty(&tport->xmit_fifo)) {
s->cookie_tx = 0;
schedule_work(&s->work_tx);
@@ -1731,6 +1736,19 @@ static void sci_flush_buffer(struct uart_port *port)
s->cookie_tx = -EINVAL;
}
}
+
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+ struct dma_tx_state state;
+ enum dma_status status;
+
+ if (!s->chan_tx)
+ return;
+
+ status = dmaengine_tx_status(s->chan_tx, s->cookie_tx, &state);
+ if (status == DMA_COMPLETE || status == DMA_IN_PROGRESS)
+ s->tx_occurred = true;
+}
#else /* !CONFIG_SERIAL_SH_SCI_DMA */
static inline void sci_request_dma(struct uart_port *port)
{
@@ -1740,6 +1758,10 @@ static inline void sci_free_dma(struct uart_port *port)
{
}
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+}
+
#define sci_flush_buffer NULL
#endif /* !CONFIG_SERIAL_SH_SCI_DMA */
@@ -2076,6 +2098,12 @@ static unsigned int sci_tx_empty(struct uart_port *port)
{
unsigned short status = sci_serial_in(port, SCxSR);
unsigned short in_tx_fifo = sci_txfill(port);
+ struct sci_port *s = to_sci_port(port);
+
+ sci_dma_check_tx_occurred(s);
+
+ if (!s->tx_occurred)
+ return TIOCSER_TEMT;
return (status & SCxSR_TEND(port)) && !in_tx_fifo ? TIOCSER_TEMT : 0;
}
@@ -2247,6 +2275,7 @@ static int sci_startup(struct uart_port *port)
dev_dbg(port->dev, "%s(%d)\n", __func__, port->line);
+ s->tx_occurred = false;
sci_request_dma(port);
ret = sci_request_irq(s);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 7cc0e0a43a91052477c2921f924a37d9c3891f0c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121534-getaway-geometric-0284@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7cc0e0a43a91052477c2921f924a37d9c3891f0c Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Date: Mon, 25 Nov 2024 13:58:56 +0200
Subject: [PATCH] serial: sh-sci: Check if TX data was written to device in
.tx_empty()
On the Renesas RZ/G3S, when doing suspend to RAM, the uart_suspend_port()
is called. The uart_suspend_port() calls 3 times the
struct uart_port::ops::tx_empty() before shutting down the port.
According to the documentation, the struct uart_port::ops::tx_empty()
API tests whether the transmitter FIFO and shifter for the port is
empty.
The Renesas RZ/G3S SCIFA IP reports the number of data units stored in the
transmit FIFO through the FDR (FIFO Data Count Register). The data units
in the FIFOs are written in the shift register and transmitted from there.
The TEND bit in the Serial Status Register reports if the data was
transmitted from the shift register.
In the previous code, in the tx_empty() API implemented by the sh-sci
driver, it is considered that the TX is empty if the hardware reports the
TEND bit set and the number of data units in the FIFO is zero.
According to the HW manual, the TEND bit has the following meaning:
0: Transmission is in the waiting state or in progress.
1: Transmission is completed.
It has been noticed that when opening the serial device w/o using it and
then switch to a power saving mode, the tx_empty() call in the
uart_port_suspend() function fails, leading to the "Unable to drain
transmitter" message being printed on the console. This is because the
TEND=0 if nothing has been transmitted and the FIFOs are empty. As the
TEND=0 has double meaning (waiting state, in progress) we can't
determined the scenario described above.
Add a software workaround for this. This sets a variable if any data has
been sent on the serial console (when using PIO) or if the DMA callback has
been called (meaning something has been transmitted). In the tx_empty()
API the status of the DMA transaction is also checked and if it is
completed or in progress the code falls back in checking the hardware
registers instead of relying on the software variable.
Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.")
Cc: stable(a)vger.kernel.org
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Link: https://lore.kernel.org/r/20241125115856.513642-1-claudiu.beznea.uj@bp.rene…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index df523c744423..924b803af440 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -157,6 +157,7 @@ struct sci_port {
bool has_rtscts;
bool autorts;
+ bool tx_occurred;
};
#define SCI_NPORTS CONFIG_SERIAL_SH_SCI_NR_UARTS
@@ -850,6 +851,7 @@ static void sci_transmit_chars(struct uart_port *port)
{
struct tty_port *tport = &port->state->port;
unsigned int stopped = uart_tx_stopped(port);
+ struct sci_port *s = to_sci_port(port);
unsigned short status;
unsigned short ctrl;
int count;
@@ -885,6 +887,7 @@ static void sci_transmit_chars(struct uart_port *port)
}
sci_serial_out(port, SCxTDR, c);
+ s->tx_occurred = true;
port->icount.tx++;
} while (--count > 0);
@@ -1241,6 +1244,8 @@ static void sci_dma_tx_complete(void *arg)
if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
uart_write_wakeup(port);
+ s->tx_occurred = true;
+
if (!kfifo_is_empty(&tport->xmit_fifo)) {
s->cookie_tx = 0;
schedule_work(&s->work_tx);
@@ -1731,6 +1736,19 @@ static void sci_flush_buffer(struct uart_port *port)
s->cookie_tx = -EINVAL;
}
}
+
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+ struct dma_tx_state state;
+ enum dma_status status;
+
+ if (!s->chan_tx)
+ return;
+
+ status = dmaengine_tx_status(s->chan_tx, s->cookie_tx, &state);
+ if (status == DMA_COMPLETE || status == DMA_IN_PROGRESS)
+ s->tx_occurred = true;
+}
#else /* !CONFIG_SERIAL_SH_SCI_DMA */
static inline void sci_request_dma(struct uart_port *port)
{
@@ -1740,6 +1758,10 @@ static inline void sci_free_dma(struct uart_port *port)
{
}
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+}
+
#define sci_flush_buffer NULL
#endif /* !CONFIG_SERIAL_SH_SCI_DMA */
@@ -2076,6 +2098,12 @@ static unsigned int sci_tx_empty(struct uart_port *port)
{
unsigned short status = sci_serial_in(port, SCxSR);
unsigned short in_tx_fifo = sci_txfill(port);
+ struct sci_port *s = to_sci_port(port);
+
+ sci_dma_check_tx_occurred(s);
+
+ if (!s->tx_occurred)
+ return TIOCSER_TEMT;
return (status & SCxSR_TEND(port)) && !in_tx_fifo ? TIOCSER_TEMT : 0;
}
@@ -2247,6 +2275,7 @@ static int sci_startup(struct uart_port *port)
dev_dbg(port->dev, "%s(%d)\n", __func__, port->line);
+ s->tx_occurred = false;
sci_request_dma(port);
ret = sci_request_irq(s);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 7cc0e0a43a91052477c2921f924a37d9c3891f0c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121532-brunch-radiation-be3a@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7cc0e0a43a91052477c2921f924a37d9c3891f0c Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Date: Mon, 25 Nov 2024 13:58:56 +0200
Subject: [PATCH] serial: sh-sci: Check if TX data was written to device in
.tx_empty()
On the Renesas RZ/G3S, when doing suspend to RAM, the uart_suspend_port()
is called. The uart_suspend_port() calls 3 times the
struct uart_port::ops::tx_empty() before shutting down the port.
According to the documentation, the struct uart_port::ops::tx_empty()
API tests whether the transmitter FIFO and shifter for the port is
empty.
The Renesas RZ/G3S SCIFA IP reports the number of data units stored in the
transmit FIFO through the FDR (FIFO Data Count Register). The data units
in the FIFOs are written in the shift register and transmitted from there.
The TEND bit in the Serial Status Register reports if the data was
transmitted from the shift register.
In the previous code, in the tx_empty() API implemented by the sh-sci
driver, it is considered that the TX is empty if the hardware reports the
TEND bit set and the number of data units in the FIFO is zero.
According to the HW manual, the TEND bit has the following meaning:
0: Transmission is in the waiting state or in progress.
1: Transmission is completed.
It has been noticed that when opening the serial device w/o using it and
then switch to a power saving mode, the tx_empty() call in the
uart_port_suspend() function fails, leading to the "Unable to drain
transmitter" message being printed on the console. This is because the
TEND=0 if nothing has been transmitted and the FIFOs are empty. As the
TEND=0 has double meaning (waiting state, in progress) we can't
determined the scenario described above.
Add a software workaround for this. This sets a variable if any data has
been sent on the serial console (when using PIO) or if the DMA callback has
been called (meaning something has been transmitted). In the tx_empty()
API the status of the DMA transaction is also checked and if it is
completed or in progress the code falls back in checking the hardware
registers instead of relying on the software variable.
Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.")
Cc: stable(a)vger.kernel.org
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Link: https://lore.kernel.org/r/20241125115856.513642-1-claudiu.beznea.uj@bp.rene…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index df523c744423..924b803af440 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -157,6 +157,7 @@ struct sci_port {
bool has_rtscts;
bool autorts;
+ bool tx_occurred;
};
#define SCI_NPORTS CONFIG_SERIAL_SH_SCI_NR_UARTS
@@ -850,6 +851,7 @@ static void sci_transmit_chars(struct uart_port *port)
{
struct tty_port *tport = &port->state->port;
unsigned int stopped = uart_tx_stopped(port);
+ struct sci_port *s = to_sci_port(port);
unsigned short status;
unsigned short ctrl;
int count;
@@ -885,6 +887,7 @@ static void sci_transmit_chars(struct uart_port *port)
}
sci_serial_out(port, SCxTDR, c);
+ s->tx_occurred = true;
port->icount.tx++;
} while (--count > 0);
@@ -1241,6 +1244,8 @@ static void sci_dma_tx_complete(void *arg)
if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
uart_write_wakeup(port);
+ s->tx_occurred = true;
+
if (!kfifo_is_empty(&tport->xmit_fifo)) {
s->cookie_tx = 0;
schedule_work(&s->work_tx);
@@ -1731,6 +1736,19 @@ static void sci_flush_buffer(struct uart_port *port)
s->cookie_tx = -EINVAL;
}
}
+
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+ struct dma_tx_state state;
+ enum dma_status status;
+
+ if (!s->chan_tx)
+ return;
+
+ status = dmaengine_tx_status(s->chan_tx, s->cookie_tx, &state);
+ if (status == DMA_COMPLETE || status == DMA_IN_PROGRESS)
+ s->tx_occurred = true;
+}
#else /* !CONFIG_SERIAL_SH_SCI_DMA */
static inline void sci_request_dma(struct uart_port *port)
{
@@ -1740,6 +1758,10 @@ static inline void sci_free_dma(struct uart_port *port)
{
}
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+}
+
#define sci_flush_buffer NULL
#endif /* !CONFIG_SERIAL_SH_SCI_DMA */
@@ -2076,6 +2098,12 @@ static unsigned int sci_tx_empty(struct uart_port *port)
{
unsigned short status = sci_serial_in(port, SCxSR);
unsigned short in_tx_fifo = sci_txfill(port);
+ struct sci_port *s = to_sci_port(port);
+
+ sci_dma_check_tx_occurred(s);
+
+ if (!s->tx_occurred)
+ return TIOCSER_TEMT;
return (status & SCxSR_TEND(port)) && !in_tx_fifo ? TIOCSER_TEMT : 0;
}
@@ -2247,6 +2275,7 @@ static int sci_startup(struct uart_port *port)
dev_dbg(port->dev, "%s(%d)\n", __func__, port->line);
+ s->tx_occurred = false;
sci_request_dma(port);
ret = sci_request_irq(s);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 7cc0e0a43a91052477c2921f924a37d9c3891f0c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121531-unlearned-fondue-04e3@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7cc0e0a43a91052477c2921f924a37d9c3891f0c Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Date: Mon, 25 Nov 2024 13:58:56 +0200
Subject: [PATCH] serial: sh-sci: Check if TX data was written to device in
.tx_empty()
On the Renesas RZ/G3S, when doing suspend to RAM, the uart_suspend_port()
is called. The uart_suspend_port() calls 3 times the
struct uart_port::ops::tx_empty() before shutting down the port.
According to the documentation, the struct uart_port::ops::tx_empty()
API tests whether the transmitter FIFO and shifter for the port is
empty.
The Renesas RZ/G3S SCIFA IP reports the number of data units stored in the
transmit FIFO through the FDR (FIFO Data Count Register). The data units
in the FIFOs are written in the shift register and transmitted from there.
The TEND bit in the Serial Status Register reports if the data was
transmitted from the shift register.
In the previous code, in the tx_empty() API implemented by the sh-sci
driver, it is considered that the TX is empty if the hardware reports the
TEND bit set and the number of data units in the FIFO is zero.
According to the HW manual, the TEND bit has the following meaning:
0: Transmission is in the waiting state or in progress.
1: Transmission is completed.
It has been noticed that when opening the serial device w/o using it and
then switch to a power saving mode, the tx_empty() call in the
uart_port_suspend() function fails, leading to the "Unable to drain
transmitter" message being printed on the console. This is because the
TEND=0 if nothing has been transmitted and the FIFOs are empty. As the
TEND=0 has double meaning (waiting state, in progress) we can't
determined the scenario described above.
Add a software workaround for this. This sets a variable if any data has
been sent on the serial console (when using PIO) or if the DMA callback has
been called (meaning something has been transmitted). In the tx_empty()
API the status of the DMA transaction is also checked and if it is
completed or in progress the code falls back in checking the hardware
registers instead of relying on the software variable.
Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.")
Cc: stable(a)vger.kernel.org
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Link: https://lore.kernel.org/r/20241125115856.513642-1-claudiu.beznea.uj@bp.rene…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index df523c744423..924b803af440 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -157,6 +157,7 @@ struct sci_port {
bool has_rtscts;
bool autorts;
+ bool tx_occurred;
};
#define SCI_NPORTS CONFIG_SERIAL_SH_SCI_NR_UARTS
@@ -850,6 +851,7 @@ static void sci_transmit_chars(struct uart_port *port)
{
struct tty_port *tport = &port->state->port;
unsigned int stopped = uart_tx_stopped(port);
+ struct sci_port *s = to_sci_port(port);
unsigned short status;
unsigned short ctrl;
int count;
@@ -885,6 +887,7 @@ static void sci_transmit_chars(struct uart_port *port)
}
sci_serial_out(port, SCxTDR, c);
+ s->tx_occurred = true;
port->icount.tx++;
} while (--count > 0);
@@ -1241,6 +1244,8 @@ static void sci_dma_tx_complete(void *arg)
if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
uart_write_wakeup(port);
+ s->tx_occurred = true;
+
if (!kfifo_is_empty(&tport->xmit_fifo)) {
s->cookie_tx = 0;
schedule_work(&s->work_tx);
@@ -1731,6 +1736,19 @@ static void sci_flush_buffer(struct uart_port *port)
s->cookie_tx = -EINVAL;
}
}
+
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+ struct dma_tx_state state;
+ enum dma_status status;
+
+ if (!s->chan_tx)
+ return;
+
+ status = dmaengine_tx_status(s->chan_tx, s->cookie_tx, &state);
+ if (status == DMA_COMPLETE || status == DMA_IN_PROGRESS)
+ s->tx_occurred = true;
+}
#else /* !CONFIG_SERIAL_SH_SCI_DMA */
static inline void sci_request_dma(struct uart_port *port)
{
@@ -1740,6 +1758,10 @@ static inline void sci_free_dma(struct uart_port *port)
{
}
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+}
+
#define sci_flush_buffer NULL
#endif /* !CONFIG_SERIAL_SH_SCI_DMA */
@@ -2076,6 +2098,12 @@ static unsigned int sci_tx_empty(struct uart_port *port)
{
unsigned short status = sci_serial_in(port, SCxSR);
unsigned short in_tx_fifo = sci_txfill(port);
+ struct sci_port *s = to_sci_port(port);
+
+ sci_dma_check_tx_occurred(s);
+
+ if (!s->tx_occurred)
+ return TIOCSER_TEMT;
return (status & SCxSR_TEND(port)) && !in_tx_fifo ? TIOCSER_TEMT : 0;
}
@@ -2247,6 +2275,7 @@ static int sci_startup(struct uart_port *port)
dev_dbg(port->dev, "%s(%d)\n", __func__, port->line);
+ s->tx_occurred = false;
sci_request_dma(port);
ret = sci_request_irq(s);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 7cc0e0a43a91052477c2921f924a37d9c3891f0c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024121530-womanlike-motocross-3283@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7cc0e0a43a91052477c2921f924a37d9c3891f0c Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Date: Mon, 25 Nov 2024 13:58:56 +0200
Subject: [PATCH] serial: sh-sci: Check if TX data was written to device in
.tx_empty()
On the Renesas RZ/G3S, when doing suspend to RAM, the uart_suspend_port()
is called. The uart_suspend_port() calls 3 times the
struct uart_port::ops::tx_empty() before shutting down the port.
According to the documentation, the struct uart_port::ops::tx_empty()
API tests whether the transmitter FIFO and shifter for the port is
empty.
The Renesas RZ/G3S SCIFA IP reports the number of data units stored in the
transmit FIFO through the FDR (FIFO Data Count Register). The data units
in the FIFOs are written in the shift register and transmitted from there.
The TEND bit in the Serial Status Register reports if the data was
transmitted from the shift register.
In the previous code, in the tx_empty() API implemented by the sh-sci
driver, it is considered that the TX is empty if the hardware reports the
TEND bit set and the number of data units in the FIFO is zero.
According to the HW manual, the TEND bit has the following meaning:
0: Transmission is in the waiting state or in progress.
1: Transmission is completed.
It has been noticed that when opening the serial device w/o using it and
then switch to a power saving mode, the tx_empty() call in the
uart_port_suspend() function fails, leading to the "Unable to drain
transmitter" message being printed on the console. This is because the
TEND=0 if nothing has been transmitted and the FIFOs are empty. As the
TEND=0 has double meaning (waiting state, in progress) we can't
determined the scenario described above.
Add a software workaround for this. This sets a variable if any data has
been sent on the serial console (when using PIO) or if the DMA callback has
been called (meaning something has been transmitted). In the tx_empty()
API the status of the DMA transaction is also checked and if it is
completed or in progress the code falls back in checking the hardware
registers instead of relying on the software variable.
Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.")
Cc: stable(a)vger.kernel.org
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj(a)bp.renesas.com>
Link: https://lore.kernel.org/r/20241125115856.513642-1-claudiu.beznea.uj@bp.rene…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index df523c744423..924b803af440 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -157,6 +157,7 @@ struct sci_port {
bool has_rtscts;
bool autorts;
+ bool tx_occurred;
};
#define SCI_NPORTS CONFIG_SERIAL_SH_SCI_NR_UARTS
@@ -850,6 +851,7 @@ static void sci_transmit_chars(struct uart_port *port)
{
struct tty_port *tport = &port->state->port;
unsigned int stopped = uart_tx_stopped(port);
+ struct sci_port *s = to_sci_port(port);
unsigned short status;
unsigned short ctrl;
int count;
@@ -885,6 +887,7 @@ static void sci_transmit_chars(struct uart_port *port)
}
sci_serial_out(port, SCxTDR, c);
+ s->tx_occurred = true;
port->icount.tx++;
} while (--count > 0);
@@ -1241,6 +1244,8 @@ static void sci_dma_tx_complete(void *arg)
if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
uart_write_wakeup(port);
+ s->tx_occurred = true;
+
if (!kfifo_is_empty(&tport->xmit_fifo)) {
s->cookie_tx = 0;
schedule_work(&s->work_tx);
@@ -1731,6 +1736,19 @@ static void sci_flush_buffer(struct uart_port *port)
s->cookie_tx = -EINVAL;
}
}
+
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+ struct dma_tx_state state;
+ enum dma_status status;
+
+ if (!s->chan_tx)
+ return;
+
+ status = dmaengine_tx_status(s->chan_tx, s->cookie_tx, &state);
+ if (status == DMA_COMPLETE || status == DMA_IN_PROGRESS)
+ s->tx_occurred = true;
+}
#else /* !CONFIG_SERIAL_SH_SCI_DMA */
static inline void sci_request_dma(struct uart_port *port)
{
@@ -1740,6 +1758,10 @@ static inline void sci_free_dma(struct uart_port *port)
{
}
+static void sci_dma_check_tx_occurred(struct sci_port *s)
+{
+}
+
#define sci_flush_buffer NULL
#endif /* !CONFIG_SERIAL_SH_SCI_DMA */
@@ -2076,6 +2098,12 @@ static unsigned int sci_tx_empty(struct uart_port *port)
{
unsigned short status = sci_serial_in(port, SCxSR);
unsigned short in_tx_fifo = sci_txfill(port);
+ struct sci_port *s = to_sci_port(port);
+
+ sci_dma_check_tx_occurred(s);
+
+ if (!s->tx_occurred)
+ return TIOCSER_TEMT;
return (status & SCxSR_TEND(port)) && !in_tx_fifo ? TIOCSER_TEMT : 0;
}
@@ -2247,6 +2275,7 @@ static int sci_startup(struct uart_port *port)
dev_dbg(port->dev, "%s(%d)\n", __func__, port->line);
+ s->tx_occurred = false;
sci_request_dma(port);
ret = sci_request_irq(s);
From: yangge <yangge1116(a)126.com>
Since commit 984fdba6a32e ("mm, compaction: use proper alloc_flags
in __compaction_suitable()") allow compaction to proceed when free
pages required for compaction reside in the CMA pageblocks, it's
possible that __compaction_suitable() always returns true, and in
some cases, it's not acceptable.
There are 4 NUMA nodes on my machine, and each NUMA node has 32GB
of memory. I have configured 16GB of CMA memory on each NUMA node,
and starting a 32GB virtual machine with device passthrough is
extremely slow, taking almost an hour.
During the start-up of the virtual machine, it will call
pin_user_pages_remote(..., FOLL_LONGTERM, ...) to allocate memory.
Long term GUP cannot allocate memory from CMA area, so a maximum
of 16 GB of no-CMA memory on a NUMA node can be used as virtual
machine memory. Since there is 16G of free CMA memory on the NUMA
node, watermark for order-0 always be met for compaction, so
__compaction_suitable() always returns true, even if the node is
unable to allocate non-CMA memory for the virtual machine.
For costly allocations, because __compaction_suitable() always
returns true, __alloc_pages_slowpath() can't exit at the appropriate
place, resulting in excessively long virtual machine startup times.
Call trace:
__alloc_pages_slowpath
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage; // should exit __alloc_pages_slowpath() from here
In order to quickly fall back to remote node, we should remove
ALLOC_CMA both in __compaction_suitable() and __isolate_free_page()
in long term GUP flow. After this fix, starting a 32GB virtual machine
with device passthrough takes only a few seconds.
Fixes: 984fdba6a32e ("mm, compaction: use proper alloc_flags in __compaction_suitable()")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: yangge <yangge1116(a)126.com>
---
V2:
- using the 'cc->alloc_flags' to determin if 'ALLOC_CMA' is needed
- rich the commit log description
include/linux/compaction.h | 3 ++-
mm/compaction.c | 18 +++++++++++-------
mm/page_alloc.c | 4 +++-
mm/vmscan.c | 4 ++--
4 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index e947764..0c6f97a 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -90,7 +90,8 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx);
+ int highest_zoneidx,
+ unsigned int alloc_flags);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
diff --git a/mm/compaction.c b/mm/compaction.c
index 07bd227..585f5ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2381,9 +2381,11 @@ static enum compact_result compact_finished(struct compact_control *cc)
static bool __compaction_suitable(struct zone *zone, int order,
int highest_zoneidx,
+ unsigned int alloc_flags,
unsigned long wmark_target)
{
unsigned long watermark;
+ bool use_cma;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2395,25 +2397,27 @@ static bool __compaction_suitable(struct zone *zone, int order,
* even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * In addition to long term GUP flow, ALLOC_CMA is used, as pages in
+ * CMA pageblocks are considered suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ use_cma = !!(alloc_flags & ALLOC_CMA);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ use_cma ? ALLOC_CMA : 0, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx,
+ unsigned int alloc_flags)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, highest_zoneidx, alloc_flags,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2474,7 +2478,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ alloc_flags, available))
return true;
}
@@ -2499,7 +2503,7 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order,
alloc_flags))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ if (!compaction_suitable(zone, order, highest_zoneidx, alloc_flags))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde19db..9a5dfda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2813,6 +2813,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
{
struct zone *zone = page_zone(page);
int mt = get_pageblock_migratetype(page);
+ bool pin;
if (!is_migrate_isolate(mt)) {
unsigned long watermark;
@@ -2823,7 +2824,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ pin = !!(current->flags & PF_MEMALLOC_PIN);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, pin ? 0 : ALLOC_CMA))
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e03a61..806f031 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5815,7 +5815,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx, 0))
return false;
}
@@ -6043,7 +6043,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return true;
/* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx, 0))
return false;
/*
--
2.7.4
When testing large folio support with XFS on our servers, we observed that
only a few large folios are mapped when reading large files via mmap.
After a thorough analysis, I identified it was caused by the
`/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this
parameter is set to 128KB. After I tune it to 2MB, the large folio can
work as expected. However, I believe the large folio behavior should not
be dependent on the value of read_ahead_kb. It would be more robust if
the kernel can automatically adopt to it.
With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a
sequential read on a 1GB file using MADV_HUGEPAGE, the differences in
/proc/meminfo are as follows:
- before this patch
FileHugePages: 18432 kB
FilePmdMapped: 4096 kB
- after this patch
FileHugePages: 1067008 kB
FilePmdMapped: 1048576 kB
This shows that after applying the patch, the entire 1GB file is mapped to
huge pages. The stable list is CCed, as without this patch, large folios
don't function optimally in the readahead path.
It's worth noting that if read_ahead_kb is set to a larger value that
isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail
to map to hugepages.
Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com
Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
Signed-off-by: Yafang Shao <laoar.shao(a)gmail.com>
Tested-by: kernel test robot <oliver.sang(a)intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
---
mm/readahead.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
Changes:
v2->v3:
- Fix the softlockup reported by kernel test robot
https://lore.kernel.org/linux-fsdevel/202411292300.61edbd37-lkp@intel.com/
v1->v2: https://lore.kernel.org/linux-mm/20241108141710.9721-1-laoar.shao@gmail.com/
- Drop the alignment (Matthew)
- Improve commit log (Andrew)
RFC->v1: https://lore.kernel.org/linux-mm/20241106092114.8408-1-laoar.shao@gmail.com/
- Simplify the code as suggested by Matthew
RFC: https://lore.kernel.org/linux-mm/20241104143015.34684-1-laoar.shao@gmail.co…
diff --git a/mm/readahead.c b/mm/readahead.c
index 3dc6c7a128dd..1dc3cffd4843 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -642,7 +642,11 @@ void page_cache_async_ra(struct readahead_control *ractl,
1UL << order);
if (index == expected) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max_pages);
+ /*
+ * In the case of MADV_HUGEPAGE, the actual size might exceed
+ * the readahead window.
+ */
+ ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
ra->async_size = ra->size;
goto readit;
}
--
2.43.5
From: Sungjong Seo <sj1557.seo(a)samsung.com>
commit 89fc548767a2155231128cb98726d6d2ea1256c9 upstream.
When accessing a file with more entries than ES_MAX_ENTRY_NUM, the bh-array
is allocated in __exfat_get_entry_set. The problem is that the bh-array is
allocated with GFP_KERNEL. It does not make sense. In the following cases,
a deadlock for sbi->s_lock between the two processes may occur.
CPU0 CPU1
---- ----
kswapd
balance_pgdat
lock(fs_reclaim)
exfat_iterate
lock(&sbi->s_lock)
exfat_readdir
exfat_get_uniname_from_ext_entry
exfat_get_dentry_set
__exfat_get_dentry_set
kmalloc_array
...
lock(fs_reclaim)
...
evict
exfat_evict_inode
lock(&sbi->s_lock)
To fix this, let's allocate bh-array with GFP_NOFS.
Fixes: a3ff29a95fde ("exfat: support dynamic allocate bh for exfat_entry_set_cache")
Cc: stable(a)vger.kernel.org # v6.2+
Reported-by: syzbot+412a392a2cd4a65e71db(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/lkml/000000000000fef47e0618c0327f@google.com
Signed-off-by: Sungjong Seo <sj1557.seo(a)samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon(a)kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
[Sherry: The problematic commit was backported to 5.15.y and 5.10.y,
thus backport this fix]
Signed-off-by: Sherry Yang <sherry.yang(a)oracle.com>
---
fs/exfat/dir.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index be7570d01ae1..0a1b1de032ef 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -878,7 +878,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
if (num_bh > ARRAY_SIZE(es->__bh)) {
- es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
+ es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_NOFS);
if (!es->bh) {
brelse(bh);
kfree(es);
--
2.46.0
The channel index is off by one unit if AS73211_SCAN_MASK_ALL is not
set (optimized path for color channel readings), and it must be shifted
instead of leaving an empty channel for the temperature when it is off.
Once the channel index is fixed, the uninitialized channel must be set
to zero to avoid pushing uninitialized data.
Add available_scan_masks for all channels and only-color channels to let
the IIO core demux and repack the enabled channels.
Cc: stable(a)vger.kernel.org
Fixes: 403e5586b52e ("iio: light: as73211: New driver")
Tested-by: Christian Eggers <ceggers(a)arri.de>
Signed-off-by: Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
---
This issue was found after attempting to make the same mistake for
a driver I maintain, which was fortunately spotted by Jonathan [1].
Keeping old sensor values if the channel configuration changes is known
and not considered an issue, which is also mentioned in [1], so it has
not been addressed by this series. That keeps most of the drivers out
of the way because they store the scan element in iio private data,
which is kzalloc() allocated.
This series only addresses cases where uninitialized i.e. unknown data
is pushed to the userspace, either due to holes in structs or
uninitialized struct members/array elements.
While analyzing involved functions, I found and fixed some triviality
(wrong function name) in the documentation of iio_dev_opaque.
Link: https://lore.kernel.org/linux-iio/20241123151634.303aa860@jic23-huawei/ [1]
---
Changes in v3:
- as73211.c: add available_scan_masks for all channels and only-color
channels to let the IIO core demux and repack the enabled channels.
- Link to v2: https://lore.kernel.org/r/20241204-iio_memset_scan_holes-v2-0-3f941592a76d@…
Changes in v2:
- as73211.c: shift channels if no temperature is available and
initialize chan[3] to zero.
- Link to v1: https://lore.kernel.org/r/20241125-iio_memset_scan_holes-v1-0-0cb6e98d895c@…
---
drivers/iio/light/as73211.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/drivers/iio/light/as73211.c b/drivers/iio/light/as73211.c
index be0068081ebb..4be2e349a068 100644
--- a/drivers/iio/light/as73211.c
+++ b/drivers/iio/light/as73211.c
@@ -177,6 +177,12 @@ struct as73211_data {
BIT(AS73211_SCAN_INDEX_TEMP) | \
AS73211_SCAN_MASK_COLOR)
+static const unsigned long as73211_scan_masks[] = {
+ AS73211_SCAN_MASK_ALL,
+ AS73211_SCAN_MASK_COLOR,
+ 0,
+};
+
static const struct iio_chan_spec as73211_channels[] = {
{
.type = IIO_TEMP,
@@ -672,9 +678,12 @@ static irqreturn_t as73211_trigger_handler(int irq __always_unused, void *p)
/* AS73211 starts reading at address 2 */
ret = i2c_master_recv(data->client,
- (char *)&scan.chan[1], 3 * sizeof(scan.chan[1]));
+ (char *)&scan.chan[0], 3 * sizeof(scan.chan[0]));
if (ret < 0)
goto done;
+
+ /* Avoid pushing uninitialized data */
+ scan.chan[3] = 0;
}
if (data_result) {
@@ -682,9 +691,15 @@ static irqreturn_t as73211_trigger_handler(int irq __always_unused, void *p)
* Saturate all channels (in case of overflows). Temperature channel
* is not affected by overflows.
*/
- scan.chan[1] = cpu_to_le16(U16_MAX);
- scan.chan[2] = cpu_to_le16(U16_MAX);
- scan.chan[3] = cpu_to_le16(U16_MAX);
+ if (*indio_dev->active_scan_mask == AS73211_SCAN_MASK_ALL) {
+ scan.chan[1] = cpu_to_le16(U16_MAX);
+ scan.chan[2] = cpu_to_le16(U16_MAX);
+ scan.chan[3] = cpu_to_le16(U16_MAX);
+ } else {
+ scan.chan[0] = cpu_to_le16(U16_MAX);
+ scan.chan[1] = cpu_to_le16(U16_MAX);
+ scan.chan[2] = cpu_to_le16(U16_MAX);
+ }
}
iio_push_to_buffers_with_timestamp(indio_dev, &scan, iio_get_time_ns(indio_dev));
@@ -758,6 +773,7 @@ static int as73211_probe(struct i2c_client *client)
indio_dev->channels = data->spec_dev->channels;
indio_dev->num_channels = data->spec_dev->num_channels;
indio_dev->modes = INDIO_DIRECT_MODE;
+ indio_dev->available_scan_masks = as73211_scan_masks;
ret = i2c_smbus_read_byte_data(data->client, AS73211_REG_OSR);
if (ret < 0)
---
base-commit: 91e71d606356e50f238d7a87aacdee4abc427f07
change-id: 20241123-iio_memset_scan_holes-a673833ef932
Best regards,
--
Javier Carrasco <javier.carrasco.cruz(a)gmail.com>