The patch titled
Subject: mm/huge_memory: avoid PMD-size page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/huge_memory: avoid PMD-size page cache if needed
Date: Mon, 15 Jul 2024 10:04:23 +1000
xarray can't support arbitrary page cache size. the largest and supported
page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71
("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However,
it's possible to have 512MB page cache in the huge memory's collapsing
path on ARM64 system whose base page size is 64KB. 512MB page cache is
breaking the limitation and a warning is raised when the xarray entry is
split as shown in the following example.
[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
KernelPageSize: 64 kB
[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
:
int main(int argc, char **argv)
{
const char *filename = TEST_XFS_FILENAME;
int fd = 0;
void *buf = (void *)-1, *p;
int pgsize = getpagesize();
int ret = 0;
if (pgsize != 0x10000) {
fprintf(stdout, "System with 64KB base page size is required!\n");
return -EPERM;
}
system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
system("echo 1 > /proc/sys/vm/drop_caches");
/* Open the xfs file */
fd = open(filename, O_RDONLY);
assert(fd > 0);
/* Create VMA */
buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
assert(buf != (void *)-1);
fprintf(stdout, "mapped buffer at 0x%p\n", buf);
/* Populate VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
assert(ret == 0);
ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
assert(ret == 0);
/* Collapse VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
assert(ret == 0);
ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
if (ret) {
fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
goto out;
}
/* Split xarray entry. Write permission is needed */
munmap(buf, TEST_MEM_SIZE);
buf = (void *)-1;
close(fd);
fd = open(filename, O_RDWR);
assert(fd > 0);
fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
TEST_MEM_SIZE - pgsize, pgsize);
out:
if (buf != (void *)-1)
munmap(buf, TEST_MEM_SIZE);
if (fd > 0)
close(fd);
return ret;
}
[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
[root@dhcp-10-26-1-207 ~]# /tmp/test
------------[ cut here ]------------
WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \
xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \
sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x780
sp : ffff8000ac32f660
x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x780
truncate_inode_partial_folio+0xdc/0x160
truncate_inode_pages_range+0x1b4/0x4a8
truncate_pagecache_range+0x84/0xa0
xfs_flush_unmap_range+0x70/0x90 [xfs]
xfs_file_fallocate+0xfc/0x4d8 [xfs]
vfs_fallocate+0x124/0x2f0
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by correcting the supported page cache orders, different sets for
DAX and other files. With it corrected, 512MB page cache becomes
disallowed on all non-DAX files on ARM64 system where the base page size
is 64KB. After this patch is applied, the test program fails with error
-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
system call to collapse the page caches.
Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts(a)arm.com>
Acked-by: Zi Yan <ziy(a)nvidia.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: <stable(a)vger.kernel.org> [5.17+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/huge_mm.h | 12 +++++++++---
mm/huge_memory.c | 12 ++++++++++--
2 files changed, 19 insertions(+), 5 deletions(-)
--- a/include/linux/huge_mm.h~mm-huge_memory-avoid-pmd-size-page-cache-if-needed
+++ a/include/linux/huge_mm.h
@@ -72,14 +72,20 @@ extern struct kobj_attribute shmem_enabl
#define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
/*
- * Mask of all large folio orders supported for file THP.
+ * Mask of all large folio orders supported for file THP. Folios in a DAX
+ * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
+ * it.
*/
-#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DAX \
+ (BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DEFAULT \
+ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
/*
* Mask of all large folio orders supported for THP.
*/
-#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+#define THP_ORDERS_ALL \
+ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
#define TVA_IN_PF (1 << 1) /* Page fault handler */
--- a/mm/huge_memory.c~mm-huge_memory-avoid-pmd-size-page-cache-if-needed
+++ a/mm/huge_memory.c
@@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders
bool smaps = tva_flags & TVA_SMAPS;
bool in_pf = tva_flags & TVA_IN_PF;
bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ unsigned long supported_orders;
+
/* Check the intersection of requested and supported orders. */
- orders &= vma_is_anonymous(vma) ?
- THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
+ if (vma_is_anonymous(vma))
+ supported_orders = THP_ORDERS_ALL_ANON;
+ else if (vma_is_dax(vma))
+ supported_orders = THP_ORDERS_ALL_FILE_DAX;
+ else
+ supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
+
+ orders &= supported_orders;
if (!orders)
return 0;
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
Hi,
There are two patches for bpf that I think should be backported to
v6.6 kernel to fix a privilege escalation vulnerability in kernelCTF.
bpf: Fix too early release of tcx_entry
1cb6f0bae50441f4b4b32a28315853b279c7404e
selftests/bpf: Extend tcx tests to cover late tcx_entry release
5f1d18de79180deac2822c93e431bbe547f7d3ce
Thanks!
Best regards,
He
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 310d6c15e9104c99d5d9d0ff8e5383a79da7d5e6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024071546-swerve-grew-de52@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
310d6c15e910 ("mm/damon/core: merge regions aggressively when max_nr_regions is unmet")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 310d6c15e9104c99d5d9d0ff8e5383a79da7d5e6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj(a)kernel.org>
Date: Mon, 24 Jun 2024 10:58:14 -0700
Subject: [PATCH] mm/damon/core: merge regions aggressively when max_nr_regions
is unmet
DAMON keeps the number of regions under max_nr_regions by skipping regions
split operations when doing so can make the number higher than the limit.
It works well for preventing violation of the limit. But, if somehow the
violation happens, it cannot recovery well depending on the situation. In
detail, if the real number of regions having different access pattern is
higher than the limit, the mechanism cannot reduce the number below the
limit. In such a case, the system could suffer from high monitoring
overhead of DAMON.
The violation can actually happen. For an example, the user could reduce
max_nr_regions while DAMON is running, to be lower than the current number
of regions. Fix the problem by repeating the merge operations with
increasing aggressiveness in kdamond_merge_regions() for the case, until
the limit is met.
[sj(a)kernel.org: increase regions merge aggressiveness while respecting min_nr_regions]
Link: https://lkml.kernel.org/r/20240626164753.46270-1-sj@kernel.org
[sj(a)kernel.org: ensure max threshold attempt for max_nr_regions violation]
Link: https://lkml.kernel.org/r/20240627163153.75969-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240624175814.89611-1-sj@kernel.org
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.15+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 6392f1cc97a3..e66823d6b10b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1358,14 +1358,31 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
* access frequencies are similar. This is for minimizing the monitoring
* overhead under the dynamically changeable access pattern. If a merge was
* unnecessarily made, later 'kdamond_split_regions()' will revert it.
+ *
+ * The total number of regions could be higher than the user-defined limit,
+ * max_nr_regions for some cases. For example, the user can update
+ * max_nr_regions to a number that lower than the current number of regions
+ * while DAMON is running. For such a case, repeat merging until the limit is
+ * met while increasing @threshold up to possible maximum level.
*/
static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
unsigned long sz_limit)
{
struct damon_target *t;
+ unsigned int nr_regions;
+ unsigned int max_thres;
- damon_for_each_target(t, c)
- damon_merge_regions_of(t, threshold, sz_limit);
+ max_thres = c->attrs.aggr_interval /
+ (c->attrs.sample_interval ? c->attrs.sample_interval : 1);
+ do {
+ nr_regions = 0;
+ damon_for_each_target(t, c) {
+ damon_merge_regions_of(t, threshold, sz_limit);
+ nr_regions += damon_nr_regions(t);
+ }
+ threshold = max(1, threshold * 2);
+ } while (nr_regions > c->attrs.max_nr_regions &&
+ threshold / 2 < max_thres);
}
/*
I have created a bz for this issue as well -
https://bugzilla.redhat.com/show_bug.cgi?id=2293600
In my digging, it seems that the 6.9+ kernels do not have the xpad
kernel module (running # sudo modprobe xpad returns no value)
A workaround was to manually install xone or xpad kernel module and
self sign the key (keeping secure boot intact) however, this is
somewhat tedious since it did work before.
Commit ffd603f21423 ("usb: gadget: u_serial: Add null pointer check in
gs_start_io") adds null pointer checks to gs_start_io(), but it doesn't
fully fix the potential null pointer dereference issue. While
gserial_connect() calls gs_start_io() with port_lock held, gs_start_rx()
and gs_start_tx() release the lock during endpoint request submission.
This creates a window where gs_close() could set port->port_tty to NULL,
leading to a dereference when the lock is reacquired.
This patch adds a null pointer check for port->port_tty after RX/TX
submission, and removes the initial null pointer check in gs_start_io()
since the caller must hold port_lock and guarantee non-null values for
port_usb and port_tty.
Fixes: ffd603f21423 ("usb: gadget: u_serial: Add null pointer check in gs_start_io")
Cc: stable(a)vger.kernel.org
Signed-off-by: Kuen-Han Tsai <khtsai(a)google.com>
---
Explanation:
CPU1: CPU2:
gserial_connect() // lock
gs_close() // await lock
gs_start_rx() // unlock
usb_ep_queue()
gs_close() // lock, reset port_tty and unlock
gs_start_rx() // lock
tty_wakeup() // dereference
Stack traces:
[ 51.494375][ T278] ttyGS1: shutdown
[ 51.494817][ T269] android_work: sent uevent USB_STATE=DISCONNECTED
[ 52.115792][ T1508] usb: [dm_bind] generic ttyGS1: super speed IN/ep1in OUT/ep1out
[ 52.516288][ T1026] android_work: sent uevent USB_STATE=CONNECTED
[ 52.551667][ T1533] gserial_connect: start ttyGS1
[ 52.565634][ T1533] [khtsai] enter gs_start_io, ttyGS1, port->port.tty=0000000046bd4060
[ 52.565671][ T1533] [khtsai] gs_start_rx, unlock port ttyGS1
[ 52.591552][ T1533] [khtsai] gs_start_rx, lock port ttyGS1
[ 52.619901][ T1533] [khtsai] gs_start_rx, unlock port ttyGS1
[ 52.638659][ T1325] [khtsai] gs_close, lock port ttyGS1
[ 52.656842][ T1325] gs_close: ttyGS1 (0000000046bd4060,00000000be9750a5) ...
[ 52.683005][ T1325] [khtsai] gs_close, clear ttyGS1
[ 52.683007][ T1325] gs_close: ttyGS1 (0000000046bd4060,00000000be9750a5) done!
[ 52.708643][ T1325] [khtsai] gs_close, unlock port ttyGS1
[ 52.747592][ T1533] [khtsai] gs_start_rx, lock port ttyGS1
[ 52.747616][ T1533] [khtsai] gs_start_io, ttyGS1, going to call tty_wakeup(), port->port.tty=0000000000000000
[ 52.747629][ T1533] Unable to handle kernel NULL pointer dereference at virtual address 00000000000001f8
---
drivers/usb/gadget/function/u_serial.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index a92eb6d90976..2f1890c8f473 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -539,20 +539,16 @@ static int gs_alloc_requests(struct usb_ep *ep, struct list_head *head,
static int gs_start_io(struct gs_port *port)
{
struct list_head *head = &port->read_pool;
- struct usb_ep *ep;
+ struct usb_ep *ep = port->port_usb->out;
int status;
unsigned started;
- if (!port->port_usb || !port->port.tty)
- return -EIO;
-
/* Allocate RX and TX I/O buffers. We can't easily do this much
* earlier (with GFP_KERNEL) because the requests are coupled to
* endpoints, as are the packet sizes we'll be using. Different
* configurations may use different endpoints with a given port;
* and high speed vs full speed changes packet sizes too.
*/
- ep = port->port_usb->out;
status = gs_alloc_requests(ep, head, gs_read_complete,
&port->read_allocated);
if (status)
@@ -569,12 +565,22 @@ static int gs_start_io(struct gs_port *port)
port->n_read = 0;
started = gs_start_rx(port);
+ /*
+ * The TTY may be set to NULL by gs_close() after gs_start_rx() or
+ * gs_start_tx() release locks for endpoint request submission.
+ */
+ if (!port->port.tty)
+ goto out;
+
if (started) {
gs_start_tx(port);
/* Unblock any pending writes into our circular buffer, in case
* we didn't in gs_start_tx() */
+ if (!port->port.tty)
+ goto out;
tty_wakeup(port->port.tty);
} else {
+out:
gs_free_requests(ep, head, &port->read_allocated);
gs_free_requests(port->port_usb->in, &port->write_pool,
&port->write_allocated);
--
2.43.0.275.g3460e3d667-goog
The page cache of the atomic file keeps new data pages which will be
stored in the COW file. It can also keep old data pages when GCing the
atomic file. In this case, new data can be overwritten by old data if a
GC thread sets the old data page as dirty after new data page was
evicted.
Also, since all writes to the atomic file are redirected to COW inodes,
GC for the atomic file is not working well as below.
f2fs_gc(gc_type=FG_GC)
- select A as a victim segment
do_garbage_collect
- iget atomic file's inode for block B
move_data_page
f2fs_do_write_data_page
- use dn of cow inode
- set fio->old_blkaddr from cow inode
- seg_freed is 0 since block B is still valid
- goto gc_more and A is selected as victim again
To solve the problem, let's separate GC writes and updates in the atomic
file by using the meta inode for GC writes.
Fixes: 3db1de0e582c ("f2fs: change the current atomic write way")
Cc: stable(a)vger.kernel.org #v5.19+
Reviewed-by: Sungjong Seo <sj1557.seo(a)samsung.com>
Reviewed-by: Yeongjin Gil <youngjin.gil(a)samsung.com>
Signed-off-by: Sunmin Jeong <s_min.jeong(a)samsung.com>
Reviewed-by: Chao Yu <chao(a)kernel.org>
---
v2:
- replace post_read to meta_gc
fs/f2fs/data.c | 4 ++--
fs/f2fs/f2fs.h | 7 ++++++-
fs/f2fs/gc.c | 6 +++---
fs/f2fs/segment.c | 6 +++---
4 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b6dcb3bcaef7..9a213d03005d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2693,7 +2693,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
}
/* wait for GCed page writeback via META_MAPPING */
- if (fio->post_read)
+ if (fio->meta_gc)
f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
/*
@@ -2788,7 +2788,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.submitted = 0,
.compr_blocks = compr_blocks,
.need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY,
- .post_read = f2fs_post_read_required(inode) ? 1 : 0,
+ .meta_gc = f2fs_meta_inode_gc_required(inode) ? 1 : 0,
.io_type = io_type,
.io_wbc = wbc,
.bio = bio,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f7ee6c5e371e..796ae11c0fa3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1211,7 +1211,7 @@ struct f2fs_io_info {
unsigned int in_list:1; /* indicate fio is in io_list */
unsigned int is_por:1; /* indicate IO is from recovery or not */
unsigned int encrypted:1; /* indicate file is encrypted */
- unsigned int post_read:1; /* require post read */
+ unsigned int meta_gc:1; /* require meta inode GC */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
struct bio **bio; /* bio for ipu */
@@ -4263,6 +4263,11 @@ static inline bool f2fs_post_read_required(struct inode *inode)
f2fs_compressed_file(inode);
}
+static inline bool f2fs_meta_inode_gc_required(struct inode *inode)
+{
+ return f2fs_post_read_required(inode) || f2fs_is_atomic_file(inode);
+}
+
/*
* compress.c
*/
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ef667fec9a12..cb3006551ab5 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1589,7 +1589,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
ofs_in_node;
- if (f2fs_post_read_required(inode)) {
+ if (f2fs_meta_inode_gc_required(inode)) {
int err = ra_data_block(inode, start_bidx);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1640,7 +1640,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
start_bidx = f2fs_start_bidx_of_node(nofs, inode)
+ ofs_in_node;
- if (f2fs_post_read_required(inode))
+ if (f2fs_meta_inode_gc_required(inode))
err = move_data_block(inode, start_bidx,
gc_type, segno, off);
else
@@ -1648,7 +1648,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
segno, off);
if (!err && (gc_type == FG_GC ||
- f2fs_post_read_required(inode)))
+ f2fs_meta_inode_gc_required(inode)))
submitted++;
if (locked) {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4db1add43e36..77ef46b384b4 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3851,7 +3851,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
goto drop_bio;
}
- if (fio->post_read)
+ if (fio->meta_gc)
f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1);
stat_inc_inplace_blocks(fio->sbi);
@@ -4021,7 +4021,7 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *cpage;
- if (!f2fs_post_read_required(inode))
+ if (!f2fs_meta_inode_gc_required(inode))
return;
if (!__is_valid_data_blkaddr(blkaddr))
@@ -4040,7 +4040,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
block_t i;
- if (!f2fs_post_read_required(inode))
+ if (!f2fs_meta_inode_gc_required(inode))
return;
for (i = 0; i < len; i++)
--
2.25.1
Since the below commit, there are regressions for legacy setups:
1/ conntracks are created while there are no listener
2/ a listener starts and dumps all conntracks to get the current state
3/ conntracks deleted before the listener has started are not advertised
This is problematic in containers, where conntracks could be created early.
This sysctl is part of unsafe sysctl and could not be changed easily in
some environments.
Let's switch back to the legacy behavior.
CC: stable(a)vger.kernel.org
Fixes: 90d1daa45849 ("netfilter: conntrack: add nf_conntrack_events autodetect mode")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
---
Documentation/networking/nf_conntrack-sysctl.rst | 10 ++++++----
net/netfilter/nf_conntrack_ecache.c | 2 +-
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index c383a394c665..edc04f99e1aa 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -34,13 +34,15 @@ nf_conntrack_count - INTEGER (read-only)
nf_conntrack_events - BOOLEAN
- 0 - disabled
- - 1 - enabled
- - 2 - auto (default)
+ - 1 - enabled (default)
+ - 2 - auto
If this option is enabled, the connection tracking code will
provide userspace with connection tracking events via ctnetlink.
- The default allocates the extension if a userspace program is
- listening to ctnetlink events.
+ The 'auto' allocates the extension if a userspace program is
+ listening to ctnetlink events. Note that conntracks created
+ before the first listener has started won't trigger any netlink
+ event.
nf_conntrack_expect_max - INTEGER
Maximum size of expectation table. Default value is
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 69948e1d6974..4c8559529e18 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -334,7 +334,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp
}
EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
-#define NF_CT_EVENTS_DEFAULT 2
+#define NF_CT_EVENTS_DEFAULT 1
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
void nf_conntrack_ecache_pernet_init(struct net *net)
--
2.43.1
The ov5675 specification says that the gap between XSHUTDN deassert and the
first I2C transaction should be a minimum of 8192 XVCLK cycles.
Right now we use a usleep_rage() that gives a sleep time of between about
430 and 860 microseconds.
On the Lenovo X13s we have observed that in about 1/20 cases the current
timing is too tight and we start transacting before the ov5675's reset
cycle completes, leading to I2C bus transaction failures.
The reset racing is sometimes triggered at initial chip probe but, more
usually on a subsequent power-off/power-on cycle e.g.
[ 71.451662] ov5675 24-0010: failed to write reg 0x0103. error = -5
[ 71.451686] ov5675 24-0010: failed to set plls
The current quiescence period we have is too tight. Instead of expressing
the post reset delay in terms of the current XVCLK this patch converts the
power-on and power-off delays to the maximum theoretical delay @ 6 MHz with
an additional buffer.
1.365 milliseconds on the power-on path is 1.5 milliseconds with grace.
85.3 microseconds on the power-off path is 90 microseconds with grace.
Fixes: 49d9ad719e89 ("media: ov5675: add device-tree support and support runtime PM")
Cc: stable(a)vger.kernel.org
Signed-off-by: Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>
---
v3:
- Fixed my out-by-one 853 -> 85.3 us calc and the 900 us -> 90us calc as a
result.
- Link to v2: https://lore.kernel.org/r/20240711-linux-next-ov5675-v2-1-d0ea6ac2e6e9@lina…
v2:
- Drop patch to read and act on reported XVCLK
- Use worst-case timings + a reasonable grace period in-lieu of previous
xvclk calculations on power-on and power-off.
- Link to v1: https://lore.kernel.org/r/20240711-linux-next-ov5675-v1-0-69e9b6c62c16@lina…
v1:
One long running saga for me on the Lenovo X13s is the occasional failure
to either probe or subsequently bring-up the ov5675 main RGB sensor on the
laptop.
Initially I suspected the PMIC for this part as the PMIC is using a new
interface on an I2C bus instead of an SPMI bus. In particular I thought
perhaps the I2C write to PMIC had completed but the regulator output hadn't
become stable from the perspective of the SoC. This however doesn't appear
to be the case - I can introduce a delay of milliseconds on the PMIC path
without resolving the sensor reset problem.
Secondly I thought about reset pin polarity or drive-strength but, again
playing about with both didn't yield decent results.
I also played with the duration of reset to no avail.
The error manifested as an I2C write timeout to the sensor which indicated
that the chip likely hadn't come out reset. An intermittent fault appearing
in perhaps 1/10 or 1/20 reset cycles.
Looking at the expression of the reset we see that there is a minimum time
expressed in XVCLK cycles between reset completion and first I2C
transaction to the sensor. The specification calls out the minimum delay @
8192 XVCLK cycles and the ov5675 driver meets that timing almost exactly.
A little too exactly - testing finally showed that we were too racy with
respect to the minimum quiescence between reset completion and first
command to the chip.
Fixing this error I choose to base the fix again on the number of clocks
but to also support any clock rate the chip could support by moving away
from a define to reading and using the XVCLK.
True enough only 19.2 MHz is currently supported but for the hypothetical
case where some other frequency is supported in the future, I wanted the
fix introduced in this series to still hold.
Hence this series:
1. Allows for any clock rate to be used in the valid range for the reset.
2. Elongates the post-reset period based on clock cycles which can now
vary.
Patch #2 can still be backported to stable irrespective of patch #1.
---
drivers/media/i2c/ov5675.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/media/i2c/ov5675.c b/drivers/media/i2c/ov5675.c
index 3641911bc73f..5b5127f8953f 100644
--- a/drivers/media/i2c/ov5675.c
+++ b/drivers/media/i2c/ov5675.c
@@ -972,12 +972,10 @@ static int ov5675_set_stream(struct v4l2_subdev *sd, int enable)
static int ov5675_power_off(struct device *dev)
{
- /* 512 xvclk cycles after the last SCCB transation or MIPI frame end */
- u32 delay_us = DIV_ROUND_UP(512, OV5675_XVCLK_19_2 / 1000 / 1000);
struct v4l2_subdev *sd = dev_get_drvdata(dev);
struct ov5675 *ov5675 = to_ov5675(sd);
- usleep_range(delay_us, delay_us * 2);
+ usleep_range(90, 100);
clk_disable_unprepare(ov5675->xvclk);
gpiod_set_value_cansleep(ov5675->reset_gpio, 1);
@@ -988,7 +986,6 @@ static int ov5675_power_off(struct device *dev)
static int ov5675_power_on(struct device *dev)
{
- u32 delay_us = DIV_ROUND_UP(8192, OV5675_XVCLK_19_2 / 1000 / 1000);
struct v4l2_subdev *sd = dev_get_drvdata(dev);
struct ov5675 *ov5675 = to_ov5675(sd);
int ret;
@@ -1014,8 +1011,11 @@ static int ov5675_power_on(struct device *dev)
gpiod_set_value_cansleep(ov5675->reset_gpio, 0);
- /* 8192 xvclk cycles prior to the first SCCB transation */
- usleep_range(delay_us, delay_us * 2);
+ /* Worst case quiesence gap is 1.365 milliseconds @ 6MHz XVCLK
+ * Add an additional threshold grace period to ensure reset
+ * completion before initiating our first I2C transaction.
+ */
+ usleep_range(1500, 1600);
return 0;
}
---
base-commit: 523b23f0bee3014a7a752c9bb9f5c54f0eddae88
change-id: 20240710-linux-next-ov5675-60b0e83c73f1
Best regards,
--
Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>