From: Zhang Yi <yi.zhang(a)huawei.com>
After we factor out the inline data write procedure from
ext4_da_write_end(), we don't need to start journal handle for the cases
of both buffer overwrite and append-write. If we need to update
i_disksize, mark_inode_dirty() do start handle and update inode buffer.
So we could just remove all the journal handle codes in the delalloc
write procedure.
After this patch, we could get a lot of performance improvement. Below
is the Unixbench comparison data test on my machine with 'Intel Xeon
Gold 5120' CPU and nvme SSD backend.
Test cmd:
./Run -c 56 -i 3 fstime fsbuffer fsdisk
Before this patch:
System Benchmarks Partial Index BASELINE RESULT INDEX
File Copy 1024 bufsize 2000 maxblocks 3960.0 422965.0 1068.1
File Copy 256 bufsize 500 maxblocks 1655.0 105077.0 634.9
File Copy 4096 bufsize 8000 maxblocks 5800.0 1429092.0 2464.0
======
System Benchmarks Index Score (Partial Only) 1186.6
After this patch:
System Benchmarks Partial Index BASELINE RESULT INDEX
File Copy 1024 bufsize 2000 maxblocks 3960.0 732716.0 1850.3
File Copy 256 bufsize 500 maxblocks 1655.0 184940.0 1117.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 2427152.0 4184.7
======
System Benchmarks Index Score (Partial Only) 2053.0
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210716122024.1105856-5-yi.zhang@huawei.com
Reviewed-by: Cheng Nie <niecheng1(a)uniontech.com>
Signed-off-by: Dandan Zhang <zhangdandan(a)uniontech.com>
Signed-off-by: WangYuli <wangyuli(a)uniontech.com>
---
fs/ext4/inode.c | 60 +++++--------------------------------------------
1 file changed, 5 insertions(+), 55 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e6753084a00..597c2f49c889 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3032,19 +3032,6 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-/* We always reserve for an inode update; the superblock could be there too */
-static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
-{
- if (likely(ext4_has_feature_large_file(inode->i_sb)))
- return 1;
-
- if (pos + len <= 0x7fffffffULL)
- return 1;
-
- /* We might need to update the superblock to set LARGE_FILE */
- return 2;
-}
-
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -3053,7 +3040,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
pgoff_t index;
struct inode *inode = mapping->host;
- handle_t *handle;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -3079,41 +3065,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
- /*
- * grab_cache_page_write_begin() can take a long time if the
- * system is thrashing due to memory pressure, or if the page
- * is being written back. So grab it first before we start
- * the transaction handle. This also allows us to allocate
- * the page (if needed) without using GFP_NOFS.
- */
-retry_grab:
+retry:
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
- unlock_page(page);
-
- /*
- * With delayed allocation, we don't log the i_disksize update
- * if there is delayed block allocation. But we still need
- * to journalling the i_disksize update if writes to the end
- * of file which has an already mapped buffer.
- */
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_da_write_credits(inode, pos, len));
- if (IS_ERR(handle)) {
- put_page(page);
- return PTR_ERR(handle);
- }
- lock_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- unlock_page(page);
- put_page(page);
- ext4_journal_stop(handle);
- goto retry_grab;
- }
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
@@ -3125,20 +3081,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
#endif
if (ret < 0) {
unlock_page(page);
- ext4_journal_stop(handle);
+ put_page(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold inode lock.
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
-
- put_page(page);
+ goto retry;
return ret;
}
@@ -3175,8 +3129,6 @@ static int ext4_da_write_end(struct file *file,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- int ret;
- handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
@@ -3215,9 +3167,7 @@ static int ext4_da_write_end(struct file *file,
ext4_da_should_update_i_disksize(page, end))
ext4_update_i_disksize(inode, new_i_size);
- copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- ret = ext4_journal_stop(handle);
- return ret ? ret : copied;
+ return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
}
static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
--
2.43.4
From: Zhang Yi <yi.zhang(a)huawei.com>
Current error path of ext4_write_inline_data_end() is not correct.
Firstly, it should pass out the error value if ext4_get_inode_loc()
return fail, or else it could trigger infinite loop if we inject error
here. And then it's better to add inode to orphan list if it return fail
in ext4_journal_stop(), otherwise we could not restore inline xattr
entry after power failure. Finally, we need to reset the 'ret' value if
ext4_write_inline_data_end() return success in ext4_write_end() and
ext4_journalled_write_end(), otherwise we could not get the error return
value of ext4_journal_stop().
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210716122024.1105856-3-yi.zhang@huawei.com
Reviewed-by: Cheng Nie <niecheng1(a)uniontech.com>
Signed-off-by: Dandan Zhang <zhangdandan(a)uniontech.com>
Signed-off-by: WangYuli <wangyuli(a)uniontech.com>
---
fs/ext4/inline.c | 15 +++++----------
fs/ext4/inode.c | 7 +++++--
2 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 71bb3cfc5933..de04bd5fb551 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -745,18 +745,13 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
void *kaddr;
struct ext4_iloc iloc;
- if (unlikely(copied < len)) {
- if (!PageUptodate(page)) {
- copied = 0;
- goto out;
- }
- }
+ if (unlikely(copied < len) && !PageUptodate(page))
+ return 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret) {
ext4_std_error(inode->i_sb, ret);
- copied = 0;
- goto out;
+ return ret;
}
ext4_write_lock_xattr(inode, &no_expand);
@@ -769,7 +764,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
(void) ext4_find_inline_data_nolock(inode);
kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
kunmap_atomic(kaddr);
SetPageUptodate(page);
/* clear page dirty so that writepages wouldn't work for us. */
@@ -778,7 +773,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
ext4_write_unlock_xattr(inode, &no_expand);
brelse(iloc.bh);
mark_inode_dirty(inode);
-out:
+
return copied;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8a8e4ee5ff8..44a715e6aae1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1428,6 +1428,7 @@ static int ext4_write_end(struct file *file,
goto errout;
}
copied = ret;
+ ret = 0;
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
@@ -1450,13 +1451,14 @@ static int ext4_write_end(struct file *file,
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
+errout:
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-errout:
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1538,6 +1540,7 @@ static int ext4_journalled_write_end(struct file *file,
goto errout;
}
copied = ret;
+ ret = 0;
} else if (unlikely(copied < len) && !PageUptodate(page)) {
copied = 0;
ext4_journalled_zero_new_buffers(handle, page, from, to);
@@ -1566,6 +1569,7 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+errout:
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1573,7 +1577,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
ext4_orphan_add(handle, inode);
-errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
--
2.43.4
A patchset from linux-5.15 should be backported to 4.19 that can
significantly improve ext4 fs read and write performance. Unixbench test
results for linux-4.19.318 on Phytium D2000 CPU are shown below.
Test cmd: (Phytium D2000 only has 8 cores)
./Run fs -c 8
Before this patch set:
File Copy 1024 bufsize 2000 maxblocks 1124181
File Copy 256 bufsize 500 maxblocks 281885
File Copy 4096 bufsize 8000 maxblocks 3383785
File Read 1024 bufsize 2000 maxblocks 8702173
File Read 256 bufsize 500 maxblocks 3869384
File Read 4096 bufsize 8000 maxblocks 13043151
File Write 1024 bufsize 2000 maxblocks 1107185
File Write 256 bufsize 500 maxblocks 270493
File Write 4096 bufsize 8000 maxblocks 4018084
After this patch set:
File Copy 1024 bufsize 2000 maxblocks 2026206
File Copy 256 bufsize 500 maxblocks 829534
File Copy 4096 bufsize 8000 maxblocks 4066659
File Read 1024 bufsize 2000 maxblocks 8877219
File Read 256 bufsize 500 maxblocks 3997445
File Read 4096 bufsize 8000 maxblocks 13179885
File Write 1024 bufsize 2000 maxblocks 4256929
File Write 256 bufsize 500 maxblocks 1305320
File Write 4096 bufsize 8000 maxblocks 10721052
We can observe a quantum leap in the test results as a consequence of
applying this patchset
Link: https://lore.kernel.org/all/20210716122024.1105856-1-yi.zhang@huawei.com/
Original description:
This patchset address to improve buffer write performance with delalloc.
The first patch reduce the unnecessary update i_disksize, the second two
patch refactor the inline data write procedure and also do some small
fix, the last patch do improve by remove all unnecessary journal handle
in the delalloc write procedure.
After this patch set, we could get a lot of performance improvement.
Below is the Unixbench comparison data test on my machine with 'Intel
Xeon Gold 5120' CPU and nvme SSD backend.
Test cmd:
./Run -c 56 -i 3 fstime fsbuffer fsdisk
Before this patch set:
System Benchmarks Partial Index BASELINE RESULT INDEX
File Copy 1024 bufsize 2000 maxblocks 3960.0 422965.0 1068.1
File Copy 256 bufsize 500 maxblocks 1655.0 105077.0 634.9
File Copy 4096 bufsize 8000 maxblocks 5800.0 1429092.0 2464.0
========
System Benchmarks Index Score (Partial Only) 1186.6
After this patch set:
System Benchmarks Partial Index BASELINE RESULT INDEX
File Copy 1024 bufsize 2000 maxblocks 3960.0 732716.0 1850.3
File Copy 256 bufsize 500 maxblocks 1655.0 184940.0 1117.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 2427152.0 4184.7
========
System Benchmarks Index Score (Partial Only) 2053.0
Zhang Yi (4):
ext4: check and update i_disksize properly
ext4: correct the error path of ext4_write_inline_data_end()
ext4: factor out write end code of inline file
ext4: drop unnecessary journal handle in delalloc write
fs/ext4/ext4.h | 3 -
fs/ext4/inline.c | 120 ++++++++++++++++++-------------------
fs/ext4/inode.c | 150 ++++++++++++-----------------------------------
3 files changed, 99 insertions(+), 174 deletions(-)
--
2.31.1
Hello,
I sent you a message a few hours ago but no reply yet, or you didn't receive it? Kindly read my letter and reply back. I want to make an inquiry
Thanks.
Dr.Allen Cheng
Human Resource Manager | Product Research Assistant
FC Industrial Laboratories Ltd
Originally, the check_unaligned_access_emulated_all_cpus function
only checked the boot hart. This fixes the function to check all
harts.
Fixes: 71c54b3d169d ("riscv: report misaligned accesses emulation to hwprobe")
Signed-off-by: Jesse Taube <jesse(a)rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie(a)rivosinc.com>
Cc: stable(a)vger.kernel.org
---
V1 -> V2:
- New patch
V2 -> V3:
- Split patch
V3 -> V4:
- Re-add check for a system where a heterogeneous
CPU is hotplugged into a previously homogenous
system.
V4 -> V5:
- Change work_struct *unused to work_struct *work __always_unused
---
arch/riscv/kernel/traps_misaligned.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index b62d5a2f4541..9a1e94383d6d 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -526,11 +526,11 @@ int handle_misaligned_store(struct pt_regs *regs)
return 0;
}
-static bool check_unaligned_access_emulated(int cpu)
+static void check_unaligned_access_emulated(struct work_struct *work __always_unused)
{
+ int cpu = smp_processor_id();
long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
unsigned long tmp_var, tmp_val;
- bool misaligned_emu_detected;
*mas_ptr = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
@@ -538,19 +538,16 @@ static bool check_unaligned_access_emulated(int cpu)
" "REG_L" %[tmp], 1(%[ptr])\n"
: [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory");
- misaligned_emu_detected = (*mas_ptr == RISCV_HWPROBE_MISALIGNED_EMULATED);
/*
* If unaligned_ctl is already set, this means that we detected that all
* CPUS uses emulated misaligned access at boot time. If that changed
* when hotplugging the new cpu, this is something we don't handle.
*/
- if (unlikely(unaligned_ctl && !misaligned_emu_detected)) {
+ if (unlikely(unaligned_ctl && (*mas_ptr != RISCV_HWPROBE_MISALIGNED_EMULATED))) {
pr_crit("CPU misaligned accesses non homogeneous (expected all emulated)\n");
while (true)
cpu_relax();
}
-
- return misaligned_emu_detected;
}
bool check_unaligned_access_emulated_all_cpus(void)
@@ -562,8 +559,11 @@ bool check_unaligned_access_emulated_all_cpus(void)
* accesses emulated since tasks requesting such control can run on any
* CPU.
*/
+ schedule_on_each_cpu(check_unaligned_access_emulated);
+
for_each_online_cpu(cpu)
- if (!check_unaligned_access_emulated(cpu))
+ if (per_cpu(misaligned_access_speed, cpu)
+ != RISCV_HWPROBE_MISALIGNED_EMULATED)
return false;
unaligned_ctl = true;
--
2.45.2
Before this change, network restrictions were enforced according to the
calling thread's Landlock domain, leading to potential inconsistent
results when the same socket was used by different threads or processes
(with different domains). This change fixes such access control
inconsistency by enforcing the socket's Landlock domain instead of the
caller's Landlock domain.
Socket's Landlock domain is inherited from the thread that created this
socket. This means that a socket created without sandboxing will be
free to connect and bind without limitation. This also means that a
socket created by a sandboxed thread will inherit the thread's policy,
which will be enforced on this socket even when used by another thread
or passed to another process.
The initial rationale [1] was that a socket does not directly grants
access to data, but it is an object used to define an access (e.g.
connection to a peer). Contrary to my initial assumption, we can
identify to which protocol/port a newly created socket can give access
to with the socket's file->f_cred inherited from its creator. Moreover,
from a kernel point of view, especially for shared objects, we need a
more consistent access model. This means that the same action on the
same socket performed by different threads will have the same effect.
This follows the same approach as for file descriptors tied to the file
system (e.g. LANDLOCK_ACCESS_FS_TRUNCATE).
One potential risk of this change is for unsandboxed processes to send
socket file descriptors to sandboxed processes, which could give
unrestricted network access to the sandboxed process (by reconfigure the
socket). While it makes sense for processes to transfer (AF_UNIX)
socketpairs, which is OK because they can only exchange data between
themselves, it should be rare for processes to legitimately pass other
kind of sockets (e.g. AF_INET).
Another potential risk of this approach is socket file descriptor leaks.
This is the same risk as with regular file descriptor leaks giving
access to the content of a file, which is well known and documented.
This could be mitigated with a future complementary restriction on
received or inherited file descriptors.
One interesting side effect of this new approach is that a process can
create a socket that will only allow to connect to a set of ports. This
can be done by creating a thread, sandboxing it, creating a socket, and
using the related file descriptor (in the same process). Passing this
restricted socket to a more sandboxed process makes it possible to have
a more dynamic security policy.
This new approach aligns with SELinux and Smack instead of AppArmor and
Tomoyo. It is also in line with capability-based security mechanisms
such as Capsicum.
This slight semantic change is important for current and future
Landlock's consistency, and it must be backported.
Current tests are still OK because this behavior wasn't covered. A
following commit adds new tests.
Cc: Günther Noack <gnoack(a)google.com>
Cc: Ivanov Mikhail <ivanov.mikhail1(a)huawei-partners.com>
Cc: Konstantin Meskhidze <konstantin.meskhidze(a)huawei.com>
Cc: Paul Moore <paul(a)paul-moore.com>
Cc: Tahera Fahimi <fahimitahera(a)gmail.com>
Cc: <stable(a)vger.kernel.org> # 6.7.x: 088e2efaf3d2: landlock: Simplify current_check_access_socket()
Fixes: fff69fb03dde ("landlock: Support network rules with TCP bind and connect")
Link: https://lore.kernel.org/r/263c1eb3-602f-57fe-8450-3f138581bee7@digikod.net [1]
Signed-off-by: Mickaël Salaün <mic(a)digikod.net>
Link: https://lore.kernel.org/r/20240719150618.197991-2-mic@digikod.net
---
security/landlock/net.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/security/landlock/net.c b/security/landlock/net.c
index c8bcd29bde09..78e027a74819 100644
--- a/security/landlock/net.c
+++ b/security/landlock/net.c
@@ -50,10 +50,11 @@ get_raw_handled_net_accesses(const struct landlock_ruleset *const domain)
return access_dom;
}
-static const struct landlock_ruleset *get_current_net_domain(void)
+static const struct landlock_ruleset *
+get_socket_net_domain(const struct socket *const sock)
{
const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
+ landlock_cred(sock->file->f_cred)->domain;
if (!dom || !get_raw_handled_net_accesses(dom))
return NULL;
@@ -61,10 +62,9 @@ static const struct landlock_ruleset *get_current_net_domain(void)
return dom;
}
-static int current_check_access_socket(struct socket *const sock,
- struct sockaddr *const address,
- const int addrlen,
- access_mask_t access_request)
+static int check_access_socket(struct socket *const sock,
+ struct sockaddr *const address,
+ const int addrlen, access_mask_t access_request)
{
__be16 port;
layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_NET] = {};
@@ -72,7 +72,7 @@ static int current_check_access_socket(struct socket *const sock,
struct landlock_id id = {
.type = LANDLOCK_KEY_NET_PORT,
};
- const struct landlock_ruleset *const dom = get_current_net_domain();
+ const struct landlock_ruleset *const dom = get_socket_net_domain(sock);
if (!dom)
return 0;
@@ -175,16 +175,16 @@ static int current_check_access_socket(struct socket *const sock,
static int hook_socket_bind(struct socket *const sock,
struct sockaddr *const address, const int addrlen)
{
- return current_check_access_socket(sock, address, addrlen,
- LANDLOCK_ACCESS_NET_BIND_TCP);
+ return check_access_socket(sock, address, addrlen,
+ LANDLOCK_ACCESS_NET_BIND_TCP);
}
static int hook_socket_connect(struct socket *const sock,
struct sockaddr *const address,
const int addrlen)
{
- return current_check_access_socket(sock, address, addrlen,
- LANDLOCK_ACCESS_NET_CONNECT_TCP);
+ return check_access_socket(sock, address, addrlen,
+ LANDLOCK_ACCESS_NET_CONNECT_TCP);
}
static struct security_hook_list landlock_hooks[] __ro_after_init = {
--
2.45.2