The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 915593a7a663b2ad08b895a5f3ba8b19d89d4ebf Mon Sep 17 00:00:00 2001
From: Tom Rix <trix(a)redhat.com>
Date: Sat, 26 Mar 2022 12:42:36 -0700
Subject: [PATCH] rtc: check if __rtc_read_time was successful
Clang static analysis reports this issue
interface.c:810:8: warning: Passed-by-value struct
argument contains uninitialized data
now = rtc_tm_to_ktime(tm);
^~~~~~~~~~~~~~~~~~~
tm is set by a successful call to __rtc_read_time()
but its return status is not checked. Check if
it was successful before setting the enabled flag.
Move the decl of err to function scope.
Fixes: 2b2f5ff00f63 ("rtc: interface: ignore expired timers when enqueuing new timers")
Signed-off-by: Tom Rix <trix(a)redhat.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220326194236.2916310-1-trix@redhat.com
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index d8e835798153..9edd662c69ac 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -804,9 +804,13 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
struct rtc_time tm;
ktime_t now;
+ int err;
+
+ err = __rtc_read_time(rtc, &tm);
+ if (err)
+ return err;
timer->enabled = 1;
- __rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
/* Skip over expired timers */
@@ -820,7 +824,6 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
trace_rtc_timer_enqueue(timer);
if (!next || ktime_before(timer->node.expires, next->expires)) {
struct rtc_wkalrm alarm;
- int err;
alarm.time = rtc_ktime_to_tm(timer->node.expires);
alarm.enabled = 1;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 915593a7a663b2ad08b895a5f3ba8b19d89d4ebf Mon Sep 17 00:00:00 2001
From: Tom Rix <trix(a)redhat.com>
Date: Sat, 26 Mar 2022 12:42:36 -0700
Subject: [PATCH] rtc: check if __rtc_read_time was successful
Clang static analysis reports this issue
interface.c:810:8: warning: Passed-by-value struct
argument contains uninitialized data
now = rtc_tm_to_ktime(tm);
^~~~~~~~~~~~~~~~~~~
tm is set by a successful call to __rtc_read_time()
but its return status is not checked. Check if
it was successful before setting the enabled flag.
Move the decl of err to function scope.
Fixes: 2b2f5ff00f63 ("rtc: interface: ignore expired timers when enqueuing new timers")
Signed-off-by: Tom Rix <trix(a)redhat.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220326194236.2916310-1-trix@redhat.com
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index d8e835798153..9edd662c69ac 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -804,9 +804,13 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
struct rtc_time tm;
ktime_t now;
+ int err;
+
+ err = __rtc_read_time(rtc, &tm);
+ if (err)
+ return err;
timer->enabled = 1;
- __rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
/* Skip over expired timers */
@@ -820,7 +824,6 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
trace_rtc_timer_enqueue(timer);
if (!next || ktime_before(timer->node.expires, next->expires)) {
struct rtc_wkalrm alarm;
- int err;
alarm.time = rtc_ktime_to_tm(timer->node.expires);
alarm.enabled = 1;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 915593a7a663b2ad08b895a5f3ba8b19d89d4ebf Mon Sep 17 00:00:00 2001
From: Tom Rix <trix(a)redhat.com>
Date: Sat, 26 Mar 2022 12:42:36 -0700
Subject: [PATCH] rtc: check if __rtc_read_time was successful
Clang static analysis reports this issue
interface.c:810:8: warning: Passed-by-value struct
argument contains uninitialized data
now = rtc_tm_to_ktime(tm);
^~~~~~~~~~~~~~~~~~~
tm is set by a successful call to __rtc_read_time()
but its return status is not checked. Check if
it was successful before setting the enabled flag.
Move the decl of err to function scope.
Fixes: 2b2f5ff00f63 ("rtc: interface: ignore expired timers when enqueuing new timers")
Signed-off-by: Tom Rix <trix(a)redhat.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220326194236.2916310-1-trix@redhat.com
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index d8e835798153..9edd662c69ac 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -804,9 +804,13 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
struct rtc_time tm;
ktime_t now;
+ int err;
+
+ err = __rtc_read_time(rtc, &tm);
+ if (err)
+ return err;
timer->enabled = 1;
- __rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
/* Skip over expired timers */
@@ -820,7 +824,6 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
trace_rtc_timer_enqueue(timer);
if (!next || ktime_before(timer->node.expires, next->expires)) {
struct rtc_wkalrm alarm;
- int err;
alarm.time = rtc_ktime_to_tm(timer->node.expires);
alarm.enabled = 1;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 27 Dec 2021 11:22:41 +0800
Subject: [PATCH] ubifs: Fix to add refcount once page is set private
MM defined the rule [1] very clearly that once page was set with PG_private
flag, we should increment the refcount in that page, also main flows like
pageout(), migrate_page() will assume there is one additional page
reference count if page_has_private() returns true. Otherwise, we may
get a BUG in page migration:
page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8
index:0xe2 pfn:0x14c12
aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e"
flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0|
zone=1|lastcpupid=0x1fffff)
page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0)
------------[ cut here ]------------
kernel BUG at include/linux/page_ref.h:184!
invalid opcode: 0000 [#1] SMP
CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5
RIP: 0010:migrate_page_move_mapping+0xac3/0xe70
Call Trace:
ubifs_migrate_page+0x22/0xc0 [ubifs]
move_to_new_page+0xb4/0x600
migrate_pages+0x1523/0x1cc0
compact_zone+0x8c5/0x14b0
kcompactd+0x2bc/0x560
kthread+0x18c/0x1e0
ret_from_fork+0x1f/0x30
Before the time, we should make clean a concept, what does refcount means
in page gotten from grab_cache_page_write_begin(). There are 2 situations:
Situation 1: refcount is 3, page is created by __page_cache_alloc.
TYPE_A - the write process is using this page
TYPE_B - page is assigned to one certain mapping by calling
__add_to_page_cache_locked()
TYPE_C - page is added into pagevec list corresponding current cpu by
calling lru_cache_add()
Situation 2: refcount is 2, page is gotten from the mapping's tree
TYPE_B - page has been assigned to one certain mapping
TYPE_A - the write process is using this page (by calling
page_cache_get_speculative())
Filesystem releases one refcount by calling put_page() in xxx_write_end(),
the released refcount corresponds to TYPE_A (write task is using it). If
there are any processes using a page, page migration process will skip the
page by judging whether expected_page_refs() equals to page refcount.
The BUG is caused by following process:
PA(cpu 0) kcompactd(cpu 1)
compact_zone
ubifs_write_begin
page_a = grab_cache_page_write_begin
add_to_page_cache_lru
lru_cache_add
pagevec_add // put page into cpu 0's pagevec
(refcnf = 3, for page creation process)
ubifs_write_end
SetPagePrivate(page_a) // doesn't increase page count !
unlock_page(page_a)
put_page(page_a) // refcnt = 2
[...]
PB(cpu 0)
filemap_read
filemap_get_pages
add_to_page_cache_lru
lru_cache_add
__pagevec_lru_add // traverse all pages in cpu 0's pagevec
__pagevec_lru_add_fn
SetPageLRU(page_a)
isolate_migratepages
isolate_migratepages_block
get_page_unless_zero(page_a)
// refcnt = 3
list_add(page_a, from_list)
migrate_pages(from_list)
__unmap_and_move
move_to_new_page
ubifs_migrate_page(page_a)
migrate_page_move_mapping
expected_page_refs get 3
(migration[1] + mapping[1] + private[1])
release_pages
put_page_testzero(page_a) // refcnt = 3
page_ref_freeze // refcnt = 0
page_ref_dec_and_test(0 - 1 = -1)
page_ref_unfreeze
VM_BUG_ON_PAGE(-1 != 0, page)
UBIFS doesn't increase the page refcount after setting private flag, which
leads to page migration task believes the page is not used by any other
processes, so the page is migrated. This causes concurrent accessing on
page refcount between put_page() called by other process(eg. read process
calls lru_cache_add) and page_ref_unfreeze() called by migration task.
Actually zhangjun has tried to fix this problem [2] by recalculating page
refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because
just like Kirill suggested in [2], we need to check all users of
page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting
refcount when setting/clearing private for a page. BTW, according to [4],
we set 'page->private' as 1 because ubifs just simply SetPagePrivate().
And, [5] provided a common helper to set/clear page private, ubifs can
use this helper following the example of iomap, afs, btrfs, etc.
Jump [6] to find a reproducer.
[1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com
[2] https://www.spinics.net/lists/linux-mtd/msg04018.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html
[4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.in…
[5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.…
[6] https://bugzilla.kernel.org/show_bug.cgi?id=214961
Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5cfa28cd00cd..6b45a037a047 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
if (!PagePrivate(page)) {
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len)
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
kunmap(page);
@@ -1304,7 +1304,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
}
@@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping,
return rc;
if (PagePrivate(page)) {
- ClearPagePrivate(page);
- SetPagePrivate(newpage);
+ detach_page_private(page);
+ attach_page_private(newpage, (void *)1);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return 0;
ubifs_assert(c, PagePrivate(page));
ubifs_assert(c, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
return 1;
}
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
else {
if (!PageChecked(page))
ubifs_convert_page_budget(c);
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 27 Dec 2021 11:22:41 +0800
Subject: [PATCH] ubifs: Fix to add refcount once page is set private
MM defined the rule [1] very clearly that once page was set with PG_private
flag, we should increment the refcount in that page, also main flows like
pageout(), migrate_page() will assume there is one additional page
reference count if page_has_private() returns true. Otherwise, we may
get a BUG in page migration:
page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8
index:0xe2 pfn:0x14c12
aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e"
flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0|
zone=1|lastcpupid=0x1fffff)
page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0)
------------[ cut here ]------------
kernel BUG at include/linux/page_ref.h:184!
invalid opcode: 0000 [#1] SMP
CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5
RIP: 0010:migrate_page_move_mapping+0xac3/0xe70
Call Trace:
ubifs_migrate_page+0x22/0xc0 [ubifs]
move_to_new_page+0xb4/0x600
migrate_pages+0x1523/0x1cc0
compact_zone+0x8c5/0x14b0
kcompactd+0x2bc/0x560
kthread+0x18c/0x1e0
ret_from_fork+0x1f/0x30
Before the time, we should make clean a concept, what does refcount means
in page gotten from grab_cache_page_write_begin(). There are 2 situations:
Situation 1: refcount is 3, page is created by __page_cache_alloc.
TYPE_A - the write process is using this page
TYPE_B - page is assigned to one certain mapping by calling
__add_to_page_cache_locked()
TYPE_C - page is added into pagevec list corresponding current cpu by
calling lru_cache_add()
Situation 2: refcount is 2, page is gotten from the mapping's tree
TYPE_B - page has been assigned to one certain mapping
TYPE_A - the write process is using this page (by calling
page_cache_get_speculative())
Filesystem releases one refcount by calling put_page() in xxx_write_end(),
the released refcount corresponds to TYPE_A (write task is using it). If
there are any processes using a page, page migration process will skip the
page by judging whether expected_page_refs() equals to page refcount.
The BUG is caused by following process:
PA(cpu 0) kcompactd(cpu 1)
compact_zone
ubifs_write_begin
page_a = grab_cache_page_write_begin
add_to_page_cache_lru
lru_cache_add
pagevec_add // put page into cpu 0's pagevec
(refcnf = 3, for page creation process)
ubifs_write_end
SetPagePrivate(page_a) // doesn't increase page count !
unlock_page(page_a)
put_page(page_a) // refcnt = 2
[...]
PB(cpu 0)
filemap_read
filemap_get_pages
add_to_page_cache_lru
lru_cache_add
__pagevec_lru_add // traverse all pages in cpu 0's pagevec
__pagevec_lru_add_fn
SetPageLRU(page_a)
isolate_migratepages
isolate_migratepages_block
get_page_unless_zero(page_a)
// refcnt = 3
list_add(page_a, from_list)
migrate_pages(from_list)
__unmap_and_move
move_to_new_page
ubifs_migrate_page(page_a)
migrate_page_move_mapping
expected_page_refs get 3
(migration[1] + mapping[1] + private[1])
release_pages
put_page_testzero(page_a) // refcnt = 3
page_ref_freeze // refcnt = 0
page_ref_dec_and_test(0 - 1 = -1)
page_ref_unfreeze
VM_BUG_ON_PAGE(-1 != 0, page)
UBIFS doesn't increase the page refcount after setting private flag, which
leads to page migration task believes the page is not used by any other
processes, so the page is migrated. This causes concurrent accessing on
page refcount between put_page() called by other process(eg. read process
calls lru_cache_add) and page_ref_unfreeze() called by migration task.
Actually zhangjun has tried to fix this problem [2] by recalculating page
refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because
just like Kirill suggested in [2], we need to check all users of
page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting
refcount when setting/clearing private for a page. BTW, according to [4],
we set 'page->private' as 1 because ubifs just simply SetPagePrivate().
And, [5] provided a common helper to set/clear page private, ubifs can
use this helper following the example of iomap, afs, btrfs, etc.
Jump [6] to find a reproducer.
[1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com
[2] https://www.spinics.net/lists/linux-mtd/msg04018.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html
[4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.in…
[5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.…
[6] https://bugzilla.kernel.org/show_bug.cgi?id=214961
Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5cfa28cd00cd..6b45a037a047 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
if (!PagePrivate(page)) {
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len)
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
kunmap(page);
@@ -1304,7 +1304,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
}
@@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping,
return rc;
if (PagePrivate(page)) {
- ClearPagePrivate(page);
- SetPagePrivate(newpage);
+ detach_page_private(page);
+ attach_page_private(newpage, (void *)1);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return 0;
ubifs_assert(c, PagePrivate(page));
ubifs_assert(c, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
return 1;
}
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
else {
if (!PageChecked(page))
ubifs_convert_page_budget(c);
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4f2262a334641e05f645364d5ade1f565c85f20b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 27 Dec 2021 11:22:40 +0800
Subject: [PATCH] ubifs: Fix read out-of-bounds in ubifs_wbuf_write_nolock()
Function ubifs_wbuf_write_nolock() may access buf out of bounds in
following process:
ubifs_wbuf_write_nolock():
aligned_len = ALIGN(len, 8); // Assume len = 4089, aligned_len = 4096
if (aligned_len <= wbuf->avail) ... // Not satisfy
if (wbuf->used) {
ubifs_leb_write() // Fill some data in avail wbuf
len -= wbuf->avail; // len is still not 8-bytes aligned
aligned_len -= wbuf->avail;
}
n = aligned_len >> c->max_write_shift;
if (n) {
n <<= c->max_write_shift;
err = ubifs_leb_write(c, wbuf->lnum, buf + written,
wbuf->offs, n);
// n > len, read out of bounds less than 8(n-len) bytes
}
, which can be catched by KASAN:
=========================================================
BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0
Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128
Workqueue: writeback wb_workfn (flush-ubifs_0_0)
Call Trace:
kasan_report.cold+0x81/0x165
nand_write_page_swecc+0xa9/0x160
ubifs_leb_write+0xf2/0x1b0 [ubifs]
ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs]
write_head+0xdc/0x1c0 [ubifs]
ubifs_jnl_write_inode+0x627/0x960 [ubifs]
wb_workfn+0x8af/0xb80
Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8
bytes aligned, the 'len' represents the true length of buf (which is
allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so
ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully
to write leb safely.
Fetch a reproducer in [Link].
Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785
Reported-by: Chengsong Ke <kechengsong(a)huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 789a7813f3fa..1607a3c76681 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -854,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->max_write_shift;
+ int m = n - 1;
+
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n);
+
+ if (m) {
+ /* '(n-1)<<c->max_write_shift < len' is always true. */
+ m <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, m);
+ if (err)
+ goto out;
+ wbuf->offs += m;
+ aligned_len -= m;
+ len -= m;
+ written += m;
+ }
+
+ /*
+ * The non-written len of buf may be less than 'n' because
+ * parameter 'len' is not 8 bytes aligned, so here we read
+ * min(len, n) bytes from buf.
+ */
+ n = 1 << c->max_write_shift;
+ memcpy(wbuf->buf, buf + written, min(len, n));
+ if (n > len) {
+ ubifs_assert(c, n - len < 8);
+ ubifs_pad(c, wbuf->buf + len, n - len);
+ }
+
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
aligned_len -= n;
- len -= n;
+ len -= min(len, n);
written += n;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 705757274599e2e064dd3054aabc74e8af31a095 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1(a)huawei.com>
Date: Tue, 15 Feb 2022 12:07:36 +0800
Subject: [PATCH] ubifs: rename_whiteout: correct old_dir size computing
When renaming the whiteout file, the old whiteout file is not deleted.
Therefore, we add the old dentry size to the old dir like XFS.
Otherwise, an error may be reported due to `fscki->calc_sz != fscki->size`
in check_indes.
Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
Reported-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ae082a0be2a3..86151889548e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1402,6 +1402,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
iput(whiteout);
goto out_release;
}
+
+ /* Add the old_dentry size to the old_dir size. */
+ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
}
lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4f2262a334641e05f645364d5ade1f565c85f20b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 27 Dec 2021 11:22:40 +0800
Subject: [PATCH] ubifs: Fix read out-of-bounds in ubifs_wbuf_write_nolock()
Function ubifs_wbuf_write_nolock() may access buf out of bounds in
following process:
ubifs_wbuf_write_nolock():
aligned_len = ALIGN(len, 8); // Assume len = 4089, aligned_len = 4096
if (aligned_len <= wbuf->avail) ... // Not satisfy
if (wbuf->used) {
ubifs_leb_write() // Fill some data in avail wbuf
len -= wbuf->avail; // len is still not 8-bytes aligned
aligned_len -= wbuf->avail;
}
n = aligned_len >> c->max_write_shift;
if (n) {
n <<= c->max_write_shift;
err = ubifs_leb_write(c, wbuf->lnum, buf + written,
wbuf->offs, n);
// n > len, read out of bounds less than 8(n-len) bytes
}
, which can be catched by KASAN:
=========================================================
BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0
Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128
Workqueue: writeback wb_workfn (flush-ubifs_0_0)
Call Trace:
kasan_report.cold+0x81/0x165
nand_write_page_swecc+0xa9/0x160
ubifs_leb_write+0xf2/0x1b0 [ubifs]
ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs]
write_head+0xdc/0x1c0 [ubifs]
ubifs_jnl_write_inode+0x627/0x960 [ubifs]
wb_workfn+0x8af/0xb80
Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8
bytes aligned, the 'len' represents the true length of buf (which is
allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so
ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully
to write leb safely.
Fetch a reproducer in [Link].
Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785
Reported-by: Chengsong Ke <kechengsong(a)huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 789a7813f3fa..1607a3c76681 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -854,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->max_write_shift;
+ int m = n - 1;
+
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n);
+
+ if (m) {
+ /* '(n-1)<<c->max_write_shift < len' is always true. */
+ m <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, m);
+ if (err)
+ goto out;
+ wbuf->offs += m;
+ aligned_len -= m;
+ len -= m;
+ written += m;
+ }
+
+ /*
+ * The non-written len of buf may be less than 'n' because
+ * parameter 'len' is not 8 bytes aligned, so here we read
+ * min(len, n) bytes from buf.
+ */
+ n = 1 << c->max_write_shift;
+ memcpy(wbuf->buf, buf + written, min(len, n));
+ if (n > len) {
+ ubifs_assert(c, n - len < 8);
+ ubifs_pad(c, wbuf->buf + len, n - len);
+ }
+
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
aligned_len -= n;
- len -= n;
+ len -= min(len, n);
written += n;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5974ea7ce0f9a5987fc8cf5e08ad6e3e70bb542e Mon Sep 17 00:00:00 2001
From: Sungup Moon <sungup.moon(a)samsung.com>
Date: Mon, 14 Mar 2022 20:05:45 +0900
Subject: [PATCH] nvme: allow duplicate NSIDs for private namespaces
A NVMe subsystem with multiple controller can have private namespaces
that use the same NSID under some conditions:
"If Namespace Management, ANA Reporting, or NVM Sets are supported, the
NSIDs shall be unique within the NVM subsystem. If the Namespace
Management, ANA Reporting, and NVM Sets are not supported, then NSIDs:
a) for shared namespace shall be unique; and
b) for private namespace are not required to be unique."
Reference: Section 6.1.6 NSID and Namespace Usage; NVM Express 1.4c spec.
Make sure this specific setup is supported in Linux.
Fixes: 9ad1927a3bc2 ("nvme: always search for namespace head")
Signed-off-by: Sungup Moon <sungup.moon(a)samsung.com>
[hch: refactored and fixed the controller vs subsystem based naming
conflict]
Signed-off-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Sagi Grimberg <sagi(a)grimberg.me>
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ccc5877d514b..6dc05c1fa7db 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3627,15 +3627,20 @@ static const struct attribute_group *nvme_dev_attr_groups[] = {
NULL,
};
-static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
+static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
unsigned nsid)
{
struct nvme_ns_head *h;
- lockdep_assert_held(&subsys->lock);
+ lockdep_assert_held(&ctrl->subsys->lock);
- list_for_each_entry(h, &subsys->nsheads, entry) {
- if (h->ns_id != nsid)
+ list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
+ /*
+ * Private namespaces can share NSIDs under some conditions.
+ * In that case we can't use the same ns_head for namespaces
+ * with the same NSID.
+ */
+ if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
continue;
if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
return h;
@@ -3829,7 +3834,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
}
mutex_lock(&ctrl->subsys->lock);
- head = nvme_find_ns_head(ctrl->subsys, nsid);
+ head = nvme_find_ns_head(ctrl, nsid);
if (!head) {
ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
if (ret) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index c97d7f843977..ba90555124c4 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -482,10 +482,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
/*
* Add a multipath node if the subsystems supports multiple controllers.
- * We also do this for private namespaces as the namespace sharing data could
- * change after a rescan.
+ * We also do this for private namespaces as the namespace sharing flag
+ * could change after a rescan.
*/
- if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
+ if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
+ !nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
head->disk = blk_alloc_disk(ctrl->numa_node);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ea908d43e17..1552a48719d6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -722,6 +722,25 @@ static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
return queue_live;
return __nvme_check_ready(ctrl, rq, queue_live);
}
+
+/*
+ * NSID shall be unique for all shared namespaces, or if at least one of the
+ * following conditions is met:
+ * 1. Namespace Management is supported by the controller
+ * 2. ANA is supported by the controller
+ * 3. NVM Set are supported by the controller
+ *
+ * In other case, private namespace are not required to report a unique NSID.
+ */
+static inline bool nvme_is_unique_nsid(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head)
+{
+ return head->shared ||
+ (ctrl->oacs & NVME_CTRL_OACS_NS_MNGT_SUPP) ||
+ (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) ||
+ (ctrl->ctratt & NVME_CTRL_CTRATT_NVM_SETS);
+}
+
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9dbc3ef4daf7..2dcee34d467d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -345,6 +345,7 @@ enum {
NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_NS_MNGT_SUPP = 1 << 3,
NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8,
NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1,
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 27 Dec 2021 11:22:41 +0800
Subject: [PATCH] ubifs: Fix to add refcount once page is set private
MM defined the rule [1] very clearly that once page was set with PG_private
flag, we should increment the refcount in that page, also main flows like
pageout(), migrate_page() will assume there is one additional page
reference count if page_has_private() returns true. Otherwise, we may
get a BUG in page migration:
page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8
index:0xe2 pfn:0x14c12
aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e"
flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0|
zone=1|lastcpupid=0x1fffff)
page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0)
------------[ cut here ]------------
kernel BUG at include/linux/page_ref.h:184!
invalid opcode: 0000 [#1] SMP
CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5
RIP: 0010:migrate_page_move_mapping+0xac3/0xe70
Call Trace:
ubifs_migrate_page+0x22/0xc0 [ubifs]
move_to_new_page+0xb4/0x600
migrate_pages+0x1523/0x1cc0
compact_zone+0x8c5/0x14b0
kcompactd+0x2bc/0x560
kthread+0x18c/0x1e0
ret_from_fork+0x1f/0x30
Before the time, we should make clean a concept, what does refcount means
in page gotten from grab_cache_page_write_begin(). There are 2 situations:
Situation 1: refcount is 3, page is created by __page_cache_alloc.
TYPE_A - the write process is using this page
TYPE_B - page is assigned to one certain mapping by calling
__add_to_page_cache_locked()
TYPE_C - page is added into pagevec list corresponding current cpu by
calling lru_cache_add()
Situation 2: refcount is 2, page is gotten from the mapping's tree
TYPE_B - page has been assigned to one certain mapping
TYPE_A - the write process is using this page (by calling
page_cache_get_speculative())
Filesystem releases one refcount by calling put_page() in xxx_write_end(),
the released refcount corresponds to TYPE_A (write task is using it). If
there are any processes using a page, page migration process will skip the
page by judging whether expected_page_refs() equals to page refcount.
The BUG is caused by following process:
PA(cpu 0) kcompactd(cpu 1)
compact_zone
ubifs_write_begin
page_a = grab_cache_page_write_begin
add_to_page_cache_lru
lru_cache_add
pagevec_add // put page into cpu 0's pagevec
(refcnf = 3, for page creation process)
ubifs_write_end
SetPagePrivate(page_a) // doesn't increase page count !
unlock_page(page_a)
put_page(page_a) // refcnt = 2
[...]
PB(cpu 0)
filemap_read
filemap_get_pages
add_to_page_cache_lru
lru_cache_add
__pagevec_lru_add // traverse all pages in cpu 0's pagevec
__pagevec_lru_add_fn
SetPageLRU(page_a)
isolate_migratepages
isolate_migratepages_block
get_page_unless_zero(page_a)
// refcnt = 3
list_add(page_a, from_list)
migrate_pages(from_list)
__unmap_and_move
move_to_new_page
ubifs_migrate_page(page_a)
migrate_page_move_mapping
expected_page_refs get 3
(migration[1] + mapping[1] + private[1])
release_pages
put_page_testzero(page_a) // refcnt = 3
page_ref_freeze // refcnt = 0
page_ref_dec_and_test(0 - 1 = -1)
page_ref_unfreeze
VM_BUG_ON_PAGE(-1 != 0, page)
UBIFS doesn't increase the page refcount after setting private flag, which
leads to page migration task believes the page is not used by any other
processes, so the page is migrated. This causes concurrent accessing on
page refcount between put_page() called by other process(eg. read process
calls lru_cache_add) and page_ref_unfreeze() called by migration task.
Actually zhangjun has tried to fix this problem [2] by recalculating page
refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because
just like Kirill suggested in [2], we need to check all users of
page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting
refcount when setting/clearing private for a page. BTW, according to [4],
we set 'page->private' as 1 because ubifs just simply SetPagePrivate().
And, [5] provided a common helper to set/clear page private, ubifs can
use this helper following the example of iomap, afs, btrfs, etc.
Jump [6] to find a reproducer.
[1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com
[2] https://www.spinics.net/lists/linux-mtd/msg04018.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html
[4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.in…
[5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.…
[6] https://bugzilla.kernel.org/show_bug.cgi?id=214961
Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5cfa28cd00cd..6b45a037a047 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
if (!PagePrivate(page)) {
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len)
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
kunmap(page);
@@ -1304,7 +1304,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
}
@@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping,
return rc;
if (PagePrivate(page)) {
- ClearPagePrivate(page);
- SetPagePrivate(newpage);
+ detach_page_private(page);
+ attach_page_private(newpage, (void *)1);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return 0;
ubifs_assert(c, PagePrivate(page));
ubifs_assert(c, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
return 1;
}
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
else {
if (!PageChecked(page))
ubifs_convert_page_budget(c);
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}