This mail needs to be saent to stable(a)vger.kernel.org (now cc'd).
Greg et al: please backport 2df3bae9a6543e90042291707b8db0cbfbae9ee9
Thanks,
Mike
On Mon, Jul 27 2020 at 9:40am -0400,
John Donnelly <John.P.donnelly(a)oracle.com> wrote:
> From: Mike Snitzer <snitzer(a)redhat.com>
>
> Discontinue issuing writethrough write IO in series to the origin and
> then cache.
>
> Use bio_clone_fast() to create a new origin clone bio that will be
> mapped to the origin device and then bio_chain() it to the bio that gets
> remapped to the cache device. The origin clone bio does _not_ have a
> copy of the per_bio_data -- as such check_if_tick_bio_needed() will not
> be called.
>
> The cache bio (parent bio) will not complete until the origin bio has
> completed -- this fulfills bio_clone_fast()'s requirements as well as
> the requirement to not complete the original IO until the write IO has
> completed to both the origin and cache device.
>
> Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
>
> (cherry picked from commit 2df3bae9a6543e90042291707b8db0cbfbae9ee9)
>
> Fixes: 705559706d62038b74c5088114c1799cf2c9dce8 (dm bio record:
> save/restore bi_end_io and bi_integrity, version 4.14.188)
>
> 70555970 introduced a mkfs.ext4 hang on a LVM device that has been
> modified with lvconvert --cachemode=writethrough.
>
> Signed-off-by: John Donnelly <john.p.donnelly(a)oracle.com>
> Tested-by: John Donnelly <john.p.donnelly(a)oracle.com>
> Reviewed-by: Somasundaram Krishnasamy <somasundaram.krishnasamy(a)oracle.com>
>
> conflict: drivers/md/dm-cache-target.c - Corrected syntax of
> writethrough_mode(&cache->feature) that was caught by
> arm compiler.
>
> cc: stable(a)vger.kernel.org
> cc: snitzer(a)redhat.com
> ---
> drivers/md/dm-cache-target.c | 54 ++++++++++++++++++++++++------------
> 1 file changed, 37 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
> index 69cdb29ef6be..8241b7c36655 100644
> --- a/drivers/md/dm-cache-target.c
> +++ b/drivers/md/dm-cache-target.c
> @@ -450,6 +450,7 @@ struct cache {
> struct work_struct migration_worker;
> struct delayed_work waker;
> struct dm_bio_prison_v2 *prison;
> + struct bio_set *bs;
> mempool_t *migration_pool;
> @@ -868,16 +869,23 @@ static void check_if_tick_bio_needed(struct
> cache *cache, struct bio *bio)
> spin_unlock_irqrestore(&cache->lock, flags);
> }
> -static void remap_to_origin_clear_discard(struct cache *cache,
> struct bio *bio,
> - dm_oblock_t oblock)
> +static void __remap_to_origin_clear_discard(struct cache *cache,
> struct bio *bio,
> + dm_oblock_t oblock, bool bio_has_pbd)
> {
> - // FIXME: this is called way too much.
> - check_if_tick_bio_needed(cache, bio);
> + if (bio_has_pbd)
> + check_if_tick_bio_needed(cache, bio);
> remap_to_origin(cache, bio);
> if (bio_data_dir(bio) == WRITE)
> clear_discard(cache, oblock_to_dblock(cache, oblock));
> }
> +static void remap_to_origin_clear_discard(struct cache *cache,
> struct bio *bio,
> + dm_oblock_t oblock)
> +{
> + // FIXME: check_if_tick_bio_needed() is called way too much
> through this interface
> + __remap_to_origin_clear_discard(cache, bio, oblock, true);
> +}
> +
> static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
> dm_oblock_t oblock, dm_cblock_t cblock)
> {
> @@ -971,23 +979,25 @@ static void writethrough_endio(struct bio *bio)
> }
> /*
> - * FIXME: send in parallel, huge latency as is.
> * When running in writethrough mode we need to send writes to clean blocks
> - * to both the cache and origin devices. In future we'd like to clone the
> - * bio and send them in parallel, but for now we're doing them in
> - * series as this is easier.
> + * to both the cache and origin devices. Clone the bio and send
> them in parallel.
> */
> -static void remap_to_origin_then_cache(struct cache *cache, struct
> bio *bio,
> - dm_oblock_t oblock, dm_cblock_t cblock)
> +static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
> + dm_oblock_t oblock, dm_cblock_t cblock)
> {
> - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
> + struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
> - pb->cache = cache;
> - pb->cblock = cblock;
> - dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
> - dm_bio_record(&pb->bio_details, bio);
> + BUG_ON(!origin_bio);
> - remap_to_origin_clear_discard(pb->cache, bio, oblock);
> + bio_chain(origin_bio, bio);
> + /*
> + * Passing false to __remap_to_origin_clear_discard() skips
> + * all code that might use per_bio_data (since clone doesn't have it)
> + */
> + __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
> + submit_bio(origin_bio);
> +
> + remap_to_cache(cache, bio, cblock);
> }
> /*----------------------------------------------------------------
> @@ -1873,7 +1883,7 @@ static int map_bio(struct cache *cache, struct
> bio *bio, dm_oblock_t block,
> } else {
> if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
> !is_dirty(cache, cblock)) {
> - remap_to_origin_then_cache(cache, bio, block, cblock);
> + remap_to_origin_and_cache(cache, bio, block, cblock);
> accounted_begin(cache, bio);
> } else
> remap_to_cache_dirty(cache, bio, block, cblock);
> @@ -2132,6 +2142,9 @@ static void destroy(struct cache *cache)
> kfree(cache->ctr_args[i]);
> kfree(cache->ctr_args);
> + if (cache->bs)
> + bioset_free(cache->bs);
> +
> kfree(cache);
> }
> @@ -2589,6 +2602,13 @@ static int cache_create(struct cache_args
> *ca, struct cache **result)
> cache->features = ca->features;
> ti->per_io_data_size = get_per_bio_data_size(cache);
> + if (writethrough_mode(&cache->features)) {
> + /* Create bioset for writethrough bios issued to origin */
> + cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
> + if (!cache->bs)
> + goto bad;
> + }
> +
> cache->callbacks.congested_fn = cache_is_congested;
> dm_table_add_target_callbacks(ti->table, &cache->callbacks);
>
> --
> 2.26.2
>
The routine cma_init_reserved_areas is designed to activate all
reserved cma areas. It quits when it first encounters an error.
This can leave some areas in a state where they are reserved but
not activated. There is no feedback to code which performed the
reservation. Attempting to allocate memory from areas in such a
state will result in a BUG.
Modify cma_init_reserved_areas to always attempt to activate all
areas. The called routine, cma_activate_area is responsible for
leaving the area in a valid state. No one is making active use
of returned error codes, so change the routine to void.
How to reproduce: This example uses kernelcore, hugetlb and cma
as an easy way to reproduce. However, this is a more general cma
issue.
Two node x86 VM 16GB total, 8GB per node
Kernel command line parameters, kernelcore=4G hugetlb_cma=8G
Related boot time messages,
hugetlb_cma: reserve 8192 MiB, up to 4096 MiB per node
cma: Reserved 4096 MiB at 0x0000000100000000
hugetlb_cma: reserved 4096 MiB on node 0
cma: Reserved 4096 MiB at 0x0000000300000000
hugetlb_cma: reserved 4096 MiB on node 1
cma: CMA area hugetlb could not be activated
# echo 8 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] SMP PTI
...
Call Trace:
bitmap_find_next_zero_area_off+0x51/0x90
cma_alloc+0x1a5/0x310
alloc_fresh_huge_page+0x78/0x1a0
alloc_pool_huge_page+0x6f/0xf0
set_max_huge_pages+0x10c/0x250
nr_hugepages_store_common+0x92/0x120
? __kmalloc+0x171/0x270
kernfs_fop_write+0xc1/0x1a0
vfs_write+0xc7/0x1f0
ksys_write+0x5f/0xe0
do_syscall_64+0x4d/0x90
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: c64be2bb1c6e ("drivers: add Contiguous Memory Allocator")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
---
mm/cma.c | 23 +++++++++--------------
1 file changed, 9 insertions(+), 14 deletions(-)
diff --git a/mm/cma.c b/mm/cma.c
index 26ecff818881..0963c0f9c502 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -93,17 +93,15 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
mutex_unlock(&cma->lock);
}
-static int __init cma_activate_area(struct cma *cma)
+static void __init cma_activate_area(struct cma *cma)
{
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap) {
- cma->count = 0;
- return -ENOMEM;
- }
+ if (!cma->bitmap)
+ goto out_error;
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -133,25 +131,22 @@ static int __init cma_activate_area(struct cma *cma)
spin_lock_init(&cma->mem_head_lock);
#endif
- return 0;
+ return;
not_in_zone:
- pr_err("CMA area %s could not be activated\n", cma->name);
bitmap_free(cma->bitmap);
+out_error:
cma->count = 0;
- return -EINVAL;
+ pr_err("CMA area %s could not be activated\n", cma->name);
+ return;
}
static int __init cma_init_reserved_areas(void)
{
int i;
- for (i = 0; i < cma_area_count; i++) {
- int ret = cma_activate_area(&cma_areas[i]);
-
- if (ret)
- return ret;
- }
+ for (i = 0; i < cma_area_count; i++)
+ cma_activate_area(&cma_areas[i]);
return 0;
}
--
2.25.4
The following changes since commit 92ed301919932f777713b9172e525674157e983d:
Linux 5.8-rc7 (2020-07-26 14:14:06 -0700)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
for you to fetch changes up to a96b0d061d476093cf86ca1c2de06fc57163588d:
virtio-mem: Fix build error due to improper use 'select' (2020-07-30 11:28:17 -0400)
----------------------------------------------------------------
virtio, qemu_fw: bugfixes
A couple of last minute bugfixes.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
----------------------------------------------------------------
Alexander Duyck (1):
virtio-balloon: Document byte ordering of poison_val
Michael S. Tsirkin (2):
vhost/scsi: fix up req type endian-ness
virtio_balloon: fix up endian-ness for free cmd id
Qiushi Wu (1):
firmware: Fix a reference count leak.
Weilong Chen (1):
virtio-mem: Fix build error due to improper use 'select'
drivers/firmware/qemu_fw_cfg.c | 7 ++++---
drivers/vhost/scsi.c | 2 +-
drivers/virtio/Kconfig | 2 +-
drivers/virtio/virtio_balloon.c | 11 ++++++++++-
4 files changed, 16 insertions(+), 6 deletions(-)
__tracepoint_string's have their string data stored in .rodata, and an
address to that data stored in the "__tracepoint_str" section. Functions
that refer to those strings refer to the symbol of the address. Compiler
optimization can replace those address references with references
directly to the string data. If the address doesn't appear to have other
uses, then it appears dead to the compiler and is removed. This can
break the /tracing/printk_formats sysfs node which iterates the
addresses stored in the "__tracepoint_str" section.
Like other strings stored in custom sections in this header, mark these
__used to inform the compiler that there are other non-obvious users of
the address, so they should still be emitted.
Cc: stable(a)vger.kernel.org
Reported-by: Tim Murray <timmurray(a)google.com>
Reported-by: Simon MacMullen <simonmacm(a)google.com>
Suggested-by: Greg Hackmann <ghackmann(a)google.com>
Signed-off-by: Nick Desaulniers <ndesaulniers(a)google.com>
---
No change V1 -> V2.
include/linux/tracepoint.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index a1fecf311621..3a5b717d92e8 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -361,7 +361,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
static const char *___tp_str __tracepoint_string = str; \
___tp_str; \
})
-#define __tracepoint_string __attribute__((section("__tracepoint_str")))
+#define __tracepoint_string __attribute__((section("__tracepoint_str"), used))
#else
/*
* tracepoint_string() is used to save the string address for userspace
--
2.28.0.163.g6104cc2f0b6-goog
syzbot report [1] describes a deadlock when write operation against an
ashmem fd executed at the time when ashmem is shrinking its cache results
in the following lock sequence:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(&sb->s_type->i_mutex_key#13);
lock(fs_reclaim);
lock(&sb->s_type->i_mutex_key#13);
kswapd takes fs_reclaim and then inode_lock while generic_perform_write
takes inode_lock and then fs_reclaim. However ashmem does not support
writing into backing shmem with a write syscall. The only way to change
its content is to mmap it and operate on mapped memory. Therefore the race
that lockdep is warning about is not valid. Resolve this by introducing a
separate lockdep class for the backing shmem inodes.
[1]: https://lkml.kernel.org/lkml/0000000000000b5f9d059aa2037f@google.com/
Reported-by: syzbot+7a0d9d0b26efefe61780(a)syzkaller.appspotmail.com
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Reviewed-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
drivers/staging/android/ashmem.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index c05a214191da..10b4be1f3e78 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -95,6 +95,15 @@ static DEFINE_MUTEX(ashmem_mutex);
static struct kmem_cache *ashmem_area_cachep __read_mostly;
static struct kmem_cache *ashmem_range_cachep __read_mostly;
+/*
+ * A separate lockdep class for the backing shmem inodes to resolve the lockdep
+ * warning about the race between kswapd taking fs_reclaim before inode_lock
+ * and write syscall taking inode_lock and then fs_reclaim.
+ * Note that such race is impossible because ashmem does not support write
+ * syscalls operating on the backing shmem.
+ */
+static struct lock_class_key backing_shmem_inode_class;
+
static inline unsigned long range_size(struct ashmem_range *range)
{
return range->pgend - range->pgstart + 1;
@@ -396,6 +405,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
if (!asma->file) {
char *name = ASHMEM_NAME_DEF;
struct file *vmfile;
+ struct inode *inode;
if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0')
name = asma->name;
@@ -407,6 +417,8 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
goto out;
}
vmfile->f_mode |= FMODE_LSEEK;
+ inode = file_inode(vmfile);
+ lockdep_set_class(&inode->i_rwsem, &backing_shmem_inode_class);
asma->file = vmfile;
/*
* override mmap operation of the vmfile so that it can't be
--
2.28.0.163.g6104cc2f0b6-goog