The patch below does not apply to the 5.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Mon, 1 Jul 2019 14:09:17 +0900
Subject: [PATCH] sd_zbc: Fix report zones buffer allocation
During disk scan and revalidation done with sd_revalidate(), the zones
of a zoned disk are checked using the helper function
blk_revalidate_disk_zones() if a configuration change is detected
(change in the number of zones or zone size). The function
blk_revalidate_disk_zones() issues report_zones calls that are very
large, that is, to obtain zone information for all zones of the disk
with a single command. The size of the report zones command buffer
necessary for such large request generally is lower than the disk
max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
memory fragmentation), but often fail at run time (e.g. hot-plug
event). This causes the disk revalidation to fail and the disk
capacity to be changed to 0.
This problem can be avoided by using vmalloc() instead of kmalloc() for
the buffer allocation. To limit the amount of memory to be allocated,
this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
maximum number of zones to report with a single report zones command.
This limit may be lowered further to satisfy the disk max_hw_sectors
limit. Finally, to ensure that the vmalloc-ed buffer can always be
mapped in a request, the buffer size is further limited to at most
queue_max_segments() pages, allowing successful mapping of the buffer
even in the worst case scenario where none of the buffer pages are
contiguous.
Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ec3764c8f3f1..db16c19e05c4 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -9,6 +9,8 @@
*/
#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
@@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
/**
* sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
* @sdkp: The target disk
- * @buf: Buffer to use for the reply
+ * @buf: vmalloc-ed buffer to use for the reply
* @buflen: the buffer size
* @lba: Start LBA of the report
* @partial: Do partial report
@@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
put_unaligned_be32(buflen, &cmd[10]);
if (partial)
cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
- memset(buf, 0, buflen);
result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
buf, buflen, &sshdr,
@@ -103,6 +104,53 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
return 0;
}
+/*
+ * Maximum number of zones to get with one report zones command.
+ */
+#define SD_ZBC_REPORT_MAX_ZONES 8192U
+
+/**
+ * Allocate a buffer for report zones reply.
+ * @sdkp: The target disk
+ * @nr_zones: Maximum number of zones to report
+ * @buflen: Size of the buffer allocated
+ *
+ * Try to allocate a reply buffer for the number of requested zones.
+ * The size of the buffer allocated may be smaller than requested to
+ * satify the device constraint (max_hw_sectors, max_segments, etc).
+ *
+ * Return the address of the allocated buffer and update @buflen with
+ * the size of the allocated buffer.
+ */
+static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
+ unsigned int nr_zones, size_t *buflen)
+{
+ struct request_queue *q = sdkp->disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ /*
+ * Report zone buffer size should be at most 64B times the number of
+ * zones requested plus the 64B reply header, but should be at least
+ * SECTOR_SIZE for ATA devices.
+ * Make sure that this size does not exceed the hardware capabilities.
+ * Furthermore, since the report zone command cannot be split, make
+ * sure that the allocated buffer can always be mapped by limiting the
+ * number of pages allocated to the HBA max segments limit.
+ */
+ nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
+ bufsize = roundup((nr_zones + 1) * 64, 512);
+ bufsize = min_t(size_t, bufsize,
+ queue_max_hw_sectors(q) << SECTOR_SHIFT);
+ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+ buf = vzalloc(bufsize);
+ if (buf)
+ *buflen = bufsize;
+
+ return buf;
+}
+
/**
* sd_zbc_report_zones - Disk report zones operation.
* @disk: The target disk
@@ -116,30 +164,23 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones)
{
struct scsi_disk *sdkp = scsi_disk(disk);
- unsigned int i, buflen, nrz = *nr_zones;
+ unsigned int i, nrz = *nr_zones;
unsigned char *buf;
- size_t offset = 0;
+ size_t buflen = 0, offset = 0;
int ret = 0;
if (!sd_is_zoned(sdkp))
/* Not a zoned device */
return -EOPNOTSUPP;
- /*
- * Get a reply buffer for the number of requested zones plus a header,
- * without exceeding the device maximum command size. For ATA disks,
- * buffers must be aligned to 512B.
- */
- buflen = min(queue_max_hw_sectors(disk->queue) << 9,
- roundup((nrz + 1) * 64, 512));
- buf = kmalloc(buflen, GFP_KERNEL);
+ buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
if (!buf)
return -ENOMEM;
ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
sectors_to_logical(sdkp->device, sector), true);
if (ret)
- goto out_free_buf;
+ goto out;
nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
for (i = 0; i < nrz; i++) {
@@ -150,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
*nr_zones = nrz;
-out_free_buf:
- kfree(buf);
+out:
+ kvfree(buf);
return ret;
}
@@ -285,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
return 0;
}
-#define SD_ZBC_BUF_SIZE 131072U
-
/**
* sd_zbc_check_zones - Check the device capacity and zone sizes
* @sdkp: Target disk
@@ -302,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
*/
static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
{
+ size_t bufsize, buflen;
+ unsigned int noio_flag;
u64 zone_blocks = 0;
sector_t max_lba, block = 0;
unsigned char *buf;
unsigned char *rec;
- unsigned int buf_len;
- unsigned int list_length;
int ret;
u8 same;
+ /* Do all memory allocations as if GFP_NOIO was specified */
+ noio_flag = memalloc_noio_save();
+
/* Get a buffer */
- buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
+ &bufsize);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
/* Do a report zone to get max_lba and the same field */
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
if (ret)
goto out_free;
@@ -353,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
do {
/* Parse REPORT ZONES header */
- list_length = get_unaligned_be32(&buf[0]) + 64;
+ buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
+ bufsize);
rec = buf + 64;
- buf_len = min(list_length, SD_ZBC_BUF_SIZE);
/* Parse zone descriptors */
- while (rec < buf + buf_len) {
+ while (rec < buf + buflen) {
u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
if (zone_blocks == 0) {
@@ -374,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
if (block < sdkp->capacity) {
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
- block, true);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
+ true);
if (ret)
goto out_free;
}
@@ -406,7 +451,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
out_free:
- kfree(buf);
+ memalloc_noio_restore(noio_flag);
+ kvfree(buf);
return ret;
}
The patch titled
Subject: /proc/kpageflags: prevent an integer overflow in stable_page_flags()
has been added to the -mm tree. Its filename is
proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/proc-kpageflags-prevent-an-integer…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/proc-kpageflags-prevent-an-integer…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Toshiki Fukasawa <t-fukasawa(a)vx.jp.nec.com>
Subject: /proc/kpageflags: prevent an integer overflow in stable_page_flags()
stable_page_flags() returns kpageflags info in u64, but it uses "1 <<
KPF_*" internally which is considered as int. This type mismatch causes
no visible problem now, but it will if you set bit 32 or more as done in a
subsequent patch. So use BIT_ULL in order to avoid future overflow
issues.
Link: http://lkml.kernel.org/r/20190725023100.31141-2-t-fukasawa@vx.jp.nec.com
Signed-off-by: Toshiki Fukasawa <t-fukasawa(a)vx.jp.nec.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Alexey Dobriyan <adobriyan(a)gmail.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Junichi Nomura <j-nomura(a)ce.jp.nec.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/proc/page.c | 37 ++++++++++++++++++-------------------
1 file changed, 18 insertions(+), 19 deletions(-)
--- a/fs/proc/page.c~proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags
+++ a/fs/proc/page.c
@@ -95,7 +95,7 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
k = page->flags;
u = 0;
@@ -107,22 +107,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -133,14 +133,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -148,23 +147,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -177,7 +176,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
_
Patches currently in -mm which might be from t-fukasawa(a)vx.jp.nec.com are
proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
proc-kpageflags-do-not-use-uninitialized-struct-pages.patch
The patch titled
Subject: mm: document zone device struct page field usage
has been removed from the -mm tree. Its filename was
mm-document-zone-device-struct-page-field-usage.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: Ralph Campbell <rcampbell(a)nvidia.com>
Subject: mm: document zone device struct page field usage
Patch series "mm/hmm: fixes for device private page migration", v2.
Testing the latest linux git tree turned up a few bugs with page migration
to and from ZONE_DEVICE private and anonymous pages. Hopefully it
clarifies how ZONE_DEVICE private struct page uses the same mapping and
index fields from the source anonymous page mapping.
This patch (of 3):
Struct page for ZONE_DEVICE private pages uses the page->mapping and and
page->index fields while the source anonymous pages are migrated to device
private memory. This is so rmap_walk() can find the page when migrating
the ZONE_DEVICE private page back to system memory. ZONE_DEVICE pmem
backed fsdax pages also use the page->mapping and page->index fields when
files are mapped into a process address space.
Restructure struct page and add comments to make this more clear.
Link: http://lkml.kernel.org/r/20190719192955.30462-2-rcampbell@nvidia.com
Signed-off-by: Ralph Campbell <rcampbell(a)nvidia.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Jérôme Glisse <jglisse(a)redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai(a)gmail.com>
Cc: Martin Schwidefsky <schwidefsky(a)de.ibm.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Jason Gunthorpe <jgg(a)mellanox.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm_types.h | 42 +++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 13 deletions(-)
--- a/include/linux/mm_types.h~mm-document-zone-device-struct-page-field-usage
+++ a/include/linux/mm_types.h
@@ -76,13 +76,35 @@ struct page {
* avoid collision and false-positive PageTail().
*/
union {
- struct { /* Page cache and anonymous pages */
- /**
- * @lru: Pageout list, eg. active_list protected by
- * pgdat->lru_lock. Sometimes used as a generic list
- * by the page owner.
- */
- struct list_head lru;
+ struct { /* Page cache, anonymous, ZONE_DEVICE pages */
+ union {
+ /**
+ * @lru: Pageout list, e.g., active_list
+ * protected by pgdat->lru_lock. Sometimes
+ * used as a generic list by the page owner.
+ */
+ struct list_head lru;
+ /**
+ * ZONE_DEVICE pages are never on the lru
+ * list so they reuse the list space.
+ * ZONE_DEVICE private pages are counted as
+ * being mapped so the @mapping and @index
+ * fields are used while the page is migrated
+ * to device private memory.
+ * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+ * use the @mapping and @index fields when pmem
+ * backed DAX files are mapped.
+ */
+ struct {
+ /**
+ * @pgmap: Points to the hosting
+ * device page map.
+ */
+ struct dev_pagemap *pgmap;
+ /** @zone_device_data: opaque data. */
+ void *zone_device_data;
+ };
+ };
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
@@ -155,12 +177,6 @@ struct page {
spinlock_t ptl;
#endif
};
- struct { /* ZONE_DEVICE pages */
- /** @pgmap: Points to the hosting device page map. */
- struct dev_pagemap *pgmap;
- void *zone_device_data;
- unsigned long _zd_pad_1; /* uses mapping */
- };
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
_
Patches currently in -mm which might be from rcampbell(a)nvidia.com are
mm-hmm-fix-zone_device-anon-page-mapping-reuse.patch
mm-hmm-fix-bad-subpage-pointer-in-try_to_unmap_one.patch
mm-migrate-initialize-pud_entry-in-migrate_vma.patch