This is a note to let you know that I've just added the patch titled
staging: gasket: apex: fix copy-paste typo
to my staging git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
in the staging-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 66665bb9979246729562a09fcdbb101c83127989 Mon Sep 17 00:00:00 2001
From: Ivan Bornyakov <brnkv.i1(a)gmail.com>
Date: Wed, 10 Jul 2019 23:45:18 +0300
Subject: staging: gasket: apex: fix copy-paste typo
In sysfs_show() case-branches ATTR_KERNEL_HIB_PAGE_TABLE_SIZE and
ATTR_KERNEL_HIB_SIMPLE_PAGE_TABLE_SIZE do the same. It looks like
copy-paste mistake.
Signed-off-by: Ivan Bornyakov <brnkv.i1(a)gmail.com>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20190710204518.16814-1-brnkv.i1@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/staging/gasket/apex_driver.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/staging/gasket/apex_driver.c b/drivers/staging/gasket/apex_driver.c
index 2be45ee9d061..464648ee2036 100644
--- a/drivers/staging/gasket/apex_driver.c
+++ b/drivers/staging/gasket/apex_driver.c
@@ -532,7 +532,7 @@ static ssize_t sysfs_show(struct device *device, struct device_attribute *attr,
break;
case ATTR_KERNEL_HIB_SIMPLE_PAGE_TABLE_SIZE:
ret = scnprintf(buf, PAGE_SIZE, "%u\n",
- gasket_page_table_num_entries(
+ gasket_page_table_num_simple_entries(
gasket_dev->page_table[0]));
break;
case ATTR_KERNEL_HIB_NUM_ACTIVE_PAGES:
--
2.22.0
This is a note to let you know that I've just added the patch titled
Staging: fbtft: Fix probing of gpio descriptor
to my staging git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
in the staging-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From dbc4f989c878fe101fb7920e9609e8ec44e097cd Mon Sep 17 00:00:00 2001
From: Phil Reid <preid(a)electromag.com.au>
Date: Tue, 16 Jul 2019 08:24:36 +0800
Subject: Staging: fbtft: Fix probing of gpio descriptor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Conversion to use gpio descriptors broke all gpio lookups as
devm_gpiod_get_index was converted to use dev->driver->name for
the gpio name lookup. Fix this by using the name param. In
addition gpiod_get post-fixes the -gpios to the name so that
shouldn't be included in the call. However this then breaks the
of_find_property call to see if the gpio entry exists as all
fbtft treats all gpios as optional. So use devm_gpiod_get_index_optional
instead which achieves the same thing and is simpler.
Nishad confirmed the changes where only ever compile tested.
Fixes: c440eee1a7a1 ("Staging: fbtft: Switch to the gpio descriptor interface")
Reviewed-by: Nicolas Saenz Julienne <nsaenzjulienne(a)suse.de>
Tested-by: Nicolas Saenz Julienne <nsaenzjulienne(a)suse.de>
Tested-by: Jan Sebastian Götte <linux(a)jaseg.net>
Signed-off-by: Phil Reid <preid(a)electromag.com.au>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/1563236677-5045-2-git-send-email-preid@electromag…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/staging/fbtft/fbtft-core.c | 39 ++++++++++++++----------------
1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/drivers/staging/fbtft/fbtft-core.c b/drivers/staging/fbtft/fbtft-core.c
index 7cbc1bdd2d8a..b963cccdc3f6 100644
--- a/drivers/staging/fbtft/fbtft-core.c
+++ b/drivers/staging/fbtft/fbtft-core.c
@@ -76,21 +76,18 @@ static int fbtft_request_one_gpio(struct fbtft_par *par,
struct gpio_desc **gpiop)
{
struct device *dev = par->info->device;
- struct device_node *node = dev->of_node;
int ret = 0;
- if (of_find_property(node, name, NULL)) {
- *gpiop = devm_gpiod_get_index(dev, dev->driver->name, index,
- GPIOD_OUT_HIGH);
- if (IS_ERR(*gpiop)) {
- ret = PTR_ERR(*gpiop);
- dev_err(dev,
- "Failed to request %s GPIO:%d\n", name, ret);
- return ret;
- }
- fbtft_par_dbg(DEBUG_REQUEST_GPIOS, par, "%s: '%s' GPIO\n",
- __func__, name);
+ *gpiop = devm_gpiod_get_index_optional(dev, name, index,
+ GPIOD_OUT_HIGH);
+ if (IS_ERR(*gpiop)) {
+ ret = PTR_ERR(*gpiop);
+ dev_err(dev,
+ "Failed to request %s GPIO: %d\n", name, ret);
+ return ret;
}
+ fbtft_par_dbg(DEBUG_REQUEST_GPIOS, par, "%s: '%s' GPIO\n",
+ __func__, name);
return ret;
}
@@ -103,34 +100,34 @@ static int fbtft_request_gpios_dt(struct fbtft_par *par)
if (!par->info->device->of_node)
return -EINVAL;
- ret = fbtft_request_one_gpio(par, "reset-gpios", 0, &par->gpio.reset);
+ ret = fbtft_request_one_gpio(par, "reset", 0, &par->gpio.reset);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "dc-gpios", 0, &par->gpio.dc);
+ ret = fbtft_request_one_gpio(par, "dc", 0, &par->gpio.dc);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "rd-gpios", 0, &par->gpio.rd);
+ ret = fbtft_request_one_gpio(par, "rd", 0, &par->gpio.rd);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "wr-gpios", 0, &par->gpio.wr);
+ ret = fbtft_request_one_gpio(par, "wr", 0, &par->gpio.wr);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "cs-gpios", 0, &par->gpio.cs);
+ ret = fbtft_request_one_gpio(par, "cs", 0, &par->gpio.cs);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "latch-gpios", 0, &par->gpio.latch);
+ ret = fbtft_request_one_gpio(par, "latch", 0, &par->gpio.latch);
if (ret)
return ret;
for (i = 0; i < 16; i++) {
- ret = fbtft_request_one_gpio(par, "db-gpios", i,
+ ret = fbtft_request_one_gpio(par, "db", i,
&par->gpio.db[i]);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "led-gpios", i,
+ ret = fbtft_request_one_gpio(par, "led", i,
&par->gpio.led[i]);
if (ret)
return ret;
- ret = fbtft_request_one_gpio(par, "aux-gpios", i,
+ ret = fbtft_request_one_gpio(par, "aux", i,
&par->gpio.aux[i]);
if (ret)
return ret;
--
2.22.0
This conexant codec isn't in the supported codec list yet, the hda
generic driver can drive this codec well, but on a Lenovo machine
with mute/mic-mute leds, we need to apply CXT_FIXUP_THINKPAD_ACPI
to make the leds work. After adding this codec to the list, the
driver patch_conexant.c will apply THINKPAD_ACPI to this machine.
Cc: stable(a)vger.kernel.org
Signed-off-by: Hui Wang <hui.wang(a)canonical.com>
---
sound/pci/hda/patch_conexant.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 4f8d0845ee1e..f299f137eaea 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -1083,6 +1083,7 @@ static int patch_conexant_auto(struct hda_codec *codec)
*/
static const struct hda_device_id snd_hda_id_conexant[] = {
+ HDA_CODEC_ENTRY(0x14f11f86, "CX8070", patch_conexant_auto),
HDA_CODEC_ENTRY(0x14f12008, "CX8200", patch_conexant_auto),
HDA_CODEC_ENTRY(0x14f15045, "CX20549 (Venice)", patch_conexant_auto),
HDA_CODEC_ENTRY(0x14f15047, "CX20551 (Waikiki)", patch_conexant_auto),
--
2.17.1
commit b091ac616846a1da75b1f2566b41255ce7f0e0a6 upstream.
During disk scan and revalidation done with sd_revalidate(), the zones
of a zoned disk are checked using the helper function
blk_revalidate_disk_zones() if a configuration change is detected
(change in the number of zones or zone size). The function
blk_revalidate_disk_zones() issues report_zones calls that are very
large, that is, to obtain zone information for all zones of the disk
with a single command. The size of the report zones command buffer
necessary for such large request generally is lower than the disk
max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
memory fragmentation), but often fail at run time (e.g. hot-plug
event). This causes the disk revalidation to fail and the disk
capacity to be changed to 0.
This problem can be avoided by using vmalloc() instead of kmalloc() for
the buffer allocation. To limit the amount of memory to be allocated,
this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
maximum number of zones to report with a single report zones command.
This limit may be lowered further to satisfy the disk max_hw_sectors
limit. Finally, to ensure that the vmalloc-ed buffer can always be
mapped in a request, the buffer size is further limited to at most
queue_max_segments() pages, allowing successful mapping of the buffer
even in the worst case scenario where none of the buffer pages are
contiguous.
Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable(a)vger.kernel.org # 5.1.x
Cc: stable(a)vger.kernel.org # 5.2.x
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
drivers/scsi/sd_zbc.c | 104 ++++++++++++++++++++++++++++++------------
1 file changed, 75 insertions(+), 29 deletions(-)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a340af797a85..a9f3a8d77ee7 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -23,6 +23,8 @@
*/
#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
@@ -64,7 +66,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
/**
* sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
* @sdkp: The target disk
- * @buf: Buffer to use for the reply
+ * @buf: vmalloc-ed buffer to use for the reply
* @buflen: the buffer size
* @lba: Start LBA of the report
* @partial: Do partial report
@@ -93,7 +95,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
put_unaligned_be32(buflen, &cmd[10]);
if (partial)
cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
- memset(buf, 0, buflen);
result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
buf, buflen, &sshdr,
@@ -117,6 +118,53 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
return 0;
}
+/*
+ * Maximum number of zones to get with one report zones command.
+ */
+#define SD_ZBC_REPORT_MAX_ZONES 8192U
+
+/**
+ * Allocate a buffer for report zones reply.
+ * @sdkp: The target disk
+ * @nr_zones: Maximum number of zones to report
+ * @buflen: Size of the buffer allocated
+ *
+ * Try to allocate a reply buffer for the number of requested zones.
+ * The size of the buffer allocated may be smaller than requested to
+ * satify the device constraint (max_hw_sectors, max_segments, etc).
+ *
+ * Return the address of the allocated buffer and update @buflen with
+ * the size of the allocated buffer.
+ */
+static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
+ unsigned int nr_zones, size_t *buflen)
+{
+ struct request_queue *q = sdkp->disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ /*
+ * Report zone buffer size should be at most 64B times the number of
+ * zones requested plus the 64B reply header, but should be at least
+ * SECTOR_SIZE for ATA devices.
+ * Make sure that this size does not exceed the hardware capabilities.
+ * Furthermore, since the report zone command cannot be split, make
+ * sure that the allocated buffer can always be mapped by limiting the
+ * number of pages allocated to the HBA max segments limit.
+ */
+ nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
+ bufsize = roundup((nr_zones + 1) * 64, 512);
+ bufsize = min_t(size_t, bufsize,
+ queue_max_hw_sectors(q) << SECTOR_SHIFT);
+ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+ buf = vzalloc(bufsize);
+ if (buf)
+ *buflen = bufsize;
+
+ return buf;
+}
+
/**
* sd_zbc_report_zones - Disk report zones operation.
* @disk: The target disk
@@ -132,30 +180,23 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
gfp_t gfp_mask)
{
struct scsi_disk *sdkp = scsi_disk(disk);
- unsigned int i, buflen, nrz = *nr_zones;
+ unsigned int i, nrz = *nr_zones;
unsigned char *buf;
- size_t offset = 0;
+ size_t buflen = 0, offset = 0;
int ret = 0;
if (!sd_is_zoned(sdkp))
/* Not a zoned device */
return -EOPNOTSUPP;
- /*
- * Get a reply buffer for the number of requested zones plus a header,
- * without exceeding the device maximum command size. For ATA disks,
- * buffers must be aligned to 512B.
- */
- buflen = min(queue_max_hw_sectors(disk->queue) << 9,
- roundup((nrz + 1) * 64, 512));
- buf = kmalloc(buflen, gfp_mask);
+ buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
if (!buf)
return -ENOMEM;
ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
sectors_to_logical(sdkp->device, sector), true);
if (ret)
- goto out_free_buf;
+ goto out;
nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
for (i = 0; i < nrz; i++) {
@@ -166,8 +207,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
*nr_zones = nrz;
-out_free_buf:
- kfree(buf);
+out:
+ kvfree(buf);
return ret;
}
@@ -301,8 +342,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
return 0;
}
-#define SD_ZBC_BUF_SIZE 131072U
-
/**
* sd_zbc_check_zones - Check the device capacity and zone sizes
* @sdkp: Target disk
@@ -318,22 +357,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
*/
static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
{
+ size_t bufsize, buflen;
+ unsigned int noio_flag;
u64 zone_blocks = 0;
sector_t max_lba, block = 0;
unsigned char *buf;
unsigned char *rec;
- unsigned int buf_len;
- unsigned int list_length;
int ret;
u8 same;
+ /* Do all memory allocations as if GFP_NOIO was specified */
+ noio_flag = memalloc_noio_save();
+
/* Get a buffer */
- buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
+ &bufsize);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
/* Do a report zone to get max_lba and the same field */
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
if (ret)
goto out_free;
@@ -369,12 +414,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
do {
/* Parse REPORT ZONES header */
- list_length = get_unaligned_be32(&buf[0]) + 64;
+ buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
+ bufsize);
rec = buf + 64;
- buf_len = min(list_length, SD_ZBC_BUF_SIZE);
/* Parse zone descriptors */
- while (rec < buf + buf_len) {
+ while (rec < buf + buflen) {
u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
if (zone_blocks == 0) {
@@ -390,8 +435,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
if (block < sdkp->capacity) {
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
- block, true);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
+ true);
if (ret)
goto out_free;
}
@@ -422,7 +467,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
out_free:
- kfree(buf);
+ memalloc_noio_restore(noio_flag);
+ kvfree(buf);
return ret;
}
--
2.21.0
The patch below does not apply to the 5.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Mon, 1 Jul 2019 14:09:18 +0900
Subject: [PATCH] block: Limit zone array allocation size
Limit the size of the struct blk_zone array used in
blk_revalidate_disk_zones() to avoid memory allocation failures leading
to disk revalidation failure. Also further reduce the likelyhood of
such failures by using kvcalloc() (that is vmalloc()) instead of
allocating contiguous pages with alloc_pages().
Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 58ced170b424..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,8 @@
#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include "blk.h"
@@ -371,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
* Allocate an array of struct blk_zone to get nr_zones zone information.
* The allocated array may be smaller than nr_zones.
*/
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
{
- size_t size = *nr_zones * sizeof(struct blk_zone);
- struct page *page;
- int order;
-
- for (order = get_order(size); order >= 0; order--) {
- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
- if (page) {
- *nr_zones = min_t(unsigned int, *nr_zones,
- (PAGE_SIZE << order) / sizeof(struct blk_zone));
- return page_address(page);
- }
+ struct blk_zone *zones;
+ size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+ /*
+ * GFP_KERNEL here is meaningless as the caller task context has
+ * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+ * with memalloc_noio_save().
+ */
+ zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ *nr_zones = 0;
+ return NULL;
}
- return NULL;
+ *nr_zones = nrz;
+
+ return zones;
}
void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -448,7 +453,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
/* Get zone information and initialize seq_zones_bitmap */
rep_nr_zones = nr_zones;
- zones = blk_alloc_zones(q->node, &rep_nr_zones);
+ zones = blk_alloc_zones(&rep_nr_zones);
if (!zones)
goto out;
@@ -487,8 +492,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
out:
memalloc_noio_restore(noio_flag);
- free_pages((unsigned long)zones,
- get_order(rep_nr_zones * sizeof(struct blk_zone)));
+ kvfree(zones);
kfree(seq_zones_wlock);
kfree(seq_zones_bitmap);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05036e3e3458..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,6 +344,11 @@ struct queue_limits {
#ifdef CONFIG_BLK_DEV_ZONED
+/*
+ * Maximum number of zones to report with a single report zones command.
+ */
+#define BLK_ZONED_REPORT_MAX_ZONES 8192U
+
extern unsigned int blkdev_nr_zones(struct block_device *bdev);
extern int blkdev_report_zones(struct block_device *bdev,
sector_t sector, struct blk_zone *zones,
The patch below does not apply to the 5.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Mon, 1 Jul 2019 14:09:17 +0900
Subject: [PATCH] sd_zbc: Fix report zones buffer allocation
During disk scan and revalidation done with sd_revalidate(), the zones
of a zoned disk are checked using the helper function
blk_revalidate_disk_zones() if a configuration change is detected
(change in the number of zones or zone size). The function
blk_revalidate_disk_zones() issues report_zones calls that are very
large, that is, to obtain zone information for all zones of the disk
with a single command. The size of the report zones command buffer
necessary for such large request generally is lower than the disk
max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
memory fragmentation), but often fail at run time (e.g. hot-plug
event). This causes the disk revalidation to fail and the disk
capacity to be changed to 0.
This problem can be avoided by using vmalloc() instead of kmalloc() for
the buffer allocation. To limit the amount of memory to be allocated,
this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
maximum number of zones to report with a single report zones command.
This limit may be lowered further to satisfy the disk max_hw_sectors
limit. Finally, to ensure that the vmalloc-ed buffer can always be
mapped in a request, the buffer size is further limited to at most
queue_max_segments() pages, allowing successful mapping of the buffer
even in the worst case scenario where none of the buffer pages are
contiguous.
Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ec3764c8f3f1..db16c19e05c4 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -9,6 +9,8 @@
*/
#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
@@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
/**
* sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
* @sdkp: The target disk
- * @buf: Buffer to use for the reply
+ * @buf: vmalloc-ed buffer to use for the reply
* @buflen: the buffer size
* @lba: Start LBA of the report
* @partial: Do partial report
@@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
put_unaligned_be32(buflen, &cmd[10]);
if (partial)
cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
- memset(buf, 0, buflen);
result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
buf, buflen, &sshdr,
@@ -103,6 +104,53 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
return 0;
}
+/*
+ * Maximum number of zones to get with one report zones command.
+ */
+#define SD_ZBC_REPORT_MAX_ZONES 8192U
+
+/**
+ * Allocate a buffer for report zones reply.
+ * @sdkp: The target disk
+ * @nr_zones: Maximum number of zones to report
+ * @buflen: Size of the buffer allocated
+ *
+ * Try to allocate a reply buffer for the number of requested zones.
+ * The size of the buffer allocated may be smaller than requested to
+ * satify the device constraint (max_hw_sectors, max_segments, etc).
+ *
+ * Return the address of the allocated buffer and update @buflen with
+ * the size of the allocated buffer.
+ */
+static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
+ unsigned int nr_zones, size_t *buflen)
+{
+ struct request_queue *q = sdkp->disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ /*
+ * Report zone buffer size should be at most 64B times the number of
+ * zones requested plus the 64B reply header, but should be at least
+ * SECTOR_SIZE for ATA devices.
+ * Make sure that this size does not exceed the hardware capabilities.
+ * Furthermore, since the report zone command cannot be split, make
+ * sure that the allocated buffer can always be mapped by limiting the
+ * number of pages allocated to the HBA max segments limit.
+ */
+ nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
+ bufsize = roundup((nr_zones + 1) * 64, 512);
+ bufsize = min_t(size_t, bufsize,
+ queue_max_hw_sectors(q) << SECTOR_SHIFT);
+ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+ buf = vzalloc(bufsize);
+ if (buf)
+ *buflen = bufsize;
+
+ return buf;
+}
+
/**
* sd_zbc_report_zones - Disk report zones operation.
* @disk: The target disk
@@ -116,30 +164,23 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones)
{
struct scsi_disk *sdkp = scsi_disk(disk);
- unsigned int i, buflen, nrz = *nr_zones;
+ unsigned int i, nrz = *nr_zones;
unsigned char *buf;
- size_t offset = 0;
+ size_t buflen = 0, offset = 0;
int ret = 0;
if (!sd_is_zoned(sdkp))
/* Not a zoned device */
return -EOPNOTSUPP;
- /*
- * Get a reply buffer for the number of requested zones plus a header,
- * without exceeding the device maximum command size. For ATA disks,
- * buffers must be aligned to 512B.
- */
- buflen = min(queue_max_hw_sectors(disk->queue) << 9,
- roundup((nrz + 1) * 64, 512));
- buf = kmalloc(buflen, GFP_KERNEL);
+ buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
if (!buf)
return -ENOMEM;
ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
sectors_to_logical(sdkp->device, sector), true);
if (ret)
- goto out_free_buf;
+ goto out;
nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
for (i = 0; i < nrz; i++) {
@@ -150,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
*nr_zones = nrz;
-out_free_buf:
- kfree(buf);
+out:
+ kvfree(buf);
return ret;
}
@@ -285,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
return 0;
}
-#define SD_ZBC_BUF_SIZE 131072U
-
/**
* sd_zbc_check_zones - Check the device capacity and zone sizes
* @sdkp: Target disk
@@ -302,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
*/
static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
{
+ size_t bufsize, buflen;
+ unsigned int noio_flag;
u64 zone_blocks = 0;
sector_t max_lba, block = 0;
unsigned char *buf;
unsigned char *rec;
- unsigned int buf_len;
- unsigned int list_length;
int ret;
u8 same;
+ /* Do all memory allocations as if GFP_NOIO was specified */
+ noio_flag = memalloc_noio_save();
+
/* Get a buffer */
- buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
+ &bufsize);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
/* Do a report zone to get max_lba and the same field */
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
if (ret)
goto out_free;
@@ -353,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
do {
/* Parse REPORT ZONES header */
- list_length = get_unaligned_be32(&buf[0]) + 64;
+ buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
+ bufsize);
rec = buf + 64;
- buf_len = min(list_length, SD_ZBC_BUF_SIZE);
/* Parse zone descriptors */
- while (rec < buf + buf_len) {
+ while (rec < buf + buflen) {
u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
if (zone_blocks == 0) {
@@ -374,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
if (block < sdkp->capacity) {
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
- block, true);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
+ true);
if (ret)
goto out_free;
}
@@ -406,7 +451,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
out_free:
- kfree(buf);
+ memalloc_noio_restore(noio_flag);
+ kvfree(buf);
return ret;
}
The patch titled
Subject: /proc/kpageflags: prevent an integer overflow in stable_page_flags()
has been added to the -mm tree. Its filename is
proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/proc-kpageflags-prevent-an-integer…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/proc-kpageflags-prevent-an-integer…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Toshiki Fukasawa <t-fukasawa(a)vx.jp.nec.com>
Subject: /proc/kpageflags: prevent an integer overflow in stable_page_flags()
stable_page_flags() returns kpageflags info in u64, but it uses "1 <<
KPF_*" internally which is considered as int. This type mismatch causes
no visible problem now, but it will if you set bit 32 or more as done in a
subsequent patch. So use BIT_ULL in order to avoid future overflow
issues.
Link: http://lkml.kernel.org/r/20190725023100.31141-2-t-fukasawa@vx.jp.nec.com
Signed-off-by: Toshiki Fukasawa <t-fukasawa(a)vx.jp.nec.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Alexey Dobriyan <adobriyan(a)gmail.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Junichi Nomura <j-nomura(a)ce.jp.nec.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/proc/page.c | 37 ++++++++++++++++++-------------------
1 file changed, 18 insertions(+), 19 deletions(-)
--- a/fs/proc/page.c~proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags
+++ a/fs/proc/page.c
@@ -95,7 +95,7 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
k = page->flags;
u = 0;
@@ -107,22 +107,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -133,14 +133,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -148,23 +147,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -177,7 +176,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
_
Patches currently in -mm which might be from t-fukasawa(a)vx.jp.nec.com are
proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
proc-kpageflags-do-not-use-uninitialized-struct-pages.patch
The patch titled
Subject: mm: document zone device struct page field usage
has been removed from the -mm tree. Its filename was
mm-document-zone-device-struct-page-field-usage.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: Ralph Campbell <rcampbell(a)nvidia.com>
Subject: mm: document zone device struct page field usage
Patch series "mm/hmm: fixes for device private page migration", v2.
Testing the latest linux git tree turned up a few bugs with page migration
to and from ZONE_DEVICE private and anonymous pages. Hopefully it
clarifies how ZONE_DEVICE private struct page uses the same mapping and
index fields from the source anonymous page mapping.
This patch (of 3):
Struct page for ZONE_DEVICE private pages uses the page->mapping and and
page->index fields while the source anonymous pages are migrated to device
private memory. This is so rmap_walk() can find the page when migrating
the ZONE_DEVICE private page back to system memory. ZONE_DEVICE pmem
backed fsdax pages also use the page->mapping and page->index fields when
files are mapped into a process address space.
Restructure struct page and add comments to make this more clear.
Link: http://lkml.kernel.org/r/20190719192955.30462-2-rcampbell@nvidia.com
Signed-off-by: Ralph Campbell <rcampbell(a)nvidia.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Jérôme Glisse <jglisse(a)redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Lai Jiangshan <jiangshanlai(a)gmail.com>
Cc: Martin Schwidefsky <schwidefsky(a)de.ibm.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Jason Gunthorpe <jgg(a)mellanox.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm_types.h | 42 +++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 13 deletions(-)
--- a/include/linux/mm_types.h~mm-document-zone-device-struct-page-field-usage
+++ a/include/linux/mm_types.h
@@ -76,13 +76,35 @@ struct page {
* avoid collision and false-positive PageTail().
*/
union {
- struct { /* Page cache and anonymous pages */
- /**
- * @lru: Pageout list, eg. active_list protected by
- * pgdat->lru_lock. Sometimes used as a generic list
- * by the page owner.
- */
- struct list_head lru;
+ struct { /* Page cache, anonymous, ZONE_DEVICE pages */
+ union {
+ /**
+ * @lru: Pageout list, e.g., active_list
+ * protected by pgdat->lru_lock. Sometimes
+ * used as a generic list by the page owner.
+ */
+ struct list_head lru;
+ /**
+ * ZONE_DEVICE pages are never on the lru
+ * list so they reuse the list space.
+ * ZONE_DEVICE private pages are counted as
+ * being mapped so the @mapping and @index
+ * fields are used while the page is migrated
+ * to device private memory.
+ * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
+ * use the @mapping and @index fields when pmem
+ * backed DAX files are mapped.
+ */
+ struct {
+ /**
+ * @pgmap: Points to the hosting
+ * device page map.
+ */
+ struct dev_pagemap *pgmap;
+ /** @zone_device_data: opaque data. */
+ void *zone_device_data;
+ };
+ };
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
@@ -155,12 +177,6 @@ struct page {
spinlock_t ptl;
#endif
};
- struct { /* ZONE_DEVICE pages */
- /** @pgmap: Points to the hosting device page map. */
- struct dev_pagemap *pgmap;
- void *zone_device_data;
- unsigned long _zd_pad_1; /* uses mapping */
- };
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
_
Patches currently in -mm which might be from rcampbell(a)nvidia.com are
mm-hmm-fix-zone_device-anon-page-mapping-reuse.patch
mm-hmm-fix-bad-subpage-pointer-in-try_to_unmap_one.patch
mm-migrate-initialize-pud_entry-in-migrate_vma.patch
When migrating an anonymous private page to a ZONE_DEVICE private page,
the source page->mapping and page->index fields are copied to the
destination ZONE_DEVICE struct page and the page_mapcount() is increased.
This is so rmap_walk() can be used to unmap and migrate the page back to
system memory. However, try_to_unmap_one() computes the subpage pointer
from a swap pte which computes an invalid page pointer and a kernel panic
results such as:
BUG: unable to handle page fault for address: ffffea1fffffffc8
Currently, only single pages can be migrated to device private memory so
no subpage computation is needed and it can be set to "page".
Fixes: a5430dda8a3a1c ("mm/migrate: support un-addressable ZONE_DEVICE page in migration")
Signed-off-by: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: "Jérôme Glisse" <jglisse(a)redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Jason Gunthorpe <jgg(a)mellanox.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/rmap.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mm/rmap.c b/mm/rmap.c
index e5dfe2ae6b0d..003377e24232 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1475,7 +1475,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
*/
+ subpage = page;
goto discard;
}
--
2.20.1
Hi,
We would like to learn your interest in acquiring our recently updated
Microsoft Office 365 Alternatives & Competitors contact list which helps
you to improve your business campaign.
We also provide Zoho Office, G Suite (Google Apps), WPS Office,Microsoft
Office Online, SoftMaker Office, LibreOffice 6 and Corel WordPerfect and
many more.
We provide Decision Makers Contacts such as C- Level, VP Level, Directors
and Managers Contact details.
Please let me know your targeted criteria with geography to provide you
with detailed information for your review
Regards,
Nancy Mcmahon
Marketing Specialist
If you don't want to include yourself in our mailing list, please reply
back “Leave Out" in a subject line
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d9771f5ec46c282d518b453c793635dbdc3a2a94 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni(a)redhat.com>
Date: Fri, 14 Jun 2019 15:41:05 -0700
Subject: [PATCH] raid5-cache: Need to do start() part job after adding journal
device
commit d5d885fd514f ("md: introduce new personality funciton start()")
splits the init job to two parts. The first part run() does the jobs that
do not require the md threads. The second part start() does the jobs that
require the md threads.
Now it just does run() in adding new journal device. It needs to do the
second part start() too.
Fixes: d5d885fd514f ("md: introduce new personality funciton start()")
Cc: stable(a)vger.kernel.org #v4.9+
Reported-by: Michal Soltys <soltys(a)ziu.info>
Signed-off-by: Xiao Ni <xni(a)redhat.com>
Signed-off-by: Song Liu <songliubraving(a)fb.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b83bce2beb66..da94cbaa1a9e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7672,7 +7672,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- int err = -EEXIST;
+ int ret, err = -EEXIST;
int disk;
struct disk_info *p;
int first = 0;
@@ -7687,7 +7687,14 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- log_init(conf, rdev, false);
+ ret = log_init(conf, rdev, false);
+ if (ret)
+ return ret;
+
+ ret = r5l_start(conf->log);
+ if (ret)
+ return ret;
+
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
v4.9.y to v5.1.y:
fs/btrfs/file.c: In function 'btrfs_punch_hole':
fs/btrfs/file.c:2787:27: error: invalid initializer
struct timespec64 now = current_time(inode);
^~~~~~~~~~~~
fs/btrfs/file.c:2790:18: error: incompatible types when assigning to type 'struct timespec' from type 'struct timespec64'
v4.19.y, v5.1.y:
fs/btrfs/props.c: In function 'prop_compression_validate':
fs/btrfs/props.c:369:6: error: implicit declaration of function 'btrfs_compression_is_valid_type'
My apologies for the noise if this has already been reported/fixed.
Guenter
From: Francois Buergisser <fbuergisser(a)chromium.org>
The Hantro codec is typically used in platforms with an IOMMU,
so we need to set a proper DMA segment size. Devices without an
IOMMU will still fallback to default 64KiB segments.
Cc: stable(a)vger.kernel.org
Fixes: 775fec69008d3 ("media: add Rockchip VPU JPEG encoder driver")
Signed-off-by: Francois Buergisser <fbuergisser(a)chromium.org>
Signed-off-by: Ezequiel Garcia <ezequiel(a)collabora.com>
---
drivers/staging/media/hantro/hantro_drv.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/staging/media/hantro/hantro_drv.c b/drivers/staging/media/hantro/hantro_drv.c
index b71a06e9159e..4eae1dbb1ac8 100644
--- a/drivers/staging/media/hantro/hantro_drv.c
+++ b/drivers/staging/media/hantro/hantro_drv.c
@@ -731,6 +731,7 @@ static int hantro_probe(struct platform_device *pdev)
dev_err(vpu->dev, "Could not set DMA coherent mask.\n");
return ret;
}
+ vb2_dma_contig_set_max_seg_size(&pdev->dev, DMA_BIT_MASK(32));
for (i = 0; i < vpu->variant->num_irqs; i++) {
const char *irq_name = vpu->variant->irqs[i].name;
--
2.22.0
Hi,
When one request is dispatched to LLD via dm-rq, if the result is
BLK_STS_*RESOURCE, dm-rq will free the request. However, LLD may allocate
private data for this request, so this way will cause memory leak.
Add .cleanup_rq() callback and implement it in SCSI for fixing the issue,
since SCSI is the only driver which allocates private requst data in
.queue_rq() path.
Another use case of this callback is to free the request and re-submit
bios during cpu hotplug when the hctx is dead, see the following link:
https://lore.kernel.org/linux-block/f122e8f2-5ede-2d83-9ca0-bc713ce66d01@hu…
V3:
- run .cleanup_rq() from dm-rq because this issue is dm-rq specific,
and even in future it should be still very unusual to free request
in this way. If we call .cleanup_rq() in generic rq free code(fast
path), cost will be introduced unnecessarily, also we have to
consider related race.
V2:
- run .cleanup_rq() in blk_mq_free_request(), as suggested by Mike
Ming Lei (2):
blk-mq: add callback of .cleanup_rq
scsi: implement .cleanup_rq callback
drivers/md/dm-rq.c | 1 +
drivers/scsi/scsi_lib.c | 13 +++++++++++++
include/linux/blk-mq.h | 13 +++++++++++++
3 files changed, 27 insertions(+)
Cc: Ewan D. Milne <emilne(a)redhat.com>
Cc: Bart Van Assche <bvanassche(a)acm.org>
Cc: Hannes Reinecke <hare(a)suse.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Mike Snitzer <snitzer(a)redhat.com>
Cc: dm-devel(a)redhat.com
Cc: <stable(a)vger.kernel.org>
Fixes: 396eaf21ee17 ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback")
--
2.20.1
The patch below does not apply to the 5.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 13b82b746310b51b064bc855993a1c84bf862726 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Wed, 22 May 2019 14:34:00 +0300
Subject: [PATCH] xhci: Fix immediate data transfer if buffer is already DMA
mapped
xhci immediate data transfer (IDT) support in 5.2-rc1 caused regression
on various Samsung Exynos boards with ASIX USB 2.0 ethernet dongle.
If the transfer buffer in the URB is already DMA mapped then IDT should
not be used. urb->transfer_dma will already contain a valid dma address,
and there is no guarantee the data in urb->transfer_buffer is valid.
The IDT support patch used urb->transfer_dma as a temporary storage,
copying data from urb->transfer_buffer into it.
Issue was solved by preventing IDT if transfer buffer is already dma
mapped, and by not using urb->transfer_dma as temporary storage.
Fixes: 33e39350ebd2 ("usb: xhci: add Immediate Data Transfer support")
Reported-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
CC: Nicolas Saenz Julienne <nsaenzjulienne(a)suse.de>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index ef7c8698772e..88392aa65722 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -3432,11 +3432,14 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
if (urb->transfer_buffer_length > 0) {
u32 length_field, remainder;
+ u64 addr;
if (xhci_urb_suitable_for_idt(urb)) {
- memcpy(&urb->transfer_dma, urb->transfer_buffer,
+ memcpy(&addr, urb->transfer_buffer,
urb->transfer_buffer_length);
field |= TRB_IDT;
+ } else {
+ addr = (u64) urb->transfer_dma;
}
remainder = xhci_td_remainder(xhci, 0,
@@ -3449,8 +3452,8 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
if (setup->bRequestType & USB_DIR_IN)
field |= TRB_DIR_IN;
queue_trb(xhci, ep_ring, true,
- lower_32_bits(urb->transfer_dma),
- upper_32_bits(urb->transfer_dma),
+ lower_32_bits(addr),
+ upper_32_bits(addr),
length_field,
field | ep_ring->cycle_state);
}
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index a450a99e90eb..7f8b950d1a73 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -2160,7 +2160,8 @@ static inline bool xhci_urb_suitable_for_idt(struct urb *urb)
{
if (!usb_endpoint_xfer_isoc(&urb->ep->desc) && usb_urb_dir_out(urb) &&
usb_endpoint_maxp(&urb->ep->desc) >= TRB_IDT_MAX_SIZE &&
- urb->transfer_buffer_length <= TRB_IDT_MAX_SIZE)
+ urb->transfer_buffer_length <= TRB_IDT_MAX_SIZE &&
+ !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP))
return true;
return false;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4c6d80e1144bdf48cae6b602ae30d41f3e5c76a9 Mon Sep 17 00:00:00 2001
From: Norbert Manthey <nmanthey(a)amazon.de>
Date: Fri, 5 Jul 2019 15:06:00 +0200
Subject: [PATCH] pstore: Fix double-free in pstore_mkfile() failure path
The pstore_mkfile() function is passed a pointer to a struct
pstore_record. On success it consumes this 'record' pointer and
references it from the created inode.
On failure, however, it may or may not free the record. There are even
two different code paths which return -ENOMEM -- one of which does and
the other doesn't free the record.
Make the behaviour deterministic by never consuming and freeing the
record when returning failure, allowing the caller to do the cleanup
consistently.
Signed-off-by: Norbert Manthey <nmanthey(a)amazon.de>
Link: https://lore.kernel.org/r/1562331960-26198-1-git-send-email-nmanthey@amazon…
Fixes: 83f70f0769ddd ("pstore: Do not duplicate record metadata")
Fixes: 1dfff7dd67d1a ("pstore: Pass record contents instead of copying")
Cc: stable(a)vger.kernel.org
[kees: also move "private" allocation location, rename inode cleanup label]
Signed-off-by: Kees Cook <keescook(a)chromium.org>
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 89a80b568a17..7fbe8f058220 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -318,22 +318,21 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
goto fail;
inode->i_mode = S_IFREG | 0444;
inode->i_fop = &pstore_file_operations;
- private = kzalloc(sizeof(*private), GFP_KERNEL);
- if (!private)
- goto fail_alloc;
- private->record = record;
-
scnprintf(name, sizeof(name), "%s-%s-%llu%s",
pstore_type_to_name(record->type),
record->psi->name, record->id,
record->compressed ? ".enc.z" : "");
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ if (!private)
+ goto fail_inode;
+
dentry = d_alloc_name(root, name);
if (!dentry)
goto fail_private;
+ private->record = record;
inode->i_size = private->total_size = size;
-
inode->i_private = private;
if (record->time.tv_sec)
@@ -349,7 +348,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
fail_private:
free_pstore_private(private);
-fail_alloc:
+fail_inode:
iput(inode);
fail:
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4c6d80e1144bdf48cae6b602ae30d41f3e5c76a9 Mon Sep 17 00:00:00 2001
From: Norbert Manthey <nmanthey(a)amazon.de>
Date: Fri, 5 Jul 2019 15:06:00 +0200
Subject: [PATCH] pstore: Fix double-free in pstore_mkfile() failure path
The pstore_mkfile() function is passed a pointer to a struct
pstore_record. On success it consumes this 'record' pointer and
references it from the created inode.
On failure, however, it may or may not free the record. There are even
two different code paths which return -ENOMEM -- one of which does and
the other doesn't free the record.
Make the behaviour deterministic by never consuming and freeing the
record when returning failure, allowing the caller to do the cleanup
consistently.
Signed-off-by: Norbert Manthey <nmanthey(a)amazon.de>
Link: https://lore.kernel.org/r/1562331960-26198-1-git-send-email-nmanthey@amazon…
Fixes: 83f70f0769ddd ("pstore: Do not duplicate record metadata")
Fixes: 1dfff7dd67d1a ("pstore: Pass record contents instead of copying")
Cc: stable(a)vger.kernel.org
[kees: also move "private" allocation location, rename inode cleanup label]
Signed-off-by: Kees Cook <keescook(a)chromium.org>
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 89a80b568a17..7fbe8f058220 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -318,22 +318,21 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
goto fail;
inode->i_mode = S_IFREG | 0444;
inode->i_fop = &pstore_file_operations;
- private = kzalloc(sizeof(*private), GFP_KERNEL);
- if (!private)
- goto fail_alloc;
- private->record = record;
-
scnprintf(name, sizeof(name), "%s-%s-%llu%s",
pstore_type_to_name(record->type),
record->psi->name, record->id,
record->compressed ? ".enc.z" : "");
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ if (!private)
+ goto fail_inode;
+
dentry = d_alloc_name(root, name);
if (!dentry)
goto fail_private;
+ private->record = record;
inode->i_size = private->total_size = size;
-
inode->i_private = private;
if (record->time.tv_sec)
@@ -349,7 +348,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
fail_private:
free_pstore_private(private);
-fail_alloc:
+fail_inode:
iput(inode);
fail:
The patch below does not apply to the 5.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b091ac616846a1da75b1f2566b41255ce7f0e0a6 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Mon, 1 Jul 2019 14:09:17 +0900
Subject: [PATCH] sd_zbc: Fix report zones buffer allocation
During disk scan and revalidation done with sd_revalidate(), the zones
of a zoned disk are checked using the helper function
blk_revalidate_disk_zones() if a configuration change is detected
(change in the number of zones or zone size). The function
blk_revalidate_disk_zones() issues report_zones calls that are very
large, that is, to obtain zone information for all zones of the disk
with a single command. The size of the report zones command buffer
necessary for such large request generally is lower than the disk
max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
memory fragmentation), but often fail at run time (e.g. hot-plug
event). This causes the disk revalidation to fail and the disk
capacity to be changed to 0.
This problem can be avoided by using vmalloc() instead of kmalloc() for
the buffer allocation. To limit the amount of memory to be allocated,
this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
maximum number of zones to report with a single report zones command.
This limit may be lowered further to satisfy the disk max_hw_sectors
limit. Finally, to ensure that the vmalloc-ed buffer can always be
mapped in a request, the buffer size is further limited to at most
queue_max_segments() pages, allowing successful mapping of the buffer
even in the worst case scenario where none of the buffer pages are
contiguous.
Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ec3764c8f3f1..db16c19e05c4 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -9,6 +9,8 @@
*/
#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
@@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
/**
* sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
* @sdkp: The target disk
- * @buf: Buffer to use for the reply
+ * @buf: vmalloc-ed buffer to use for the reply
* @buflen: the buffer size
* @lba: Start LBA of the report
* @partial: Do partial report
@@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
put_unaligned_be32(buflen, &cmd[10]);
if (partial)
cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
- memset(buf, 0, buflen);
result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
buf, buflen, &sshdr,
@@ -103,6 +104,53 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
return 0;
}
+/*
+ * Maximum number of zones to get with one report zones command.
+ */
+#define SD_ZBC_REPORT_MAX_ZONES 8192U
+
+/**
+ * Allocate a buffer for report zones reply.
+ * @sdkp: The target disk
+ * @nr_zones: Maximum number of zones to report
+ * @buflen: Size of the buffer allocated
+ *
+ * Try to allocate a reply buffer for the number of requested zones.
+ * The size of the buffer allocated may be smaller than requested to
+ * satify the device constraint (max_hw_sectors, max_segments, etc).
+ *
+ * Return the address of the allocated buffer and update @buflen with
+ * the size of the allocated buffer.
+ */
+static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
+ unsigned int nr_zones, size_t *buflen)
+{
+ struct request_queue *q = sdkp->disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ /*
+ * Report zone buffer size should be at most 64B times the number of
+ * zones requested plus the 64B reply header, but should be at least
+ * SECTOR_SIZE for ATA devices.
+ * Make sure that this size does not exceed the hardware capabilities.
+ * Furthermore, since the report zone command cannot be split, make
+ * sure that the allocated buffer can always be mapped by limiting the
+ * number of pages allocated to the HBA max segments limit.
+ */
+ nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
+ bufsize = roundup((nr_zones + 1) * 64, 512);
+ bufsize = min_t(size_t, bufsize,
+ queue_max_hw_sectors(q) << SECTOR_SHIFT);
+ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+ buf = vzalloc(bufsize);
+ if (buf)
+ *buflen = bufsize;
+
+ return buf;
+}
+
/**
* sd_zbc_report_zones - Disk report zones operation.
* @disk: The target disk
@@ -116,30 +164,23 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones)
{
struct scsi_disk *sdkp = scsi_disk(disk);
- unsigned int i, buflen, nrz = *nr_zones;
+ unsigned int i, nrz = *nr_zones;
unsigned char *buf;
- size_t offset = 0;
+ size_t buflen = 0, offset = 0;
int ret = 0;
if (!sd_is_zoned(sdkp))
/* Not a zoned device */
return -EOPNOTSUPP;
- /*
- * Get a reply buffer for the number of requested zones plus a header,
- * without exceeding the device maximum command size. For ATA disks,
- * buffers must be aligned to 512B.
- */
- buflen = min(queue_max_hw_sectors(disk->queue) << 9,
- roundup((nrz + 1) * 64, 512));
- buf = kmalloc(buflen, GFP_KERNEL);
+ buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
if (!buf)
return -ENOMEM;
ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
sectors_to_logical(sdkp->device, sector), true);
if (ret)
- goto out_free_buf;
+ goto out;
nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
for (i = 0; i < nrz; i++) {
@@ -150,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
*nr_zones = nrz;
-out_free_buf:
- kfree(buf);
+out:
+ kvfree(buf);
return ret;
}
@@ -285,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
return 0;
}
-#define SD_ZBC_BUF_SIZE 131072U
-
/**
* sd_zbc_check_zones - Check the device capacity and zone sizes
* @sdkp: Target disk
@@ -302,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
*/
static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
{
+ size_t bufsize, buflen;
+ unsigned int noio_flag;
u64 zone_blocks = 0;
sector_t max_lba, block = 0;
unsigned char *buf;
unsigned char *rec;
- unsigned int buf_len;
- unsigned int list_length;
int ret;
u8 same;
+ /* Do all memory allocations as if GFP_NOIO was specified */
+ noio_flag = memalloc_noio_save();
+
/* Get a buffer */
- buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
+ &bufsize);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
/* Do a report zone to get max_lba and the same field */
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
if (ret)
goto out_free;
@@ -353,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
do {
/* Parse REPORT ZONES header */
- list_length = get_unaligned_be32(&buf[0]) + 64;
+ buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
+ bufsize);
rec = buf + 64;
- buf_len = min(list_length, SD_ZBC_BUF_SIZE);
/* Parse zone descriptors */
- while (rec < buf + buf_len) {
+ while (rec < buf + buflen) {
u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
if (zone_blocks == 0) {
@@ -374,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
if (block < sdkp->capacity) {
- ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
- block, true);
+ ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
+ true);
if (ret)
goto out_free;
}
@@ -406,7 +451,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
}
out_free:
- kfree(buf);
+ memalloc_noio_restore(noio_flag);
+ kvfree(buf);
return ret;
}
From: Stephane Grosjean <s.grosjean(a)peak-system.com>
When closing the CAN device while tx skbs are inflight, echo skb could
be released twice. By calling close_candev() before unlinking all
pending tx urbs, then the internal echo_skb[] array is fully and
correctly cleared before the USB write callback and, therefore,
can_get_echo_skb() are called, for each aborted URB.
Fixes: bb4785551f64 ("can: usb: PEAK-System Technik USB adapters driver core")
Signed-off-by: Stephane Grosjean <s.grosjean(a)peak-system.com>
Cc: linux-stable <stable(a)vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/usb/peak_usb/pcan_usb_core.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 458154c9b482..22b9c8e6d040 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -568,16 +568,16 @@ static int peak_usb_ndo_stop(struct net_device *netdev)
dev->state &= ~PCAN_USB_STATE_STARTED;
netif_stop_queue(netdev);
+ close_candev(netdev);
+
+ dev->can.state = CAN_STATE_STOPPED;
+
/* unlink all pending urbs and free used memory */
peak_usb_unlink_all_urbs(dev);
if (dev->adapter->dev_stop)
dev->adapter->dev_stop(dev);
- close_candev(netdev);
-
- dev->can.state = CAN_STATE_STOPPED;
-
/* can set bus off now */
if (dev->adapter->dev_set_bus) {
int err = dev->adapter->dev_set_bus(dev, 0);
--
2.20.1
From: Nikita Yushchenko <nikita.yoush(a)cogentembedded.com>
We have observed rcar_canfd driver entering IRQ storm under high load,
with following scenario:
- rcar_canfd_global_interrupt() in entered due to Rx available,
- napi_schedule_prep() is called, and sets NAPIF_STATE_SCHED in state
- Rx fifo interrupts are masked,
- rcar_canfd_global_interrupt() is entered again, this time due to
error interrupt (e.g. due to overflow),
- since scheduled napi poller has not yet executed, condition for calling
napi_schedule_prep() from rcar_canfd_global_interrupt() remains true,
thus napi_schedule_prep() gets called and sets NAPIF_STATE_MISSED flag
in state,
- later, napi poller function rcar_canfd_rx_poll() gets executed, and
calls napi_complete_done(),
- due to NAPIF_STATE_MISSED flag in state, this call does not clear
NAPIF_STATE_SCHED flag from state,
- on return from napi_complete_done(), rcar_canfd_rx_poll() unmasks Rx
interrutps,
- Rx interrupt happens, rcar_canfd_global_interrupt() gets called
and calls napi_schedule_prep(),
- since NAPIF_STATE_SCHED is set in state at this time, this call
returns false,
- due to that false return, rcar_canfd_global_interrupt() returns
without masking Rx interrupt
- and this results into IRQ storm: unmasked Rx interrupt happens again
and again is misprocessed in the same way.
This patch fixes that scenario by unmasking Rx interrupts only when
napi_complete_done() returns true, which means it has cleared
NAPIF_STATE_SCHED in state.
Fixes: dd3bd23eb438 ("can: rcar_canfd: Add Renesas R-Car CAN FD driver")
Signed-off-by: Nikita Yushchenko <nikita.yoush(a)cogentembedded.com>
Cc: linux-stable <stable(a)vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/rcar/rcar_canfd.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 05410008aa6b..de34a4b82d4a 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1508,10 +1508,11 @@ static int rcar_canfd_rx_poll(struct napi_struct *napi, int quota)
/* All packets processed */
if (num_pkts < quota) {
- napi_complete_done(napi, num_pkts);
- /* Enable Rx FIFO interrupts */
- rcar_canfd_set_bit(priv->base, RCANFD_RFCC(ridx),
- RCANFD_RFCC_RFIE);
+ if (napi_complete_done(napi, num_pkts)) {
+ /* Enable Rx FIFO interrupts */
+ rcar_canfd_set_bit(priv->base, RCANFD_RFCC(ridx),
+ RCANFD_RFCC_RFIE);
+ }
}
return num_pkts;
}
--
2.20.1
I'm announcing the release of the 3.16.71 kernel.
All users of the 3.16 kernel series should upgrade.
The updated 3.16.y git tree can be found at:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-3.16.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git
The diff from 3.16.70 is attached to this message.
Ben.
------------
Makefile | 2 +-
kernel/ptrace.c | 4 +---
2 files changed, 2 insertions(+), 4 deletions(-)
Ben Hutchings (1):
Linux 3.16.71
Jann Horn (1):
ptrace: Fix ->ptracer_cred handling for PTRACE_TRACEME
--
Ben Hutchings
compatible: Gracefully accepts erroneous data from any source
From: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
commit ed527b13d800dd515a9e6c582f0a73eca65b2e1b upstream.
The CAAM driver currently violates an undocumented and slightly
controversial requirement imposed by the crypto stack that a buffer
referred to by the request structure via its virtual address may not
be modified while any scatterlists passed via the same request
structure are mapped for inbound DMA.
This may result in errors like
alg: aead: decryption failed on test 1 for gcm_base(ctr-aes-caam,ghash-generic): ret=74
alg: aead: Failed to load transform for gcm(aes): -2
on non-cache coherent systems, due to the fact that the GCM driver
passes an IV buffer by virtual address which shares a cacheline with
the auth_tag buffer passed via a scatterlist, resulting in corruption
of the auth_tag when the IV is updated while the DMA mapping is live.
Since the IV that is returned to the caller is only valid for CBC mode,
and given that the in-kernel users of CBC (such as CTS) don't trigger the
same issue as the GCM driver, let's just disable the output IV generation
for all modes except CBC for the time being.
Fixes: 854b06f76879 ("crypto: caam - properly set IV after {en,de}crypt")
Cc: Horia Geanta <horia.geanta(a)nxp.com>
Cc: Iuliana Prodan <iuliana.prodan(a)nxp.com>
Reported-by: Sascha Hauer <s.hauer(a)pengutronix.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Reviewed-by: Horia Geanta <horia.geanta(a)nxp.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
[ Horia: backported to 4.9 ]
Signed-off-by: Horia Geantă <horia.geanta(a)nxp.com>
---
drivers/crypto/caam/caamalg.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 88caca3370f2..f8ac768ed5d7 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -2015,6 +2015,7 @@ static void ablkcipher_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
struct ablkcipher_request *req = context;
struct ablkcipher_edesc *edesc;
struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+ struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
#ifdef DEBUG
@@ -2040,10 +2041,11 @@ static void ablkcipher_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
/*
* The crypto API expects us to set the IV (req->info) to the last
- * ciphertext block. This is used e.g. by the CTS mode.
+ * ciphertext block when running in CBC mode.
*/
- scatterwalk_map_and_copy(req->info, req->dst, req->nbytes - ivsize,
- ivsize, 0);
+ if ((ctx->class1_alg_type & OP_ALG_AAI_MASK) == OP_ALG_AAI_CBC)
+ scatterwalk_map_and_copy(req->info, req->dst, req->nbytes -
+ ivsize, ivsize, 0);
kfree(edesc);
@@ -2056,6 +2058,7 @@ static void ablkcipher_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
struct ablkcipher_request *req = context;
struct ablkcipher_edesc *edesc;
struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+ struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
#ifdef DEBUG
@@ -2080,10 +2083,11 @@ static void ablkcipher_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
/*
* The crypto API expects us to set the IV (req->info) to the last
- * ciphertext block.
+ * ciphertext block when running in CBC mode.
*/
- scatterwalk_map_and_copy(req->info, req->src, req->nbytes - ivsize,
- ivsize, 0);
+ if ((ctx->class1_alg_type & OP_ALG_AAI_MASK) == OP_ALG_AAI_CBC)
+ scatterwalk_map_and_copy(req->info, req->src, req->nbytes -
+ ivsize, ivsize, 0);
kfree(edesc);
--
2.17.1
On Wed, 24 Jul 2019 08:44:19 +0200
Christian Borntraeger <borntraeger(a)de.ibm.com> wrote:
>
>
> On 24.07.19 00:58, Halil Pasic wrote:
> > The access to airq_areas was racy ever since the adapter interrupts got
> > introduced to virtio-ccw, but since commit 39c7dcb15892 ("virtio/s390:
> > make airq summary indicators DMA") this became an issue in practice as
> > well. Namely before that commit the airq_info that got overwritten was
> > still functional. After that commit however the two infos share a
> > summary_indicator, which aggravates the situation. Which means
> > auto-online mechanism occasionally hangs the boot with virtio_blk.
> >
> > Signed-off-by: Halil Pasic <pasic(a)linux.ibm.com>
> > Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
> > Fixes: 96b14536d935 ("virtio-ccw: virtio-ccw adapter interrupt support.")
> > ---
> > * We need definitely this fixed for 5.3. For older stable kernels it is
> > to be discussed. @Connie what do you think: do we need a cc stable?
>
> Unless you can prove that the problem could never happen on old version
> we absolutely do need cc stable.
No I would not like to make an attempt at proving that. I prefer code
race free anyway. CC-ing stable.
>
> >
> > * I have a variant that does not need the extra mutex but uses cmpxchg().
> > Decided to post this one because that one is more complex. But if there
> > is interest we can have a look at it as well.
>
> This is slow path (startup) and never called in hot path. Correct? Mutex should be
> fine.
Right, this is only relevant during device initialization, which is an
infrequent operation.
Thanks,
Halil
> > ---
> > drivers/s390/virtio/virtio_ccw.c | 4 ++++
> > 1 file changed, 4 insertions(+)
> >
> > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > index 1a55e5942d36..d97742662755 100644
> > --- a/drivers/s390/virtio/virtio_ccw.c
> > +++ b/drivers/s390/virtio/virtio_ccw.c
> > @@ -145,6 +145,8 @@ struct airq_info {
> > struct airq_iv *aiv;
> > };
> > static struct airq_info *airq_areas[MAX_AIRQ_AREAS];
> > +DEFINE_MUTEX(airq_areas_lock);
> > +
> > static u8 *summary_indicators;
> >
> > static inline u8 *get_summary_indicator(struct airq_info *info)
> > @@ -265,9 +267,11 @@ static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs,
> > unsigned long bit, flags;
> >
> > for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) {
> > + mutex_lock(&airq_areas_lock);
> > if (!airq_areas[i])
> > airq_areas[i] = new_airq_info(i);
> > info = airq_areas[i];
> > + mutex_unlock(&airq_areas_lock);
> > if (!info)
> > return 0;
> > write_lock_irqsave(&info->lock, flags);
> >
>
This is a note to let you know that I've just added the patch titled
fpga-manager: altera-ps-spi: Fix build error
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 3d139703d397f6281368047ba7ad1c8bf95aa8ab Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing(a)huawei.com>
Date: Mon, 8 Jul 2019 15:13:56 +0800
Subject: fpga-manager: altera-ps-spi: Fix build error
If BITREVERSE is m and FPGA_MGR_ALTERA_PS_SPI is y,
build fails:
drivers/fpga/altera-ps-spi.o: In function `altera_ps_write':
altera-ps-spi.c:(.text+0x4ec): undefined reference to `byte_rev_table'
Select BITREVERSE to fix this.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: fcfe18f885f6 ("fpga-manager: altera-ps-spi: use bitrev8x4")
Signed-off-by: YueHaibing <yuehaibing(a)huawei.com>
Cc: stable <stable(a)vger.kernel.org>
Acked-by: Moritz Fischer <mdf(a)kernel.org>
Link: https://lore.kernel.org/r/20190708071356.50928-1-yuehaibing@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/fpga/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
index 474f304ec109..cdd4f73b4869 100644
--- a/drivers/fpga/Kconfig
+++ b/drivers/fpga/Kconfig
@@ -40,6 +40,7 @@ config ALTERA_PR_IP_CORE_PLAT
config FPGA_MGR_ALTERA_PS_SPI
tristate "Altera FPGA Passive Serial over SPI"
depends on SPI
+ select BITREVERSE
help
FPGA manager driver support for Altera Arria/Cyclone/Stratix
using the passive serial interface over SPI.
--
2.22.0
This is a note to let you know that I've just added the patch titled
mei: me: add mule creek canyon (EHL) device ids
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 1be8624a0cbef720e8da39a15971e01abffc865b Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
Date: Fri, 12 Jul 2019 12:58:14 +0300
Subject: mei: me: add mule creek canyon (EHL) device ids
Add Mule Creek Canyon (PCH) MEI device ids for Elkhart Lake (EHL) Platform.
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler(a)intel.com>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20190712095814.20746-1-tomas.winkler@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/misc/mei/hw-me-regs.h | 3 +++
drivers/misc/mei/pci-me.c | 3 +++
2 files changed, 6 insertions(+)
diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h
index d74b182e19f3..6c0173772162 100644
--- a/drivers/misc/mei/hw-me-regs.h
+++ b/drivers/misc/mei/hw-me-regs.h
@@ -81,6 +81,9 @@
#define MEI_DEV_ID_ICP_LP 0x34E0 /* Ice Lake Point LP */
+#define MEI_DEV_ID_MCC 0x4B70 /* Mule Creek Canyon (EHL) */
+#define MEI_DEV_ID_MCC_4 0x4B75 /* Mule Creek Canyon 4 (EHL) */
+
/*
* MEI HW Section
*/
diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c
index 7a2b3545a7f9..57cb68f5cc64 100644
--- a/drivers/misc/mei/pci-me.c
+++ b/drivers/misc/mei/pci-me.c
@@ -98,6 +98,9 @@ static const struct pci_device_id mei_me_pci_tbl[] = {
{MEI_PCI_DEVICE(MEI_DEV_ID_ICP_LP, MEI_ME_PCH12_CFG)},
+ {MEI_PCI_DEVICE(MEI_DEV_ID_MCC, MEI_ME_PCH12_CFG)},
+ {MEI_PCI_DEVICE(MEI_DEV_ID_MCC_4, MEI_ME_PCH8_CFG)},
+
/* required last entry */
{0, }
};
--
2.22.0
stable rc 4.14 stable rc i386 and armv7 builds failed due to below error.
fs/btrfs/file.c: In function 'btrfs_punch_hole':
fs/btrfs/file.c:2787:27: error: invalid initializer
struct timespec64 now = current_time(inode);
^~~~~~~~~~~~
fs/btrfs/file.c:2790:18: error: incompatible types when assigning to
type 'struct timespec' from type 'struct timespec64'
inode->i_mtime = now;
^
fs/btrfs/file.c:2791:18: error: incompatible types when assigning to
type 'struct timespec' from type 'struct timespec64'
inode->i_ctime = now;
^
Best regards
Naresh Kamboju
This is a note to let you know that I've just added the patch titled
binder: prevent transactions to context manager from its own process.
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 49ed96943a8e0c62cc5a9b0a6cfc88be87d1fcec Mon Sep 17 00:00:00 2001
From: Hridya Valsaraju <hridya(a)google.com>
Date: Mon, 15 Jul 2019 12:18:04 -0700
Subject: binder: prevent transactions to context manager from its own process.
Currently, a transaction to context manager from its own process
is prevented by checking if its binder_proc struct is the same as
that of the sender. However, this would not catch cases where the
process opens the binder device again and uses the new fd to send
a transaction to the context manager.
Reported-by: syzbot+8b3c354d33c4ac78bfad(a)syzkaller.appspotmail.com
Signed-off-by: Hridya Valsaraju <hridya(a)google.com>
Acked-by: Todd Kjos <tkjos(a)google.com>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20190715191804.112933-1-hridya@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/android/binder.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 5bde08603fbc..dc1c83eafc22 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2988,7 +2988,7 @@ static void binder_transaction(struct binder_proc *proc,
else
return_error = BR_DEAD_REPLY;
mutex_unlock(&context->context_mgr_node_lock);
- if (target_node && target_proc == proc) {
+ if (target_node && target_proc->pid == proc->pid) {
binder_user_error("%d:%d got transaction to context manager from process owning it\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
--
2.22.0
The patch below does not apply to the 5.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From aa53e3bfac7205fb3a8815ac1c937fd6ed01b41e Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn(a)suse.de>
Date: Thu, 6 Jun 2019 12:07:15 +0200
Subject: [PATCH] btrfs: correctly validate compression type
Nikolay reported the following KASAN splat when running btrfs/048:
[ 1843.470920] ==================================================================
[ 1843.471971] BUG: KASAN: slab-out-of-bounds in strncmp+0x66/0xb0
[ 1843.472775] Read of size 1 at addr ffff888111e369e2 by task btrfs/3979
[ 1843.473904] CPU: 3 PID: 3979 Comm: btrfs Not tainted 5.2.0-rc3-default #536
[ 1843.475009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[ 1843.476322] Call Trace:
[ 1843.476674] dump_stack+0x7c/0xbb
[ 1843.477132] ? strncmp+0x66/0xb0
[ 1843.477587] print_address_description+0x114/0x320
[ 1843.478256] ? strncmp+0x66/0xb0
[ 1843.478740] ? strncmp+0x66/0xb0
[ 1843.479185] __kasan_report+0x14e/0x192
[ 1843.479759] ? strncmp+0x66/0xb0
[ 1843.480209] kasan_report+0xe/0x20
[ 1843.480679] strncmp+0x66/0xb0
[ 1843.481105] prop_compression_validate+0x24/0x70
[ 1843.481798] btrfs_xattr_handler_set_prop+0x65/0x160
[ 1843.482509] __vfs_setxattr+0x71/0x90
[ 1843.483012] __vfs_setxattr_noperm+0x84/0x130
[ 1843.483606] vfs_setxattr+0xac/0xb0
[ 1843.484085] setxattr+0x18c/0x230
[ 1843.484546] ? vfs_setxattr+0xb0/0xb0
[ 1843.485048] ? __mod_node_page_state+0x1f/0xa0
[ 1843.485672] ? _raw_spin_unlock+0x24/0x40
[ 1843.486233] ? __handle_mm_fault+0x988/0x1290
[ 1843.486823] ? lock_acquire+0xb4/0x1e0
[ 1843.487330] ? lock_acquire+0xb4/0x1e0
[ 1843.487842] ? mnt_want_write_file+0x3c/0x80
[ 1843.488442] ? debug_lockdep_rcu_enabled+0x22/0x40
[ 1843.489089] ? rcu_sync_lockdep_assert+0xe/0x70
[ 1843.489707] ? __sb_start_write+0x158/0x200
[ 1843.490278] ? mnt_want_write_file+0x3c/0x80
[ 1843.490855] ? __mnt_want_write+0x98/0xe0
[ 1843.491397] __x64_sys_fsetxattr+0xba/0xe0
[ 1843.492201] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 1843.493201] do_syscall_64+0x6c/0x230
[ 1843.493988] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.495041] RIP: 0033:0x7fa7a8a7707a
[ 1843.495819] Code: 48 8b 0d 21 de 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 be 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ee dd 2b 00 f7 d8 64 89 01 48
[ 1843.499203] RSP: 002b:00007ffcb73bca38 EFLAGS: 00000202 ORIG_RAX: 00000000000000be
[ 1843.500210] RAX: ffffffffffffffda RBX: 00007ffcb73bda9d RCX: 00007fa7a8a7707a
[ 1843.501170] RDX: 00007ffcb73bda9d RSI: 00000000006dc050 RDI: 0000000000000003
[ 1843.502152] RBP: 00000000006dc050 R08: 0000000000000000 R09: 0000000000000000
[ 1843.503109] R10: 0000000000000002 R11: 0000000000000202 R12: 00007ffcb73bda91
[ 1843.504055] R13: 0000000000000003 R14: 00007ffcb73bda82 R15: ffffffffffffffff
[ 1843.505268] Allocated by task 3979:
[ 1843.505771] save_stack+0x19/0x80
[ 1843.506211] __kasan_kmalloc.constprop.5+0xa0/0xd0
[ 1843.506836] setxattr+0xeb/0x230
[ 1843.507264] __x64_sys_fsetxattr+0xba/0xe0
[ 1843.507886] do_syscall_64+0x6c/0x230
[ 1843.508429] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.509558] Freed by task 0:
[ 1843.510188] (stack is not available)
[ 1843.511309] The buggy address belongs to the object at ffff888111e369e0
which belongs to the cache kmalloc-8 of size 8
[ 1843.514095] The buggy address is located 2 bytes inside of
8-byte region [ffff888111e369e0, ffff888111e369e8)
[ 1843.516524] The buggy address belongs to the page:
[ 1843.517561] page:ffff88813f478d80 refcount:1 mapcount:0 mapping:ffff88811940c300 index:0xffff888111e373b8 compound_mapcount: 0
[ 1843.519993] flags: 0x4404000010200(slab|head)
[ 1843.520951] raw: 0004404000010200 ffff88813f48b008 ffff888119403d50 ffff88811940c300
[ 1843.522616] raw: ffff888111e373b8 000000000016000f 00000001ffffffff 0000000000000000
[ 1843.524281] page dumped because: kasan: bad access detected
[ 1843.525936] Memory state around the buggy address:
[ 1843.526975] ffff888111e36880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.528479] ffff888111e36900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.530138] >ffff888111e36980: fc fc fc fc fc fc fc fc fc fc fc fc 02 fc fc fc
[ 1843.531877] ^
[ 1843.533287] ffff888111e36a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.534874] ffff888111e36a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.536468] ==================================================================
This is caused by supplying a too short compression value ('lz') in the
test-case and comparing it to 'lzo' with strncmp() and a length of 3.
strncmp() read past the 'lz' when looking for the 'o' and thus caused an
out-of-bounds read.
Introduce a new check 'btrfs_compress_is_valid_type()' which not only
checks the user-supplied value against known compression types, but also
employs checks for too short values.
Reported-by: Nikolay Borisov <nborisov(a)suse.com>
Fixes: 272e5326c783 ("btrfs: prop: fix vanished compression property after failed set")
CC: stable(a)vger.kernel.org # 5.1+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Johannes Thumshirn <jthumshirn(a)suse.de>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 66e21a4e9ea2..db41315f11eb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -43,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
return NULL;
}
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 191e5f4e3523..2035b8eb1290 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a9e2e66152ee..af109c0ba720 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len)
if (!value)
return 0;
- if (!strncmp("lzo", value, 3))
- return 0;
- else if (!strncmp("zlib", value, 4))
- return 0;
- else if (!strncmp("zstd", value, 4))
+ if (btrfs_compress_is_valid_type(value, len))
return 0;
return -EINVAL;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From aa53e3bfac7205fb3a8815ac1c937fd6ed01b41e Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn(a)suse.de>
Date: Thu, 6 Jun 2019 12:07:15 +0200
Subject: [PATCH] btrfs: correctly validate compression type
Nikolay reported the following KASAN splat when running btrfs/048:
[ 1843.470920] ==================================================================
[ 1843.471971] BUG: KASAN: slab-out-of-bounds in strncmp+0x66/0xb0
[ 1843.472775] Read of size 1 at addr ffff888111e369e2 by task btrfs/3979
[ 1843.473904] CPU: 3 PID: 3979 Comm: btrfs Not tainted 5.2.0-rc3-default #536
[ 1843.475009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[ 1843.476322] Call Trace:
[ 1843.476674] dump_stack+0x7c/0xbb
[ 1843.477132] ? strncmp+0x66/0xb0
[ 1843.477587] print_address_description+0x114/0x320
[ 1843.478256] ? strncmp+0x66/0xb0
[ 1843.478740] ? strncmp+0x66/0xb0
[ 1843.479185] __kasan_report+0x14e/0x192
[ 1843.479759] ? strncmp+0x66/0xb0
[ 1843.480209] kasan_report+0xe/0x20
[ 1843.480679] strncmp+0x66/0xb0
[ 1843.481105] prop_compression_validate+0x24/0x70
[ 1843.481798] btrfs_xattr_handler_set_prop+0x65/0x160
[ 1843.482509] __vfs_setxattr+0x71/0x90
[ 1843.483012] __vfs_setxattr_noperm+0x84/0x130
[ 1843.483606] vfs_setxattr+0xac/0xb0
[ 1843.484085] setxattr+0x18c/0x230
[ 1843.484546] ? vfs_setxattr+0xb0/0xb0
[ 1843.485048] ? __mod_node_page_state+0x1f/0xa0
[ 1843.485672] ? _raw_spin_unlock+0x24/0x40
[ 1843.486233] ? __handle_mm_fault+0x988/0x1290
[ 1843.486823] ? lock_acquire+0xb4/0x1e0
[ 1843.487330] ? lock_acquire+0xb4/0x1e0
[ 1843.487842] ? mnt_want_write_file+0x3c/0x80
[ 1843.488442] ? debug_lockdep_rcu_enabled+0x22/0x40
[ 1843.489089] ? rcu_sync_lockdep_assert+0xe/0x70
[ 1843.489707] ? __sb_start_write+0x158/0x200
[ 1843.490278] ? mnt_want_write_file+0x3c/0x80
[ 1843.490855] ? __mnt_want_write+0x98/0xe0
[ 1843.491397] __x64_sys_fsetxattr+0xba/0xe0
[ 1843.492201] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 1843.493201] do_syscall_64+0x6c/0x230
[ 1843.493988] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.495041] RIP: 0033:0x7fa7a8a7707a
[ 1843.495819] Code: 48 8b 0d 21 de 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 be 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ee dd 2b 00 f7 d8 64 89 01 48
[ 1843.499203] RSP: 002b:00007ffcb73bca38 EFLAGS: 00000202 ORIG_RAX: 00000000000000be
[ 1843.500210] RAX: ffffffffffffffda RBX: 00007ffcb73bda9d RCX: 00007fa7a8a7707a
[ 1843.501170] RDX: 00007ffcb73bda9d RSI: 00000000006dc050 RDI: 0000000000000003
[ 1843.502152] RBP: 00000000006dc050 R08: 0000000000000000 R09: 0000000000000000
[ 1843.503109] R10: 0000000000000002 R11: 0000000000000202 R12: 00007ffcb73bda91
[ 1843.504055] R13: 0000000000000003 R14: 00007ffcb73bda82 R15: ffffffffffffffff
[ 1843.505268] Allocated by task 3979:
[ 1843.505771] save_stack+0x19/0x80
[ 1843.506211] __kasan_kmalloc.constprop.5+0xa0/0xd0
[ 1843.506836] setxattr+0xeb/0x230
[ 1843.507264] __x64_sys_fsetxattr+0xba/0xe0
[ 1843.507886] do_syscall_64+0x6c/0x230
[ 1843.508429] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.509558] Freed by task 0:
[ 1843.510188] (stack is not available)
[ 1843.511309] The buggy address belongs to the object at ffff888111e369e0
which belongs to the cache kmalloc-8 of size 8
[ 1843.514095] The buggy address is located 2 bytes inside of
8-byte region [ffff888111e369e0, ffff888111e369e8)
[ 1843.516524] The buggy address belongs to the page:
[ 1843.517561] page:ffff88813f478d80 refcount:1 mapcount:0 mapping:ffff88811940c300 index:0xffff888111e373b8 compound_mapcount: 0
[ 1843.519993] flags: 0x4404000010200(slab|head)
[ 1843.520951] raw: 0004404000010200 ffff88813f48b008 ffff888119403d50 ffff88811940c300
[ 1843.522616] raw: ffff888111e373b8 000000000016000f 00000001ffffffff 0000000000000000
[ 1843.524281] page dumped because: kasan: bad access detected
[ 1843.525936] Memory state around the buggy address:
[ 1843.526975] ffff888111e36880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.528479] ffff888111e36900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.530138] >ffff888111e36980: fc fc fc fc fc fc fc fc fc fc fc fc 02 fc fc fc
[ 1843.531877] ^
[ 1843.533287] ffff888111e36a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.534874] ffff888111e36a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.536468] ==================================================================
This is caused by supplying a too short compression value ('lz') in the
test-case and comparing it to 'lzo' with strncmp() and a length of 3.
strncmp() read past the 'lz' when looking for the 'o' and thus caused an
out-of-bounds read.
Introduce a new check 'btrfs_compress_is_valid_type()' which not only
checks the user-supplied value against known compression types, but also
employs checks for too short values.
Reported-by: Nikolay Borisov <nborisov(a)suse.com>
Fixes: 272e5326c783 ("btrfs: prop: fix vanished compression property after failed set")
CC: stable(a)vger.kernel.org # 5.1+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Johannes Thumshirn <jthumshirn(a)suse.de>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 66e21a4e9ea2..db41315f11eb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -43,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
return NULL;
}
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 191e5f4e3523..2035b8eb1290 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a9e2e66152ee..af109c0ba720 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len)
if (!value)
return 0;
- if (!strncmp("lzo", value, 3))
- return 0;
- else if (!strncmp("zlib", value, 4))
- return 0;
- else if (!strncmp("zstd", value, 4))
+ if (btrfs_compress_is_valid_type(value, len))
return 0;
return -EINVAL;
The patch below does not apply to the 5.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From aa53e3bfac7205fb3a8815ac1c937fd6ed01b41e Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn(a)suse.de>
Date: Thu, 6 Jun 2019 12:07:15 +0200
Subject: [PATCH] btrfs: correctly validate compression type
Nikolay reported the following KASAN splat when running btrfs/048:
[ 1843.470920] ==================================================================
[ 1843.471971] BUG: KASAN: slab-out-of-bounds in strncmp+0x66/0xb0
[ 1843.472775] Read of size 1 at addr ffff888111e369e2 by task btrfs/3979
[ 1843.473904] CPU: 3 PID: 3979 Comm: btrfs Not tainted 5.2.0-rc3-default #536
[ 1843.475009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[ 1843.476322] Call Trace:
[ 1843.476674] dump_stack+0x7c/0xbb
[ 1843.477132] ? strncmp+0x66/0xb0
[ 1843.477587] print_address_description+0x114/0x320
[ 1843.478256] ? strncmp+0x66/0xb0
[ 1843.478740] ? strncmp+0x66/0xb0
[ 1843.479185] __kasan_report+0x14e/0x192
[ 1843.479759] ? strncmp+0x66/0xb0
[ 1843.480209] kasan_report+0xe/0x20
[ 1843.480679] strncmp+0x66/0xb0
[ 1843.481105] prop_compression_validate+0x24/0x70
[ 1843.481798] btrfs_xattr_handler_set_prop+0x65/0x160
[ 1843.482509] __vfs_setxattr+0x71/0x90
[ 1843.483012] __vfs_setxattr_noperm+0x84/0x130
[ 1843.483606] vfs_setxattr+0xac/0xb0
[ 1843.484085] setxattr+0x18c/0x230
[ 1843.484546] ? vfs_setxattr+0xb0/0xb0
[ 1843.485048] ? __mod_node_page_state+0x1f/0xa0
[ 1843.485672] ? _raw_spin_unlock+0x24/0x40
[ 1843.486233] ? __handle_mm_fault+0x988/0x1290
[ 1843.486823] ? lock_acquire+0xb4/0x1e0
[ 1843.487330] ? lock_acquire+0xb4/0x1e0
[ 1843.487842] ? mnt_want_write_file+0x3c/0x80
[ 1843.488442] ? debug_lockdep_rcu_enabled+0x22/0x40
[ 1843.489089] ? rcu_sync_lockdep_assert+0xe/0x70
[ 1843.489707] ? __sb_start_write+0x158/0x200
[ 1843.490278] ? mnt_want_write_file+0x3c/0x80
[ 1843.490855] ? __mnt_want_write+0x98/0xe0
[ 1843.491397] __x64_sys_fsetxattr+0xba/0xe0
[ 1843.492201] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 1843.493201] do_syscall_64+0x6c/0x230
[ 1843.493988] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.495041] RIP: 0033:0x7fa7a8a7707a
[ 1843.495819] Code: 48 8b 0d 21 de 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 be 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ee dd 2b 00 f7 d8 64 89 01 48
[ 1843.499203] RSP: 002b:00007ffcb73bca38 EFLAGS: 00000202 ORIG_RAX: 00000000000000be
[ 1843.500210] RAX: ffffffffffffffda RBX: 00007ffcb73bda9d RCX: 00007fa7a8a7707a
[ 1843.501170] RDX: 00007ffcb73bda9d RSI: 00000000006dc050 RDI: 0000000000000003
[ 1843.502152] RBP: 00000000006dc050 R08: 0000000000000000 R09: 0000000000000000
[ 1843.503109] R10: 0000000000000002 R11: 0000000000000202 R12: 00007ffcb73bda91
[ 1843.504055] R13: 0000000000000003 R14: 00007ffcb73bda82 R15: ffffffffffffffff
[ 1843.505268] Allocated by task 3979:
[ 1843.505771] save_stack+0x19/0x80
[ 1843.506211] __kasan_kmalloc.constprop.5+0xa0/0xd0
[ 1843.506836] setxattr+0xeb/0x230
[ 1843.507264] __x64_sys_fsetxattr+0xba/0xe0
[ 1843.507886] do_syscall_64+0x6c/0x230
[ 1843.508429] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1843.509558] Freed by task 0:
[ 1843.510188] (stack is not available)
[ 1843.511309] The buggy address belongs to the object at ffff888111e369e0
which belongs to the cache kmalloc-8 of size 8
[ 1843.514095] The buggy address is located 2 bytes inside of
8-byte region [ffff888111e369e0, ffff888111e369e8)
[ 1843.516524] The buggy address belongs to the page:
[ 1843.517561] page:ffff88813f478d80 refcount:1 mapcount:0 mapping:ffff88811940c300 index:0xffff888111e373b8 compound_mapcount: 0
[ 1843.519993] flags: 0x4404000010200(slab|head)
[ 1843.520951] raw: 0004404000010200 ffff88813f48b008 ffff888119403d50 ffff88811940c300
[ 1843.522616] raw: ffff888111e373b8 000000000016000f 00000001ffffffff 0000000000000000
[ 1843.524281] page dumped because: kasan: bad access detected
[ 1843.525936] Memory state around the buggy address:
[ 1843.526975] ffff888111e36880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.528479] ffff888111e36900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.530138] >ffff888111e36980: fc fc fc fc fc fc fc fc fc fc fc fc 02 fc fc fc
[ 1843.531877] ^
[ 1843.533287] ffff888111e36a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.534874] ffff888111e36a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1843.536468] ==================================================================
This is caused by supplying a too short compression value ('lz') in the
test-case and comparing it to 'lzo' with strncmp() and a length of 3.
strncmp() read past the 'lz' when looking for the 'o' and thus caused an
out-of-bounds read.
Introduce a new check 'btrfs_compress_is_valid_type()' which not only
checks the user-supplied value against known compression types, but also
employs checks for too short values.
Reported-by: Nikolay Borisov <nborisov(a)suse.com>
Fixes: 272e5326c783 ("btrfs: prop: fix vanished compression property after failed set")
CC: stable(a)vger.kernel.org # 5.1+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Johannes Thumshirn <jthumshirn(a)suse.de>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 66e21a4e9ea2..db41315f11eb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -43,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
return NULL;
}
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 191e5f4e3523..2035b8eb1290 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a9e2e66152ee..af109c0ba720 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len)
if (!value)
return 0;
- if (!strncmp("lzo", value, 3))
- return 0;
- else if (!strncmp("zlib", value, 4))
- return 0;
- else if (!strncmp("zstd", value, 4))
+ if (btrfs_compress_is_valid_type(value, len))
return 0;
return -EINVAL;
From: Yingying Tang <yintang(a)codeaurora.org>
[ Upstream commit 9e7251fa38978b85108c44743e1436d48e8d0d76 ]
tx_stats will be freed and set to NULL before debugfs_sta node is
removed in station disconnetion process. So if read the debugfs_sta
node there may be NULL pointer error. Add check for tx_stats before
use it to resove this issue.
Signed-off-by: Yingying Tang <yintang(a)codeaurora.org>
Signed-off-by: Kalle Valo <kvalo(a)codeaurora.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/wireless/ath/ath10k/debugfs_sta.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/net/wireless/ath/ath10k/debugfs_sta.c b/drivers/net/wireless/ath/ath10k/debugfs_sta.c
index c704ae371c4d..42931a669b02 100644
--- a/drivers/net/wireless/ath/ath10k/debugfs_sta.c
+++ b/drivers/net/wireless/ath/ath10k/debugfs_sta.c
@@ -663,6 +663,13 @@ static ssize_t ath10k_dbg_sta_dump_tx_stats(struct file *file,
mutex_lock(&ar->conf_mutex);
+ if (!arsta->tx_stats) {
+ ath10k_warn(ar, "failed to get tx stats");
+ mutex_unlock(&ar->conf_mutex);
+ kfree(buf);
+ return 0;
+ }
+
spin_lock_bh(&ar->data_lock);
for (k = 0; k < ATH10K_STATS_TYPE_MAX; k++) {
for (j = 0; j < ATH10K_COUNTER_TYPE_MAX; j++) {
--
2.20.1
"Oyez Oyez..." its time for a stable update of fixes for XFS. 4 out of the
9 fixes here were recommended by Amir, and tested by both Amir and Sasha.
I've found a few other fixes, and have tested all these changes with
fstests against the following configurations in fstests sections as per
oscheck [0] and found no regressions in comparsin to v4.19.58 and by
running the full set of tests 3 times completely:
* xfs
* xfs_nocrc
* xfs_nocrc_512
* xfs_reflink
* xfs_reflink_1024
* xfs_logdev
* xfs_realtimedev
Known issues are listed on the expunges files, but its no different than
the current baseline.
Worth noting is a now known generic/388 crash on xfs_nocrc, xfs_reflink,
and what may be a new section we should consider to track:
"xfs_reflink_normapbt" with the following resulting filesystem:
# xfs_info /dev/loop5
meta-data=/dev/loop5 isize=512 agcount=4, agsize=1310720 blks
= sectsz=512 attr=2, projid32bit=1
= crc=1 finobt=1, sparse=1, rmapbt=0
= reflink=1
data = bsize=4096 blocks=5242880, imaxpct=25
= sunit=0 swidth=0 blks
naming =version 2 bsize=4096 ascii-ci=0, ftype=1
log =internal log bsize=4096 blocks=2560, version=2
= sectsz=512 sunit=0 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
Do we want to create a baseline and track this configuration for stable
as well?
There is a stable bug tracking this, kz#204223 [1], and a respective bug
also present on upstream via kz#204049 [2] which Zorro reported. But,
again, nothing changes from the baseline.
I'd appreciate further reviews from the patches.
I have some other fixes in mind as well, but I'd rather not delay this
set and think this is a first good batch.
This also goes out as the first set of stable fixes using oscheck's
new devops infrastructure built on ansible / vagrant / terraform [3].
For this release I've used vagrant with KVM, perhaps the next one
I'll try terraform on whatever cloud solution someone is willing
to let me use.
You can also find these changes on my 20190718-linux-xfs-4.19.y-v1
branch on kernel.org [4].
Lemme know if you see any issues or have any questions.
[0] https://gitlab.com/mcgrof/oscheck/blob/master/fstests-configs/xfs.config
[1] https://bugzilla.kernel.org/show_bug.cgi?id=204223
[2] https://bugzilla.kernel.org/show_bug.cgi?id=204049
[3] https://gitlab.com/mcgrof/kdevops
[4] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux-stable.git/log…
Brian Foster (1):
xfs: serialize unaligned dio writes against all other dio writes
Darrick J. Wong (6):
xfs: fix pagecache truncation prior to reflink
xfs: don't overflow xattr listent buffer
xfs: rename m_inotbt_nores to m_finobt_nores
xfs: don't ever put nlink > 0 inodes on the unlinked list
xfs: reserve blocks for ifree transaction during log recovery
xfs: abort unaligned nowait directio early
Dave Chinner (1):
xfs: flush removing page cache in xfs_reflink_remap_prep
Luis R. Rodriguez (1):
xfs: fix reporting supported extra file attributes for statx()
fs/xfs/libxfs/xfs_ag_resv.c | 2 +-
fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++--
fs/xfs/xfs_attr_list.c | 1 +
fs/xfs/xfs_bmap_util.c | 2 +-
fs/xfs/xfs_bmap_util.h | 2 ++
fs/xfs/xfs_file.c | 27 +++++++++++++++++----------
fs/xfs/xfs_fsops.c | 1 +
fs/xfs/xfs_inode.c | 18 +++++++-----------
fs/xfs/xfs_iops.c | 21 +++++++++++++++++++--
fs/xfs/xfs_mount.h | 2 +-
fs/xfs/xfs_reflink.c | 16 +++++++++++++---
fs/xfs/xfs_super.c | 7 +++++++
fs/xfs/xfs_xattr.c | 3 +++
13 files changed, 75 insertions(+), 31 deletions(-)
--
2.20.1
The patch titled
Subject: cgroup: kselftest: relax fs_spec checks
has been added to the -mm tree. Its filename is
cgroup-kselftest-relax-fs_spec-checks.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/cgroup-kselftest-relax-fs_spec-che…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/cgroup-kselftest-relax-fs_spec-che…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Chris Down <chris(a)chrisdown.name>
Subject: cgroup: kselftest: relax fs_spec checks
On my laptop most memcg kselftests were being skipped because it claimed
cgroup v2 hierarchy wasn't mounted, but this isn't correct. Instead, it
seems current systemd HEAD mounts it with the name "cgroup2" instead of
"cgroup":
% grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate 0 0
I can't think of a reason to need to check fs_spec explicitly
since it's arbitrary, so we can just rely on fs_vfstype.
After these changes, `make TARGETS=cgroup kselftest` actually runs the
cgroup v2 tests in more cases.
Link: http://lkml.kernel.org/r/20190723210737.GA487@chrisdown.name
Signed-off-by: Chris Down <chris(a)chrisdown.name>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/cgroup/cgroup_util.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/tools/testing/selftests/cgroup/cgroup_util.c~cgroup-kselftest-relax-fs_spec-checks
+++ a/tools/testing/selftests/cgroup/cgroup_util.c
@@ -191,8 +191,7 @@ int cg_find_unified_root(char *root, siz
strtok(NULL, delim);
strtok(NULL, delim);
- if (strcmp(fs, "cgroup") == 0 &&
- strcmp(type, "cgroup2") == 0) {
+ if (strcmp(type, "cgroup2") == 0) {
strncpy(root, mount, len);
return 0;
}
_
Patches currently in -mm which might be from chris(a)chrisdown.name are
cgroup-kselftest-relax-fs_spec-checks.patch
mm-throttle-allocators-when-failing-reclaim-over-memoryhigh.patch
mm-proportional-memorylowmin-reclaim.patch
mm-make-memoryemin-the-baseline-for-utilisation-determination.patch
mm-make-memoryemin-the-baseline-for-utilisation-determination-fix.patch
From: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
commit ed527b13d800dd515a9e6c582f0a73eca65b2e1b upstream.
The CAAM driver currently violates an undocumented and slightly
controversial requirement imposed by the crypto stack that a buffer
referred to by the request structure via its virtual address may not
be modified while any scatterlists passed via the same request
structure are mapped for inbound DMA.
This may result in errors like
alg: aead: decryption failed on test 1 for gcm_base(ctr-aes-caam,ghash-generic): ret=74
alg: aead: Failed to load transform for gcm(aes): -2
on non-cache coherent systems, due to the fact that the GCM driver
passes an IV buffer by virtual address which shares a cacheline with
the auth_tag buffer passed via a scatterlist, resulting in corruption
of the auth_tag when the IV is updated while the DMA mapping is live.
Since the IV that is returned to the caller is only valid for CBC mode,
and given that the in-kernel users of CBC (such as CTS) don't trigger the
same issue as the GCM driver, let's just disable the output IV generation
for all modes except CBC for the time being.
Fixes: 854b06f76879 ("crypto: caam - properly set IV after {en,de}crypt")
Cc: Horia Geanta <horia.geanta(a)nxp.com>
Cc: Iuliana Prodan <iuliana.prodan(a)nxp.com>
Reported-by: Sascha Hauer <s.hauer(a)pengutronix.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Reviewed-by: Horia Geanta <horia.geanta(a)nxp.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
[ Horia: backported to 4.14, 4.19 ]
Signed-off-by: Horia Geantă <horia.geanta(a)nxp.com>
---
drivers/crypto/caam/caamalg.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 9bc54c3c2cb9..1907945f82b7 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -887,6 +887,7 @@ static void ablkcipher_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
struct ablkcipher_request *req = context;
struct ablkcipher_edesc *edesc;
struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+ struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
#ifdef DEBUG
@@ -911,10 +912,11 @@ static void ablkcipher_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
/*
* The crypto API expects us to set the IV (req->info) to the last
- * ciphertext block. This is used e.g. by the CTS mode.
+ * ciphertext block when running in CBC mode.
*/
- scatterwalk_map_and_copy(req->info, req->dst, req->nbytes - ivsize,
- ivsize, 0);
+ if ((ctx->cdata.algtype & OP_ALG_AAI_MASK) == OP_ALG_AAI_CBC)
+ scatterwalk_map_and_copy(req->info, req->dst, req->nbytes -
+ ivsize, ivsize, 0);
/* In case initial IV was generated, copy it in GIVCIPHER request */
if (edesc->iv_dir == DMA_FROM_DEVICE) {
@@ -1651,10 +1653,11 @@ static int ablkcipher_decrypt(struct ablkcipher_request *req)
/*
* The crypto API expects us to set the IV (req->info) to the last
- * ciphertext block.
+ * ciphertext block when running in CBC mode.
*/
- scatterwalk_map_and_copy(req->info, req->src, req->nbytes - ivsize,
- ivsize, 0);
+ if ((ctx->cdata.algtype & OP_ALG_AAI_MASK) == OP_ALG_AAI_CBC)
+ scatterwalk_map_and_copy(req->info, req->src, req->nbytes -
+ ivsize, ivsize, 0);
/* Create and submit job descriptor*/
init_ablkcipher_job(ctx->sh_desc_dec, ctx->sh_desc_dec_dma, edesc, req);
--
2.17.1
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 179006688a7e888cbff39577189f2e034786d06a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Wed, 19 Jun 2019 13:05:50 +0100
Subject: [PATCH] Btrfs: add missing inode version, ctime and mtime updates
when punching hole
If the range for which we are punching a hole covers only part of a page,
we end up updating the inode item but we skip the update of the inode's
iversion, mtime and ctime. Fix that by ensuring we update those properties
of the inode.
A patch for fstests test case generic/059 that tests this as been sent
along with this fix.
Fixes: 2aaa66558172b0 ("Btrfs: add hole punching")
Fixes: e8c1c76e804b18 ("Btrfs: add missing inode update when punching hole")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5370152ea7e3..b455bdf46faa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2711,6 +2711,11 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
+ struct timespec64 now = current_time(inode);
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = now;
+ inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 49f17c26c123b60fd1c74629eef077740d16ffc2 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Thu, 18 Jul 2019 15:57:31 -0700
Subject: [PATCH] resource: fix locking in find_next_iomem_res()
Since resources can be removed, locking should ensure that the resource
is not removed while accessing it. However, find_next_iomem_res() does
not hold the lock while copying the data of the resource.
Keep holding the lock while the data is copied. While at it, change the
return value to a more informative value. It is disregarded by the
callers.
[akpm(a)linux-foundation.org: fix find_next_iomem_res() documentation]
Link: http://lkml.kernel.org/r/20190613045903.4922-2-namit@vmware.com
Fixes: ff3cc952d3f00 ("resource: Add remove_resource interface")
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: Toshi Kani <toshi.kani(a)hpe.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/kernel/resource.c b/kernel/resource.c
index d22423e85cf8..3ced0cd45bdd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource);
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
+ * -ENODEV. Returns -EINVAL for invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
@@ -365,16 +365,16 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
break;
}
+ if (p) {
+ /* copy data */
+ res->start = max(start, p->start);
+ res->end = min(end, p->end);
+ res->flags = p->flags;
+ res->desc = p->desc;
+ }
+
read_unlock(&resource_lock);
- if (!p)
- return -1;
-
- /* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
- return 0;
+ return p ? 0 : -ENODEV;
}
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 49f17c26c123b60fd1c74629eef077740d16ffc2 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Thu, 18 Jul 2019 15:57:31 -0700
Subject: [PATCH] resource: fix locking in find_next_iomem_res()
Since resources can be removed, locking should ensure that the resource
is not removed while accessing it. However, find_next_iomem_res() does
not hold the lock while copying the data of the resource.
Keep holding the lock while the data is copied. While at it, change the
return value to a more informative value. It is disregarded by the
callers.
[akpm(a)linux-foundation.org: fix find_next_iomem_res() documentation]
Link: http://lkml.kernel.org/r/20190613045903.4922-2-namit@vmware.com
Fixes: ff3cc952d3f00 ("resource: Add remove_resource interface")
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: Toshi Kani <toshi.kani(a)hpe.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/kernel/resource.c b/kernel/resource.c
index d22423e85cf8..3ced0cd45bdd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource);
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
+ * -ENODEV. Returns -EINVAL for invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
@@ -365,16 +365,16 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
break;
}
+ if (p) {
+ /* copy data */
+ res->start = max(start, p->start);
+ res->end = min(end, p->end);
+ res->flags = p->flags;
+ res->desc = p->desc;
+ }
+
read_unlock(&resource_lock);
- if (!p)
- return -1;
-
- /* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
- return 0;
+ return p ? 0 : -ENODEV;
}
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 49f17c26c123b60fd1c74629eef077740d16ffc2 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Thu, 18 Jul 2019 15:57:31 -0700
Subject: [PATCH] resource: fix locking in find_next_iomem_res()
Since resources can be removed, locking should ensure that the resource
is not removed while accessing it. However, find_next_iomem_res() does
not hold the lock while copying the data of the resource.
Keep holding the lock while the data is copied. While at it, change the
return value to a more informative value. It is disregarded by the
callers.
[akpm(a)linux-foundation.org: fix find_next_iomem_res() documentation]
Link: http://lkml.kernel.org/r/20190613045903.4922-2-namit@vmware.com
Fixes: ff3cc952d3f00 ("resource: Add remove_resource interface")
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: Toshi Kani <toshi.kani(a)hpe.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/kernel/resource.c b/kernel/resource.c
index d22423e85cf8..3ced0cd45bdd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource);
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
+ * -ENODEV. Returns -EINVAL for invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
@@ -365,16 +365,16 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
break;
}
+ if (p) {
+ /* copy data */
+ res->start = max(start, p->start);
+ res->end = min(end, p->end);
+ res->flags = p->flags;
+ res->desc = p->desc;
+ }
+
read_unlock(&resource_lock);
- if (!p)
- return -1;
-
- /* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
- return 0;
+ return p ? 0 : -ENODEV;
}
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 89705e92700170888236555fe91b45e4c1bb0985 Mon Sep 17 00:00:00 2001
From: Danit Goldberg <danitg(a)mellanox.com>
Date: Fri, 5 Jul 2019 19:21:57 +0300
Subject: [PATCH] IB/mlx5: Report correctly tag matching rendezvous capability
Userspace expects the IB_TM_CAP_RC bit to indicate that the device
supports RC transport tag matching with rendezvous offload. However the
firmware splits this into two capabilities for eager and rendezvous tag
matching.
Only if the FW supports both modes should userspace be told the tag
matching capability is available.
Cc: <stable(a)vger.kernel.org> # 4.13
Fixes: eb761894351d ("IB/mlx5: Fill XRQ capabilities")
Signed-off-by: Danit Goldberg <danitg(a)mellanox.com>
Reviewed-by: Yishai Hadas <yishaih(a)mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko(a)mellanox.com>
Signed-off-by: Leon Romanovsky <leonro(a)mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 7581571bd9cd..56d4b1e9dd23 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1046,15 +1046,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
if (MLX5_CAP_GEN(mdev, tag_matching)) {
- props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
props->tm_caps.max_num_tags =
(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
- props->tm_caps.flags = IB_TM_CAP_RC;
props->tm_caps.max_ops =
1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
}
+ if (MLX5_CAP_GEN(mdev, tag_matching) &&
+ MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
+ props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
+ props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
+ }
+
if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
props->cq_caps.max_cq_moderation_count =
MLX5_MAX_CQ_COUNT;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 50806bef9f20..4053be51b7fa 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -307,8 +307,8 @@ struct ib_rss_caps {
};
enum ib_tm_cap_flags {
- /* Support tag matching on RC transport */
- IB_TM_CAP_RC = 1 << 0,
+ /* Support tag matching with rendezvous offload for RC transport */
+ IB_TM_CAP_RNDV_RC = 1 << 0,
};
struct ib_tm_caps {
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 803f0f64d17769071d7287d9e3e3b79a3e1ae937 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Wed, 19 Jun 2019 13:05:39 +0100
Subject: [PATCH] Btrfs: fix fsync not persisting dentry deletions due to inode
evictions
In order to avoid searches on a log tree when unlinking an inode, we check
if the inode being unlinked was logged in the current transaction, as well
as the inode of its parent directory. When any of the inodes are logged,
we proceed to delete directory items and inode reference items from the
log, to ensure that if a subsequent fsync of only the inode being unlinked
or only of the parent directory when the other is not fsync'ed as well,
does not result in the entry still existing after a power failure.
That check however is not reliable when one of the inodes involved (the
one being unlinked or its parent directory's inode) is evicted, since the
logged_trans field is transient, that is, it is not stored on disk, so it
is lost when the inode is evicted and loaded into memory again (which is
set to zero on load). As a consequence the checks currently being done by
btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log() always
return true if the inode was evicted before, regardless of the inode
having been logged or not before (and in the current transaction), this
results in the dentry being unlinked still existing after a log replay
if after the unlink operation only one of the inodes involved is fsync'ed.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir
$ touch /mnt/dir/foo
$ xfs_io -c fsync /mnt/dir/foo
# Keep an open file descriptor on our directory while we evict inodes.
# We just want to evict the file's inode, the directory's inode must not
# be evicted.
$ ( cd /mnt/dir; while true; do :; done ) &
$ pid=$!
# Wait a bit to give time to background process to chdir to our test
# directory.
$ sleep 0.5
# Trigger eviction of the file's inode.
$ echo 2 > /proc/sys/vm/drop_caches
# Unlink our file and fsync the parent directory. After a power failure
# we don't expect to see the file anymore, since we fsync'ed the parent
# directory.
$ rm -f $SCRATCH_MNT/dir/foo
$ xfs_io -c fsync /mnt/dir
<power failure>
$ mount /dev/sdb /mnt
$ ls /mnt/dir
foo
$
--> file still there, unlink not persisted despite explicit fsync on dir
Fix this by checking if the inode has the full_sync bit set in its runtime
flags as well, since that bit is set everytime an inode is loaded from
disk, or for other less common cases such as after a shrinking truncate
or failure to allocate extent maps for holes, and gets cleared after the
first fsync. Also consider the inode as possibly logged only if it was
last modified in the current transaction (besides having the full_fsync
flag set).
Fixes: 3a5f1d458ad161 ("Btrfs: Optimize btree walking while logging inodes")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a04659fded7..6c8297bcfeb7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3322,6 +3322,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ if (inode->logged_trans == trans->transid)
+ return true;
+
+ if (inode->last_trans == trans->transid &&
+ test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+ return true;
+
+ return false;
+}
+
/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
@@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
- if (dir->logged_trans < trans->transid)
+ if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
- if (inode->logged_trans < trans->transid)
+ if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 803f0f64d17769071d7287d9e3e3b79a3e1ae937 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Wed, 19 Jun 2019 13:05:39 +0100
Subject: [PATCH] Btrfs: fix fsync not persisting dentry deletions due to inode
evictions
In order to avoid searches on a log tree when unlinking an inode, we check
if the inode being unlinked was logged in the current transaction, as well
as the inode of its parent directory. When any of the inodes are logged,
we proceed to delete directory items and inode reference items from the
log, to ensure that if a subsequent fsync of only the inode being unlinked
or only of the parent directory when the other is not fsync'ed as well,
does not result in the entry still existing after a power failure.
That check however is not reliable when one of the inodes involved (the
one being unlinked or its parent directory's inode) is evicted, since the
logged_trans field is transient, that is, it is not stored on disk, so it
is lost when the inode is evicted and loaded into memory again (which is
set to zero on load). As a consequence the checks currently being done by
btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log() always
return true if the inode was evicted before, regardless of the inode
having been logged or not before (and in the current transaction), this
results in the dentry being unlinked still existing after a log replay
if after the unlink operation only one of the inodes involved is fsync'ed.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir
$ touch /mnt/dir/foo
$ xfs_io -c fsync /mnt/dir/foo
# Keep an open file descriptor on our directory while we evict inodes.
# We just want to evict the file's inode, the directory's inode must not
# be evicted.
$ ( cd /mnt/dir; while true; do :; done ) &
$ pid=$!
# Wait a bit to give time to background process to chdir to our test
# directory.
$ sleep 0.5
# Trigger eviction of the file's inode.
$ echo 2 > /proc/sys/vm/drop_caches
# Unlink our file and fsync the parent directory. After a power failure
# we don't expect to see the file anymore, since we fsync'ed the parent
# directory.
$ rm -f $SCRATCH_MNT/dir/foo
$ xfs_io -c fsync /mnt/dir
<power failure>
$ mount /dev/sdb /mnt
$ ls /mnt/dir
foo
$
--> file still there, unlink not persisted despite explicit fsync on dir
Fix this by checking if the inode has the full_sync bit set in its runtime
flags as well, since that bit is set everytime an inode is loaded from
disk, or for other less common cases such as after a shrinking truncate
or failure to allocate extent maps for holes, and gets cleared after the
first fsync. Also consider the inode as possibly logged only if it was
last modified in the current transaction (besides having the full_fsync
flag set).
Fixes: 3a5f1d458ad161 ("Btrfs: Optimize btree walking while logging inodes")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a04659fded7..6c8297bcfeb7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3322,6 +3322,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ if (inode->logged_trans == trans->transid)
+ return true;
+
+ if (inode->last_trans == trans->transid &&
+ test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+ return true;
+
+ return false;
+}
+
/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
@@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
- if (dir->logged_trans < trans->transid)
+ if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
- if (inode->logged_trans < trans->transid)
+ if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d1d832a0b51dd9570429bb4b81b2a6c1759e681a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Fri, 7 Jun 2019 11:25:24 +0100
Subject: [PATCH] Btrfs: fix data loss after inode eviction, renaming it, and
fsync it
When we log an inode, regardless of logging it completely or only that it
exists, we always update it as logged (logged_trans and last_log_commit
fields of the inode are updated). This is generally fine and avoids future
attempts to log it from having to do repeated work that brings no value.
However, if we write data to a file, then evict its inode after all the
dealloc was flushed (and ordered extents completed), rename the file and
fsync it, we end up not logging the new extents, since the rename may
result in logging that the inode exists in case the parent directory was
logged before. The following reproducer shows and explains how this can
happen:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir
$ touch /mnt/dir/foo
$ touch /mnt/dir/bar
# Do a direct IO write instead of a buffered write because with a
# buffered write we would need to make sure dealloc gets flushed and
# complete before we do the inode eviction later, and we can not do that
# from user space with call to things such as sync(2) since that results
# in a transaction commit as well.
$ xfs_io -d -c "pwrite -S 0xd3 0 4K" /mnt/dir/bar
# Keep the directory dir in use while we evict inodes. We want our file
# bar's inode to be evicted but we don't want our directory's inode to
# be evicted (if it were evicted too, we would not be able to reproduce
# the issue since the first fsync below, of file foo, would result in a
# transaction commit.
$ ( cd /mnt/dir; while true; do :; done ) &
$ pid=$!
# Wait a bit to give time for the background process to chdir.
$ sleep 0.1
# Evict all inodes, except the inode for the directory dir because it is
# currently in use by our background process.
$ echo 2 > /proc/sys/vm/drop_caches
# fsync file foo, which ends up persisting information about the parent
# directory because it is a new inode.
$ xfs_io -c fsync /mnt/dir/foo
# Rename bar, this results in logging that this inode exists (inode item,
# names, xattrs) because the parent directory is in the log.
$ mv /mnt/dir/bar /mnt/dir/baz
# Now fsync baz, which ends up doing absolutely nothing because of the
# rename operation which logged that the inode exists only.
$ xfs_io -c fsync /mnt/dir/baz
<power failure>
$ mount /dev/sdb /mnt
$ od -t x1 -A d /mnt/dir/baz
0000000
--> Empty file, data we wrote is missing.
Fix this by not updating last_sub_trans of an inode when we are logging
only that it exists and the inode was not yet logged since it was loaded
from disk (full_sync bit set), this is enough to make btrfs_inode_in_log()
return false for this scenario and make us log the inode. The logged_trans
of the inode is still always setsince that alone is used to track if names
need to be deleted as part of unlink operations.
Fixes: 257c62e1bce03e ("Btrfs: avoid tree log commit when there are no changes")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3fc8d854d7fb..4a04659fded7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5420,9 +5420,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * Don't update last_log_commit if we logged that an inode exists after
+ * it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode, then
+ * the inode gets evicted after all delalloc was flushed, then we log
+ * it exists (due to a rename for example) and then fsync it. This last
+ * fsync would do nothing (not logging the extents previously written).
+ */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- inode->last_log_commit = inode->last_sub_trans;
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d1d832a0b51dd9570429bb4b81b2a6c1759e681a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Fri, 7 Jun 2019 11:25:24 +0100
Subject: [PATCH] Btrfs: fix data loss after inode eviction, renaming it, and
fsync it
When we log an inode, regardless of logging it completely or only that it
exists, we always update it as logged (logged_trans and last_log_commit
fields of the inode are updated). This is generally fine and avoids future
attempts to log it from having to do repeated work that brings no value.
However, if we write data to a file, then evict its inode after all the
dealloc was flushed (and ordered extents completed), rename the file and
fsync it, we end up not logging the new extents, since the rename may
result in logging that the inode exists in case the parent directory was
logged before. The following reproducer shows and explains how this can
happen:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/dir
$ touch /mnt/dir/foo
$ touch /mnt/dir/bar
# Do a direct IO write instead of a buffered write because with a
# buffered write we would need to make sure dealloc gets flushed and
# complete before we do the inode eviction later, and we can not do that
# from user space with call to things such as sync(2) since that results
# in a transaction commit as well.
$ xfs_io -d -c "pwrite -S 0xd3 0 4K" /mnt/dir/bar
# Keep the directory dir in use while we evict inodes. We want our file
# bar's inode to be evicted but we don't want our directory's inode to
# be evicted (if it were evicted too, we would not be able to reproduce
# the issue since the first fsync below, of file foo, would result in a
# transaction commit.
$ ( cd /mnt/dir; while true; do :; done ) &
$ pid=$!
# Wait a bit to give time for the background process to chdir.
$ sleep 0.1
# Evict all inodes, except the inode for the directory dir because it is
# currently in use by our background process.
$ echo 2 > /proc/sys/vm/drop_caches
# fsync file foo, which ends up persisting information about the parent
# directory because it is a new inode.
$ xfs_io -c fsync /mnt/dir/foo
# Rename bar, this results in logging that this inode exists (inode item,
# names, xattrs) because the parent directory is in the log.
$ mv /mnt/dir/bar /mnt/dir/baz
# Now fsync baz, which ends up doing absolutely nothing because of the
# rename operation which logged that the inode exists only.
$ xfs_io -c fsync /mnt/dir/baz
<power failure>
$ mount /dev/sdb /mnt
$ od -t x1 -A d /mnt/dir/baz
0000000
--> Empty file, data we wrote is missing.
Fix this by not updating last_sub_trans of an inode when we are logging
only that it exists and the inode was not yet logged since it was loaded
from disk (full_sync bit set), this is enough to make btrfs_inode_in_log()
return false for this scenario and make us log the inode. The logged_trans
of the inode is still always setsince that alone is used to track if names
need to be deleted as part of unlink operations.
Fixes: 257c62e1bce03e ("Btrfs: avoid tree log commit when there are no changes")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3fc8d854d7fb..4a04659fded7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5420,9 +5420,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * Don't update last_log_commit if we logged that an inode exists after
+ * it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode, then
+ * the inode gets evicted after all delalloc was flushed, then we log
+ * it exists (due to a rename for example) and then fsync it. This last
+ * fsync would do nothing (not logging the extents previously written).
+ */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- inode->last_log_commit = inode->last_sub_trans;
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 64adde31c8e996a6db6f7a1a4131180e363aa9f2 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel(a)linaro.org>
Date: Wed, 29 May 2019 11:43:52 +0200
Subject: [PATCH] PCI: qcom: Ensure that PERST is asserted for at least 100 ms
Currently, there is only a 1 ms sleep after asserting PERST.
Reading the datasheets for different endpoints, some require PERST to be
asserted for 10 ms in order for the endpoint to perform a reset, others
require it to be asserted for 50 ms.
Several SoCs using this driver uses PCIe Mini Card, where we don't know
what endpoint will be plugged in.
The PCI Express Card Electromechanical Specification r2.0, section
2.2, "PERST# Signal" specifies:
"On power up, the deassertion of PERST# is delayed 100 ms (TPVPERL) from
the power rails achieving specified operating limits."
Add a sleep of 100 ms before deasserting PERST, in order to ensure that
we are compliant with the spec.
Fixes: 82a823833f4e ("PCI: qcom: Add Qualcomm PCIe controller driver")
Signed-off-by: Niklas Cassel <niklas.cassel(a)linaro.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi(a)arm.com>
Acked-by: Stanimir Varbanov <svarbanov(a)mm-sol.com>
Cc: stable(a)vger.kernel.org # 4.5+
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index da5dd3639a49..7e581748ee9f 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -178,6 +178,8 @@ static void qcom_ep_reset_assert(struct qcom_pcie *pcie)
static void qcom_ep_reset_deassert(struct qcom_pcie *pcie)
{
+ /* Ensure that PERST has been asserted for at least 100 ms */
+ msleep(100);
gpiod_set_value_cansleep(pcie->reset, 0);
usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 64adde31c8e996a6db6f7a1a4131180e363aa9f2 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel(a)linaro.org>
Date: Wed, 29 May 2019 11:43:52 +0200
Subject: [PATCH] PCI: qcom: Ensure that PERST is asserted for at least 100 ms
Currently, there is only a 1 ms sleep after asserting PERST.
Reading the datasheets for different endpoints, some require PERST to be
asserted for 10 ms in order for the endpoint to perform a reset, others
require it to be asserted for 50 ms.
Several SoCs using this driver uses PCIe Mini Card, where we don't know
what endpoint will be plugged in.
The PCI Express Card Electromechanical Specification r2.0, section
2.2, "PERST# Signal" specifies:
"On power up, the deassertion of PERST# is delayed 100 ms (TPVPERL) from
the power rails achieving specified operating limits."
Add a sleep of 100 ms before deasserting PERST, in order to ensure that
we are compliant with the spec.
Fixes: 82a823833f4e ("PCI: qcom: Add Qualcomm PCIe controller driver")
Signed-off-by: Niklas Cassel <niklas.cassel(a)linaro.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi(a)arm.com>
Acked-by: Stanimir Varbanov <svarbanov(a)mm-sol.com>
Cc: stable(a)vger.kernel.org # 4.5+
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index da5dd3639a49..7e581748ee9f 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -178,6 +178,8 @@ static void qcom_ep_reset_assert(struct qcom_pcie *pcie)
static void qcom_ep_reset_deassert(struct qcom_pcie *pcie)
{
+ /* Ensure that PERST has been asserted for at least 100 ms */
+ msleep(100);
gpiod_set_value_cansleep(pcie->reset, 0);
usleep_range(PERST_DELAY_US, PERST_DELAY_US + 500);
}
The patch below does not apply to the 5.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7608bf40cf2480057ec0da31456cc428791c32ef Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)mellanox.com>
Date: Tue, 11 Jun 2019 13:09:51 -0300
Subject: [PATCH] RDMA/odp: Fix missed unlock in non-blocking invalidate_start
If invalidate_start returns with EAGAIN then the umem_rwsem needs to be
unlocked as no invalidate_end will be called.
Cc: <stable(a)vger.kernel.org>
Fixes: ca748c39ea3f ("RDMA/umem: Get rid of per_mm->notifier_count")
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
Reviewed-by: Leon Romanovsky <leonro(a)mellanox.com>
Signed-off-by: Doug Ledford <dledford(a)redhat.com>
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 9001cc10770a..eb9939d52818 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -149,6 +149,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
{
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
+ int rc;
if (mmu_notifier_range_blockable(range))
down_read(&per_mm->umem_rwsem);
@@ -165,11 +166,14 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
return 0;
}
- return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
- range->end,
- invalidate_range_start_trampoline,
- mmu_notifier_range_blockable(range),
- NULL);
+ rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+ range->end,
+ invalidate_range_start_trampoline,
+ mmu_notifier_range_blockable(range),
+ NULL);
+ if (rc)
+ up_read(&per_mm->umem_rwsem);
+ return rc;
}
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7608bf40cf2480057ec0da31456cc428791c32ef Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)mellanox.com>
Date: Tue, 11 Jun 2019 13:09:51 -0300
Subject: [PATCH] RDMA/odp: Fix missed unlock in non-blocking invalidate_start
If invalidate_start returns with EAGAIN then the umem_rwsem needs to be
unlocked as no invalidate_end will be called.
Cc: <stable(a)vger.kernel.org>
Fixes: ca748c39ea3f ("RDMA/umem: Get rid of per_mm->notifier_count")
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
Reviewed-by: Leon Romanovsky <leonro(a)mellanox.com>
Signed-off-by: Doug Ledford <dledford(a)redhat.com>
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 9001cc10770a..eb9939d52818 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -149,6 +149,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
{
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
+ int rc;
if (mmu_notifier_range_blockable(range))
down_read(&per_mm->umem_rwsem);
@@ -165,11 +166,14 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
return 0;
}
- return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
- range->end,
- invalidate_range_start_trampoline,
- mmu_notifier_range_blockable(range),
- NULL);
+ rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+ range->end,
+ invalidate_range_start_trampoline,
+ mmu_notifier_range_blockable(range),
+ NULL);
+ if (rc)
+ up_read(&per_mm->umem_rwsem);
+ return rc;
}
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bcef5b7215681250c4bf8961dfe15e9e4fef97d0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Wed, 29 May 2019 09:38:31 -0700
Subject: [PATCH] RDMA/srp: Accept again source addresses that do not have a
port number
The function srp_parse_in() is used both for parsing source address
specifications and for target address specifications. Target addresses
must have a port number. Having to specify a port number for source
addresses is inconvenient. Make sure that srp_parse_in() supports again
parsing addresses with no port number.
Cc: <stable(a)vger.kernel.org>
Fixes: c62adb7def71 ("IB/srp: Fix IPv6 address parsing")
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index be9ddcad8f28..87848faa7502 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3481,13 +3481,14 @@ static const match_table_t srp_opt_tokens = {
* @net: [in] Network namespace.
* @sa: [out] Address family, IP address and port number.
* @addr_port_str: [in] IP address and port number.
+ * @has_port: [out] Whether or not @addr_port_str includes a port number.
*
* Parse the following address formats:
* - IPv4: <ip_address>:<port>, e.g. 1.2.3.4:5.
* - IPv6: \[<ipv6_address>\]:<port>, e.g. [1::2:3%4]:5.
*/
static int srp_parse_in(struct net *net, struct sockaddr_storage *sa,
- const char *addr_port_str)
+ const char *addr_port_str, bool *has_port)
{
char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL);
char *port_str;
@@ -3496,9 +3497,12 @@ static int srp_parse_in(struct net *net, struct sockaddr_storage *sa,
if (!addr)
return -ENOMEM;
port_str = strrchr(addr, ':');
- if (!port_str)
- return -EINVAL;
- *port_str++ = '\0';
+ if (port_str && strchr(port_str, ']'))
+ port_str = NULL;
+ if (port_str)
+ *port_str++ = '\0';
+ if (has_port)
+ *has_port = port_str != NULL;
ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa);
if (ret && addr[0]) {
addr_end = addr + strlen(addr) - 1;
@@ -3520,6 +3524,7 @@ static int srp_parse_options(struct net *net, const char *buf,
char *p;
substring_t args[MAX_OPT_ARGS];
unsigned long long ull;
+ bool has_port;
int opt_mask = 0;
int token;
int ret = -EINVAL;
@@ -3618,7 +3623,8 @@ static int srp_parse_options(struct net *net, const char *buf,
ret = -ENOMEM;
goto out;
}
- ret = srp_parse_in(net, &target->rdma_cm.src.ss, p);
+ ret = srp_parse_in(net, &target->rdma_cm.src.ss, p,
+ NULL);
if (ret < 0) {
pr_warn("bad source parameter '%s'\n", p);
kfree(p);
@@ -3634,7 +3640,10 @@ static int srp_parse_options(struct net *net, const char *buf,
ret = -ENOMEM;
goto out;
}
- ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p);
+ ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p,
+ &has_port);
+ if (!has_port)
+ ret = -EINVAL;
if (ret < 0) {
pr_warn("bad dest parameter '%s'\n", p);
kfree(p);
The patch below does not apply to the 5.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Mon, 1 Jul 2019 14:09:18 +0900
Subject: [PATCH] block: Limit zone array allocation size
Limit the size of the struct blk_zone array used in
blk_revalidate_disk_zones() to avoid memory allocation failures leading
to disk revalidation failure. Also further reduce the likelyhood of
such failures by using kvcalloc() (that is vmalloc()) instead of
allocating contiguous pages with alloc_pages().
Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 58ced170b424..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,8 @@
#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include "blk.h"
@@ -371,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
* Allocate an array of struct blk_zone to get nr_zones zone information.
* The allocated array may be smaller than nr_zones.
*/
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
{
- size_t size = *nr_zones * sizeof(struct blk_zone);
- struct page *page;
- int order;
-
- for (order = get_order(size); order >= 0; order--) {
- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
- if (page) {
- *nr_zones = min_t(unsigned int, *nr_zones,
- (PAGE_SIZE << order) / sizeof(struct blk_zone));
- return page_address(page);
- }
+ struct blk_zone *zones;
+ size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+ /*
+ * GFP_KERNEL here is meaningless as the caller task context has
+ * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+ * with memalloc_noio_save().
+ */
+ zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ *nr_zones = 0;
+ return NULL;
}
- return NULL;
+ *nr_zones = nrz;
+
+ return zones;
}
void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -448,7 +453,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
/* Get zone information and initialize seq_zones_bitmap */
rep_nr_zones = nr_zones;
- zones = blk_alloc_zones(q->node, &rep_nr_zones);
+ zones = blk_alloc_zones(&rep_nr_zones);
if (!zones)
goto out;
@@ -487,8 +492,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
out:
memalloc_noio_restore(noio_flag);
- free_pages((unsigned long)zones,
- get_order(rep_nr_zones * sizeof(struct blk_zone)));
+ kvfree(zones);
kfree(seq_zones_wlock);
kfree(seq_zones_bitmap);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05036e3e3458..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,6 +344,11 @@ struct queue_limits {
#ifdef CONFIG_BLK_DEV_ZONED
+/*
+ * Maximum number of zones to report with a single report zones command.
+ */
+#define BLK_ZONED_REPORT_MAX_ZONES 8192U
+
extern unsigned int blkdev_nr_zones(struct block_device *bdev);
extern int blkdev_report_zones(struct block_device *bdev,
sector_t sector, struct blk_zone *zones,