commit b36e4523d4d5 ("netfilter: nf_conncount: fix garbage collection confirm
race")
An iptable rule like the following on a multicore systems will result in
accepting more connections than set in the rule.
iptables -A INPUT -p tcp -m tcp --syn --dport 7777 -m connlimit \
--connlimit-above 2000 --connlimit-mask 0 -j DROP
In check_hlist function, connections that are found in saved connections
but not in netfilter conntrack are deleted, assuming that those
connections do not exist anymore. But for multi core systems, there exists
a small time window, when a connection has been added to the xt_connlimit
maintained rb-tree but has not yet made to netfilter conntrack table. This
causes concurrent connections to return incorrect counts and go over limit
set in iptable rule.
The fix has been partially backported from the above mentioned upstream
commit. Introduce timestamp and the owning cpu.
Signed-off-by: Alakesh Haloi <alakeshh(a)amazon.com>
Cc: Pablo Neira Ayuso <pablo(a)netfilter.org>
Cc: Jozsef Kadlecsik <kadlec(a)blackhole.kfki.hu>
Cc: Florian Westphal <fw(a)strlen.de>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: stable(a)vger.kernel.org # v4.15 and before
Cc: netdev(a)vger.kernel.org
Cc: Dmitry Andrianov <dmitry.andrianov(a)alertme.com>
Cc: Justin Pettit <jpettit(a)vmware.com>
Cc: Yi-Hung Wei <yihung.wei(a)gmail.com>
---
net/netfilter/xt_connlimit.c | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index ffa8eec..e7b092b 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -47,6 +47,8 @@ struct xt_connlimit_conn {
struct hlist_node node;
struct nf_conntrack_tuple tuple;
union nf_inet_addr addr;
+ int cpu;
+ u32 jiffies32;
};
struct xt_connlimit_rb {
@@ -126,6 +128,8 @@ static bool add_hlist(struct hlist_head *head,
return false;
conn->tuple = *tuple;
conn->addr = *addr;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
hlist_add_head(&conn->node, head);
return true;
}
@@ -148,8 +152,26 @@ static unsigned int check_hlist(struct net *net,
hlist_for_each_entry_safe(conn, n, head, node) {
found = nf_conntrack_find_get(net, zone, &conn->tuple);
if (found == NULL) {
- hlist_del(&conn->node);
- kmem_cache_free(connlimit_conn_cachep, conn);
+ /* If connection is not found, it may be because
+ * it has not made into conntrack table yet. We
+ * check if it is a recently created connection
+ * on a different core and do not delete it in that
+ * case.
+ */
+
+ unsigned long a, b;
+ int cpu = raw_smp_processor_id();
+ __u32 age;
+
+ b = conn->jiffies;
+ a = (u32)jiffies;
+ age = a - b;
+ if (conn->cpu != cpu && age <= 2) {
+ length++;
+ } else {
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
+ }
continue;
}
@@ -271,6 +293,8 @@ static void tree_nodes_free(struct rb_root *root,
conn->tuple = *tuple;
conn->addr = *addr;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
rbconn->addr = *addr;
INIT_HLIST_HEAD(&rbconn->hhead);
--
1.8.3.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d57f9da890696af1484f4a47f7f123560197865a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Fri, 30 Nov 2018 15:31:48 +0900
Subject: [PATCH] dm zoned: Fix target BIO completion handling
struct bioctx includes the ref refcount_t to track the number of I/O
fragments used to process a target BIO as well as ensure that the zone
of the BIO is kept in the active state throughout the lifetime of the
BIO. However, since decrementing of this reference count is done in the
target .end_io method, the function bio_endio() must be called multiple
times for read and write target BIOs, which causes problems with the
value of the __bi_remaining struct bio field for chained BIOs (e.g. the
clone BIO passed by dm core is large and splits into fragments by the
block layer), resulting in incorrect values and inconsistencies with the
BIO_CHAIN flag setting. This is turn triggers the BUG_ON() call:
BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
in bio_remaining_done() called from bio_endio().
Fix this ensuring that bio_endio() is called only once for any target
BIO by always using internal clone BIOs for processing any read or
write target BIO. This allows reference counting using the target BIO
context counter to trigger the target BIO completion bio_endio() call
once all data, metadata and other zone work triggered by the BIO
complete.
Overall, this simplifies the code too as the target .end_io becomes
unnecessary and differences between read and write BIO issuing and
completion processing disappear.
Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 981154e59461..6af5babe6837 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -20,7 +20,6 @@ struct dmz_bioctx {
struct dm_zone *zone;
struct bio *bio;
refcount_t ref;
- blk_status_t status;
};
/*
@@ -78,65 +77,66 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
{
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
- if (bioctx->status == BLK_STS_OK && status != BLK_STS_OK)
- bioctx->status = status;
- bio_endio(bio);
+ if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
+ bio->bi_status = status;
+
+ if (refcount_dec_and_test(&bioctx->ref)) {
+ struct dm_zone *zone = bioctx->zone;
+
+ if (zone) {
+ if (bio->bi_status != BLK_STS_OK &&
+ bio_op(bio) == REQ_OP_WRITE &&
+ dmz_is_seq(zone))
+ set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
+ dmz_deactivate_zone(zone);
+ }
+ bio_endio(bio);
+ }
}
/*
- * Partial clone read BIO completion callback. This terminates the
+ * Completion callback for an internally cloned target BIO. This terminates the
* target BIO when there are no more references to its context.
*/
-static void dmz_read_bio_end_io(struct bio *bio)
+static void dmz_clone_endio(struct bio *clone)
{
- struct dmz_bioctx *bioctx = bio->bi_private;
- blk_status_t status = bio->bi_status;
+ struct dmz_bioctx *bioctx = clone->bi_private;
+ blk_status_t status = clone->bi_status;
- bio_put(bio);
+ bio_put(clone);
dmz_bio_endio(bioctx->bio, status);
}
/*
- * Issue a BIO to a zone. The BIO may only partially process the
+ * Issue a clone of a target BIO. The clone may only partially process the
* original target BIO.
*/
-static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
- struct bio *bio, sector_t chunk_block,
- unsigned int nr_blocks)
+static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
+ struct bio *bio, sector_t chunk_block,
+ unsigned int nr_blocks)
{
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
- sector_t sector;
struct bio *clone;
- /* BIO remap sector */
- sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
-
- /* If the read is not partial, there is no need to clone the BIO */
- if (nr_blocks == dmz_bio_blocks(bio)) {
- /* Setup and submit the BIO */
- bio->bi_iter.bi_sector = sector;
- refcount_inc(&bioctx->ref);
- generic_make_request(bio);
- return 0;
- }
-
- /* Partial BIO: we need to clone the BIO */
clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
if (!clone)
return -ENOMEM;
- /* Setup the clone */
- clone->bi_iter.bi_sector = sector;
+ bio_set_dev(clone, dmz->dev->bdev);
+ clone->bi_iter.bi_sector =
+ dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
- clone->bi_end_io = dmz_read_bio_end_io;
+ clone->bi_end_io = dmz_clone_endio;
clone->bi_private = bioctx;
bio_advance(bio, clone->bi_iter.bi_size);
- /* Submit the clone */
refcount_inc(&bioctx->ref);
generic_make_request(clone);
+ if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
+ zone->wp_block += nr_blocks;
+
return 0;
}
@@ -214,7 +214,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
if (nr_blocks) {
/* Valid blocks found: read them */
nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
- ret = dmz_submit_read_bio(dmz, rzone, bio, chunk_block, nr_blocks);
+ ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks);
if (ret)
return ret;
chunk_block += nr_blocks;
@@ -228,25 +228,6 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
return 0;
}
-/*
- * Issue a write BIO to a zone.
- */
-static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone,
- struct bio *bio, sector_t chunk_block,
- unsigned int nr_blocks)
-{
- struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
-
- /* Setup and submit the BIO */
- bio_set_dev(bio, dmz->dev->bdev);
- bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
- refcount_inc(&bioctx->ref);
- generic_make_request(bio);
-
- if (dmz_is_seq(zone))
- zone->wp_block += nr_blocks;
-}
-
/*
* Write blocks directly in a data zone, at the write pointer.
* If a buffer zone is assigned, invalidate the blocks written
@@ -265,7 +246,9 @@ static int dmz_handle_direct_write(struct dmz_target *dmz,
return -EROFS;
/* Submit write */
- dmz_submit_write_bio(dmz, zone, bio, chunk_block, nr_blocks);
+ ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
+ if (ret)
+ return ret;
/*
* Validate the blocks in the data zone and invalidate
@@ -301,7 +284,9 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
return -EROFS;
/* Submit write */
- dmz_submit_write_bio(dmz, bzone, bio, chunk_block, nr_blocks);
+ ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
+ if (ret)
+ return ret;
/*
* Validate the blocks in the buffer zone
@@ -600,7 +585,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
bioctx->zone = NULL;
bioctx->bio = bio;
refcount_set(&bioctx->ref, 1);
- bioctx->status = BLK_STS_OK;
/* Set the BIO pending in the flush list */
if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
@@ -623,35 +607,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
-/*
- * Completed target BIO processing.
- */
-static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
-{
- struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
-
- if (bioctx->status == BLK_STS_OK && *error)
- bioctx->status = *error;
-
- if (!refcount_dec_and_test(&bioctx->ref))
- return DM_ENDIO_INCOMPLETE;
-
- /* Done */
- bio->bi_status = bioctx->status;
-
- if (bioctx->zone) {
- struct dm_zone *zone = bioctx->zone;
-
- if (*error && bio_op(bio) == REQ_OP_WRITE) {
- if (dmz_is_seq(zone))
- set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
- }
- dmz_deactivate_zone(zone);
- }
-
- return DM_ENDIO_DONE;
-}
-
/*
* Get zoned device information.
*/
@@ -946,7 +901,6 @@ static struct target_type dmz_type = {
.ctr = dmz_ctr,
.dtr = dmz_dtr,
.map = dmz_map,
- .end_io = dmz_end_io,
.io_hints = dmz_io_hints,
.prepare_ioctl = dmz_prepare_ioctl,
.postsuspend = dmz_suspend,
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 744889b7cbb56a64f957e65ade7cb65fe3f35714 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei(a)redhat.com>
Date: Fri, 12 Oct 2018 15:53:10 +0800
Subject: [PATCH] block: don't deal with discard limit in
blkdev_issue_discard()
blk_queue_split() does respect this limit via bio splitting, so no
need to do that in blkdev_issue_discard(), then we can align to
normal bio submit(bio_add_page() & submit_bio()).
More importantly, this patch fixes one issue introduced in a22c4d7e34402cc
("block: re-add discard_granularity and alignment checks"), in which
zero discard bio may be generated in case of zero alignment.
Fixes: a22c4d7e34402ccdf3 ("block: re-add discard_granularity and alignment checks")
Cc: stable(a)vger.kernel.org
Cc: Ming Lin <ming.l(a)ssi.samsung.com>
Cc: Mike Snitzer <snitzer(a)redhat.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Xiao Ni <xni(a)redhat.com>
Tested-by: Mariusz Dabrowski <mariusz.dabrowski(a)intel.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d1b9dd03da25..bbd44666f2b5 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -29,9 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
- unsigned int granularity;
unsigned int op;
- int alignment;
sector_t bs_mask;
if (!q)
@@ -54,38 +52,16 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
- /* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
- alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
-
while (nr_sects) {
- unsigned int req_sects;
- sector_t end_sect, tmp;
+ unsigned int req_sects = nr_sects;
+ sector_t end_sect;
- /*
- * Issue in chunks of the user defined max discard setting,
- * ensuring that bi_size doesn't overflow
- */
- req_sects = min_t(sector_t, nr_sects,
- q->limits.max_discard_sectors);
if (!req_sects)
goto fail;
if (req_sects > UINT_MAX >> 9)
req_sects = UINT_MAX >> 9;
- /*
- * If splitting a request, and the next starting sector would be
- * misaligned, stop the discard at the previous aligned sector.
- */
end_sect = sector + req_sects;
- tmp = end_sect;
- if (req_sects < nr_sects &&
- sector_div(tmp, granularity) != alignment) {
- end_sect = end_sect - alignment;
- sector_div(end_sect, granularity);
- end_sect = end_sect * granularity + alignment;
- req_sects = end_sect - sector;
- }
bio = next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e4b069e0945fa14c71cf8b5b89f8b1b2aa68dbc2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Wed, 22 Aug 2018 12:45:51 -0400
Subject: [PATCH] dm verity: fix crash on bufio buffer that was allocated with
vmalloc
Since commit d1ac3ff008fb ("dm verity: switch to using asynchronous hash
crypto API") dm-verity uses asynchronous crypto calls for verification,
so that it can use hardware with asynchronous processing of crypto
operations.
These asynchronous calls don't support vmalloc memory, but the buffer data
can be allocated with vmalloc if dm-bufio is short of memory and uses a
reserved buffer that was preallocated in dm_bufio_client_create().
Fix verity_hash_update() so that it deals with vmalloc'd memory
correctly.
Reported-by: "Xiao, Jin" <jin.xiao(a)intel.com>
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Fixes: d1ac3ff008fb ("dm verity: switch to using asynchronous hash crypto API")
Cc: stable(a)vger.kernel.org # 4.12+
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 12decdbd722d..fc65f0dedf7f 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -99,10 +99,26 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
{
struct scatterlist sg;
- sg_init_one(&sg, data, len);
- ahash_request_set_crypt(req, &sg, NULL, len);
-
- return crypto_wait_req(crypto_ahash_update(req), wait);
+ if (likely(!is_vmalloc_addr(data))) {
+ sg_init_one(&sg, data, len);
+ ahash_request_set_crypt(req, &sg, NULL, len);
+ return crypto_wait_req(crypto_ahash_update(req), wait);
+ } else {
+ do {
+ int r;
+ size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data));
+ flush_kernel_vmap_range((void *)data, this_step);
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data));
+ ahash_request_set_crypt(req, &sg, NULL, this_step);
+ r = crypto_wait_req(crypto_ahash_update(req), wait);
+ if (unlikely(r))
+ return r;
+ data += this_step;
+ len -= this_step;
+ } while (len);
+ return 0;
+ }
}
/*
On 1/3/19 5:52 AM, Sasha Levin wrote:
> This commit has been processed because it contains a -stable tag.
> The stable tag indicates that it's relevant for the following trees: all
>
> The bot has tested the following trees: v4.20.0, v4.19.13, v4.14.91, v4.9.148, v4.4.169, v3.18.131,
>
> v4.20.0: Build OK!
> v4.19.13: Build OK!
> v4.14.91: Build OK!
> v4.9.148: Failed to apply! Possible dependencies:
> f50b4878329a ("x86/pkeys/selftests: Fix pkey exhaustion test off-by-one")
Protection keys was merged in 4.8. We can ignore any of the selftests
changes before that.
But, it looks like the 4.9 selftests are a bit behind mainline.
Probably because I didn't cc stable@ on f50b4878329a. I don't have a
strong opinion as to how up-to-date we want to keep the -stable
selftests. Shua, is there a usual way that folks do this?
commit c92a54cfa0257e8ffd66b2a17d49e9c0bd4b769f upstream
This fix appears in 4.20, but dma_direct_supported() was changed in 4.20
such that the original version of the fix will not apply to previous
versions of the kernel. The fix only applies to the 4.19-stable tree and
has been backported for that tree.
The dma_direct_supported() function intends to check the DMA mask against
specific values. However, the phys_to_dma() function includes the SME
encryption mask, which defeats the intended purpose of the check. This
results in drivers that support less than 48-bit DMA (SME encryption mask
is bit 47) from being able to set the DMA mask successfully when SME is
active, which results in the driver failing to initialize.
Change the function used to check the mask from phys_to_dma() to
__phys_to_dma() so that the SME encryption mask is not part of the check.
Fixes: c1d0af1a1d5d ("kernel/dma/direct: take DMA offset into account in dma_direct_supported")
Cc: <stable(a)vger.kernel.org> # 4.19.x
Signed-off-by: Tom Lendacky <thomas.lendacky(a)amd.com>
---
kernel/dma/direct.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index de87b02..1d2f147 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -168,7 +168,12 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
int dma_direct_supported(struct device *dev, u64 mask)
{
#ifdef CONFIG_ZONE_DMA
- if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
+ /*
+ * This check needs to be against the actual bit mask value, so
+ * use __phys_to_dma() here so that the SME encryption mask isn't
+ * part of the check.
+ */
+ if (mask < __phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
return 0;
#else
/*
@@ -176,8 +181,12 @@ int dma_direct_supported(struct device *dev, u64 mask)
* to be able to satisfy them - either by not supporting more physical
* memory, or by providing a ZONE_DMA32. If neither is the case, the
* architecture needs to use an IOMMU instead of the direct mapping.
+ *
+ * This check needs to be against the actual bit mask value, so
+ * use __phys_to_dma() here so that the SME encryption mask isn't
+ * part of the check.
*/
- if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
+ if (mask < __phys_to_dma(dev, DMA_BIT_MASK(32)))
return 0;
#endif
/*
--
1.9.1
Please apply mainline commit a72b69dc083a931422cc8a5e33841aff7d5312f2
("vhost/vsock: fix uninitialized vhost_vsock->guest_cid") to the v4.9
and v4.14 stable branches.
I believe this is the root cause of an issue uncovered by applying
"vhost/vsock: fix use-after-free in network stack callers" in these
branches. I sometimes see a crash in hash_del_rcu() with vsock in the
call stack, and that call is protected by a newly-added check of
vsock->guest_cid, which was uninitialized before this commit.
v4.4 doesn't have vsock, and v4.19 already has this commit, so they
don't need to be fixed.
Thanks,
-- Daniel
From: Eric Biggers <ebiggers(a)google.com>
Hi Greg, please consider applying this to 4.9-stable and 4.4-stable.
It's a minimal fix for a bug that was fixed incidentally by a large
refactoring in v4.11.
>8------------------------------------------------------8<
In chacha20-simd, clear the MAY_SLEEP flag in the blkcipher_desc to
prevent sleeping with preemption disabled, under kernel_fpu_begin().
This was fixed upstream incidentally by a large refactoring,
commit 9ae433bc79f9 ("crypto: chacha20 - convert generic and x86
versions to skcipher"). But syzkaller easily trips over this when
running on older kernels, as it's easily reachable via AF_ALG.
Therefore, this patch makes the minimal fix for older kernels.
Fixes: c9320b6dcb89 ("crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64")
Cc: linux-crypto(a)vger.kernel.org
Cc: Martin Willi <martin(a)strongswan.org>
Cc: Ard Biesheuvel <ard.biesheuvel(a)linaro.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
arch/x86/crypto/chacha20_glue.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index f910d1d449f00..0a5fedf43bdc8 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -77,6 +77,7 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
--
2.20.1.97.g81188d93c3-goog
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From adcc81f148d733b7e8e641300c5590a2cdc13bf3 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton(a)mips.com>
Date: Thu, 20 Dec 2018 17:45:43 +0000
Subject: [PATCH] MIPS: math-emu: Write-protect delay slot emulation pages
Mapping the delay slot emulation page as both writeable & executable
presents a security risk, in that if an exploit can write to & jump into
the page then it can be used as an easy way to execute arbitrary code.
Prevent this by mapping the page read-only for userland, and using
access_process_vm() with the FOLL_FORCE flag to write to it from
mips_dsemul().
This will likely be less efficient due to copy_to_user_page() performing
cache maintenance on a whole page, rather than a single line as in the
previous use of flush_cache_sigtramp(). However this delay slot
emulation code ought not to be running in any performance critical paths
anyway so this isn't really a problem, and we can probably do better in
copy_to_user_page() anyway in future.
A major advantage of this approach is that the fix is small & simple to
backport to stable kernels.
Reported-by: Andy Lutomirski <luto(a)kernel.org>
Signed-off-by: Paul Burton <paul.burton(a)mips.com>
Fixes: 432c6bacbd0c ("MIPS: Use per-mm page to execute branch delay slot instructions")
Cc: stable(a)vger.kernel.org # v4.8+
Cc: linux-mips(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: Rich Felker <dalias(a)libc.org>
Cc: David Daney <david.daney(a)cavium.com>
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index 48a9c6b90e07..9df3ebdc7b0f 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -126,8 +126,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
/* Map delay slot emulation page */
base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
- VM_READ|VM_WRITE|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ VM_READ | VM_EXEC |
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
0, NULL);
if (IS_ERR_VALUE(base)) {
ret = base;
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 5450f4d1c920..e2d46cb93ca9 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -214,8 +214,9 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
{
int isa16 = get_isa16_mode(regs->cp0_epc);
mips_instruction break_math;
- struct emuframe __user *fr;
- int err, fr_idx;
+ unsigned long fr_uaddr;
+ struct emuframe fr;
+ int fr_idx, ret;
/* NOP is easy */
if (ir == 0)
@@ -250,27 +251,31 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
fr_idx = alloc_emuframe();
if (fr_idx == BD_EMUFRAME_NONE)
return SIGBUS;
- fr = &dsemul_page()[fr_idx];
/* Retrieve the appropriately encoded break instruction */
break_math = BREAK_MATH(isa16);
/* Write the instructions to the frame */
if (isa16) {
- err = __put_user(ir >> 16,
- (u16 __user *)(&fr->emul));
- err |= __put_user(ir & 0xffff,
- (u16 __user *)((long)(&fr->emul) + 2));
- err |= __put_user(break_math >> 16,
- (u16 __user *)(&fr->badinst));
- err |= __put_user(break_math & 0xffff,
- (u16 __user *)((long)(&fr->badinst) + 2));
+ union mips_instruction _emul = {
+ .halfword = { ir >> 16, ir }
+ };
+ union mips_instruction _badinst = {
+ .halfword = { break_math >> 16, break_math }
+ };
+
+ fr.emul = _emul.word;
+ fr.badinst = _badinst.word;
} else {
- err = __put_user(ir, &fr->emul);
- err |= __put_user(break_math, &fr->badinst);
+ fr.emul = ir;
+ fr.badinst = break_math;
}
- if (unlikely(err)) {
+ /* Write the frame to user memory */
+ fr_uaddr = (unsigned long)&dsemul_page()[fr_idx];
+ ret = access_process_vm(current, fr_uaddr, &fr, sizeof(fr),
+ FOLL_FORCE | FOLL_WRITE);
+ if (unlikely(ret != sizeof(fr))) {
MIPS_FPU_EMU_INC_STATS(errors);
free_emuframe(fr_idx, current->mm);
return SIGBUS;
@@ -282,10 +287,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
atomic_set(¤t->thread.bd_emu_frame, fr_idx);
/* Change user register context to execute the frame */
- regs->cp0_epc = (unsigned long)&fr->emul | isa16;
-
- /* Ensure the icache observes our newly written frame */
- flush_cache_sigtramp((unsigned long)&fr->emul);
+ regs->cp0_epc = fr_uaddr | isa16;
return 0;
}