irq_remove_generic_chip() can call (depending on the msk parameter
value) several operations on irqs based on gc->irq_base such as
irq_set_handler(irq, NULL) to remove an handler.
When the generic chip is present in an irq domain (created with a call
to irq_alloc_domain_generic_chips()), gc->irq_base is the base hardware
irq for this chip. It is set to 0 for the first chip in the domain,
0 + n for the next chip (with n the number of hardware irqs per chip)
and so on.
In that case, the operations done on irqs based on gc->irq_base touch
some irqs not related to the chip nor the domain breaking some unrelated
components in the system.
In order to avoid touching these "outside" irqs, take care of the domain
irq mapping and translate the chip hardware irq to an irq number
suitable for the several operations done.
Fixes: cfefd21e693d ("genirq: Add chip suspend and resume callbacks")
Cc: stable(a)vger.kernel.org
Signed-off-by: Herve Codina <herve.codina(a)bootlin.com>
---
kernel/irq/generic-chip.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c653cd31548d..494584e25ef4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -544,21 +544,28 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
- unsigned int i = gc->irq_base;
+ unsigned int irq;
+ unsigned int i;
raw_spin_lock(&gc_lock);
list_del(&gc->list);
raw_spin_unlock(&gc_lock);
- for (; msk; msk >>= 1, i++) {
+ for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
+ irq = gc->domain ?
+ irq_find_mapping(gc->domain, gc->irq_base + i) :
+ gc->irq_base + i;
+ if (!irq)
+ continue;
+
/* Remove handler first. That will mask the irq line */
- irq_set_handler(i, NULL);
- irq_set_chip(i, &no_irq_chip);
- irq_set_chip_data(i, NULL);
- irq_modify_status(i, clr, set);
+ irq_set_handler(irq, NULL);
+ irq_set_chip(irq, &no_irq_chip);
+ irq_set_chip_data(irq, NULL);
+ irq_modify_status(irq, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
--
2.41.0
User can specify a smaller meta buffer than what the device is
wired to update/access. Kernel makes a copy of the meta buffer into
which the device does DMA.
As a result, the device overwrites the unrelated kernel memory, causing
random kernel crashes.
Same issue is possible for extended-lba case also. When user specifies a
short unaligned buffer, the kernel makes a copy and uses that for DMA.
Detect these situations and prevent corruption for unprivileged user
passthrough. No change to status-quo for privileged/root user.
Fixes: 63263d60e0f9 ("nvme: Use metadata for passthrough commands")
Cc: stable(a)vger.kernel.org
Reported-by: Vincent Fu <vincent.fu(a)samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k(a)samsung.com>
---
Changes since v3:
- Block only unprivileged user
- Harden the checks by disallowing everything for which data length
(nlb) can not be determined
- Separate the bounce buffer checks to a different function
- Factor in CSIs beyond NVM and ZNS
Changes since v2:
- Handle extended-lba case: short unaligned buffer IO
- Reduce the scope of check to only well-known commands
- Do not check early. Move it deeper so that check gets executed less
often
- Combine two patches into one.
Changes since v1:
- Revise the check to exclude PRACT=1 case
drivers/nvme/host/ioctl.c | 116 ++++++++++++++++++++++++++++++++++++++
1 file changed, 116 insertions(+)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f2..57160ca02e65 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -96,6 +96,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
return (void __user *)ptrval;
}
+static inline bool nvme_nlb_in_cdw12(struct nvme_ns *ns, u8 opcode)
+{
+ u8 csi = ns->head->ids.csi;
+
+ if (csi != NVME_CSI_NVM && csi != NVME_CSI_ZNS)
+ return false;
+
+ switch (opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_write:
+ case nvme_cmd_compare:
+ case nvme_cmd_zone_append:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * NVMe has no separate field to encode the metadata length expected
+ * (except when using SGLs).
+ *
+ * Because of that we can't allow to transfer arbitrary metadata, as
+ * a metadata buffer that is shorted than what the device expects for
+ * the command will lead to arbitrary kernel (if bounce buffering) or
+ * userspace (if not) memory corruption.
+ *
+ * Check that external metadata is only specified for the few commands
+ * where we know the length based of other fields, and that it fits
+ * the actual data transfer from/to the device.
+ */
+static bool nvme_validate_metadata_len(struct request *req, unsigned meta_len)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+ struct nvme_command *c = nvme_req(req)->cmd;
+ u32 len_by_nlb;
+
+ /* Do not guard admin */
+ if (capable(CAP_SYS_ADMIN))
+ return true;
+
+ /* Block commands that do not have nlb in cdw12 */
+ if (!nvme_nlb_in_cdw12(ns, c->common.opcode)) {
+ dev_err(ns->ctrl->device,
+ "unknown metadata command %c\n", c->common.opcode);
+ return false;
+ }
+
+ /* Skip when PI is inserted or stripped and not transferred */
+ if (ns->ms == ns->pi_size &&
+ (c->rw.control & cpu_to_le16(NVME_RW_PRINFO_PRACT)))
+ return true;
+
+ if (ns->features & NVME_NS_EXT_LBAS) {
+ dev_err(ns->ctrl->device,
+ "requires extended LBAs for metadata\n");
+ return false;
+ }
+
+ len_by_nlb = (le16_to_cpu(c->rw.length) + 1) * ns->ms;
+ if (meta_len < len_by_nlb) {
+ dev_err(ns->ctrl->device,
+ "metadata length (%u instad of %u) is too small.\n",
+ meta_len, len_by_nlb);
+ return false;
+ }
+
+ return true;
+}
+
static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
unsigned len, u32 seed)
{
@@ -104,6 +174,9 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
void *buf;
struct bio *bio = req->bio;
+ if (!nvme_validate_metadata_len(req, len))
+ return ERR_PTR(-EINVAL);
+
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
goto out;
@@ -134,6 +207,41 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
return ERR_PTR(ret);
}
+static bool nvme_validate_buffer_len(struct nvme_ns *ns, struct nvme_command *c,
+ unsigned meta_len, unsigned data_len)
+{
+ u32 mlen_by_nlb, dlen_by_nlb;
+
+ /* Do not guard admin */
+ if (capable(CAP_SYS_ADMIN))
+ return true;
+
+ /* Block commands that do not have nlb in cdw12 */
+ if (!nvme_nlb_in_cdw12(ns, c->common.opcode)) {
+ dev_err(ns->ctrl->device,
+ "unknown metadata command %c.\n", c->common.opcode);
+ return false;
+ }
+
+ /* When PI is inserted or stripped and not transferred.*/
+ if (ns->ms == ns->pi_size &&
+ (c->rw.control & cpu_to_le16(NVME_RW_PRINFO_PRACT)))
+ mlen_by_nlb = 0;
+ else
+ mlen_by_nlb = (le16_to_cpu(c->rw.length) + 1) * ns->ms;
+
+ dlen_by_nlb = (le16_to_cpu(c->rw.length) + 1) << ns->lba_shift;
+
+ if (data_len < (dlen_by_nlb + mlen_by_nlb)) {
+ dev_err(ns->ctrl->device,
+ "buffer length (%u instad of %u) is too small.\n",
+ data_len, dlen_by_nlb + mlen_by_nlb);
+ return false;
+ }
+
+ return true;
+}
+
static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
void *meta, unsigned len, int ret)
{
@@ -202,6 +310,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
}
*metap = meta;
}
+ /* Guard for a short bounce buffer */
+ if (bio->bi_private) {
+ if (!nvme_validate_buffer_len(ns, nvme_req(req)->cmd,
+ meta_len, bufflen)) {
+ ret = -EINVAL;
+ goto out_unmap;
+ }
+ }
return ret;
--
2.25.1
When we sync the register cache we do so with the cache bypassed in order
to avoid overhead from writing the synced values back into the cache. If
the regmap has ranges and the selector register for those ranges is in a
register which is cached this has the unfortunate side effect of meaning
that the physical and cached copies of the selector register can be out of
sync after a cache sync. The cache will have whatever the selector was when
the sync started and the hardware will have the selector for the register
that was synced last.
Fix this by rewriting all cached selector registers after every sync,
ensuring that the hardware and cache have the same content. This will
result in extra writes that wouldn't otherwise be needed but is simple
so hopefully robust. We don't read from the hardware since not all
devices have physical read support.
Given that nobody noticed this until now it is likely that we are rarely if
ever hitting this case.
Reported-by: Hector Martin <marcan(a)marcan.st>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
drivers/base/regmap/regcache.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index c5d151e9c481..92592f944a3d 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -334,6 +334,11 @@ static int regcache_default_sync(struct regmap *map, unsigned int min,
return 0;
}
+static int rbtree_all(const void *key, const struct rb_node *node)
+{
+ return 0;
+}
+
/**
* regcache_sync - Sync the register cache with the hardware.
*
@@ -351,6 +356,7 @@ int regcache_sync(struct regmap *map)
unsigned int i;
const char *name;
bool bypass;
+ struct rb_node *node;
if (WARN_ON(map->cache_type == REGCACHE_NONE))
return -EINVAL;
@@ -392,6 +398,30 @@ int regcache_sync(struct regmap *map)
/* Restore the bypass state */
map->cache_bypass = bypass;
map->no_sync_defaults = false;
+
+ /*
+ * If we did any paging with cache bypassed and a cached
+ * paging register then the register and cache state might
+ * have gone out of sync, force writes of all the paging
+ * registers.
+ */
+ rb_for_each(node, 0, &map->range_tree, rbtree_all) {
+ struct regmap_range_node *this =
+ rb_entry(node, struct regmap_range_node, node);
+
+ /* If there's nothing in the cache there's nothing to sync */
+ ret = regcache_read(map, this->selector_reg, &i);
+ if (ret != 0)
+ continue;
+
+ ret = _regmap_write(map, this->selector_reg, i);
+ if (ret != 0) {
+ dev_err(map->dev, "Failed to write %x = %x: %d\n",
+ this->selector_reg, i, ret);
+ break;
+ }
+ }
+
map->unlock(map->lock_arg);
regmap_async_complete(map);
---
base-commit: 611da07b89fdd53f140d7b33013f255bf0ed8f34
change-id: 20231026-regmap-fix-selector-sync-ad1514fd15df
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Hi,
in kernel 6.1 (maybe 5.x - 6.x) there's an ATACB setting for "Super Top
USB 2.0 SATA Bridge" -devices, where the minimum bcdDevice version to
match has been set to 1.60. It's in the file
drivers/usb/storage/unusual_cypress.h:
"""
UNUSUAL_DEV( 0x14cd, 0x6116, 0x0160, 0x0160,
"Super Top",
"USB 2.0 SATA BRIDGE",
USB_SC_CYP_ATACB, USB_PR_DEVICE, NULL, 0),
"""
My old USB HDD with a "Super Top" bridge has bcdDevice version 1.50,
thus the setting won't match and it will not mount.
I'm not sure when this changed (after kernel 4.x?), but it used to work
before. Reading some earlier bug reports, it seems that the max version
used to be 0x9999, which then caused corruption in "Super Top" devices
with version >=2.20. So that's a reason for lowering the maximum value,
but I wonder why the minimum value has also been set to 0x0160.
I created a patch, changing 0x0160 to 0x0150 (though I should've left
the max version as it was...):
"""
UNUSUAL_DEV( 0x14cd, 0x6116, 0x0150, 0x0150,
"""
Built, installed and rebooted; now the USB HDD can be mounted and works
perfectly again. I did some write & read tests, checked with diff, cmp
and md5sum - no corruption, everything OK 👍
Best regards,
LihaS
From: Anthony Krowiak <akrowiak(a)linux.ibm.com>
In the vfio_ap_irq_enable function, after the page containing the
notification indicator byte (NIB) is pinned, the function attempts
to register the guest ISC. If registration fails, the function sets the
status response code and returns without unpinning the page containing
the NIB. In order to avoid a memory leak, the NIB should be unpinned before
returning from the vfio_ap_irq_enable function.
Co-developed-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Anthony Krowiak <akrowiak(a)linux.ibm.com>
Reviewed-by: Matthew Rosato <mjrosato(a)linux.ibm.com>
Fixes: 783f0a3ccd79 ("s390/vfio-ap: add s390dbf logging to the vfio_ap_irq_enable function")
Cc: <stable(a)vger.kernel.org>
---
drivers/s390/crypto/vfio_ap_ops.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 4db538a55192..9cb28978c186 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -457,6 +457,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
VFIO_AP_DBF_WARN("%s: gisc registration failed: nisc=%d, isc=%d, apqn=%#04x\n",
__func__, nisc, isc, q->apqn);
+ vfio_unpin_pages(&q->matrix_mdev->vdev, nib, 1);
status.response_code = AP_RESPONSE_INVALID_GISA;
return status;
}
--
2.41.0
The rtnl lock also needs to be held before rndis_filter_device_add()
which advertises nvsp_2_vsc_capability / sriov bit, and triggers
VF NIC offering and registering. If VF NIC finished register_netdev()
earlier it may cause name based config failure.
To fix this issue, move the call to rtnl_lock() before
rndis_filter_device_add(), so VF will be registered later than netvsc
/ synthetic NIC, and gets a name numbered (ethX) after netvsc.
And, move register_netdevice_notifier() earlier, so the call back
function is set before probing.
Cc: stable(a)vger.kernel.org
Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()")
Signed-off-by: Haiyang Zhang <haiyangz(a)microsoft.com>
---
drivers/net/hyperv/netvsc_drv.c | 30 +++++++++++++++++++-----------
1 file changed, 19 insertions(+), 11 deletions(-)
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 3ba3c8fb28a5..feca1391f756 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2531,15 +2531,6 @@ static int netvsc_probe(struct hv_device *dev,
goto devinfo_failed;
}
- nvdev = rndis_filter_device_add(dev, device_info);
- if (IS_ERR(nvdev)) {
- ret = PTR_ERR(nvdev);
- netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
- goto rndis_failed;
- }
-
- eth_hw_addr_set(net, device_info->mac_adr);
-
/* We must get rtnl lock before scheduling nvdev->subchan_work,
* otherwise netvsc_subchan_work() can get rtnl lock first and wait
* all subchannels to show up, but that may not happen because
@@ -2547,9 +2538,23 @@ static int netvsc_probe(struct hv_device *dev,
* -> ... -> device_add() -> ... -> __device_attach() can't get
* the device lock, so all the subchannels can't be processed --
* finally netvsc_subchan_work() hangs forever.
+ *
+ * The rtnl lock also needs to be held before rndis_filter_device_add()
+ * which advertises nvsp_2_vsc_capability / sriov bit, and triggers
+ * VF NIC offering and registering. If VF NIC finished register_netdev()
+ * earlier it may cause name based config failure.
*/
rtnl_lock();
+ nvdev = rndis_filter_device_add(dev, device_info);
+ if (IS_ERR(nvdev)) {
+ ret = PTR_ERR(nvdev);
+ netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
+ goto rndis_failed;
+ }
+
+ eth_hw_addr_set(net, device_info->mac_adr);
+
if (nvdev->num_chn > 1)
schedule_work(&nvdev->subchan_work);
@@ -2788,11 +2793,14 @@ static int __init netvsc_drv_init(void)
}
netvsc_ring_bytes = ring_size * PAGE_SIZE;
+ register_netdevice_notifier(&netvsc_netdev_notifier);
+
ret = vmbus_driver_register(&netvsc_drv);
- if (ret)
+ if (ret) {
+ unregister_netdevice_notifier(&netvsc_netdev_notifier);
return ret;
+ }
- register_netdevice_notifier(&netvsc_netdev_notifier);
return 0;
}
--
2.25.1
From: Anthony Krowiak <akrowiak(a)linux.ibm.com>
In the vfio_ap_irq_enable function, after the page containing the
notification indicator byte (NIB) is pinned, the function attempts
to register the guest ISC. If registration fails, the function sets the
status response code and returns without unpinning the page containing
the NIB. In order to avoid a memory leak, the NIB should be unpinned before
returning from the vfio_ap_irq_enable function.
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Anthony Krowiak <akrowiak(a)linux.ibm.com>
Reviewed-by: Matthew Rosato <mjrosato(a)linux.ibm.com>
Fixes: 783f0a3ccd79 ("s390/vfio-ap: add s390dbf logging to the vfio_ap_irq_enable function")
Cc: <stable(a)vger.kernel.org>
---
drivers/s390/crypto/vfio_ap_ops.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 4db538a55192..9cb28978c186 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -457,6 +457,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
VFIO_AP_DBF_WARN("%s: gisc registration failed: nisc=%d, isc=%d, apqn=%#04x\n",
__func__, nisc, isc, q->apqn);
+ vfio_unpin_pages(&q->matrix_mdev->vdev, nib, 1);
status.response_code = AP_RESPONSE_INVALID_GISA;
return status;
}
--
2.41.0
There were a set of issues brought up in 6.5 by the introduction of
the "laundromat" (which cleans up directory leases). After discussion
and testing with Paulo, Bernd Feige and others, we request that the
following set also be included to address these problems. They are
all in mainline (added in 6.6) and the patches (from mainline since
they apply cleanly) are also attached.
238b351d0935 ("smb3: allow controlling length of time
directory entries are cached with dir leases")
6a50d71d0fff ("smb3: allow controlling maximum number of
cached directories")
2da338ff752a ("smb3: do not start laundromat thread when dir
leases disabled")
3b8bb3171571 ("smb: client: do not start laundromat thread on
nohandlecache")
e95f3f744650 ("smb: client: make laundromat a delayed worker")
81ba10959970 ("smb: client: prevent new fids from being
removed by laundromat")
--
Thanks,
Steve,