When I implemented virtio's hash-related features to tun/tap [1], I found tun/tap does not fill the entire region reserved for the virtio header, leaving some uninitialized hole in the middle of the buffer after read()/recvmesg().
This series fills the uninitialized hole. More concretely, the num_buffers field will be initialized with 1, and the other fields will be inialized with 0. Setting the num_buffers field to 1 is mandated by virtio 1.0 [2].
The change to virtio header is preceded by another change that refactors tun and tap to unify their virtio-related code.
[1]: https://lore.kernel.org/r/20241008-rss-v5-0-f3cf68df005d@daynix.com [2]: https://lore.kernel.org/r/20241227084256-mutt-send-email-mst@kernel.org/
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- Changes in v2: - Fixed num_buffers endian. - Link to v1: https://lore.kernel.org/r/20250108-tun-v1-0-67d784b34374@daynix.com
--- Akihiko Odaki (3): tun: Unify vnet implementation tun: Pad virtio header with zero tun: Set num_buffers for virtio 1.0
MAINTAINERS | 1 + drivers/net/Kconfig | 5 ++ drivers/net/Makefile | 1 + drivers/net/tap.c | 174 ++++++---------------------------------- drivers/net/tun.c | 214 +++++++++---------------------------------------- drivers/net/tun_vnet.c | 191 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/tun_vnet.h | 24 ++++++ 7 files changed, 283 insertions(+), 327 deletions(-) --- base-commit: a32e14f8aef69b42826cf0998b068a43d486a9e9 change-id: 20241230-tun-66e10a49b0c7
Best regards,
Both tun and tap exposes the same set of virtio-net-related features. Unify their implementations to ease future changes.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- MAINTAINERS | 1 + drivers/net/Kconfig | 5 ++ drivers/net/Makefile | 1 + drivers/net/tap.c | 172 ++++++---------------------------------- drivers/net/tun.c | 208 ++++++++----------------------------------------- drivers/net/tun_vnet.c | 186 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/tun_vnet.h | 24 ++++++ 7 files changed, 273 insertions(+), 324 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..1be8a452d11f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23903,6 +23903,7 @@ F: Documentation/networking/tuntap.rst F: arch/um/os-Linux/drivers/ F: drivers/net/tap.c F: drivers/net/tun.c +F: drivers/net/tun_vnet.h
TURBOCHANNEL SUBSYSTEM M: "Maciej W. Rozycki" macro@orcam.me.uk diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1fd5acdc73c6..255c8f9f1d7c 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32 + select TUN_VNET help TUN/TAP provides packet reception and transmission for user space programs. It can be viewed as a simple Point-to-Point or Ethernet @@ -417,10 +418,14 @@ config TUN
config TAP tristate + select TUN_VNET help This option is selected by any driver implementing tap user space interface for a virtual interface to re-use core tap functionality.
+config TUN_VNET + tristate + config TUN_VNET_CROSS_LE bool "Support for cross-endian vnet headers on little-endian kernels" default n diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 13743d0e83b5..bc1f193eccb1 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -30,6 +30,7 @@ obj-y += pcs/ obj-$(CONFIG_RIONET) += rionet.o obj-$(CONFIG_NET_TEAM) += team/ obj-$(CONFIG_TUN) += tun.o +obj-$(CONFIG_TUN_VNET) += tun_vnet.o obj-$(CONFIG_TAP) += tap.o obj-$(CONFIG_VETH) += veth.o obj-$(CONFIG_VIRTIO_NET) += virtio_net.o diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 5aa41d5f7765..60804855510b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -26,74 +26,9 @@ #include <linux/virtio_net.h> #include <linux/skb_array.h>
-#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) - -#define TAP_VNET_LE 0x80000000 -#define TAP_VNET_BE 0x40000000 - -#ifdef CONFIG_TUN_VNET_CROSS_LE -static inline bool tap_legacy_is_little_endian(struct tap_queue *q) -{ - return q->flags & TAP_VNET_BE ? false : - virtio_legacy_is_little_endian(); -} - -static long tap_get_vnet_be(struct tap_queue *q, int __user *sp) -{ - int s = !!(q->flags & TAP_VNET_BE); - - if (put_user(s, sp)) - return -EFAULT; - - return 0; -} - -static long tap_set_vnet_be(struct tap_queue *q, int __user *sp) -{ - int s; - - if (get_user(s, sp)) - return -EFAULT; - - if (s) - q->flags |= TAP_VNET_BE; - else - q->flags &= ~TAP_VNET_BE; - - return 0; -} -#else -static inline bool tap_legacy_is_little_endian(struct tap_queue *q) -{ - return virtio_legacy_is_little_endian(); -} - -static long tap_get_vnet_be(struct tap_queue *q, int __user *argp) -{ - return -EINVAL; -} +#include "tun_vnet.h"
-static long tap_set_vnet_be(struct tap_queue *q, int __user *argp) -{ - return -EINVAL; -} -#endif /* CONFIG_TUN_VNET_CROSS_LE */ - -static inline bool tap_is_little_endian(struct tap_queue *q) -{ - return q->flags & TAP_VNET_LE || - tap_legacy_is_little_endian(q); -} - -static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val) -{ - return __virtio16_to_cpu(tap_is_little_endian(q), val); -} - -static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val) -{ - return __cpu_to_virtio16(tap_is_little_endian(q), val); -} +#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
static struct proto tap_proto = { .name = "tap", @@ -641,10 +576,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from); - unsigned long len = total_len; + unsigned long len; int err; struct virtio_net_hdr vnet_hdr = { 0 }; - int vnet_hdr_len = 0; + int hdr_len; int copylen = 0; int depth; bool zerocopy = false; @@ -652,38 +587,20 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, enum skb_drop_reason drop_reason;
if (q->flags & IFF_VNET_HDR) { - vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); - - err = -EINVAL; - if (len < vnet_hdr_len) - goto err; - len -= vnet_hdr_len; - - err = -EFAULT; - if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from)) - goto err; - iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr)); - if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - tap16_to_cpu(q, vnet_hdr.csum_start) + - tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 > - tap16_to_cpu(q, vnet_hdr.hdr_len)) - vnet_hdr.hdr_len = cpu_to_tap16(q, - tap16_to_cpu(q, vnet_hdr.csum_start) + - tap16_to_cpu(q, vnet_hdr.csum_offset) + 2); - err = -EINVAL; - if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len) + hdr_len = tun_vnet_hdr_get(READ_ONCE(q->vnet_hdr_sz), q->flags, from, &vnet_hdr); + if (hdr_len < 0) { + err = hdr_len; goto err; + } + } else { + hdr_len = 0; }
- err = -EINVAL; - if (unlikely(len < ETH_HLEN)) - goto err; - + len = iov_iter_count(from); if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i;
- copylen = vnet_hdr.hdr_len ? - tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN; + copylen = hdr_len ? hdr_len : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; else if (copylen < ETH_HLEN) @@ -697,7 +614,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
if (!zerocopy) { copylen = len; - linear = tap16_to_cpu(q, vnet_hdr.hdr_len); + linear = hdr_len; if (linear > good_linear) linear = good_linear; else if (linear < ETH_HLEN) @@ -732,9 +649,8 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, } skb->dev = tap->dev;
- if (vnet_hdr_len) { - err = virtio_net_hdr_to_skb(skb, &vnet_hdr, - tap_is_little_endian(q)); + if (q->flags & IFF_VNET_HDR) { + err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr); if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR; @@ -797,23 +713,17 @@ static ssize_t tap_put_user(struct tap_queue *q, int total;
if (q->flags & IFF_VNET_HDR) { - int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; struct virtio_net_hdr vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); - if (iov_iter_count(iter) < vnet_hdr_len) - return -EINVAL; - - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, - tap_is_little_endian(q), true, - vlan_hlen)) - BUG();
- if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != - sizeof(vnet_hdr)) - return -EFAULT; + ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); + if (ret < 0) + goto done;
- iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr)); + ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr); + if (ret < 0) + goto done; } total = vnet_hdr_len; total += skb->len; @@ -1072,42 +982,6 @@ static long tap_ioctl(struct file *file, unsigned int cmd, q->sk.sk_sndbuf = s; return 0;
- case TUNGETVNETHDRSZ: - s = q->vnet_hdr_sz; - if (put_user(s, sp)) - return -EFAULT; - return 0; - - case TUNSETVNETHDRSZ: - if (get_user(s, sp)) - return -EFAULT; - if (s < (int)sizeof(struct virtio_net_hdr)) - return -EINVAL; - - q->vnet_hdr_sz = s; - return 0; - - case TUNGETVNETLE: - s = !!(q->flags & TAP_VNET_LE); - if (put_user(s, sp)) - return -EFAULT; - return 0; - - case TUNSETVNETLE: - if (get_user(s, sp)) - return -EFAULT; - if (s) - q->flags |= TAP_VNET_LE; - else - q->flags &= ~TAP_VNET_LE; - return 0; - - case TUNGETVNETBE: - return tap_get_vnet_be(q, sp); - - case TUNSETVNETBE: - return tap_set_vnet_be(q, sp); - case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | @@ -1151,7 +1025,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, return ret;
default: - return -EINVAL; + return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp); } }
@@ -1198,7 +1072,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) skb->protocol = eth_hdr(skb)->h_proto;
if (vnet_hdr_len) { - err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q)); + err = tun_vnet_hdr_to_skb(q->flags, skb, gso); if (err) goto err_kfree; } diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e816aaba8e5f..dbf0dee92e93 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -83,6 +83,8 @@ #include <linux/uaccess.h> #include <linux/proc_fs.h>
+#include "tun_vnet.h" + static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd);
@@ -94,9 +96,6 @@ static void tun_default_link_ksettings(struct net_device *dev, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE -/* High bits in flags field are unused. */ -#define TUN_VNET_LE 0x80000000 -#define TUN_VNET_BE 0x40000000
#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) @@ -298,70 +297,6 @@ static bool tun_napi_frags_enabled(const struct tun_file *tfile) return tfile->napi_frags_enabled; }
-#ifdef CONFIG_TUN_VNET_CROSS_LE -static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) -{ - return tun->flags & TUN_VNET_BE ? false : - virtio_legacy_is_little_endian(); -} - -static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) -{ - int be = !!(tun->flags & TUN_VNET_BE); - - if (put_user(be, argp)) - return -EFAULT; - - return 0; -} - -static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) -{ - int be; - - if (get_user(be, argp)) - return -EFAULT; - - if (be) - tun->flags |= TUN_VNET_BE; - else - tun->flags &= ~TUN_VNET_BE; - - return 0; -} -#else -static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) -{ - return virtio_legacy_is_little_endian(); -} - -static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) -{ - return -EINVAL; -} - -static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) -{ - return -EINVAL; -} -#endif /* CONFIG_TUN_VNET_CROSS_LE */ - -static inline bool tun_is_little_endian(struct tun_struct *tun) -{ - return tun->flags & TUN_VNET_LE || - tun_legacy_is_little_endian(tun); -} - -static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) -{ - return __virtio16_to_cpu(tun_is_little_endian(tun), val); -} - -static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) -{ - return __cpu_to_virtio16(tun_is_little_endian(tun), val); -} - static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; @@ -1752,8 +1687,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); - size_t len = total_len, align = tun->align, linear; + size_t len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; + int hdr_len; int good_linear; int copylen; bool zerocopy = false; @@ -1764,37 +1700,25 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (!(tun->flags & IFF_NO_PI)) { - if (len < sizeof(pi)) + if (iov_iter_count(from) < sizeof(pi)) return -EINVAL; - len -= sizeof(pi);
if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; }
if (tun->flags & IFF_VNET_HDR) { - int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); - - if (len < vnet_hdr_sz) - return -EINVAL; - len -= vnet_hdr_sz; - - if (!copy_from_iter_full(&gso, sizeof(gso), from)) - return -EFAULT; - - if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) - gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); - - if (tun16_to_cpu(tun, gso.hdr_len) > len) - return -EINVAL; - iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); + hdr_len = tun_vnet_hdr_get(READ_ONCE(tun->vnet_hdr_sz), tun->flags, from, &gso); + if (hdr_len < 0) + return hdr_len; + } else { + hdr_len = 0; }
+ len = iov_iter_count(from); if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; - if (unlikely(len < ETH_HLEN || - (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) + if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) return -EINVAL; }
@@ -1807,7 +1731,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ - copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; + copylen = hdr_len ? hdr_len : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; linear = copylen; @@ -1830,10 +1754,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } else { if (!zerocopy) { copylen = len; - if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) + if (hdr_len > good_linear) linear = good_linear; else - linear = tun16_to_cpu(tun, gso.hdr_len); + linear = hdr_len; }
if (frags) { @@ -1868,7 +1792,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } }
- if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { + if (tun_vnet_hdr_to_skb(tun->flags, skb, &gso)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; @@ -2061,29 +1985,27 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct xdp_frame *xdp_frame, struct iov_iter *iter) { + int ret; int vnet_hdr_sz = 0; size_t size = xdp_frame->len; - size_t ret; + size_t total;
if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 };
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); - if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) - return -EINVAL; - if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != - sizeof(gso))) - return -EFAULT; - iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); + if (ret < 0) + return ret; }
- ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; + total = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
preempt_disable(); - dev_sw_netstats_tx_add(tun->dev, 1, ret); + dev_sw_netstats_tx_add(tun->dev, 1, total); preempt_enable();
- return ret; + return total; }
/* Put packet to the user space buffer */ @@ -2097,6 +2019,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; + int ret;
if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; @@ -2123,31 +2046,13 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (vnet_hdr_sz) { struct virtio_net_hdr gso;
- if (iov_iter_count(iter) < vnet_hdr_sz) - return -EINVAL; - - if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun), true, - vlan_hlen)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - if (net_ratelimit()) { - netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); - } - WARN_ON_ONCE(1); - return -EINVAL; - } - - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) - return -EFAULT; + ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); + if (ret < 0) + goto done;
- iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); + if (ret < 0) + goto done; }
if (vlan_hlen) { @@ -2507,7 +2412,7 @@ static int tun_xdp_one(struct tun_struct *tun, skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data);
- if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { + if (tun_vnet_hdr_to_skb(tun->flags, skb, gso)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; @@ -3091,8 +2996,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, kgid_t group; int ifindex; int sndbuf; - int vnet_hdr_sz; - int le; int ret; bool do_notify = false;
@@ -3299,50 +3202,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_set_sndbuf(tun); break;
- case TUNGETVNETHDRSZ: - vnet_hdr_sz = tun->vnet_hdr_sz; - if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) - ret = -EFAULT; - break; - - case TUNSETVNETHDRSZ: - if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) { - ret = -EFAULT; - break; - } - if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) { - ret = -EINVAL; - break; - } - - tun->vnet_hdr_sz = vnet_hdr_sz; - break; - - case TUNGETVNETLE: - le = !!(tun->flags & TUN_VNET_LE); - if (put_user(le, (int __user *)argp)) - ret = -EFAULT; - break; - - case TUNSETVNETLE: - if (get_user(le, (int __user *)argp)) { - ret = -EFAULT; - break; - } - if (le) - tun->flags |= TUN_VNET_LE; - else - tun->flags &= ~TUN_VNET_LE; - break; - - case TUNGETVNETBE: - ret = tun_get_vnet_be(tun, argp); - break; - - case TUNSETVNETBE: - ret = tun_set_vnet_be(tun, argp); - break; - case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; @@ -3398,8 +3257,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break;
default: - ret = -EINVAL; - break; + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); }
if (do_notify) diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c new file mode 100644 index 000000000000..fe842df9e9ef --- /dev/null +++ b/drivers/net/tun_vnet.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "tun_vnet.h" + +/* High bits in flags field are unused. */ +#define TUN_VNET_LE 0x80000000 +#define TUN_VNET_BE 0x40000000 + +static bool tun_vnet_legacy_is_little_endian(unsigned int flags) +{ + return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) && + virtio_legacy_is_little_endian(); +} + +static long tun_vnet_get_be(int flags, int __user *sp) +{ + int s = !!(flags & TUN_VNET_BE); + + if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) + return -EINVAL; + + if (put_user(s, sp)) + return -EFAULT; + + return 0; +} + +static long tun_vnet_set_be(int *flags, int __user *sp) +{ + int s; + + if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) + return -EINVAL; + + if (get_user(s, sp)) + return -EFAULT; + + if (s) + *flags |= TUN_VNET_BE; + else + *flags &= ~TUN_VNET_BE; + + return 0; +} + +static bool tun_vnet_is_little_endian(unsigned int flags) +{ + return flags & TUN_VNET_LE || tun_vnet_legacy_is_little_endian(flags); +} + +static u16 tun_vnet16_to_cpu(unsigned int flags, __virtio16 val) +{ + return __virtio16_to_cpu(tun_vnet_is_little_endian(flags), val); +} + +static __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val) +{ + return __cpu_to_virtio16(tun_vnet_is_little_endian(flags), val); +} + +long tun_vnet_ioctl(int *sz, unsigned int *flags, + unsigned int cmd, int __user *sp) +{ + int s; + + switch (cmd) { + case TUNGETVNETHDRSZ: + s = *sz; + if (put_user(s, sp)) + return -EFAULT; + return 0; + + case TUNSETVNETHDRSZ: + if (get_user(s, sp)) + return -EFAULT; + if (s < (int)sizeof(struct virtio_net_hdr)) + return -EINVAL; + + *sz = s; + return 0; + + case TUNGETVNETLE: + s = !!(*flags & TUN_VNET_LE); + if (put_user(s, sp)) + return -EFAULT; + return 0; + + case TUNSETVNETLE: + if (get_user(s, sp)) + return -EFAULT; + if (s) + *flags |= TUN_VNET_LE; + else + *flags &= ~TUN_VNET_LE; + return 0; + + case TUNGETVNETBE: + return tun_vnet_get_be(*flags, sp); + + case TUNSETVNETBE: + return tun_vnet_set_be(flags, sp); + + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(tun_vnet_ioctl); + +int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, + struct virtio_net_hdr *hdr) +{ + if (iov_iter_count(from) < sz) + return -EINVAL; + + if (!copy_from_iter_full(hdr, sizeof(*hdr), from)) + return -EFAULT; + + iov_iter_advance(from, sz - sizeof(*hdr)); + if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + tun_vnet16_to_cpu(flags, hdr->csum_start) + + tun_vnet16_to_cpu(flags, hdr->csum_offset) + 2 > + tun_vnet16_to_cpu(flags, hdr->hdr_len)) + hdr->hdr_len = cpu_to_tun_vnet16(flags, + tun_vnet16_to_cpu(flags, hdr->csum_start) + + tun_vnet16_to_cpu(flags, hdr->csum_offset) + 2); + if (tun_vnet16_to_cpu(flags, hdr->hdr_len) > iov_iter_count(from)) + return -EINVAL; + + return tun_vnet16_to_cpu(flags, hdr->hdr_len); +} +EXPORT_SYMBOL_GPL(tun_vnet_hdr_get); + +int tun_vnet_hdr_put(int sz, struct iov_iter *iter, + const struct virtio_net_hdr *hdr) +{ + if (iov_iter_count(iter) < sz) + return -EINVAL; + + if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) + return -EFAULT; + + iov_iter_advance(iter, sz - sizeof(*hdr)); + + return 0; +} +EXPORT_SYMBOL_GPL(tun_vnet_hdr_put); + +int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); +} +EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb); + +int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, + const struct sk_buff *skb, + struct virtio_net_hdr *hdr) +{ + int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; + + if (virtio_net_hdr_from_skb(skb, hdr, + tun_vnet_is_little_endian(flags), true, + vlan_hlen)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + if (net_ratelimit()) { + netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), + tun_vnet16_to_cpu(flags, hdr->hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb); + +MODULE_DESCRIPTION("Common library for drivers implementing TUN/TAP's virtio-related features"); +MODULE_AUTHOR("Max Krasnyansky maxk@qualcomm.com"); +MODULE_AUTHOR("Arnd Bergmann arnd@arndb.de"); +MODULE_AUTHOR("Sainath Grandhi sainath.grandhi@intel.com"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h new file mode 100644 index 000000000000..2dfdbe92bb24 --- /dev/null +++ b/drivers/net/tun_vnet.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef TUN_VNET_H +#define TUN_VNET_H + +#include <linux/if_tun.h> +#include <linux/virtio_net.h> + +long tun_vnet_ioctl(int *sz, unsigned int *flags, + unsigned int cmd, int __user *sp); + +int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, + struct virtio_net_hdr *hdr); + +int tun_vnet_hdr_put(int sz, struct iov_iter *iter, + const struct virtio_net_hdr *hdr); + +int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, + const struct virtio_net_hdr *hdr); + +int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, + const struct sk_buff *skb, + struct virtio_net_hdr *hdr); + +#endif /* TUN_VNET_H */
Akihiko Odaki wrote:
Both tun and tap exposes the same set of virtio-net-related features. Unify their implementations to ease future changes.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
MAINTAINERS | 1 + drivers/net/Kconfig | 5 ++ drivers/net/Makefile | 1 + drivers/net/tap.c | 172 ++++++---------------------------------- drivers/net/tun.c | 208 ++++++++----------------------------------------- drivers/net/tun_vnet.c | 186 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/tun_vnet.h | 24 ++++++ 7 files changed, 273 insertions(+), 324 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..1be8a452d11f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23903,6 +23903,7 @@ F: Documentation/networking/tuntap.rst F: arch/um/os-Linux/drivers/ F: drivers/net/tap.c F: drivers/net/tun.c +F: drivers/net/tun_vnet.h TURBOCHANNEL SUBSYSTEM M: "Maciej W. Rozycki" macro@orcam.me.uk diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1fd5acdc73c6..255c8f9f1d7c 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32
- select TUN_VNET
No need for this new Kconfig
static struct proto tap_proto = { .name = "tap", @@ -641,10 +576,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from);
- unsigned long len = total_len;
- unsigned long len; int err; struct virtio_net_hdr vnet_hdr = { 0 };
- int vnet_hdr_len = 0;
- int hdr_len; int copylen = 0; int depth; bool zerocopy = false;
@@ -652,38 +587,20 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) {
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
err = -EINVAL;
if (len < vnet_hdr_len)
goto err;
len -= vnet_hdr_len;
err = -EFAULT;
if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
goto err;
iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
tap16_to_cpu(q, vnet_hdr.csum_start) +
tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
tap16_to_cpu(q, vnet_hdr.hdr_len))
vnet_hdr.hdr_len = cpu_to_tap16(q,
tap16_to_cpu(q, vnet_hdr.csum_start) +
tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
err = -EINVAL;
if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
hdr_len = tun_vnet_hdr_get(READ_ONCE(q->vnet_hdr_sz), q->flags, from, &vnet_hdr);
if (hdr_len < 0) {
err = hdr_len; goto err;
}
- } else {
}hdr_len = 0;
- err = -EINVAL;
- if (unlikely(len < ETH_HLEN))
goto err;
Is this check removal intentional?
- len = iov_iter_count(from); if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i;
copylen = vnet_hdr.hdr_len ?
tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
if (copylen > good_linear) copylen = good_linear; else if (copylen < ETH_HLEN)copylen = hdr_len ? hdr_len : GOODCOPY_LEN;
@@ -697,7 +614,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (!zerocopy) { copylen = len;
linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
if (linear > good_linear) linear = good_linear; else if (linear < ETH_HLEN)linear = hdr_len;
@@ -732,9 +649,8 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, } skb->dev = tap->dev;
- if (vnet_hdr_len) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
tap_is_little_endian(q));
- if (q->flags & IFF_VNET_HDR) {
if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR;err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr);
@@ -797,23 +713,17 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
if (iov_iter_count(iter) < vnet_hdr_len)
return -EINVAL;
if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
tap_is_little_endian(q), true,
vlan_hlen))
BUG();
if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
sizeof(vnet_hdr))
return -EFAULT;
ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
if (ret < 0)
goto done;
iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr);
if (ret < 0)
goto done;
Please split this patch in to a series of smaller patches.
If feasible:
1. one that move the head of tun.c into tun_vnet.[hc]. 2. then one that uses that also in tap.c. 3. then a separate patch for the ioctl changes. 4. then introduce tun_vnet_hdr_from_skb, tun_vnet_hdr_put and friends in (a) follow-up patch(es).
This is subtle code. Please report what tests you ran to ensure that it does not introduce behavioral changes / regressions.
On 2025/01/09 23:06, Willem de Bruijn wrote:
Akihiko Odaki wrote:
Both tun and tap exposes the same set of virtio-net-related features. Unify their implementations to ease future changes.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
MAINTAINERS | 1 + drivers/net/Kconfig | 5 ++ drivers/net/Makefile | 1 + drivers/net/tap.c | 172 ++++++---------------------------------- drivers/net/tun.c | 208 ++++++++----------------------------------------- drivers/net/tun_vnet.c | 186 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/tun_vnet.h | 24 ++++++ 7 files changed, 273 insertions(+), 324 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..1be8a452d11f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23903,6 +23903,7 @@ F: Documentation/networking/tuntap.rst F: arch/um/os-Linux/drivers/ F: drivers/net/tap.c F: drivers/net/tun.c +F: drivers/net/tun_vnet.h TURBOCHANNEL SUBSYSTEM M: "Maciej W. Rozycki" macro@orcam.me.uk diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1fd5acdc73c6..255c8f9f1d7c 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32
- select TUN_VNET
No need for this new Kconfig
I will merge tun_vnet.c into TAP.
static struct proto tap_proto = { .name = "tap", @@ -641,10 +576,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from);
- unsigned long len = total_len;
- unsigned long len; int err; struct virtio_net_hdr vnet_hdr = { 0 };
- int vnet_hdr_len = 0;
- int hdr_len; int copylen = 0; int depth; bool zerocopy = false;
@@ -652,38 +587,20 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) {
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
err = -EINVAL;
if (len < vnet_hdr_len)
goto err;
len -= vnet_hdr_len;
err = -EFAULT;
if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
goto err;
iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
tap16_to_cpu(q, vnet_hdr.csum_start) +
tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
tap16_to_cpu(q, vnet_hdr.hdr_len))
vnet_hdr.hdr_len = cpu_to_tap16(q,
tap16_to_cpu(q, vnet_hdr.csum_start) +
tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
err = -EINVAL;
if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
hdr_len = tun_vnet_hdr_get(READ_ONCE(q->vnet_hdr_sz), q->flags, from, &vnet_hdr);
if (hdr_len < 0) {
err = hdr_len; goto err;
}
- } else {
}hdr_len = 0;
- err = -EINVAL;
- if (unlikely(len < ETH_HLEN))
goto err;
Is this check removal intentional?
No, I'm not sure what this check is for, but it is irrlevant with vnet header and shouldn't be modified with this patch. I'll restore the check with the next version.
- len = iov_iter_count(from); if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i;
copylen = vnet_hdr.hdr_len ?
tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
if (copylen > good_linear) copylen = good_linear; else if (copylen < ETH_HLEN)copylen = hdr_len ? hdr_len : GOODCOPY_LEN;
@@ -697,7 +614,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (!zerocopy) { copylen = len;
linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
if (linear > good_linear) linear = good_linear; else if (linear < ETH_HLEN)linear = hdr_len;
@@ -732,9 +649,8 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, } skb->dev = tap->dev;
- if (vnet_hdr_len) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
tap_is_little_endian(q));
- if (q->flags & IFF_VNET_HDR) {
if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR;err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr);
@@ -797,23 +713,17 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
if (iov_iter_count(iter) < vnet_hdr_len)
return -EINVAL;
if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
tap_is_little_endian(q), true,
vlan_hlen))
BUG();
if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
sizeof(vnet_hdr))
return -EFAULT;
ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
if (ret < 0)
goto done;
iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr);
if (ret < 0)
goto done;
Please split this patch in to a series of smaller patches.
If feasible:
- one that move the head of tun.c into tun_vnet.[hc].
- then one that uses that also in tap.c.
- then a separate patch for the ioctl changes.
- then introduce tun_vnet_hdr_from_skb, tun_vnet_hdr_put
and friends in (a) follow-up patch(es).
I will do so.
This is subtle code. Please report what tests you ran to ensure that it does not introduce behavioral changes / regressions.
I tried: - curl on QEMU with macvtap (vhost=on) - curl on QEMU with macvtap (vhost=off)
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
Both tun and tap exposes the same set of virtio-net-related features. Unify their implementations to ease future changes.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
MAINTAINERS | 1 + drivers/net/Kconfig | 5 ++ drivers/net/Makefile | 1 + drivers/net/tap.c | 172 ++++++---------------------------------- drivers/net/tun.c | 208 ++++++++----------------------------------------- drivers/net/tun_vnet.c | 186 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/tun_vnet.h | 24 ++++++ 7 files changed, 273 insertions(+), 324 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..1be8a452d11f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23903,6 +23903,7 @@ F: Documentation/networking/tuntap.rst F: arch/um/os-Linux/drivers/ F: drivers/net/tap.c F: drivers/net/tun.c +F: drivers/net/tun_vnet.h
TURBOCHANNEL SUBSYSTEM M: "Maciej W. Rozycki" macro@orcam.me.uk diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 1fd5acdc73c6..255c8f9f1d7c 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32
select TUN_VNET
I don't think we need a dedicated Kconfig option here.
Btw, fixes should come first as it simplifies the backporting.
Thanks
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
- iov_iter_advance(iter, sz - sizeof(*hdr)); + if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) + return -EFAULT;
return 0; }
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
- iov_iter_advance(iter, sz - sizeof(*hdr));
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return -EFAULT;
return 0; }
-- 2.47.1
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
- iov_iter_advance(iter, sz - sizeof(*hdr));
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return -EFAULT;
return 0; }
-- 2.47.1
On Thu, Jan 09, 2025 at 04:41:50PM +0900, Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
It's been like this for years, I'd say we wait until we know there's a problem?
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
- iov_iter_advance(iter, sz - sizeof(*hdr));
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return 0; }return -EFAULT;
-- 2.47.1
On 2025/01/09 16:43, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 04:41:50PM +0900, Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
It's been like this for years, I'd say we wait until we know there's a problem?
Perhaps we can just leave it as is. Let me ask filesystem and networking people:
Is it OK to leave some part of buffer uninitialized with read_iter() or recvmsg()?
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
- iov_iter_advance(iter, sz - sizeof(*hdr));
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return 0; }return -EFAULT;
-- 2.47.1
On Thu 09-01-25 18:36:52, Akihiko Odaki wrote:
On 2025/01/09 16:43, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 04:41:50PM +0900, Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
It's been like this for years, I'd say we wait until we know there's a problem?
Perhaps we can just leave it as is. Let me ask filesystem and networking people:
Is it OK to leave some part of buffer uninitialized with read_iter() or recvmsg()?
I think that leaving part of the IO buffer within returned IO length uninitialized is a very bad practice and I'm not aware of any place in filesystem area that would do that. It makes life unnecessarily harder for userspace and also it is invitation for subtle information leaks (depending on who allocates the buffer and who then gets to read the results). So I think the patch makes sense.
Honza
Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
Zero is a valid hash value, so cannot be used as an indication that hashing is inactive.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If this is user memory that is ignored by the kernel, just reflected back, then there is no need in general to zero it. There are many such instances, also in msg_control.
If not zeroing leads to ambiguity with the new feature, that would be a reason to add it -- it is always safe to do so.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
On 2025/01/09 21:46, Willem de Bruijn wrote:
Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
Zero is a valid hash value, so cannot be used as an indication that hashing is inactive.
Zeroing will initialize the hash_report field to VIRTIO_NET_HASH_REPORT_NONE, which tells it does not have a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If this is user memory that is ignored by the kernel, just reflected back, then there is no need in general to zero it. There are many such instances, also in msg_control.
More specifically, is there any instance of recvmsg() implementation which returns N and does not fill the complete N bytes of msg_iter?
If not zeroing leads to ambiguity with the new feature, that would be a reason to add it -- it is always safe to do so.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
On Fri, Jan 10, 2025 at 01:38:06PM +0900, Akihiko Odaki wrote:
On 2025/01/09 21:46, Willem de Bruijn wrote:
Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
Zero is a valid hash value, so cannot be used as an indication that hashing is inactive.
Zeroing will initialize the hash_report field to VIRTIO_NET_HASH_REPORT_NONE, which tells it does not have a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If this is user memory that is ignored by the kernel, just reflected back, then there is no need in general to zero it. There are many such instances, also in msg_control.
More specifically, is there any instance of recvmsg() implementation which returns N and does not fill the complete N bytes of msg_iter?
The one in tun. It was a silly idea but it has been here for years now.
If not zeroing leads to ambiguity with the new feature, that would be a reason to add it -- it is always safe to do so.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
On 2025/01/10 17:33, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 01:38:06PM +0900, Akihiko Odaki wrote:
On 2025/01/09 21:46, Willem de Bruijn wrote:
Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
Zero is a valid hash value, so cannot be used as an indication that hashing is inactive.
Zeroing will initialize the hash_report field to VIRTIO_NET_HASH_REPORT_NONE, which tells it does not have a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If this is user memory that is ignored by the kernel, just reflected back, then there is no need in general to zero it. There are many such instances, also in msg_control.
More specifically, is there any instance of recvmsg() implementation which returns N and does not fill the complete N bytes of msg_iter?
The one in tun. It was a silly idea but it has been here for years now.
Except tun. If there is such an example of recvmsg() implementation and it is not accidental and people have agreed to keep it functioning, we can confidently say this construct is safe without fearing pushback from people maintaining filesystem/networking infrastructure. Ultimately I want those people decide if this can be supported for the future or not.
If not zeroing leads to ambiguity with the new feature, that would be a reason to add it -- it is always safe to do so.
If we are really confident that it will not cause problems, this behavior can be opt-in based on a flag or we can just write some documentation warning userspace programmers to initialize the buffer.
Akihiko Odaki wrote:
On 2025/01/10 17:33, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 01:38:06PM +0900, Akihiko Odaki wrote:
On 2025/01/09 21:46, Willem de Bruijn wrote:
Akihiko Odaki wrote:
On 2025/01/09 16:31, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote: > tun used to simply advance iov_iter when it needs to pad virtio header, > which leaves the garbage in the buffer as is. This is especially > problematic when tun starts to allow enabling the hash reporting > feature; even if the feature is enabled, the packet may lack a hash > value and may contain a hole in the virtio header because the packet > arrived before the feature gets enabled or does not contain the > header fields to be hashed. If the hole is not filled with zero, it is > impossible to tell if the packet lacks a hash value.
Zero is a valid hash value, so cannot be used as an indication that hashing is inactive.
Zeroing will initialize the hash_report field to VIRTIO_NET_HASH_REPORT_NONE, which tells it does not have a hash value.
> In theory, a user of tun can fill the buffer with zero before calling > read() to avoid such a problem, but leaving the garbage in the buffer is > awkward anyway so fill the buffer in tun. > > Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
Yes. but that means the user expects some part of buffer is not filled after read() or recvmsg(). I'm a bit worried that not filling the buffer may break assumptions others (especially the filesystem and socket infrastructures in the kernel) may have.
If this is user memory that is ignored by the kernel, just reflected back, then there is no need in general to zero it. There are many such instances, also in msg_control.
More specifically, is there any instance of recvmsg() implementation which returns N and does not fill the complete N bytes of msg_iter?
The one in tun. It was a silly idea but it has been here for years now.
Except tun. If there is such an example of recvmsg() implementation and it is not accidental and people have agreed to keep it functioning, we can confidently say this construct is safe without fearing pushback from people maintaining filesystem/networking infrastructure. Ultimately I want those people decide if this can be supported for the future or not.
It seems preferable to write a value.
Userspace should have not assumption that whatever it writes there will be reflected unmodified. That said, that is the tiny risk of changing this in established code.
If it worked without issue so far, without hashing, then probably the change should only go to net-next.
As said, there are examples in msg_control. I don't immediately have an example where this is the case in msg_data today. A search for iov_iter_advance might show something.
On Thu, Jan 09, 2025 at 02:31:37AM -0500, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:44PM +0900, Akihiko Odaki wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
But if the user did it, you have just overwritten his value, did you not?
To clearify, I mean if user pre-filled buffer with 1, you have now regressed it. Patch 3 fixes it back, but - not pretty.
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
- iov_iter_advance(iter, sz - sizeof(*hdr));
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return -EFAULT;
return 0; }
-- 2.47.1
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
I'm not sure I will get here, could we do this in the series of hash reporting?
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
iov_iter_advance(iter, sz - sizeof(*hdr));
if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return -EFAULT; return 0;
There're various callers of iov_iter_advance(), do we need to fix them all?
Thanks
}
-- 2.47.1
On 2025/01/10 12:27, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
tun used to simply advance iov_iter when it needs to pad virtio header, which leaves the garbage in the buffer as is. This is especially problematic when tun starts to allow enabling the hash reporting feature; even if the feature is enabled, the packet may lack a hash value and may contain a hole in the virtio header because the packet arrived before the feature gets enabled or does not contain the header fields to be hashed. If the hole is not filled with zero, it is impossible to tell if the packet lacks a hash value.
I'm not sure I will get here, could we do this in the series of hash reporting?
I'll create another series dedicated for this and num_buffers change as suggested by Willem.
In theory, a user of tun can fill the buffer with zero before calling read() to avoid such a problem, but leaving the garbage in the buffer is awkward anyway so fill the buffer in tun.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
drivers/net/tun_vnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index fe842df9e9ef..ffb2186facd3 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -138,7 +138,8 @@ int tun_vnet_hdr_put(int sz, struct iov_iter *iter, if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) return -EFAULT;
iov_iter_advance(iter, sz - sizeof(*hdr));
if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
return -EFAULT; return 0;
There're various callers of iov_iter_advance(), do we need to fix them all?
No. For example, there are iov_iter_advance() calls for SOCK_ZEROCOPY in tun_get_user() and tap_get_user(). They are fine as they are not writing buffers after skipping.
The problem is that read_iter() and recvmsg() says it wrote N bytes but it leaves some of this N bytes uninialized. Such an implementation may be created even without iov_iter_advance() (for example just returning a too big number), and it is equally problematic with the current tun_get_user()/tap_get_user().
Regards, Akihiko Odaki
Thanks
}
-- 2.47.1
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- drivers/net/tap.c | 2 +- drivers/net/tun.c | 6 ++++-- drivers/net/tun_vnet.c | 14 +++++++++----- drivers/net/tun_vnet.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 60804855510b..fe9554ee5b8b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -713,7 +713,7 @@ static ssize_t tap_put_user(struct tap_queue *q, int total;
if (q->flags & IFF_VNET_HDR) { - struct virtio_net_hdr vnet_hdr; + struct virtio_net_hdr_v1 vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dbf0dee92e93..f211d0580887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1991,7 +1991,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, size_t total;
if (tun->flags & IFF_VNET_HDR) { - struct virtio_net_hdr gso = { 0 }; + struct virtio_net_hdr_v1 gso = { + .num_buffers = __virtio16_to_cpu(true, 1) + };
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); @@ -2044,7 +2046,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, }
if (vnet_hdr_sz) { - struct virtio_net_hdr gso; + struct virtio_net_hdr_v1 gso;
ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret < 0) diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index ffb2186facd3..a7a7989fae56 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -130,15 +130,17 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, EXPORT_SYMBOL_GPL(tun_vnet_hdr_get);
int tun_vnet_hdr_put(int sz, struct iov_iter *iter, - const struct virtio_net_hdr *hdr) + const struct virtio_net_hdr_v1 *hdr) { + int content_sz = MIN(sizeof(*hdr), sz); + if (iov_iter_count(iter) < sz) return -EINVAL;
- if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)) + if (copy_to_iter(hdr, content_sz, iter) != content_sz) return -EFAULT;
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) + if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT;
return 0; @@ -154,11 +156,11 @@ EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb);
int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb, - struct virtio_net_hdr *hdr) + struct virtio_net_hdr_v1 *hdr) { int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
- if (virtio_net_hdr_from_skb(skb, hdr, + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); @@ -176,6 +178,8 @@ int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, return -EINVAL; }
+ hdr->num_buffers = 1; + return 0; } EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb); diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 2dfdbe92bb24..d8fd94094227 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -12,13 +12,13 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr);
int tun_vnet_hdr_put(int sz, struct iov_iter *iter, - const struct virtio_net_hdr *hdr); + const struct virtio_net_hdr_v1 *hdr);
int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr);
int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb, - struct virtio_net_hdr *hdr); + struct virtio_net_hdr_v1 *hdr);
#endif /* TUN_VNET_H */
On Thu, Jan 09, 2025 at 03:58:45PM +0900, Akihiko Odaki wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
How do we know this is v1 and not v0? Confused.
drivers/net/tap.c | 2 +- drivers/net/tun.c | 6 ++++-- drivers/net/tun_vnet.c | 14 +++++++++----- drivers/net/tun_vnet.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 60804855510b..fe9554ee5b8b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -713,7 +713,7 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;
struct virtio_net_hdr_v1 vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dbf0dee92e93..f211d0580887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1991,7 +1991,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, size_t total; if (tun->flags & IFF_VNET_HDR) {
struct virtio_net_hdr gso = { 0 };
struct virtio_net_hdr_v1 gso = {
.num_buffers = __virtio16_to_cpu(true, 1)
};
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); @@ -2044,7 +2046,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) {
struct virtio_net_hdr gso;
struct virtio_net_hdr_v1 gso;
ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret < 0) diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index ffb2186facd3..a7a7989fae56 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -130,15 +130,17 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, EXPORT_SYMBOL_GPL(tun_vnet_hdr_get); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr)
const struct virtio_net_hdr_v1 *hdr)
{
- int content_sz = MIN(sizeof(*hdr), sz);
- if (iov_iter_count(iter) < sz) return -EINVAL;
- if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
- if (copy_to_iter(hdr, content_sz, iter) != content_sz) return -EFAULT;
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
- if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT;
return 0; @@ -154,11 +156,11 @@ EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr)
struct virtio_net_hdr_v1 *hdr)
{ int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
- if (virtio_net_hdr_from_skb(skb, hdr,
- if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -176,6 +178,8 @@ int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, return -EINVAL; }
- hdr->num_buffers = 1;
- return 0;
} EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb); diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 2dfdbe92bb24..d8fd94094227 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -12,13 +12,13 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr);
const struct virtio_net_hdr_v1 *hdr);
int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr);
struct virtio_net_hdr_v1 *hdr);
#endif /* TUN_VNET_H */
-- 2.47.1
On Thu, Jan 09, 2025 at 02:32:25AM -0500, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:45PM +0900, Akihiko Odaki wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
How do we know this is v1 and not v0? Confused.
Ah I got it, you assume userspace will over-write it if VIRTIO_NET_F_MRG_RXBUF is set. If we are leaving this up to userspace, why not let it do it always?
drivers/net/tap.c | 2 +- drivers/net/tun.c | 6 ++++-- drivers/net/tun_vnet.c | 14 +++++++++----- drivers/net/tun_vnet.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 60804855510b..fe9554ee5b8b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -713,7 +713,7 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;
struct virtio_net_hdr_v1 vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dbf0dee92e93..f211d0580887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1991,7 +1991,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, size_t total; if (tun->flags & IFF_VNET_HDR) {
struct virtio_net_hdr gso = { 0 };
struct virtio_net_hdr_v1 gso = {
.num_buffers = __virtio16_to_cpu(true, 1)
};
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); @@ -2044,7 +2046,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) {
struct virtio_net_hdr gso;
struct virtio_net_hdr_v1 gso;
ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret < 0) diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index ffb2186facd3..a7a7989fae56 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -130,15 +130,17 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, EXPORT_SYMBOL_GPL(tun_vnet_hdr_get); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr)
const struct virtio_net_hdr_v1 *hdr)
{
- int content_sz = MIN(sizeof(*hdr), sz);
- if (iov_iter_count(iter) < sz) return -EINVAL;
- if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
- if (copy_to_iter(hdr, content_sz, iter) != content_sz) return -EFAULT;
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
- if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT;
return 0; @@ -154,11 +156,11 @@ EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr)
struct virtio_net_hdr_v1 *hdr)
{ int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
- if (virtio_net_hdr_from_skb(skb, hdr,
- if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -176,6 +178,8 @@ int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, return -EINVAL; }
- hdr->num_buffers = 1;
- return 0;
} EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb); diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 2dfdbe92bb24..d8fd94094227 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -12,13 +12,13 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr);
const struct virtio_net_hdr_v1 *hdr);
int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr);
struct virtio_net_hdr_v1 *hdr);
#endif /* TUN_VNET_H */
-- 2.47.1
On 2025/01/09 16:40, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 02:32:25AM -0500, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:45PM +0900, Akihiko Odaki wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
How do we know this is v1 and not v0? Confused.
Ah I got it, you assume userspace will over-write it if VIRTIO_NET_F_MRG_RXBUF is set. If we are leaving this up to userspace, why not let it do it always?
tun may be used with vhost_net, which does not set the field.
drivers/net/tap.c | 2 +- drivers/net/tun.c | 6 ++++-- drivers/net/tun_vnet.c | 14 +++++++++----- drivers/net/tun_vnet.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 60804855510b..fe9554ee5b8b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -713,7 +713,7 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;
struct virtio_net_hdr_v1 vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dbf0dee92e93..f211d0580887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1991,7 +1991,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, size_t total; if (tun->flags & IFF_VNET_HDR) {
struct virtio_net_hdr gso = { 0 };
struct virtio_net_hdr_v1 gso = {
.num_buffers = __virtio16_to_cpu(true, 1)
};
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); @@ -2044,7 +2046,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) {
struct virtio_net_hdr gso;
struct virtio_net_hdr_v1 gso;
ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret < 0) diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index ffb2186facd3..a7a7989fae56 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -130,15 +130,17 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, EXPORT_SYMBOL_GPL(tun_vnet_hdr_get); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr)
{const struct virtio_net_hdr_v1 *hdr)
- int content_sz = MIN(sizeof(*hdr), sz);
- if (iov_iter_count(iter) < sz) return -EINVAL;
- if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
- if (copy_to_iter(hdr, content_sz, iter) != content_sz) return -EFAULT;
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
- if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT;
return 0; @@ -154,11 +156,11 @@ EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr)
{ int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;struct virtio_net_hdr_v1 *hdr)
- if (virtio_net_hdr_from_skb(skb, hdr,
- if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -176,6 +178,8 @@ int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, return -EINVAL; }
- hdr->num_buffers = 1;
- return 0; } EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb);
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 2dfdbe92bb24..d8fd94094227 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -12,13 +12,13 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr);
const struct virtio_net_hdr_v1 *hdr);
int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr);
struct virtio_net_hdr_v1 *hdr);
#endif /* TUN_VNET_H */
-- 2.47.1
On Thu, Jan 09, 2025 at 06:38:10PM +0900, Akihiko Odaki wrote:
On 2025/01/09 16:40, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 02:32:25AM -0500, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:45PM +0900, Akihiko Odaki wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
How do we know this is v1 and not v0? Confused.
Ah I got it, you assume userspace will over-write it if VIRTIO_NET_F_MRG_RXBUF is set. If we are leaving this up to userspace, why not let it do it always?
tun may be used with vhost_net, which does not set the field.
I'd fix that in vhost net.
drivers/net/tap.c | 2 +- drivers/net/tun.c | 6 ++++-- drivers/net/tun_vnet.c | 14 +++++++++----- drivers/net/tun_vnet.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 60804855510b..fe9554ee5b8b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -713,7 +713,7 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);struct virtio_net_hdr_v1 vnet_hdr;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dbf0dee92e93..f211d0580887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1991,7 +1991,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, size_t total; if (tun->flags & IFF_VNET_HDR) {
struct virtio_net_hdr gso = { 0 };
struct virtio_net_hdr_v1 gso = {
.num_buffers = __virtio16_to_cpu(true, 1)
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);};
@@ -2044,7 +2046,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) {
struct virtio_net_hdr gso;
ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret < 0)struct virtio_net_hdr_v1 gso;
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index ffb2186facd3..a7a7989fae56 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -130,15 +130,17 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, EXPORT_SYMBOL_GPL(tun_vnet_hdr_get); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr)
{const struct virtio_net_hdr_v1 *hdr)
- int content_sz = MIN(sizeof(*hdr), sz);
- if (iov_iter_count(iter) < sz) return -EINVAL;
- if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
- if (copy_to_iter(hdr, content_sz, iter) != content_sz) return -EFAULT;
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
- if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT; return 0;
@@ -154,11 +156,11 @@ EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr)
{ int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;struct virtio_net_hdr_v1 *hdr)
- if (virtio_net_hdr_from_skb(skb, hdr,
- if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -176,6 +178,8 @@ int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, return -EINVAL; }
- hdr->num_buffers = 1;
- return 0; } EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb);
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 2dfdbe92bb24..d8fd94094227 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -12,13 +12,13 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr);
int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,const struct virtio_net_hdr_v1 *hdr);
struct virtio_net_hdr *hdr);
#endif /* TUN_VNET_H */struct virtio_net_hdr_v1 *hdr);
-- 2.47.1
On 2025/01/09 19:54, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 06:38:10PM +0900, Akihiko Odaki wrote:
On 2025/01/09 16:40, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 02:32:25AM -0500, Michael S. Tsirkin wrote:
On Thu, Jan 09, 2025 at 03:58:45PM +0900, Akihiko Odaki wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
How do we know this is v1 and not v0? Confused.
Ah I got it, you assume userspace will over-write it if VIRTIO_NET_F_MRG_RXBUF is set. If we are leaving this up to userspace, why not let it do it always?
tun may be used with vhost_net, which does not set the field.
I'd fix that in vhost net.
Let's see what filesystem and networking people will say for the earlier patch. We can fix num_buffers for free if the earlier patch is getting merged. We will need to come up with another solution otherwise.
drivers/net/tap.c | 2 +- drivers/net/tun.c | 6 ++++-- drivers/net/tun_vnet.c | 14 +++++++++----- drivers/net/tun_vnet.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 60804855510b..fe9554ee5b8b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -713,7 +713,7 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) {
struct virtio_net_hdr vnet_hdr;
struct virtio_net_hdr_v1 vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dbf0dee92e93..f211d0580887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1991,7 +1991,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, size_t total; if (tun->flags & IFF_VNET_HDR) {
struct virtio_net_hdr gso = { 0 };
struct virtio_net_hdr_v1 gso = {
.num_buffers = __virtio16_to_cpu(true, 1)
}; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
@@ -2044,7 +2046,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) {
struct virtio_net_hdr gso;
struct virtio_net_hdr_v1 gso; ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret < 0)
diff --git a/drivers/net/tun_vnet.c b/drivers/net/tun_vnet.c index ffb2186facd3..a7a7989fae56 100644 --- a/drivers/net/tun_vnet.c +++ b/drivers/net/tun_vnet.c @@ -130,15 +130,17 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, EXPORT_SYMBOL_GPL(tun_vnet_hdr_get); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr)
{const struct virtio_net_hdr_v1 *hdr)
- int content_sz = MIN(sizeof(*hdr), sz);
- if (iov_iter_count(iter) < sz) return -EINVAL;
- if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
- if (copy_to_iter(hdr, content_sz, iter) != content_sz) return -EFAULT;
- if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
- if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz) return -EFAULT; return 0;
@@ -154,11 +156,11 @@ EXPORT_SYMBOL_GPL(tun_vnet_hdr_to_skb); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,
struct virtio_net_hdr *hdr)
{ int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;struct virtio_net_hdr_v1 *hdr)
- if (virtio_net_hdr_from_skb(skb, hdr,
- if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -176,6 +178,8 @@ int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, return -EINVAL; }
- hdr->num_buffers = 1;
- return 0; } EXPORT_SYMBOL_GPL(tun_vnet_hdr_from_skb);
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 2dfdbe92bb24..d8fd94094227 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -12,13 +12,13 @@ int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr); int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
const struct virtio_net_hdr *hdr);
int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr); int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb,const struct virtio_net_hdr_v1 *hdr);
struct virtio_net_hdr *hdr);
#endif /* TUN_VNET_H */struct virtio_net_hdr_v1 *hdr);
-- 2.47.1
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
On 2025/01/10 12:27, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
My understanding is that we should fix the kernel and QEMU instead. There may be some driver implementations that assumes num_buffers is 1 so the kernel and QEMU should be fixed to be compatible with such potential implementations.
It is also possible to make future drivers with existing kernels and QEMU by ensuring they will not read num_buffers when VIRTIO_NET_F_MRG_RXBUF has not negotiated, and that's what "[PATCH v3] virtio-net: Ignore num_buffers when unused" does. https://lore.kernel.org/r/20250110-reserved-v3-1-2ade0a5d2090@daynix.com
Regards, Akihiko Odaki
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
On 2025/01/10 19:23, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
On Fri, Jan 10, 2025 at 7:12 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/10 19:23, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
So is this something that the driver should notice?
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
I don't get this. We are talking about devices and we want to relax so it should compatibile.
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
I don't think we can fix it all.
Thanks
On 2025/01/13 12:04, Jason Wang wrote:
On Fri, Jan 10, 2025 at 7:12 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/10 19:23, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
So is this something that the driver should notice?
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
I don't get this. We are talking about devices and we want to relax so it should compatibile.
The problem is: 1) On the device side, the num_buffers can be left uninitialized due to bugs 2) On the driver side, the specification allows assuming the num_buffers is set to one.
Relaxing the device requirement will replace "due to bugs" with "according to the specification" in 1). It still contradicts with 2) so does not fix compatibility.
Instead, we should make the driver requirement stricter to change 2). That is what "[PATCH v3] virtio-net: Ignore num_buffers when unused" does: https://lore.kernel.org/r/20250110-reserved-v3-1-2ade0a5d2090@daynix.com
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
I don't think we can fix it all.
It essentially only requires storing 16 bits. There are details we need to work out, but it should be possible to fix.
Regards, Akihiko Odaki
On Wed, Jan 15, 2025 at 1:07 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/13 12:04, Jason Wang wrote:
On Fri, Jan 10, 2025 at 7:12 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/10 19:23, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
The specification says the device MUST set num_buffers to 1 if VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
So is this something that the driver should notice?
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
I don't get this. We are talking about devices and we want to relax so it should compatibile.
The problem is:
- On the device side, the num_buffers can be left uninitialized due to bugs
- On the driver side, the specification allows assuming the num_buffers
is set to one.
Relaxing the device requirement will replace "due to bugs" with "according to the specification" in 1). It still contradicts with 2) so does not fix compatibility.
Just to clarify I meant we can simply remove the following:
""" The device MUST use only a single descriptor if VIRTIO_NET_F_MRG_RXBUF was not negotiated. Note: This means that num_buffers will always be 1 if VIRTIO_NET_F_MRG_RXBUF is not negotiated. """
And
""" If VIRTIO_NET_F_MRG_RXBUF has not been negotiated, the device MUST set num_buffers to 1. """
This seems easier as it reflects the fact where some devices don't set it. And it eases the transitional device as it doesn't need to have any special care.
Then we don't need any driver normative so I don't see any conflict.
Michael suggests we use "SHOULD", but if this is something that the driver needs to be aware of I don't know how "SHOULD" can help a lot or not.
Instead, we should make the driver requirement stricter to change 2). That is what "[PATCH v3] virtio-net: Ignore num_buffers when unused" does: https://lore.kernel.org/r/20250110-reserved-v3-1-2ade0a5d2090@daynix.com
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
I don't think we can fix it all.
It essentially only requires storing 16 bits. There are details we need to work out, but it should be possible to fix.
I meant it's not realistic to fix all the hypervisors. Note that modern devices have been implemented for about a decade so we may have too many versions of various hypervisors. (E.g DPDK seems to stick with the same behaviour of the current kernel).
Regards, Akihiko Odaki
Thanks
On 2025/01/16 10:06, Jason Wang wrote:
On Wed, Jan 15, 2025 at 1:07 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/13 12:04, Jason Wang wrote:
On Fri, Jan 10, 2025 at 7:12 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/10 19:23, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote:
On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote: > > The specification says the device MUST set num_buffers to 1 if > VIRTIO_NET_F_MRG_RXBUF has not been negotiated.
Have we agreed on how to fix the spec or not?
As I replied in the spec patch, if we just remove this "MUST", it looks like we are all fine?
Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
So is this something that the driver should notice?
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
I don't get this. We are talking about devices and we want to relax so it should compatibile.
The problem is:
- On the device side, the num_buffers can be left uninitialized due to bugs
- On the driver side, the specification allows assuming the num_buffers
is set to one.
Relaxing the device requirement will replace "due to bugs" with "according to the specification" in 1). It still contradicts with 2) so does not fix compatibility.
Just to clarify I meant we can simply remove the following:
""" The device MUST use only a single descriptor if VIRTIO_NET_F_MRG_RXBUF was not negotiated. Note: This means that num_buffers will always be 1 if VIRTIO_NET_F_MRG_RXBUF is not negotiated. """
And
""" If VIRTIO_NET_F_MRG_RXBUF has not been negotiated, the device MUST set num_buffers to 1. """
This seems easier as it reflects the fact where some devices don't set it. And it eases the transitional device as it doesn't need to have any special care.
That can potentially break existing drivers that are compliant with the current and assumes the num_buffers is set to 1.
Regards, Akihiko Odaki
Then we don't need any driver normative so I don't see any conflict.
Michael suggests we use "SHOULD", but if this is something that the driver needs to be aware of I don't know how "SHOULD" can help a lot or not.
Instead, we should make the driver requirement stricter to change 2). That is what "[PATCH v3] virtio-net: Ignore num_buffers when unused" does: https://lore.kernel.org/r/20250110-reserved-v3-1-2ade0a5d2090@daynix.com
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
I don't think we can fix it all.
It essentially only requires storing 16 bits. There are details we need to work out, but it should be possible to fix.
I meant it's not realistic to fix all the hypervisors. Note that modern devices have been implemented for about a decade so we may have too many versions of various hypervisors. (E.g DPDK seems to stick with the same behaviour of the current kernel).
Regards, Akihiko Odaki
Thanks
On Thu, Jan 16, 2025 at 1:30 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/16 10:06, Jason Wang wrote:
On Wed, Jan 15, 2025 at 1:07 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/13 12:04, Jason Wang wrote:
On Fri, Jan 10, 2025 at 7:12 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/10 19:23, Michael S. Tsirkin wrote:
On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote: > On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote: >> >> The specification says the device MUST set num_buffers to 1 if >> VIRTIO_NET_F_MRG_RXBUF has not been negotiated. > > Have we agreed on how to fix the spec or not? > > As I replied in the spec patch, if we just remove this "MUST", it > looks like we are all fine? > > Thanks
We should replace MUST with SHOULD but it is not all fine, ignoring SHOULD is a quality of implementation issue.
So is this something that the driver should notice?
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
I don't get this. We are talking about devices and we want to relax so it should compatibile.
The problem is:
- On the device side, the num_buffers can be left uninitialized due to bugs
- On the driver side, the specification allows assuming the num_buffers
is set to one.
Relaxing the device requirement will replace "due to bugs" with "according to the specification" in 1). It still contradicts with 2) so does not fix compatibility.
Just to clarify I meant we can simply remove the following:
""" The device MUST use only a single descriptor if VIRTIO_NET_F_MRG_RXBUF was not negotiated. Note: This means that num_buffers will always be 1 if VIRTIO_NET_F_MRG_RXBUF is not negotiated. """
And
""" If VIRTIO_NET_F_MRG_RXBUF has not been negotiated, the device MUST set num_buffers to 1. """
This seems easier as it reflects the fact where some devices don't set it. And it eases the transitional device as it doesn't need to have any special care.
That can potentially break existing drivers that are compliant with the current and assumes the num_buffers is set to 1.
Those drivers are already 'broken'. Aren't they?
Thanks
Regards, Akihiko Odaki
Then we don't need any driver normative so I don't see any conflict.
Michael suggests we use "SHOULD", but if this is something that the driver needs to be aware of I don't know how "SHOULD" can help a lot or not.
Instead, we should make the driver requirement stricter to change 2). That is what "[PATCH v3] virtio-net: Ignore num_buffers when unused" does: https://lore.kernel.org/r/20250110-reserved-v3-1-2ade0a5d2090@daynix.com
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
I don't think we can fix it all.
It essentially only requires storing 16 bits. There are details we need to work out, but it should be possible to fix.
I meant it's not realistic to fix all the hypervisors. Note that modern devices have been implemented for about a decade so we may have too many versions of various hypervisors. (E.g DPDK seems to stick with the same behaviour of the current kernel).
Regards, Akihiko Odaki
Thanks
On 2025/01/20 9:40, Jason Wang wrote:
On Thu, Jan 16, 2025 at 1:30 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/16 10:06, Jason Wang wrote:
On Wed, Jan 15, 2025 at 1:07 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/13 12:04, Jason Wang wrote:
On Fri, Jan 10, 2025 at 7:12 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2025/01/10 19:23, Michael S. Tsirkin wrote: > On Fri, Jan 10, 2025 at 11:27:13AM +0800, Jason Wang wrote: >> On Thu, Jan 9, 2025 at 2:59 PM Akihiko Odaki akihiko.odaki@daynix.com wrote: >>> >>> The specification says the device MUST set num_buffers to 1 if >>> VIRTIO_NET_F_MRG_RXBUF has not been negotiated. >> >> Have we agreed on how to fix the spec or not? >> >> As I replied in the spec patch, if we just remove this "MUST", it >> looks like we are all fine? >> >> Thanks > > We should replace MUST with SHOULD but it is not all fine, > ignoring SHOULD is a quality of implementation issue. >
So is this something that the driver should notice?
Should we really replace it? It would mean that a driver conformant with the current specification may not be compatible with a device conformant with the future specification.
I don't get this. We are talking about devices and we want to relax so it should compatibile.
The problem is:
- On the device side, the num_buffers can be left uninitialized due to bugs
- On the driver side, the specification allows assuming the num_buffers
is set to one.
Relaxing the device requirement will replace "due to bugs" with "according to the specification" in 1). It still contradicts with 2) so does not fix compatibility.
Just to clarify I meant we can simply remove the following:
""" The device MUST use only a single descriptor if VIRTIO_NET_F_MRG_RXBUF was not negotiated. Note: This means that num_buffers will always be 1 if VIRTIO_NET_F_MRG_RXBUF is not negotiated. """
And
""" If VIRTIO_NET_F_MRG_RXBUF has not been negotiated, the device MUST set num_buffers to 1. """
This seems easier as it reflects the fact where some devices don't set it. And it eases the transitional device as it doesn't need to have any special care.
That can potentially break existing drivers that are compliant with the current and assumes the num_buffers is set to 1.
Those drivers are already 'broken'. Aren't they?
The drivers are not broken, but vhost_net is. The driver works fine as long as it's used with a device compliant with the specification. If we relax the device requirement in the future specification, the drivers may not work with devices compliant with the revised specification.
Regards, Akihiko Odaki
Thanks
Regards, Akihiko Odaki
Then we don't need any driver normative so I don't see any conflict.
Michael suggests we use "SHOULD", but if this is something that the driver needs to be aware of I don't know how "SHOULD" can help a lot or not.
Instead, we should make the driver requirement stricter to change 2). That is what "[PATCH v3] virtio-net: Ignore num_buffers when unused" does: https://lore.kernel.org/r/20250110-reserved-v3-1-2ade0a5d2090@daynix.com
We are going to fix all implementations known to buggy (QEMU and Linux) anyway so I think it's just fine to leave that part of specification as is.
I don't think we can fix it all.
It essentially only requires storing 16 bits. There are details we need to work out, but it should be possible to fix.
I meant it's not realistic to fix all the hypervisors. Note that modern devices have been implemented for about a decade so we may have too many versions of various hypervisors. (E.g DPDK seems to stick with the same behaviour of the current kernel).
Regards, Akihiko Odaki
Thanks
Akihiko Odaki wrote:
When I implemented virtio's hash-related features to tun/tap [1], I found tun/tap does not fill the entire region reserved for the virtio header, leaving some uninitialized hole in the middle of the buffer after read()/recvmesg().
This series fills the uninitialized hole. More concretely, the num_buffers field will be initialized with 1, and the other fields will be inialized with 0. Setting the num_buffers field to 1 is mandated by virtio 1.0 [2].
The change to virtio header is preceded by another change that refactors tun and tap to unify their virtio-related code.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
Changes in v2:
- Fixed num_buffers endian.
- Link to v1: https://lore.kernel.org/r/20250108-tun-v1-0-67d784b34374@daynix.com
Akihiko Odaki (3): tun: Unify vnet implementation tun: Pad virtio header with zero tun: Set num_buffers for virtio 1.0
Patches should explicitly to net or net-next.
In this case if the undefined data would be a bug, that would target net. It sounds as if this is only relevant with the upcoming hash changes, so then it too can target net-next. If needed at all.
The first patch is clearly net-next material.
I would prefer to work on that independent from the rest. I'm in favor of deduplicating logic across tun/tap/pf_packet. Have taken a stab, but haven't gotten to a concrete series. This indeed a valid deduplication effort.
We have to make sure that the code is identical between tun and tap, or where it isn't (due to one of the two having received a change to such code, but the other not) explicitly note that in the commit message. As then it is a behavioral change.
Anyway, let's send the undefined data, hash and dedup changes independently. And preferably one after the other, rather than having concurrent conversations across threads.
linux-kselftest-mirror@lists.linaro.org