Akihiko Odaki wrote:
Allow the guest to reuse the hash value to make receive steering consistent between the host and guest, and to save hash computation.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com
Documentation/networking/tuntap.rst | 7 ++ drivers/net/Kconfig | 1 + drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++----- include/uapi/linux/if_tun.h | 44 +++++++++++ 4 files changed, 180 insertions(+), 18 deletions(-)
diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst index 4d7087f727be..86b4ae8caa8a 100644 --- a/Documentation/networking/tuntap.rst +++ b/Documentation/networking/tuntap.rst @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } +3.4 Reference +-------------
+``linux/if_tun.h`` defines the interface described below:
+.. kernel-doc:: include/uapi/linux/if_tun.h
Universal TUN/TAP device driver Frequently Asked Question
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 9920b3a68ed1..e2a7bd703550 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -395,6 +395,7 @@ config TUN tristate "Universal TUN/TAP device driver support" depends on INET select CRC32
- select SKB_EXTENSIONS help TUN/TAP provides packet reception and transmission for user space programs. It can be viewed as a simple Point-to-Point or Ethernet
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9d93ab9ee58f..b8fcd71becac 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -173,6 +173,10 @@ struct tun_prog { struct bpf_prog *prog; }; +struct tun_vnet_hash_container {
- struct tun_vnet_hash common;
+};
/* Since the socket were moved to tun_file, to preserve the behavior of persist
- device, socket filter, sndbuf and vnet header size were restore when the
- file were attached to a persist device.
@@ -210,6 +214,7 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog;
- struct tun_vnet_hash_container __rcu *vnet_hash;
This is just
+struct tun_vnet_hash { + u32 value; + u16 report; +};
Can just be fields in the struct directly.
Also, only one bit really used for report, so probably can be condensed further.
struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; @@ -221,6 +226,11 @@ struct veth { __be16 h_vlan_TCI; }; +static const struct tun_vnet_hash tun_vnet_hash_cap = {
- .flags = TUN_VNET_HASH_REPORT,
- .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
+};
static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); @@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) if (get_user(be, argp)) return -EFAULT;
- if (be)
- if (be) {
struct tun_vnet_hash_container *vnet_hash = rtnl_dereference(tun->vnet_hash);
if (!(tun->flags & TUN_VNET_LE) &&
vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT))
return -EBUSY;
Doesn't be here imply !tun->flags & TUN_VNET_LE? Same again below.
tun->flags |= TUN_VNET_BE;
- else
- } else { tun->flags &= ~TUN_VNET_BE;
- }
return 0; } @@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
- the userspace application move between processors, we may get a
- different rxq no. here.
*/ -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb,
const struct tun_vnet_hash_container *vnet_hash)
{
- struct tun_vnet_hash_ext *ext;
- struct flow_keys keys; struct tun_flow_entry *e; u32 txq, numqueues;
numqueues = READ_ONCE(tun->numqueues);
- txq = __skb_get_hash_symmetric(skb);
- memset(&keys, 0, sizeof(keys));
- skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
- txq = flow_hash_from_keys(&keys); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq);
@@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) txq = reciprocal_scale(txq, numqueues); }
- if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) {
ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
if (ext) {
u32 types = vnet_hash->common.types;
ext->report = virtio_net_hash_report(types, keys.basic);
ext->value = skb->l4_hash ? skb->hash : txq;
}
- }
- return txq;
} @@ -565,10 +598,13 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, u16 ret; rcu_read_lock();
- if (rcu_dereference(tun->steering_prog))
- if (rcu_dereference(tun->steering_prog)) { ret = tun_ebpf_select_queue(tun, skb);
- else
ret = tun_automq_select_queue(tun, skb);
- } else {
struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
ret = tun_automq_select_queue(tun, skb, vnet_hash);
Already passing tun, no need to pass tun->vnet_hash separately.
- } rcu_read_unlock();
return ret; @@ -2120,33 +2156,63 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) {
struct virtio_net_hdr gso;
struct tun_vnet_hash_ext *ext;
size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr);
union {
struct virtio_net_hdr hdr;
struct virtio_net_hdr_v1_hash hdr_v1_hash;
} vnet_hdr;
int ret;
if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL;
if (virtio_net_hdr_from_skb(skb, &gso,
tun_is_little_endian(tun), true,
vlan_hlen)) {
ext = vnet_hdr_sz < sizeof(vnet_hdr.hdr_v1_hash) ?
NULL : skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
if (ext) {
struct virtio_net_hash hash = {
.value = ext->value,
.report = ext->report,
};
vnet_hdr_content_sz = sizeof(vnet_hdr.hdr_v1_hash);
ret = virtio_net_hdr_v1_hash_from_skb(skb,
&vnet_hdr.hdr_v1_hash,
true,
vlan_hlen,
&hash);
} else {
vnet_hdr_content_sz = sizeof(struct virtio_net_hdr);
ret = virtio_net_hdr_from_skb(skb,
&vnet_hdr.hdr,
tun_is_little_endian(tun),
true,
vlan_hlen);
}
This is why just setting the fields directly rather than adding virtio_net_hdr_v1_hash_from_skb is actually simpler.
if (ret) { struct skb_shared_info *sinfo = skb_shinfo(skb);
if (net_ratelimit()) { netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
tun16_to_cpu(tun, gso.hdr_len));
sinfo->gso_type,
tun16_to_cpu(tun, vnet_hdr.hdr.gso_size),
tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head,
min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
min(tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64),
}true); } WARN_ON_ONCE(1); return -EINVAL;
if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz) return -EFAULT;
iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter);
}iov_iter_zero(vnet_hdr_sz - vnet_hdr_content_sz, iter);
if (vlan_hlen) { @@ -3094,6 +3160,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int le; int ret; bool do_notify = false;
- struct tun_vnet_hash vnet_hash_common;
- struct tun_vnet_hash_container *vnet_hash;
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { @@ -3115,6 +3183,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns);
- } else if (cmd == TUNGETVNETHASHCAP) {
return copy_to_user(argp, &tun_vnet_hash_cap, sizeof(tun_vnet_hash_cap)) ?
}-EFAULT : 0;
rtnl_lock(); @@ -3314,6 +3385,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; }
vnet_hash = rtnl_dereference(tun->vnet_hash);
if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) &&
vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr_v1_hash)) {
ret = -EBUSY;
break;
}
- tun->vnet_hdr_sz = vnet_hdr_sz; break;
@@ -3328,10 +3406,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; break; }
if (le)
if (le) { tun->flags |= TUN_VNET_LE;
else
} else {
vnet_hash = rtnl_dereference(tun->vnet_hash);
if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) &&
!tun_legacy_is_little_endian(tun)) {
ret = -EBUSY;
break;
}
tun->flags &= ~TUN_VNET_LE;
break;}
case TUNGETVNETBE: @@ -3396,6 +3482,30 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = open_related_ns(&net->ns, get_net_ns); break;
- case TUNSETVNETHASH:
if (copy_from_user(&vnet_hash_common, argp, sizeof(vnet_hash_common))) {
ret = -EFAULT;
break;
}
argp = (struct tun_vnet_hash __user *)argp + 1;
if ((vnet_hash_common.flags & TUN_VNET_HASH_REPORT) &&
(tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) ||
!tun_is_little_endian(tun))) {
ret = -EBUSY;
break;
}
vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
if (!vnet_hash) {
ret = -ENOMEM;
break;
}
vnet_hash->common = vnet_hash_common;
kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(tun->vnet_hash, vnet_hash));
break;
- default: ret = -EINVAL; break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 287cdc81c939..1561e8ce0a0a 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -62,6 +62,30 @@ #define TUNSETCARRIER _IOW('T', 226, int) #define TUNGETDEVNETNS _IO('T', 227) +/**
- define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
- The argument is a pointer to &struct tun_vnet_hash which will store the
- maximal virtio_net hashing configuration.
- */
+#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
+/**
- define TUNSETVNETHASH - ioctl to configure virtio_net hashing
- The argument is a pointer to &struct tun_vnet_hash.
- %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
- the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
- %TUN_VNET_HASH_REPORT.
- The virtio_net header must be configured as little-endian before calling this
- ioctl with %TUN_VNET_HASH_REPORT.
- This ioctl currently has no effect on XDP packets.
- */
+#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
/* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 @@ -115,4 +139,24 @@ struct tun_filter { __u8 addr[][ETH_ALEN]; }; +/**
- define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
- */
+#define TUN_VNET_HASH_REPORT 0x0001
+/**
- struct tun_vnet_hash - virtio_net hashing configuration
- @flags:
Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
- @pad:
Should be filled with zero before passing to %TUNSETVNETHASH
- @types:
Bitmask of allowed hash types
- */
+struct tun_vnet_hash {
- __u16 flags;
- __u8 pad[2];
- __u32 types;
+};
The values for flags and types should probably be defined here.
#endif /* _UAPI__IF_TUN_H */
-- 2.46.0