virtio-net have two usage of hashes: one is RSS and another is hash reporting. Conventionally the hash calculation was done by the VMM. However, computing the hash after the queue was chosen defeats the purpose of RSS.
Another approach is to use eBPF steering program. This approach has another downside: it cannot report the calculated hash due to the restrictive nature of eBPF.
Extend the steering program feature by introducing a dedicated program type: BPF_PROG_TYPE_VNET_HASH. This program type is capable to report the hash value and the queue to use at the same time.
This is a rewrite of a RFC patch series submitted by Yuri Benditovich that incorporates feedbacks for the series and V1 of this series: https://lore.kernel.org/lkml/20210112194143.1494-1-yuri.benditovich@daynix.c...
QEMU patched to use this new feature is available at: https://github.com/daynix/qemu/tree/akihikodaki/bpf
The QEMU patches will soon be submitted to the upstream as RFC too.
V1 -> V2: Changed to introduce a new BPF program type.
Akihiko Odaki (7): bpf: Introduce BPF_PROG_TYPE_VNET_HASH bpf: Add vnet_hash members to __sk_buff skbuff: Introduce SKB_EXT_TUN_VNET_HASH virtio_net: Add virtio_net_hdr_v1_hash_from_skb() tun: Support BPF_PROG_TYPE_VNET_HASH selftests/bpf: Test BPF_PROG_TYPE_VNET_HASH vhost_net: Support VIRTIO_NET_F_HASH_REPORT
Documentation/bpf/bpf_prog_run.rst | 1 + Documentation/bpf/libbpf/program_types.rst | 2 + drivers/net/tun.c | 158 +++++-- drivers/vhost/net.c | 16 +- include/linux/bpf_types.h | 2 + include/linux/filter.h | 7 + include/linux/skbuff.h | 10 + include/linux/virtio_net.h | 22 + include/uapi/linux/bpf.h | 5 + kernel/bpf/verifier.c | 6 + net/core/filter.c | 86 +++- net/core/skbuff.c | 3 + tools/include/uapi/linux/bpf.h | 5 + tools/lib/bpf/libbpf.c | 2 + tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/config.aarch64 | 1 - .../selftests/bpf/prog_tests/vnet_hash.c | 385 ++++++++++++++++++ tools/testing/selftests/bpf/progs/vnet_hash.c | 16 + 18 files changed, 681 insertions(+), 47 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/vnet_hash.c create mode 100644 tools/testing/selftests/bpf/progs/vnet_hash.c
This new program type will be used by tun to determine the queues to deliver packets and the hash values and types reported with virtio-net headers.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- Documentation/bpf/bpf_prog_run.rst | 1 + Documentation/bpf/libbpf/program_types.rst | 2 ++ include/linux/bpf_types.h | 2 ++ include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/verifier.c | 6 ++++++ net/core/filter.c | 11 +++++++++++ tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 2 ++ 8 files changed, 30 insertions(+)
diff --git a/Documentation/bpf/bpf_prog_run.rst b/Documentation/bpf/bpf_prog_run.rst index 4868c909df5c..0d108d867c03 100644 --- a/Documentation/bpf/bpf_prog_run.rst +++ b/Documentation/bpf/bpf_prog_run.rst @@ -39,6 +39,7 @@ following types: - ``BPF_PROG_TYPE_STRUCT_OPS`` - ``BPF_PROG_TYPE_RAW_TRACEPOINT`` - ``BPF_PROG_TYPE_SYSCALL`` +- ``BPF_PROG_TYPE_VNET_HASH``
When using the ``BPF_PROG_RUN`` command, userspace supplies an input context object and (for program types operating on network packets) a buffer containing diff --git a/Documentation/bpf/libbpf/program_types.rst b/Documentation/bpf/libbpf/program_types.rst index ad4d4d5eecb0..6be53201f91b 100644 --- a/Documentation/bpf/libbpf/program_types.rst +++ b/Documentation/bpf/libbpf/program_types.rst @@ -171,6 +171,8 @@ described in more detail in the footnotes. + +----------------------------------------+----------------------------------+-----------+ | | ``BPF_TRACE_RAW_TP`` | ``tp_btf+`` [#fentry]_ | | +-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_VNET_HASH`` | | ``vnet_hash`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ | ``BPF_PROG_TYPE_XDP`` | ``BPF_XDP_CPUMAP`` | ``xdp.frags/cpumap`` | | + + +----------------------------------+-----------+ | | | ``xdp/cpumap`` | | diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fc0d6f32c687..dec83d495e82 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -34,6 +34,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg, struct sk_msg_md, struct sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector, struct __sk_buff, struct bpf_flow_dissector) +BPF_PROG_TYPE(BPF_PROG_TYPE_VNET_HASH, vnet_hash, + struct __sk_buff, struct sk_buff) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_VNET_HASH, };
enum bpf_attach_type { @@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; + + __u32 vnet_hash_value; + __u16 vnet_hash_report; + __u16 vnet_rss_queue; };
struct bpf_tunnel_key { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb78212fa5b2..fd6d842635d2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14373,6 +14373,7 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_VNET_HASH: return true; default: return false; @@ -16973,6 +16974,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; }
+ if (prog_type == BPF_PROG_TYPE_VNET_HASH) { + verbose(env, "vnet hash progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + if (is_tracing_prog_type(prog_type)) { verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index a094694899c9..867edbc628de 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10967,6 +10967,17 @@ const struct bpf_prog_ops flow_dissector_prog_ops = { .test_run = bpf_prog_test_run_flow_dissector, };
+const struct bpf_verifier_ops vnet_hash_verifier_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, +}; + +const struct bpf_prog_ops vnet_hash_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0448700890f7..60976fe86247 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_VNET_HASH, };
enum bpf_attach_type { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 96ff1aa4bf6a..e74d136eae07 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -209,6 +209,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SYSCALL] = "syscall", [BPF_PROG_TYPE_NETFILTER] = "netfilter", + [BPF_PROG_TYPE_VNET_HASH] = "vnet_hash", };
static int __base_pr(enum libbpf_print_level level, const char *format, @@ -8858,6 +8859,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), + SEC_DEF("vnet_hash", VNET_HASH, 0, SEC_NONE), };
int libbpf_register_prog_handler(const char *sec,
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
__u16 vnet_rss_queue;
};
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
};__u16 vnet_rss_queue;
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
};__u16 vnet_rss_queue;
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
bpf is not an option then. we do not add stable bpf program types or hooks any more. If a kernel subsystem wants to use bpf it needs to accept the fact that such bpf extensibility will be unstable and subsystem maintainers can decide to remove such bpf support in the future.
On Mon, Oct 16, 2023 at 7:53 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
};__u16 vnet_rss_queue;
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
bpf is not an option then. we do not add stable bpf program types or hooks any more. If a kernel subsystem wants to use bpf it needs to accept the fact that such bpf extensibility will be unstable and subsystem maintainers can decide to remove such bpf support in the future.
Based on hooks for tracepoints and kfuncs, correct?
Perhaps the existing stable flow dissector type is extensible to optionally compute the hash and report hash and hash type. Else we probably should revisit the previous version of this series.
On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
};__u16 vnet_rss_queue;
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
bpf is not an option then. we do not add stable bpf program types or hooks any more.
Does this mean eBPF could not be used for any new use cases other than the existing ones?
If a kernel subsystem wants to use bpf it needs to accept the fact that such bpf extensibility will be unstable and subsystem maintainers can decide to remove such bpf support in the future.
I don't see how it is different from the existing ones.
Thanks
On Mon, Oct 16, 2023 at 7:38 PM Jason Wang jasowang@redhat.com wrote:
On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
};__u16 vnet_rss_queue;
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
bpf is not an option then. we do not add stable bpf program types or hooks any more.
Does this mean eBPF could not be used for any new use cases other than the existing ones?
It means that any new use of bpf has to be unstable for the time being.
If a kernel subsystem wants to use bpf it needs to accept the fact that such bpf extensibility will be unstable and subsystem maintainers can decide to remove such bpf support in the future.
I don't see how it is different from the existing ones.
Can we remove BPF_CGROUP_RUN_PROG_INET_INGRESS hook along with BPF_PROG_TYPE_CGROUP_SKB program type? Obviously not. We can refactor it. We can move it around, but not remove. That's the difference in stable vs unstable.
On 2023/10/18 4:03, Alexei Starovoitov wrote:
On Mon, Oct 16, 2023 at 7:38 PM Jason Wang jasowang@redhat.com wrote:
On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0448700890f7..298634556fab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER,
BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
@@ -6111,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp;
__u32 vnet_hash_value;
__u16 vnet_hash_report;
};__u16 vnet_rss_queue;
we also do not add anything to uapi __sk_buff.
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.gen_ld_abs = bpf_gen_ld_abs,
+};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
bpf is not an option then. we do not add stable bpf program types or hooks any more.
Does this mean eBPF could not be used for any new use cases other than the existing ones?
It means that any new use of bpf has to be unstable for the time being.
Can you elaborate more about making new use unstable "for the time being?" Is it a temporary situation? What is the rationale for that? Such information will help devise a solution that is best for both of the BPF and network subsystems.
I would also appreciate if you have some documentation or link to relevant discussions on the mailing list. That will avoid having same discussion you may already have done in the past.
On 2023/10/18 4:19, Akihiko Odaki wrote:
On 2023/10/18 4:03, Alexei Starovoitov wrote:
On Mon, Oct 16, 2023 at 7:38 PM Jason Wang jasowang@redhat.com wrote:
On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/16 1:07, Alexei Starovoitov wrote:
On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki akihiko.odaki@daynix.com wrote: > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 0448700890f7..298634556fab 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -988,6 +988,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_SK_LOOKUP, > BPF_PROG_TYPE_SYSCALL, /* a program that can execute > syscalls */ > BPF_PROG_TYPE_NETFILTER, > + BPF_PROG_TYPE_VNET_HASH,
Sorry, we do not add new stable program types anymore.
> @@ -6111,6 +6112,10 @@ struct __sk_buff { > __u8 tstamp_type; > __u32 :24; /* Padding, future use. */ > __u64 hwtstamp; > + > + __u32 vnet_hash_value; > + __u16 vnet_hash_report; > + __u16 vnet_rss_queue; > };
we also do not add anything to uapi __sk_buff.
> +const struct bpf_verifier_ops vnet_hash_verifier_ops = { > + .get_func_proto = sk_filter_func_proto, > + .is_valid_access = sk_filter_is_valid_access, > + .convert_ctx_access = bpf_convert_ctx_access, > + .gen_ld_abs = bpf_gen_ld_abs, > +};
and we don't do ctx rewrites like this either.
Please see how hid-bpf and cgroup rstat are hooking up bpf in _unstable_ way.
Can you describe what "stable" and "unstable" mean here? I'm new to BPF and I'm worried if it may mean the interface stability.
Let me describe the context. QEMU bundles an eBPF program that is used for the "eBPF steering program" feature of tun. Now I'm proposing to extend the feature to allow to return some values to the userspace and vhost_net. As such, the extension needs to be done in a way that ensures interface stability.
bpf is not an option then. we do not add stable bpf program types or hooks any more.
Does this mean eBPF could not be used for any new use cases other than the existing ones?
It means that any new use of bpf has to be unstable for the time being.
Can you elaborate more about making new use unstable "for the time being?" Is it a temporary situation? What is the rationale for that? Such information will help devise a solution that is best for both of the BPF and network subsystems.
I would also appreciate if you have some documentation or link to relevant discussions on the mailing list. That will avoid having same discussion you may already have done in the past.
Hi,
The discussion has been stuck for a month, but I'd still like to continue figuring out the way best for the whole kernel to implement this feature. I summarize the current situation and question that needs to be answered before push this forward:
The goal of this RFC is to allow to report hash values calculated with eBPF steering program. It's essentially just to report 4 bytes from the kernel to the userspace.
Unfortunately, however, it is not acceptable for the BPF subsystem because the "stable" BPF is completely fixed these days. The "unstable/kfunc" BPF is an alternative, but the eBPF program will be shipped with a portable userspace program (QEMU)[1] so the lack of interface stability is not tolerable.
Another option is to hardcode the algorithm that was conventionally implemented with eBPF steering program in the kernel[2]. It is possible because the algorithm strictly follows the virtio-net specification[3]. However, there are proposals to add different algorithms to the specification[4], and hardcoding the algorithm to the kernel will require to add more UAPIs and code each time such a specification change happens, which is not good for tuntap.
In short, the proposed feature requires to make either of three compromises:
1. Compromise on the BPF side: Relax the "stable" BPF feature freeze once and allow eBPF steering program to report 4 more bytes to the kernel.
2. Compromise on the tuntap side: Implement the algorithm to the kernel, and abandon the capability to update the algorithm without changing the kernel.
IMHO, I think it's better to make a compromise on the BPF side (option 1). We should minimize the total UAPI changes in the whole kernel, and option 1 is much superior in that sense.
Yet I have to note that such a compromise on the BPF side can risk the "stable" BPF feature freeze fragile and let other people complain like "you allowed to change stable BPF for this, why do you reject [some other request to change stable BPF]?" It is bad for BPF maintainers. (I can imagine that introducing and maintaining widely different BPF interfaces is too much burden.) And, of course, this requires an approval from BPF maintainers.
So I'd like to ask you that which of these compromises you think worse. Please also tell me if you have another idea.
Regards, Akihiko Odaki
[1] https://qemu.readthedocs.io/en/v8.1.0/devel/ebpf_rss.html [2] https://lore.kernel.org/all/20231008052101.144422-1-akihiko.odaki@daynix.com... [3] https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#... [4] https://lore.kernel.org/all/CACGkMEuBbGKssxNv5AfpaPpWQfk2BHR83rM5AHXN-YVMf2N...
Hi,
A few rookie questions below.
On Sat, Nov 18, 2023 at 2:39 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/18 4:19, Akihiko Odaki wrote:
On 2023/10/18 4:03, Alexei Starovoitov wrote:
[...]
I would also appreciate if you have some documentation or link to relevant discussions on the mailing list. That will avoid having same discussion you may already have done in the past.
Hi,
The discussion has been stuck for a month, but I'd still like to continue figuring out the way best for the whole kernel to implement this feature. I summarize the current situation and question that needs to be answered before push this forward:
The goal of this RFC is to allow to report hash values calculated with eBPF steering program. It's essentially just to report 4 bytes from the kernel to the userspace.
AFAICT, the proposed design is to have BPF generate some data (namely hash, but could be anything afaict) and consume it from user space. Instead of updating __sk_buff, can we have the user space to fetch the data/hash from a bpf map? If this is an option, I guess we can implement the same feature with BPF tracing programs?
Unfortunately, however, it is not acceptable for the BPF subsystem because the "stable" BPF is completely fixed these days. The "unstable/kfunc" BPF is an alternative, but the eBPF program will be shipped with a portable userspace program (QEMU)[1] so the lack of interface stability is not tolerable.
bpf kfuncs are as stable as exported symbols. Is exported symbols like stability enough for the use case? (I would assume yes.)
Another option is to hardcode the algorithm that was conventionally implemented with eBPF steering program in the kernel[2]. It is possible because the algorithm strictly follows the virtio-net specification[3]. However, there are proposals to add different algorithms to the specification[4], and hardcoding the algorithm to the kernel will require to add more UAPIs and code each time such a specification change happens, which is not good for tuntap.
The requirement looks similar to hid-bpf. Could you explain why that model is not enough? HID also requires some stability AFAICT.
Thanks, Song
In short, the proposed feature requires to make either of three compromises:
- Compromise on the BPF side: Relax the "stable" BPF feature freeze
once and allow eBPF steering program to report 4 more bytes to the kernel.
- Compromise on the tuntap side: Implement the algorithm to the kernel,
and abandon the capability to update the algorithm without changing the kernel.
IMHO, I think it's better to make a compromise on the BPF side (option 1). We should minimize the total UAPI changes in the whole kernel, and option 1 is much superior in that sense.
Yet I have to note that such a compromise on the BPF side can risk the "stable" BPF feature freeze fragile and let other people complain like "you allowed to change stable BPF for this, why do you reject [some other request to change stable BPF]?" It is bad for BPF maintainers. (I can imagine that introducing and maintaining widely different BPF interfaces is too much burden.) And, of course, this requires an approval from BPF maintainers.
So I'd like to ask you that which of these compromises you think worse. Please also tell me if you have another idea.
Regards, Akihiko Odaki
[1] https://qemu.readthedocs.io/en/v8.1.0/devel/ebpf_rss.html [2] https://lore.kernel.org/all/20231008052101.144422-1-akihiko.odaki@daynix.com... [3] https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#... [4] https://lore.kernel.org/all/CACGkMEuBbGKssxNv5AfpaPpWQfk2BHR83rM5AHXN-YVMf2N...
On 2023/11/19 1:08, Song Liu wrote:
Hi,
A few rookie questions below.
Thanks for questions.
On Sat, Nov 18, 2023 at 2:39 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/10/18 4:19, Akihiko Odaki wrote:
On 2023/10/18 4:03, Alexei Starovoitov wrote:
[...]
I would also appreciate if you have some documentation or link to relevant discussions on the mailing list. That will avoid having same discussion you may already have done in the past.
Hi,
The discussion has been stuck for a month, but I'd still like to continue figuring out the way best for the whole kernel to implement this feature. I summarize the current situation and question that needs to be answered before push this forward:
The goal of this RFC is to allow to report hash values calculated with eBPF steering program. It's essentially just to report 4 bytes from the kernel to the userspace.
AFAICT, the proposed design is to have BPF generate some data (namely hash, but could be anything afaict) and consume it from user space. Instead of updating __sk_buff, can we have the user space to fetch the data/hash from a bpf map? If this is an option, I guess we can implement the same feature with BPF tracing programs?
Unfortunately no. The communication with the userspace can be done with two different means: - usual socket read/write - vhost for direct interaction with a KVM guest
The BPF map may be a valid option for socket read/write, but it is not for vhost. In-kernel vhost may fetch hash from the BPF map, but I guess it's not a standard way to have an interaction between the kernel code and a BPF program.
Unfortunately, however, it is not acceptable for the BPF subsystem because the "stable" BPF is completely fixed these days. The "unstable/kfunc" BPF is an alternative, but the eBPF program will be shipped with a portable userspace program (QEMU)[1] so the lack of interface stability is not tolerable.
bpf kfuncs are as stable as exported symbols. Is exported symbols like stability enough for the use case? (I would assume yes.)
Another option is to hardcode the algorithm that was conventionally implemented with eBPF steering program in the kernel[2]. It is possible because the algorithm strictly follows the virtio-net specification[3]. However, there are proposals to add different algorithms to the specification[4], and hardcoding the algorithm to the kernel will require to add more UAPIs and code each time such a specification change happens, which is not good for tuntap.
The requirement looks similar to hid-bpf. Could you explain why that model is not enough? HID also requires some stability AFAICT.
I have little knowledge with hid-bpf, but I assume it is more like a "safe" kernel module; in my understanding, it affects the system state and is intended to be loaded with some kind of a system daemon. It is fine to have the same lifecycle with the kernel for such a BPF program; whenever the kernel is updated, the distributor can recompile the BPF program with the new kernel headers and ship it along with the kernel just as like a kernel module.
In contrast, our intended use case is more like a normal application. So, for example, a user may download a container and run QEMU (including the BPF program) installed in the container. As such, it is nice if the ABI is stable across kernel releases, but it is not guaranteed for kfuncs. Such a use case is already covered with the eBPF steering program so I want to maintain it if possible.
Regards, Akihiko Odaki
On Sun, Nov 19, 2023 at 12:03 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
[...]
Unfortunately no. The communication with the userspace can be done with two different means:
- usual socket read/write
- vhost for direct interaction with a KVM guest
The BPF map may be a valid option for socket read/write, but it is not for vhost. In-kernel vhost may fetch hash from the BPF map, but I guess it's not a standard way to have an interaction between the kernel code and a BPF program.
I am very new to areas like vhost and KVM. So I don't really follow. Does this mean we have the guest kernel reading data from host eBPF programs (loaded by Qemu)?
Unfortunately, however, it is not acceptable for the BPF subsystem because the "stable" BPF is completely fixed these days. The "unstable/kfunc" BPF is an alternative, but the eBPF program will be shipped with a portable userspace program (QEMU)[1] so the lack of interface stability is not tolerable.
bpf kfuncs are as stable as exported symbols. Is exported symbols like stability enough for the use case? (I would assume yes.)
Another option is to hardcode the algorithm that was conventionally implemented with eBPF steering program in the kernel[2]. It is possible because the algorithm strictly follows the virtio-net specification[3]. However, there are proposals to add different algorithms to the specification[4], and hardcoding the algorithm to the kernel will require to add more UAPIs and code each time such a specification change happens, which is not good for tuntap.
The requirement looks similar to hid-bpf. Could you explain why that model is not enough? HID also requires some stability AFAICT.
I have little knowledge with hid-bpf, but I assume it is more like a "safe" kernel module; in my understanding, it affects the system state and is intended to be loaded with some kind of a system daemon. It is fine to have the same lifecycle with the kernel for such a BPF program; whenever the kernel is updated, the distributor can recompile the BPF program with the new kernel headers and ship it along with the kernel just as like a kernel module.
In contrast, our intended use case is more like a normal application. So, for example, a user may download a container and run QEMU (including the BPF program) installed in the container. As such, it is nice if the ABI is stable across kernel releases, but it is not guaranteed for kfuncs. Such a use case is already covered with the eBPF steering program so I want to maintain it if possible.
TBH, I don't think stability should be a concern for kfuncs used by QEMU. Many core BPF APIs are now implemented as kfuncs: bpf_dynptr_*, bpf_rcu_*, etc. As long as there are valid use cases,these kfuncs will be supported.
Thanks, Song
On 2023/11/20 6:02, Song Liu wrote:
On Sun, Nov 19, 2023 at 12:03 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
[...]
Unfortunately no. The communication with the userspace can be done with two different means:
- usual socket read/write
- vhost for direct interaction with a KVM guest
The BPF map may be a valid option for socket read/write, but it is not for vhost. In-kernel vhost may fetch hash from the BPF map, but I guess it's not a standard way to have an interaction between the kernel code and a BPF program.
I am very new to areas like vhost and KVM. So I don't really follow. Does this mean we have the guest kernel reading data from host eBPF programs (loaded by Qemu)?
Yes, the guest will read hashes calculated by the host, and the interface is strictly defined with the virtio-net specification.
Unfortunately, however, it is not acceptable for the BPF subsystem because the "stable" BPF is completely fixed these days. The "unstable/kfunc" BPF is an alternative, but the eBPF program will be shipped with a portable userspace program (QEMU)[1] so the lack of interface stability is not tolerable.
bpf kfuncs are as stable as exported symbols. Is exported symbols like stability enough for the use case? (I would assume yes.)
Another option is to hardcode the algorithm that was conventionally implemented with eBPF steering program in the kernel[2]. It is possible because the algorithm strictly follows the virtio-net specification[3]. However, there are proposals to add different algorithms to the specification[4], and hardcoding the algorithm to the kernel will require to add more UAPIs and code each time such a specification change happens, which is not good for tuntap.
The requirement looks similar to hid-bpf. Could you explain why that model is not enough? HID also requires some stability AFAICT.
I have little knowledge with hid-bpf, but I assume it is more like a "safe" kernel module; in my understanding, it affects the system state and is intended to be loaded with some kind of a system daemon. It is fine to have the same lifecycle with the kernel for such a BPF program; whenever the kernel is updated, the distributor can recompile the BPF program with the new kernel headers and ship it along with the kernel just as like a kernel module.
In contrast, our intended use case is more like a normal application. So, for example, a user may download a container and run QEMU (including the BPF program) installed in the container. As such, it is nice if the ABI is stable across kernel releases, but it is not guaranteed for kfuncs. Such a use case is already covered with the eBPF steering program so I want to maintain it if possible.
TBH, I don't think stability should be a concern for kfuncs used by QEMU. Many core BPF APIs are now implemented as kfuncs: bpf_dynptr_*, bpf_rcu_*, etc. As long as there are valid use cases,these kfuncs will be supported.
Documentation/bpf/kfuncs.rst still says:
kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs.
Is it possible to change the statement like as follows: "Most kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs. kfuncs that have same stability restrictions associated with UAPIs are exceptional, and must be carefully reviewed by subsystem (and BPF?) maintainers as any other UAPIs are."
Regards, Akihiko Odaki
On Mon, Nov 20, 2023 at 12:05 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/11/20 6:02, Song Liu wrote:
[...]
In contrast, our intended use case is more like a normal application. So, for example, a user may download a container and run QEMU (including the BPF program) installed in the container. As such, it is nice if the ABI is stable across kernel releases, but it is not guaranteed for kfuncs. Such a use case is already covered with the eBPF steering program so I want to maintain it if possible.
TBH, I don't think stability should be a concern for kfuncs used by QEMU. Many core BPF APIs are now implemented as kfuncs: bpf_dynptr_*, bpf_rcu_*, etc. As long as there are valid use cases,these kfuncs will be supported.
Documentation/bpf/kfuncs.rst still says:
kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs.
Is it possible to change the statement like as follows: "Most kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs. kfuncs that have same stability restrictions associated with UAPIs are exceptional, and must be carefully reviewed by subsystem (and BPF?) maintainers as any other UAPIs are."
I am afraid this is against the intention to not guarantee UAPI-level stability for kfuncs.
Thanks, Song
On 2023/11/22 14:25, Song Liu wrote:
On Mon, Nov 20, 2023 at 12:05 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/11/20 6:02, Song Liu wrote:
[...]
In contrast, our intended use case is more like a normal application. So, for example, a user may download a container and run QEMU (including the BPF program) installed in the container. As such, it is nice if the ABI is stable across kernel releases, but it is not guaranteed for kfuncs. Such a use case is already covered with the eBPF steering program so I want to maintain it if possible.
TBH, I don't think stability should be a concern for kfuncs used by QEMU. Many core BPF APIs are now implemented as kfuncs: bpf_dynptr_*, bpf_rcu_*, etc. As long as there are valid use cases,these kfuncs will be supported.
Documentation/bpf/kfuncs.rst still says:
kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs.
Is it possible to change the statement like as follows: "Most kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs. kfuncs that have same stability restrictions associated with UAPIs are exceptional, and must be carefully reviewed by subsystem (and BPF?) maintainers as any other UAPIs are."
I am afraid this is against the intention to not guarantee UAPI-level stability for kfuncs.
Is it possible to ensure that a QEMU binary with the eBPF program included works on different kernel versions without UAPI-level stability then? Otherwise, I think we need to think of the minimal UAPI addition that exposes the feature I propose, and the two options I presented first are the candidates of such: the stable BPF change or tuntap interface change.
Regards, Akihiko Odaki
On 2023/11/22 14:36, Akihiko Odaki wrote:
On 2023/11/22 14:25, Song Liu wrote:
On Mon, Nov 20, 2023 at 12:05 AM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/11/20 6:02, Song Liu wrote:
[...]
In contrast, our intended use case is more like a normal application. So, for example, a user may download a container and run QEMU (including the BPF program) installed in the container. As such, it is nice if the ABI is stable across kernel releases, but it is not guaranteed for kfuncs. Such a use case is already covered with the eBPF steering program so I want to maintain it if possible.
TBH, I don't think stability should be a concern for kfuncs used by QEMU. Many core BPF APIs are now implemented as kfuncs: bpf_dynptr_*, bpf_rcu_*, etc. As long as there are valid use cases,these kfuncs will be supported.
Documentation/bpf/kfuncs.rst still says: > kfuncs provide a kernel <-> kernel API, and thus are not bound by any > of the strict stability restrictions associated with kernel <-> user > UAPIs.
Is it possible to change the statement like as follows: "Most kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the strict stability restrictions associated with kernel <-> user UAPIs. kfuncs that have same stability restrictions associated with UAPIs are exceptional, and must be carefully reviewed by subsystem (and BPF?) maintainers as any other UAPIs are."
I am afraid this is against the intention to not guarantee UAPI-level stability for kfuncs.
Is it possible to ensure that a QEMU binary with the eBPF program included works on different kernel versions without UAPI-level stability then? Otherwise, I think we need to think of the minimal UAPI addition that exposes the feature I propose, and the two options I presented first are the candidates of such: the stable BPF change or tuntap interface change.
Regards, Akihiko Odaki
Now the discussion is stale again so let me summarize the discussion:
A tuntap device can have an eBPF steering program to let the userspace decide which tuntap queue should be used for each packet. QEMU uses this feature to implement the RSS algorithm for virtio-net emulation. Now, the virtio specification has a new feature to report hash values calculated with the RSS algorithm. The goal of this RFC is to report such hash values from the eBPF steering program to the userspace.
There are currently three ideas to implement the proposal:
1. Abandon eBPF steering program and implement RSS in the kernel.
It is possible to implement the RSS algorithm in the kernel as it's strictly defined in the specification. However, there are proposals for relevant virtio specification changes, and abandoning eBPF steering program will loose the ability to implement those changes in the userspace. There are concerns that this lead to more UAPI changes in the end.
2. Add BPF kfuncs.
Adding BPF kfuncs is *the* standard way to add BPF interfaces. hid-bpf is a good reference for this.
The problem with BPF kfuncs is that kfuncs are not considered as stable as UAPI. In my understanding, it is not problematic for things like hid-bpf because programs using those kfuncs affect the entire system state and expected to be centrally managed. Such BPF programs can be updated along with the kernel in a manner similar to kernel modules.
The use case of tuntap steering/hash reporting is somewhat different though; the eBPF program is more like a part of application (QEMU or potentially other VMM) and thus needs to be portable. For example, a user may expect a Debian container with QEMU installed to work on Fedora.
BPF kfuncs do still provide some level of stability, but there is no documentation that tell how stable they are. The worst case scenario I can imagine is that a future legitimate BPF change breaks QEMU, letting the "no regressions" rule force the change to be reverted. Some assurance that kind scenario will not happen is necessary in my opinion.
3. Add BPF program type derived from the conventional steering program type
In principle, it's just to add a feature to report four more bytes to the conventional steering program. However, BPF program types are frozen for feature additions and the proposed change will break the feature freeze.
So what's next? I'm inclined to option 3 due to its minimal ABI/API change, but I'm also fine with option 2 if it is possible to guarantee the ABI/API stability necessary to run pre-built QEMUs on future kernel versions by e.g., explicitly stating the stability of kfuncs. If no objection arises, I'll resend this series with the RFC prefix dropped for upstream inclusion. If it's decided to go for option 1 or 2, I'll post a new version of the series implementing the idea.
Regards, Akihiko Odaki
On Sat, Dec 9, 2023 at 11:03 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/11/22 14:36, Akihiko Odaki wrote:
On 2023/11/22 14:25, Song Liu wrote:
[...]
Now the discussion is stale again so let me summarize the discussion:
A tuntap device can have an eBPF steering program to let the userspace decide which tuntap queue should be used for each packet. QEMU uses this feature to implement the RSS algorithm for virtio-net emulation. Now, the virtio specification has a new feature to report hash values calculated with the RSS algorithm. The goal of this RFC is to report such hash values from the eBPF steering program to the userspace.
There are currently three ideas to implement the proposal:
- Abandon eBPF steering program and implement RSS in the kernel.
It is possible to implement the RSS algorithm in the kernel as it's strictly defined in the specification. However, there are proposals for relevant virtio specification changes, and abandoning eBPF steering program will loose the ability to implement those changes in the userspace. There are concerns that this lead to more UAPI changes in the end.
- Add BPF kfuncs.
Adding BPF kfuncs is *the* standard way to add BPF interfaces. hid-bpf is a good reference for this.
The problem with BPF kfuncs is that kfuncs are not considered as stable as UAPI. In my understanding, it is not problematic for things like hid-bpf because programs using those kfuncs affect the entire system state and expected to be centrally managed. Such BPF programs can be updated along with the kernel in a manner similar to kernel modules.
The use case of tuntap steering/hash reporting is somewhat different though; the eBPF program is more like a part of application (QEMU or potentially other VMM) and thus needs to be portable. For example, a user may expect a Debian container with QEMU installed to work on Fedora.
BPF kfuncs do still provide some level of stability, but there is no documentation that tell how stable they are. The worst case scenario I can imagine is that a future legitimate BPF change breaks QEMU, letting the "no regressions" rule force the change to be reverted. Some assurance that kind scenario will not happen is necessary in my opinion.
I don't think we can provide stability guarantees before seeing something being used in the field. How do we know it will be useful forever? If a couple years later, there is only one person using it somewhere in the world, why should we keep supporting it? If there are millions of virtual machines using it, why would you worry about it being removed?
- Add BPF program type derived from the conventional steering program type
In principle, it's just to add a feature to report four more bytes to the conventional steering program. However, BPF program types are frozen for feature additions and the proposed change will break the feature freeze.
So what's next? I'm inclined to option 3 due to its minimal ABI/API change, but I'm also fine with option 2 if it is possible to guarantee the ABI/API stability necessary to run pre-built QEMUs on future kernel versions by e.g., explicitly stating the stability of kfuncs. If no objection arises, I'll resend this series with the RFC prefix dropped for upstream inclusion. If it's decided to go for option 1 or 2, I'll post a new version of the series implementing the idea.
Probably a dumb question, but does this RFC fall into option 3? If that's the case, I seriously don't think it's gonna happen.
I would recommend you give option 2 a try and share the code. This is probably the best way to move the discussion forward.
Thanks, Song
On 2023/12/11 10:40, Song Liu wrote:
On Sat, Dec 9, 2023 at 11:03 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
On 2023/11/22 14:36, Akihiko Odaki wrote:
On 2023/11/22 14:25, Song Liu wrote:
[...]
Now the discussion is stale again so let me summarize the discussion:
A tuntap device can have an eBPF steering program to let the userspace decide which tuntap queue should be used for each packet. QEMU uses this feature to implement the RSS algorithm for virtio-net emulation. Now, the virtio specification has a new feature to report hash values calculated with the RSS algorithm. The goal of this RFC is to report such hash values from the eBPF steering program to the userspace.
There are currently three ideas to implement the proposal:
- Abandon eBPF steering program and implement RSS in the kernel.
It is possible to implement the RSS algorithm in the kernel as it's strictly defined in the specification. However, there are proposals for relevant virtio specification changes, and abandoning eBPF steering program will loose the ability to implement those changes in the userspace. There are concerns that this lead to more UAPI changes in the end.
- Add BPF kfuncs.
Adding BPF kfuncs is *the* standard way to add BPF interfaces. hid-bpf is a good reference for this.
The problem with BPF kfuncs is that kfuncs are not considered as stable as UAPI. In my understanding, it is not problematic for things like hid-bpf because programs using those kfuncs affect the entire system state and expected to be centrally managed. Such BPF programs can be updated along with the kernel in a manner similar to kernel modules.
The use case of tuntap steering/hash reporting is somewhat different though; the eBPF program is more like a part of application (QEMU or potentially other VMM) and thus needs to be portable. For example, a user may expect a Debian container with QEMU installed to work on Fedora.
BPF kfuncs do still provide some level of stability, but there is no documentation that tell how stable they are. The worst case scenario I can imagine is that a future legitimate BPF change breaks QEMU, letting the "no regressions" rule force the change to be reverted. Some assurance that kind scenario will not happen is necessary in my opinion.
I don't think we can provide stability guarantees before seeing something being used in the field. How do we know it will be useful forever? If a couple years later, there is only one person using it somewhere in the world, why should we keep supporting it? If there are millions of virtual machines using it, why would you worry about it being removed?
I have a different opinion about providing stability guarantees; I believe it is safe to provide such a guarantee without actual use in a field. We develop features expecting there are real uses, and if it turns out otherwise, we can break the stated guarantee since there is no real use cases anyway. It is fine even breaking UAPIs in such a case, which is stated in Documentation/admin-guide/reporting-regressions.rst.
So I rather feel easy about guaranteeing UAPI stability; we can just guarantee the UAPI-level stability for a particular kfunc and use it from QEMU expecting the stability. If the feature is found not useful, QEMU and the kernel can just remove it.
I'm more concerned about the other case, which means that there will be wide uses of this feature. A kernel developer may assume the stability of the interface is like one of kernel internal APIs (Documentation/bpf/kfuncs.rst says kfuncs are like EXPORT_SYMBOL_GPL) and decide to change it, breaking old QEMU binaries and that's something I would like to avoid.
Regarding the breakage scenario, I think we can avoid the kfuncs removal just by saying "we won't remove them". I'm more worried the case that a change in the BPF kfunc infrastucture requires to recompile the binary.
So, in short, I don't think we can say "kfuncs are like EXPORT_SYMBOL_GPL" and "you can freely use kfuncs in a normal userspace application like QEMU" at the same time.
- Add BPF program type derived from the conventional steering program type
In principle, it's just to add a feature to report four more bytes to the conventional steering program. However, BPF program types are frozen for feature additions and the proposed change will break the feature freeze.
So what's next? I'm inclined to option 3 due to its minimal ABI/API change, but I'm also fine with option 2 if it is possible to guarantee the ABI/API stability necessary to run pre-built QEMUs on future kernel versions by e.g., explicitly stating the stability of kfuncs. If no objection arises, I'll resend this series with the RFC prefix dropped for upstream inclusion. If it's decided to go for option 1 or 2, I'll post a new version of the series implementing the idea.
Probably a dumb question, but does this RFC fall into option 3? If that's the case, I seriously don't think it's gonna happen.
Yes, it's option 3.
I would recommend you give option 2 a try and share the code. This is probably the best way to move the discussion forward.
I'd like to add a documentation change to say the added kfuncs are exceptional cases that are not like EXPORT_SYMBOL_GPL in that case. Will it work?
Regards, Akihiko Odaki
On Sun, Dec 10, 2023 at 9:04 PM Akihiko Odaki akihiko.odaki@daynix.com wrote:
[...]
I don't think we can provide stability guarantees before seeing something being used in the field. How do we know it will be useful forever? If a couple years later, there is only one person using it somewhere in the world, why should we keep supporting it? If there are millions of virtual machines using it, why would you worry about it being removed?
I have a different opinion about providing stability guarantees; I believe it is safe to provide such a guarantee without actual use in a field. We develop features expecting there are real uses, and if it turns out otherwise, we can break the stated guarantee since there is no real use cases anyway. It is fine even breaking UAPIs in such a case, which is stated in Documentation/admin-guide/reporting-regressions.rst.
So I rather feel easy about guaranteeing UAPI stability; we can just guarantee the UAPI-level stability for a particular kfunc and use it from QEMU expecting the stability. If the feature is found not useful, QEMU and the kernel can just remove it.
It appears that we more or less agree that this feature may not be something we will support forever.
I'm more concerned about the other case, which means that there will be wide uses of this feature. A kernel developer may assume the stability of the interface is like one of kernel internal APIs (Documentation/bpf/kfuncs.rst says kfuncs are like EXPORT_SYMBOL_GPL) and decide to change it, breaking old QEMU binaries and that's something I would like to avoid.
Regarding the breakage scenario, I think we can avoid the kfuncs removal just by saying "we won't remove them". I'm more worried the case that a change in the BPF kfunc infrastucture requires to recompile the binary.
So, in short, I don't think we can say "kfuncs are like EXPORT_SYMBOL_GPL" and "you can freely use kfuncs in a normal userspace application like QEMU" at the same time.
[...]
I would recommend you give option 2 a try and share the code. This is probably the best way to move the discussion forward.
I'd like to add a documentation change to say the added kfuncs are exceptional cases that are not like EXPORT_SYMBOL_GPL in that case. Will it work?
It will not.
The BPF community had a lot of discussions about the stability of BPF APIs, for example in [1]. Therefore, this is not a light decision.
AFAICT, what is being proposed in this RFC is way less stable than many kfuncs we added recently. We are not changing the stability guarantee for this. Let's invest our time wisely and work on more meaningful things, for example, a prototype that may actually get accepted.
Thanks, Song
[1] https://lore.kernel.org/bpf/20221207205537.860248-1-joannelkoong@gmail.com/
They will be used only by BPF_PROG_TYPE_VNET_HASH to tell the queues to deliver packets and the hash values and types reported with virtio-net headers.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- include/linux/filter.h | 7 ++++ net/core/filter.c | 77 +++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 4 ++ 3 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index bf7ad887943c..d10afe92ee45 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -643,6 +643,13 @@ struct bpf_skb_data_end { void *data_end; };
+struct bpf_skb_vnet_hash_end { + struct qdisc_skb_cb qdisc_cb; + u32 hash_value; + u16 hash_report; + u16 rss_queue; +}; + struct bpf_nh_params { u32 nh_family; union { diff --git a/net/core/filter.c b/net/core/filter.c index 867edbc628de..35bc60b71722 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8435,9 +8435,15 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): if (size != size_default) return false; break; + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): + if (size != sizeof(__u16)) + return false; + break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, hwtstamp): @@ -8473,7 +8479,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return true; }
-static bool sk_filter_is_valid_access(int off, int size, +static bool vnet_hash_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -8493,6 +8499,9 @@ static bool sk_filter_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): break; default: return false; @@ -8502,6 +8511,21 @@ static bool sk_filter_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); }
+static bool sk_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): + return false; + } + + return vnet_hash_is_valid_access(off, size, type, prog, info); +} + static bool cg_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -8511,6 +8535,9 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -8558,6 +8585,9 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): return false; }
@@ -8799,6 +8829,10 @@ static bool tc_cls_act_is_valid_access(int off, int size, }
switch (off) { + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): + return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -9117,6 +9151,9 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): + case bpf_ctx_range(struct __sk_buff, vnet_hash_value): + case bpf_ctx_range(struct __sk_buff, vnet_hash_report): + case bpf_ctx_range(struct __sk_buff, vnet_rss_queue): return false; }
@@ -9727,6 +9764,42 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, hwtstamps, 8, target_size)); break; + + case offsetof(struct __sk_buff, vnet_hash_value): + BUILD_BUG_ON(sizeof_field(struct bpf_skb_vnet_hash_end, hash_value) != 4); + + off = offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_vnet_hash_end, hash_value); + + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, vnet_hash_report): + BUILD_BUG_ON(sizeof_field(struct bpf_skb_vnet_hash_end, hash_report) != 2); + + off = offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_vnet_hash_end, hash_report); + + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + else + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, vnet_rss_queue): + BUILD_BUG_ON(sizeof_field(struct bpf_skb_vnet_hash_end, rss_queue) != 2); + + off = offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_vnet_hash_end, rss_queue); + + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + else + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); + break; }
return insn - insn_buf; @@ -10969,7 +11042,7 @@ const struct bpf_prog_ops flow_dissector_prog_ops = {
const struct bpf_verifier_ops vnet_hash_verifier_ops = { .get_func_proto = sk_filter_func_proto, - .is_valid_access = sk_filter_is_valid_access, + .is_valid_access = vnet_hash_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 60976fe86247..298634556fab 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6112,6 +6112,10 @@ struct __sk_buff { __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; + + __u32 vnet_hash_value; + __u16 vnet_hash_report; + __u16 vnet_rss_queue; };
struct bpf_tunnel_key {
This new extension will be used by tun to carry the hash values and types to report with virtio-net headers.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- include/linux/skbuff.h | 10 ++++++++++ net/core/skbuff.c | 3 +++ 2 files changed, 13 insertions(+)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4174c4b82d13..1f2e5d350810 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -333,6 +333,13 @@ struct tc_skb_ext { }; #endif
+#if IS_ENABLED(CONFIG_TUN) +struct tun_vnet_hash { + u32 value; + u16 report; +}; +#endif + struct sk_buff_head { /* These two members must be first to match sk_buff. */ struct_group_tagged(sk_buff_list, list, @@ -4631,6 +4638,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, +#endif +#if IS_ENABLED(CONFIG_TUN) + SKB_EXT_TUN_VNET_HASH, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4eaf7ed0d1f4..774c2b26bf25 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4793,6 +4793,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_TUN) + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct tun_vnet_hash), +#endif };
static __always_inline unsigned int skb_ext_total_length(void)
It is identical with virtio_net_hdr_from_skb() except that it impelements hash reporting.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- include/linux/virtio_net.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 7b4dd69555e4..01e594b4586b 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -216,4 +216,26 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, return 0; }
+static inline int virtio_net_hdr_v1_hash_from_skb(const struct sk_buff *skb, + struct virtio_net_hdr_v1_hash *hdr, + bool little_endian, + bool has_data_valid, + int vlan_hlen, + u32 hash_value, + u16 hash_report) +{ + int ret; + + memset(hdr, 0, sizeof(*hdr)); + + ret = virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr, + little_endian, has_data_valid, vlan_hlen); + if (!ret) { + hdr->hash_value = cpu_to_le32(hash_value); + hdr->hash_report = cpu_to_le16(hash_report); + } + + return ret; +} + #endif /* _LINUX_VIRTIO_NET_H */
Support BPF_PROG_TYPE_VNET_HASH with TUNSETSTEERINGEBPF ioctl to make it possible to report hash values and types when steering packets.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- drivers/net/tun.c | 158 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 117 insertions(+), 41 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 89ab9efe522c..e0b453572a64 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -543,19 +543,37 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { + struct bpf_skb_vnet_hash_end *cb = (struct bpf_skb_vnet_hash_end *)skb->cb; + struct tun_vnet_hash *ext; struct tun_prog *prog; u32 numqueues; - u16 ret = 0; + u16 queue = 0; + + BUILD_BUG_ON(sizeof(*cb) > sizeof(skb->cb));
numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0;
prog = rcu_dereference(tun->steering_prog); - if (prog) - ret = bpf_prog_run_clear_cb(prog->prog, skb); + if (prog) { + if (prog->prog->type == BPF_PROG_TYPE_VNET_HASH) { + memset(skb->cb, 0, sizeof(*cb) - sizeof(struct qdisc_skb_cb)); + bpf_prog_run_clear_cb(prog->prog, skb); + + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); + if (ext) { + ext->value = cb->hash_value; + ext->report = cb->hash_report; + }
- return ret % numqueues; + queue = cb->rss_queue; + } else { + queue = bpf_prog_run_clear_cb(prog->prog, skb); + } + } + + return queue % numqueues; }
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, @@ -2116,31 +2134,74 @@ static ssize_t tun_put_user(struct tun_struct *tun, }
if (vnet_hdr_sz) { - struct virtio_net_hdr gso; + struct bpf_skb_vnet_hash_end *cb = (struct bpf_skb_vnet_hash_end *)skb->cb; + struct tun_prog *prog; + struct tun_vnet_hash *vnet_hash_p; + struct tun_vnet_hash vnet_hash; + size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr); + union { + struct virtio_net_hdr hdr; + struct virtio_net_hdr_v1_hash hdr_v1_hash; + } vnet_hdr; + int ret;
if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL;
- if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun), true, - vlan_hlen)) { + if (vnet_hdr_sz >= sizeof(struct virtio_net_hdr_v1_hash)) { + vnet_hash_p = skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH); + if (vnet_hash_p) { + vnet_hash = *vnet_hash_p; + vnet_hdr_content_sz = sizeof(struct virtio_net_hdr_v1_hash); + } else { + rcu_read_lock(); + prog = rcu_dereference(tun->steering_prog); + if (prog && prog->prog->type == BPF_PROG_TYPE_VNET_HASH) { + memset(skb->cb, 0, + sizeof(*cb) - sizeof(struct qdisc_skb_cb)); + bpf_prog_run_clear_cb(prog->prog, skb); + vnet_hash.value = cb->hash_value; + vnet_hash.report = cb->hash_report; + vnet_hdr_content_sz = + sizeof(struct virtio_net_hdr_v1_hash); + } + rcu_read_unlock(); + } + } + + switch (vnet_hdr_content_sz) { + case sizeof(struct virtio_net_hdr): + ret = virtio_net_hdr_from_skb(skb, &vnet_hdr.hdr, + tun_is_little_endian(tun), true, + vlan_hlen); + break; + + case sizeof(struct virtio_net_hdr_v1_hash): + ret = virtio_net_hdr_v1_hash_from_skb(skb, &vnet_hdr.hdr_v1_hash, + tun_is_little_endian(tun), true, + vlan_hlen, + vnet_hash.value, vnet_hash.report); + break; + } + + if (ret) { struct skb_shared_info *sinfo = skb_shinfo(skb); pr_err("unexpected GSO type: " "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); + sinfo->gso_type, tun16_to_cpu(tun, vnet_hdr.hdr.gso_size), + tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + min((int)tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64), true); WARN_ON_ONCE(1); return -EINVAL; }
- if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) + if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz) return -EFAULT;
- iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + iov_iter_advance(iter, vnet_hdr_sz - vnet_hdr_content_sz); }
if (vlan_hlen) { @@ -2276,13 +2337,13 @@ static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
- bpf_prog_destroy(prog->prog); + bpf_prog_put(prog->prog); kfree(prog); }
-static int __tun_set_ebpf(struct tun_struct *tun, - struct tun_prog __rcu **prog_p, - struct bpf_prog *prog) +static int tun_set_ebpf(struct tun_struct *tun, + struct tun_prog __rcu **prog_p, + struct bpf_prog *prog) { struct tun_prog *old, *new = NULL;
@@ -2314,8 +2375,8 @@ static void tun_free_netdev(struct net_device *dev) free_percpu(dev->tstats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); - __tun_set_ebpf(tun, &tun->steering_prog, NULL); - __tun_set_ebpf(tun, &tun->filter_prog, NULL); + tun_set_ebpf(tun, &tun->steering_prog, NULL); + tun_set_ebpf(tun, &tun->filter_prog, NULL); }
static void tun_setup(struct net_device *dev) @@ -3007,26 +3068,6 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) return ret; }
-static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, - void __user *data) -{ - struct bpf_prog *prog; - int fd; - - if (copy_from_user(&fd, data, sizeof(fd))) - return -EFAULT; - - if (fd == -1) { - prog = NULL; - } else { - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } - - return __tun_set_ebpf(tun, prog_p, prog); -} - /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { @@ -3077,6 +3118,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, struct ifreq ifr; kuid_t owner; kgid_t group; + struct bpf_prog *prog; + int fd; int sndbuf; int vnet_hdr_sz; int le; @@ -3360,11 +3403,44 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break;
case TUNSETSTEERINGEBPF: - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); + if (copy_from_user(&fd, argp, sizeof(fd))) { + ret = -EFAULT; + break; + } + + if (fd == -1) { + prog = NULL; + } else { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_VNET_HASH); + if (IS_ERR(prog)) { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + break; + } + } + } + + ret = tun_set_ebpf(tun, &tun->steering_prog, prog); break;
case TUNSETFILTEREBPF: - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); + if (copy_from_user(&fd, argp, sizeof(fd))) { + ret = -EFAULT; + break; + } + + if (fd == -1) { + prog = NULL; + } else { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + break; + } + } + + ret = tun_set_ebpf(tun, &tun->filter_prog, prog); break;
case TUNSETCARRIER:
The added tests will ensure that the new relevant members of struct __sk_buff are initialized with 0, that the members are properly interpreted by tun, and tun checks the virtio-net header size before reporting hash values and types the BPF program computed.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/config.aarch64 | 1 - .../selftests/bpf/prog_tests/vnet_hash.c | 385 ++++++++++++++++++ tools/testing/selftests/bpf/progs/vnet_hash.c | 16 + 4 files changed, 402 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/vnet_hash.c create mode 100644 tools/testing/selftests/bpf/progs/vnet_hash.c
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index e41eb33b2704..c05defa83b44 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -10,6 +10,7 @@ CONFIG_BPF_LSM=y CONFIG_BPF_STREAM_PARSER=y CONFIG_BPF_SYSCALL=y # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set +CONFIG_BRIDGE=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64 index 253821494884..1bf6375ac7f3 100644 --- a/tools/testing/selftests/bpf/config.aarch64 +++ b/tools/testing/selftests/bpf/config.aarch64 @@ -17,7 +17,6 @@ CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_PRELOAD_UMD=y CONFIG_BPF_PRELOAD=y -CONFIG_BRIDGE=m CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_FREEZER=y diff --git a/tools/testing/selftests/bpf/prog_tests/vnet_hash.c b/tools/testing/selftests/bpf/prog_tests/vnet_hash.c new file mode 100644 index 000000000000..4d71d7b5adc6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/vnet_hash.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include <net/if.h> +#include <sched.h> + +#include "test_progs.h" +#include "vnet_hash.skel.h" + +#include <linux/if_arp.h> +#include <linux/if_tun.h> +#include <linux/sockios.h> +#include <linux/virtio_net.h> + +#define TUN_HWADDR_SOURCE { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00 } +#define TUN_HWADDR_DEST { 0x02, 0x00, 0x00, 0x00, 0x00, 0x01 } + +#define TUN_IPADDR_SOURCE htonl((172 << 24) | (17 << 16) | 0) +#define TUN_IPADDR_DEST htonl((172 << 24) | (17 << 16) | 1) + +struct payload { + struct ethhdr ethhdr; + struct arphdr arphdr; + unsigned char sender_hwaddr[6]; + uint32_t sender_ipaddr; + unsigned char target_hwaddr[6]; + uint32_t target_ipaddr; +} __packed; + +static bool bpf_setup(struct vnet_hash **skel) +{ + *skel = vnet_hash__open(); + if (!ASSERT_OK_PTR(*skel, __func__)) + return false; + + if (!ASSERT_OK(vnet_hash__load(*skel), __func__)) { + vnet_hash__destroy(*skel); + return false; + } + + return true; +} + +static void bpf_teardown(struct vnet_hash *skel) +{ + vnet_hash__destroy(skel); +} + +static bool local_setup(int *fd) +{ + *fd = socket(AF_LOCAL, SOCK_STREAM, 0); + return ASSERT_GE(*fd, 0, __func__); +} + +static bool local_set_flags(int fd, const char *name, short flags) +{ + struct ifreq ifreq = { .ifr_flags = flags }; + + strcpy(ifreq.ifr_name, name); + + return ASSERT_OK(ioctl(fd, SIOCSIFFLAGS, &ifreq), __func__); +} + +static void local_teardown(int fd) +{ + ASSERT_OK(close(fd), __func__); +} + +static bool bridge_setup(int local_fd) +{ + if (!ASSERT_OK(ioctl(local_fd, SIOCBRADDBR, "xbridge"), __func__)) + return false; + + return local_set_flags(local_fd, "xbridge", IFF_UP); +} + +static bool bridge_add_if(int local_fd, const char *name) +{ + struct ifreq ifreq = { + .ifr_name = "xbridge", + .ifr_ifindex = if_nametoindex(name) + }; + + if (!ASSERT_NEQ(ifreq.ifr_ifindex, 0, __func__)) + return false; + + return ASSERT_OK(ioctl(local_fd, SIOCBRADDIF, &ifreq), __func__); +} + +static void bridge_teardown(int local_fd) +{ + if (!local_set_flags(local_fd, "xbridge", 0)) + return; + + ASSERT_OK(ioctl(local_fd, SIOCBRDELBR, "xbridge"), __func__); +} + +static bool tun_open(int *fd, char *ifname, short flags) +{ + struct ifreq ifr; + + *fd = open("/dev/net/tun", O_RDWR); + if (!ASSERT_GE(*fd, 0, __func__)) + return false; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, ifname); + ifr.ifr_flags = flags | IFF_TAP | IFF_NAPI | IFF_NO_PI | + IFF_MULTI_QUEUE; + + if (!ASSERT_OK(ioctl(*fd, TUNSETIFF, (void *) &ifr), __func__)) { + ASSERT_OK(close(*fd), __func__); + return false; + } + + strcpy(ifname, ifr.ifr_name); + + return true; +} + +static bool tun_source_setup(int local_fd, int *fd) +{ + char ifname[IFNAMSIZ]; + + ifname[0] = 0; + if (!tun_open(fd, ifname, 0)) + return false; + + if (!bridge_add_if(local_fd, ifname)) { + ASSERT_OK(close(*fd), __func__); + return false; + } + + if (!local_set_flags(local_fd, ifname, IFF_UP)) { + ASSERT_OK(close(*fd), __func__); + return false; + } + + return true; +} + +static void tun_source_teardown(int fd) +{ + ASSERT_OK(close(fd), __func__); +} + +static bool tun_dest_setup(int local_fd, struct vnet_hash *bpf, + int *fd, char *ifname) +{ + struct { + struct virtio_net_hdr vnet_hdr; + struct payload payload; + } __packed packet = { + .payload = { + .ethhdr = { + .h_source = TUN_HWADDR_DEST, + .h_dest = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .h_proto = htons(ETH_P_ARP) + }, + .arphdr = { + .ar_hrd = htons(ARPHRD_ETHER), + .ar_pro = htons(ETH_P_IP), + .ar_hln = ETH_ALEN, + .ar_pln = 4, + .ar_op = htons(ARPOP_REQUEST) + }, + .sender_hwaddr = TUN_HWADDR_DEST, + .sender_ipaddr = TUN_IPADDR_DEST, + .target_ipaddr = TUN_IPADDR_DEST + } + }; + + int bpf_fd = bpf_program__fd(bpf->progs.prog); + + ifname[0] = 0; + if (!tun_open(fd, ifname, IFF_VNET_HDR)) + return false; + + if (!ASSERT_OK(ioctl(*fd, TUNSETSTEERINGEBPF, &bpf_fd), __func__)) + goto fail; + + if (!bridge_add_if(local_fd, ifname)) + goto fail; + + if (!local_set_flags(local_fd, ifname, IFF_UP)) + goto fail; + + if (!ASSERT_EQ(write(*fd, &packet, sizeof(packet)), sizeof(packet), __func__)) + goto fail; + + return true; + +fail: + ASSERT_OK(close(*fd), __func__); + return false; +} + +static void tun_dest_teardown(int fd) +{ + ASSERT_OK(close(fd), __func__); +} + +static bool tun_dest_queue_setup(char *ifname, int *fd) +{ + return tun_open(fd, ifname, IFF_VNET_HDR); +} + +static void tun_dest_queue_teardown(int fd) +{ + ASSERT_OK(close(fd), __func__); +} + +static void *test_vnet_hash_thread(void *arg) +{ + struct payload sent = { + .ethhdr = { + .h_source = TUN_HWADDR_SOURCE, + .h_dest = TUN_HWADDR_DEST, + .h_proto = htons(ETH_P_ARP) + }, + .arphdr = { + .ar_hrd = htons(ARPHRD_ETHER), + .ar_pro = htons(ETH_P_IP), + .ar_hln = ETH_ALEN, + .ar_pln = 4, + .ar_op = htons(ARPOP_REPLY) + }, + .sender_hwaddr = TUN_HWADDR_SOURCE, + .sender_ipaddr = TUN_IPADDR_SOURCE, + .target_hwaddr = TUN_HWADDR_DEST, + .target_ipaddr = TUN_IPADDR_DEST + }; + union { + struct virtio_net_hdr_v1_hash virtio_net_hdr; + uint8_t bytes[sizeof(struct virtio_net_hdr_v1_hash) + sizeof(struct payload)]; + } received; + struct vnet_hash *bpf; + int local_fd; + int source_fd; + int dest_fds[2]; + char dest_ifname[IFNAMSIZ]; + int vnet_hdr_sz; + + if (!ASSERT_OK(unshare(CLONE_NEWNET), "unshare")) + return NULL; + + if (!bpf_setup(&bpf)) + return NULL; + + if (!local_setup(&local_fd)) + goto fail_local; + + if (!bridge_setup(local_fd)) + goto fail_bridge; + + if (!tun_source_setup(local_fd, &source_fd)) + goto fail_tun_source; + + if (!tun_dest_setup(local_fd, bpf, dest_fds, dest_ifname)) + goto fail_tun_dest; + + if (!ASSERT_EQ(write(source_fd, &sent, sizeof(sent)), sizeof(sent), "write")) + goto fail_tests_single_queue; + + if (!ASSERT_EQ(read(dest_fds[0], &received, sizeof(received)), + sizeof(struct virtio_net_hdr) + sizeof(struct payload), + "read")) + goto fail_tests_single_queue; + + ASSERT_EQ(received.virtio_net_hdr.hdr.flags, 0, + "virtio_net_hdr.hdr.flags"); + ASSERT_EQ(received.virtio_net_hdr.hdr.gso_type, VIRTIO_NET_HDR_GSO_NONE, + "virtio_net_hdr.hdr.gso_type"); + ASSERT_EQ(received.virtio_net_hdr.hdr.hdr_len, 0, + "virtio_net_hdr.hdr.hdr_len"); + ASSERT_EQ(received.virtio_net_hdr.hdr.gso_size, 0, + "virtio_net_hdr.hdr.gso_size"); + ASSERT_EQ(received.virtio_net_hdr.hdr.csum_start, 0, + "virtio_net_hdr.hdr.csum_start"); + ASSERT_EQ(received.virtio_net_hdr.hdr.csum_offset, 0, + "virtio_net_hdr.hdr.csum_offset"); + ASSERT_EQ(memcmp(received.bytes + sizeof(struct virtio_net_hdr), &sent, sizeof(sent)), 0, + "payload"); + + vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1_hash); + if (!ASSERT_OK(ioctl(dest_fds[0], TUNSETVNETHDRSZ, &vnet_hdr_sz), "TUNSETVNETHDRSZ")) + goto fail_tests_single_queue; + + if (!ASSERT_EQ(write(source_fd, &sent, sizeof(sent)), sizeof(sent), + "hash: write")) + goto fail_tests_single_queue; + + if (!ASSERT_EQ(read(dest_fds[0], &received, sizeof(received)), + sizeof(struct virtio_net_hdr_v1_hash) + sizeof(struct payload), + "hash: read")) + goto fail_tests_single_queue; + + ASSERT_EQ(received.virtio_net_hdr.hdr.flags, 0, + "hash: virtio_net_hdr.hdr.flags"); + ASSERT_EQ(received.virtio_net_hdr.hdr.gso_type, VIRTIO_NET_HDR_GSO_NONE, + "hash: virtio_net_hdr.hdr.gso_type"); + ASSERT_EQ(received.virtio_net_hdr.hdr.hdr_len, 0, + "hash: virtio_net_hdr.hdr.hdr_len"); + ASSERT_EQ(received.virtio_net_hdr.hdr.gso_size, 0, + "hash: virtio_net_hdr.hdr.gso_size"); + ASSERT_EQ(received.virtio_net_hdr.hdr.csum_start, 0, + "hash: virtio_net_hdr.hdr.csum_start"); + ASSERT_EQ(received.virtio_net_hdr.hdr.csum_offset, 0, + "hash: virtio_net_hdr.hdr.csum_offset"); + ASSERT_EQ(received.virtio_net_hdr.hdr.num_buffers, 0, + "hash: virtio_net_hdr.hdr.num_buffers"); + ASSERT_EQ(received.virtio_net_hdr.hash_value, htole32(3), + "hash: virtio_net_hdr.hash_value"); + ASSERT_EQ(received.virtio_net_hdr.hash_report, htole16(2), + "hash: virtio_net_hdr.hash_report"); + ASSERT_EQ(received.virtio_net_hdr.padding, 0, + "hash: virtio_net_hdr.padding"); + ASSERT_EQ(memcmp(received.bytes + sizeof(struct virtio_net_hdr_v1_hash), &sent, + sizeof(sent)), + 0, + "hash: payload"); + + if (!tun_dest_queue_setup(dest_ifname, dest_fds + 1)) + goto fail_tests_single_queue; + + if (!ASSERT_EQ(write(source_fd, &sent, sizeof(sent)), sizeof(sent), + "hash, multi queue: write")) + goto fail_tests_multi_queue; + + if (!ASSERT_EQ(read(dest_fds[1], &received, sizeof(received)), + sizeof(struct virtio_net_hdr_v1_hash) + sizeof(struct payload), + "hash, multi queue: read")) + goto fail_tests_multi_queue; + + ASSERT_EQ(received.virtio_net_hdr.hdr.flags, 0, + "hash, multi queue: virtio_net_hdr.hdr.flags"); + ASSERT_EQ(received.virtio_net_hdr.hdr.gso_type, VIRTIO_NET_HDR_GSO_NONE, + "hash, multi queue: virtio_net_hdr.hdr.gso_type"); + ASSERT_EQ(received.virtio_net_hdr.hdr.hdr_len, 0, + "hash, multi queue: virtio_net_hdr.hdr.hdr_len"); + ASSERT_EQ(received.virtio_net_hdr.hdr.gso_size, 0, + "hash, multi queue: virtio_net_hdr.hdr.gso_size"); + ASSERT_EQ(received.virtio_net_hdr.hdr.csum_start, 0, + "hash, multi queue: virtio_net_hdr.hdr.csum_start"); + ASSERT_EQ(received.virtio_net_hdr.hdr.csum_offset, 0, + "hash, multi queue: virtio_net_hdr.hdr.csum_offset"); + ASSERT_EQ(received.virtio_net_hdr.hdr.num_buffers, 0, + "hash, multi queue: virtio_net_hdr.hdr.num_buffers"); + ASSERT_EQ(received.virtio_net_hdr.hash_value, htole32(3), + "hash, multi queue: virtio_net_hdr.hash_value"); + ASSERT_EQ(received.virtio_net_hdr.hash_report, htole16(2), + "hash, multi queue: virtio_net_hdr.hash_report"); + ASSERT_EQ(received.virtio_net_hdr.padding, 0, + "hash, multi queue: virtio_net_hdr.padding"); + ASSERT_EQ(memcmp(received.bytes + sizeof(struct virtio_net_hdr_v1_hash), &sent, + sizeof(sent)), + 0, + "hash, multi queue: payload"); + +fail_tests_multi_queue: + tun_dest_queue_teardown(dest_fds[1]); +fail_tests_single_queue: + tun_dest_teardown(dest_fds[0]); +fail_tun_dest: + tun_source_teardown(source_fd); +fail_tun_source: + bridge_teardown(local_fd); +fail_bridge: + local_teardown(local_fd); +fail_local: + bpf_teardown(bpf); + + return NULL; +} + +void test_vnet_hash(void) +{ + pthread_t thread; + int err; + + err = pthread_create(&thread, NULL, &test_vnet_hash_thread, NULL); + if (ASSERT_OK(err, "pthread_create")) + ASSERT_OK(pthread_join(thread, NULL), "pthread_join"); +} diff --git a/tools/testing/selftests/bpf/progs/vnet_hash.c b/tools/testing/selftests/bpf/progs/vnet_hash.c new file mode 100644 index 000000000000..0451bab65647 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/vnet_hash.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("vnet_hash") +int prog(struct __sk_buff *skb) +{ + skb->vnet_hash_value ^= 3; + skb->vnet_hash_report ^= 2; + skb->vnet_rss_queue ^= 1; + + return BPF_OK; +} + +char _license[] SEC("license") = "GPL";
VIRTIO_NET_F_HASH_REPORT allows to report hash values calculated on the host. When VHOST_NET_F_VIRTIO_NET_HDR is employed, it will report no hash values (i.e., the hash_report member is always set to VIRTIO_NET_HASH_REPORT_NONE). Otherwise, the values reported by the underlying socket will be reported.
VIRTIO_NET_F_HASH_REPORT requires VIRTIO_F_VERSION_1.
Signed-off-by: Akihiko Odaki akihiko.odaki@daynix.com --- drivers/vhost/net.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f2ed7167c848..6a31d450fae2 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -73,6 +73,7 @@ enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_NET_F_HASH_REPORT) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_RESET) }; @@ -1634,10 +1635,13 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) size_t vhost_hlen, sock_hlen, hdr_len; int i;
- hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_F_VERSION_1))) ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : - sizeof(struct virtio_net_hdr); + if (features & (1ULL << VIRTIO_NET_F_HASH_REPORT)) + hdr_len = sizeof(struct virtio_net_hdr_v1_hash); + else if (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_F_VERSION_1))) + hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + hdr_len = sizeof(struct virtio_net_hdr); if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; @@ -1718,6 +1722,10 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, return -EFAULT; if (features & ~VHOST_NET_FEATURES) return -EOPNOTSUPP; + if ((features & ((1ULL << VIRTIO_F_VERSION_1) | + (1ULL << VIRTIO_NET_F_HASH_REPORT))) == + (1ULL << VIRTIO_NET_F_HASH_REPORT)) + return -EINVAL; return vhost_net_set_features(n, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES;
linux-kselftest-mirror@lists.linaro.org