PACKET socket can retain its fanout membership through link down and up and leave a fanout while closed regardless of link state. However, socket was forbidden from joining a fanout while it was not RUNNING.
This patch allows PACKET socket to join a fanout while not RUNNING. Selftest psock_fanout is extended to test this scenario. This is the only test that was performed.
This scenario was identified while studying DPDK pmd_af_packet_drv. Since sockets are only created during initialization, there is no reason to fail the initialization if a single link is temporarily down.
I hope it is not considered as breaking user space and that applications are not designed to expect this failure.
Changes:
V02: * psock_fanout: use explicit loopback up/down instead of toggle. * psock_fanout: don't try to restore loopback state on failure. * Rephrase commit message about "leaving a fanout".
V01: https://lore.kernel.org/netdev/cover.1728303615.git.gur.stavi@huawei.com/
Gur Stavi (2): af_packet: allow fanout_add when socket is not RUNNING selftests: net/psock_fanout: socket joins fanout when link is down
net/packet/af_packet.c | 10 +++--- tools/testing/selftests/net/psock_fanout.c | 42 ++++++++++++++++++++-- 2 files changed, 44 insertions(+), 8 deletions(-)
base-commit: f95b4725e796b12e5f347a0d161e1d3843142aa8
PACKET socket can retain its fanout membership through link down and up and leave a fanout while closed regardless of link state. However, socket was forbidden from joining a fanout while it was not RUNNING.
This patch allows PACKET socket to join fanout while not RUNNING.
Signed-off-by: Gur Stavi gur.stavi@huawei.com --- net/packet/af_packet.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f8942062f776..fb2cca73d953 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL;
spin_lock(&po->bind_lock); - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && - match->type == type && + if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { - __dev_remove_pack(&po->prot_hook); - /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match);
po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); - __fanout_link(sk, po); + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { + __dev_remove_pack(&po->prot_hook); + __fanout_link(sk, po); + } err = 0; } }
Gur Stavi wrote:
PACKET socket can retain its fanout membership through link down and up and leave a fanout while closed regardless of link state. However, socket was forbidden from joining a fanout while it was not RUNNING.
This patch allows PACKET socket to join fanout while not RUNNING.
Signed-off-by: Gur Stavi gur.stavi@huawei.com
net/packet/af_packet.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f8942062f776..fb2cca73d953 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL; spin_lock(&po->bind_lock);
- if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
- if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) {
__dev_remove_pack(&po->prot_hook);
/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match);
po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
__fanout_link(sk, po);
if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
__dev_remove_pack(&po->prot_hook);
__fanout_link(sk, po);
} }} err = 0;
-- 2.45.2
@@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL;
spin_lock(&po->bind_lock);
- if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
- if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to test that po->prot_hook.dev != NULL, since we are under a bind_lock anyway. But perhaps a more readable and direct approach to test "bind" would be to test po->ifindex != -1, as ifindex is commented as "bound device". However, at the moment ifindex is not initialized to -1, I can add such initialization, but perhaps I do not fully understand all the logic.
Any preferences?
err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) {
__dev_remove_pack(&po->prot_hook);
/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
__fanout_link(sk, po);
if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
__dev_remove_pack(&po->prot_hook);
__fanout_link(sk, po);
} }} err = 0;
--
Gur Stavi wrote:
@@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL;
spin_lock(&po->bind_lock);
- if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
- if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to test that po->prot_hook.dev != NULL, since we are under a bind_lock anyway. But perhaps a more readable and direct approach to test "bind" would be to test po->ifindex != -1, as ifindex is commented as "bound device". However, at the moment ifindex is not initialized to -1, I can add such initialization, but perhaps I do not fully understand all the logic.
Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is only modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind to a specific device or change ptype.
Gur Stavi wrote:
@@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk,
struct fanout_args *args)
err = -EINVAL;
spin_lock(&po->bind_lock);
- if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
- if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to test that po->prot_hook.dev != NULL, since we are under a bind_lock anyway. But perhaps a more readable and direct approach to test "bind" would be to test po->ifindex != -1, as ifindex is commented as "bound device". However, at the moment ifindex is not initialized to -1, I can add such initialization, but perhaps I do not fully understand all the logic.
Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is only modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind to a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds" do you mean SHOULD succeed or MAY SUCCEED by mistake if "something" happens ???
Do you refer to the following scenario: socket is created with non-zero protocol and becomes RUNNING "without bind" for all devices. In that case it can be added to FANOUT without bind. Is that considered a bug or does the bind requirement for fanout only apply for all-protocol (0) sockets?
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets it to device id or 0?
psock_fanout, should probably be extended with scenarios that test "all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
@@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk,
struct fanout_args *args)
err = -EINVAL;
spin_lock(&po->bind_lock);
- if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
- if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to test that po->prot_hook.dev != NULL, since we are under a bind_lock anyway. But perhaps a more readable and direct approach to test "bind" would be to test po->ifindex != -1, as ifindex is commented as "bound device". However, at the moment ifindex is not initialized to -1, I can add such initialization, but perhaps I do not fully understand all the logic.
Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is only modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind to a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds" do you mean SHOULD succeed or MAY SUCCEED by mistake if "something" happens ???
I mean it succeeds currently. Which behavior must then be maintained.
Do you refer to the following scenario: socket is created with non-zero protocol and becomes RUNNING "without bind" for all devices. In that case it can be added to FANOUT without bind. Is that considered a bug or does the bind requirement for fanout only apply for all-protocol (0) sockets?
I'm beginning to think that this bind requirement is not needed.
All type and dev are valid, even if an ETH_P_NONE fanout group would be fairly useless.
The type and dev must match that of the fanout group, and once added to a fanout group can no longer be changed (bind will fail).
I briefy considered the reason might be max_num_members accounting. Since f->num_members counts running sockets. But that is not used when tracking membership of the group, sk_ref is. Every packet socket whose po->rollover is increased increases this refcount.
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets it to device id or 0?
psock_fanout, should probably be extended with scenarios that test "all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
@@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk,
struct fanout_args *args)
err = -EINVAL;
spin_lock(&po->bind_lock);
- if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
- if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to test
that
po->prot_hook.dev != NULL, since we are under a bind_lock anyway. But perhaps a more readable and direct approach to test "bind"
would be
to test po->ifindex != -1, as ifindex is commented as "bound
device".
However, at the moment ifindex is not initialized to -1, I can add
such
initialization, but perhaps I do not fully understand all the
logic.
Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is only modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind to a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds" do you mean SHOULD succeed or MAY SUCCEED by mistake if "something" happens
???
I mean it succeeds currently. Which behavior must then be maintained.
Do you refer to the following scenario: socket is created with non-zero protocol and becomes RUNNING "without bind" for all devices. In that
case
it can be added to FANOUT without bind. Is that considered a bug or
does
the bind requirement for fanout only apply for all-protocol (0)
sockets?
I'm beginning to think that this bind requirement is not needed.
I agree with that. I think that is an historical mistake that socket becomes implicitly bound to all interfaces if a protocol is defined during create. Without this bind requirement would make sense.
All type and dev are valid, even if an ETH_P_NONE fanout group would be fairly useless.
Fanout is all about RX, I think that refusing fanout for socket that will not receive any packet is OK. The condition can be: if (po->ifindex == -1 || !po->num)
I realized another possible problem. We should consider adding ifindex Field to struct packet_fanout to be used for lookup of an existing match. There is little sense to bind sockets to different interfaces and then put them in the same fanout group. If you agree, I can prepare a separate patch for that.
The type and dev must match that of the fanout group, and once added to a fanout group can no longer be changed (bind will fail).
I briefy considered the reason might be max_num_members accounting. Since f->num_members counts running sockets. But that is not used when tracking membership of the group, sk_ref is. Every packet socket whose po->rollover is increased increases this refcount.
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets it to device id or 0?
psock_fanout, should probably be extended with scenarios that test "all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote:
> @@ -1846,21 +1846,21 @@ static int fanout_add(struct sock *sk,
struct fanout_args *args)
> err = -EINVAL; > > spin_lock(&po->bind_lock); > - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && > - match->type == type && > + if (match->type == type && > match->prot_hook.type == po->prot_hook.type && > match->prot_hook.dev == po->prot_hook.dev) {
Remaining unaddressed issue is that the socket can now be added before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to test
that
po->prot_hook.dev != NULL, since we are under a bind_lock anyway. But perhaps a more readable and direct approach to test "bind"
would be
to test po->ifindex != -1, as ifindex is commented as "bound
device".
However, at the moment ifindex is not initialized to -1, I can add
such
initialization, but perhaps I do not fully understand all the
logic.
Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is only modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind to a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds" do you mean SHOULD succeed or MAY SUCCEED by mistake if "something" happens
???
I mean it succeeds currently. Which behavior must then be maintained.
Do you refer to the following scenario: socket is created with non-zero protocol and becomes RUNNING "without bind" for all devices. In that
case
it can be added to FANOUT without bind. Is that considered a bug or
does
the bind requirement for fanout only apply for all-protocol (0)
sockets?
I'm beginning to think that this bind requirement is not needed.
I agree with that. I think that is an historical mistake that socket becomes implicitly bound to all interfaces if a protocol is defined during create. Without this bind requirement would make sense.
All type and dev are valid, even if an ETH_P_NONE fanout group would be fairly useless.
Fanout is all about RX, I think that refusing fanout for socket that will not receive any packet is OK. The condition can be: if (po->ifindex == -1 || !po->num)
Fanout is not limited to sockets bound to a specific interface. This will break existing users.
Binding to ETH_P_NONE is useless, but we're not going to slow down legitimate users with branches for cases that are harmless.
I realized another possible problem. We should consider adding ifindex Field to struct packet_fanout to be used for lookup of an existing match. There is little sense to bind sockets to different interfaces and then put them in the same fanout group. If you agree, I can prepare a separate patch for that.
The type and dev must match that of the fanout group, and once added to a fanout group can no longer be changed (bind will fail).
I briefy considered the reason might be max_num_members accounting. Since f->num_members counts running sockets. But that is not used when tracking membership of the group, sk_ref is. Every packet socket whose po->rollover is increased increases this refcount.
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets it to device id or 0?
psock_fanout, should probably be extended with scenarios that test "all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote:
>> @@ -1846,21 +1846,21 @@ static int fanout_add(struct sock
*sk,
struct fanout_args *args)
>> err = -EINVAL; >> >> spin_lock(&po->bind_lock); >> - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && >> - match->type == type && >> + if (match->type == type && >> match->prot_hook.type == po->prot_hook.type && >> match->prot_hook.dev == po->prot_hook.dev) { > > Remaining unaddressed issue is that the socket can now be
added
> before being bound. See comment in v1.
I extended the psock_fanout test with unbound fanout test.
As far as I understand, the easiest way to verify bind is to
test
that
po->prot_hook.dev != NULL, since we are under a bind_lock
anyway.
But perhaps a more readable and direct approach to test "bind"
would be
to test po->ifindex != -1, as ifindex is commented as "bound
device".
However, at the moment ifindex is not initialized to -1, I can
add
such
initialization, but perhaps I do not fully understand all the
logic.
Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is
only
modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind
to
a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds" do
you
mean SHOULD succeed or MAY SUCCEED by mistake if "something"
happens
???
I mean it succeeds currently. Which behavior must then be maintained.
Do you refer to the following scenario: socket is created with non-
zero
protocol and becomes RUNNING "without bind" for all devices. In
that
case
it can be added to FANOUT without bind. Is that considered a bug or
does
the bind requirement for fanout only apply for all-protocol (0)
sockets?
I'm beginning to think that this bind requirement is not needed.
I agree with that. I think that is an historical mistake that socket becomes implicitly bound to all interfaces if a protocol is defined during create. Without this bind requirement would make sense.
All type and dev are valid, even if an ETH_P_NONE fanout group would be fairly useless.
Fanout is all about RX, I think that refusing fanout for socket that will not receive any packet is OK. The condition can be: if (po->ifindex == -1 || !po->num)
Fanout is not limited to sockets bound to a specific interface. This will break existing users.
For specific interface ifindex >= 1 For "any interface" ifindex == 0 ifindex is -1 only if the socket was created unbound with proto == 0 or for the rare race case that during re-bind the new dev became unlisted. For both of these cases fanout should fail.
Binding to ETH_P_NONE is useless, but we're not going to slow down legitimate users with branches for cases that are harmless.
With "branch", do you refer to performance or something else? As I said in other mail, ETH_P_NONE could not be used in a fanout before as well because socket cannot become RUNNING with proto == 0. For performance, we removed the RUNNING condition and added this. It is not like we need to perform 5M fanout registrations/sec. It is a syscall after all.
I realized another possible problem. We should consider adding ifindex Field to struct packet_fanout to be used for lookup of an existing
match.
There is little sense to bind sockets to different interfaces and then put them in the same fanout group. If you agree, I can prepare a separate patch for that.
The type and dev must match that of the fanout group, and once added to a fanout group can no longer be changed (bind will fail).
I briefy considered the reason might be max_num_members accounting. Since f->num_members counts running sockets. But that is not used when tracking membership of the group, sk_ref is. Every packet socket whose po->rollover is increased increases this refcount.
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets it to device id or 0?
psock_fanout, should probably be extended with scenarios that test "all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote: > >> @@ -1846,21 +1846,21 @@ static int fanout_add(struct sock
*sk,
struct fanout_args *args) > >> err = -EINVAL; > >> > >> spin_lock(&po->bind_lock); > >> - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && > >> - match->type == type && > >> + if (match->type == type && > >> match->prot_hook.type == po->prot_hook.type && > >> match->prot_hook.dev == po->prot_hook.dev) { > > > > Remaining unaddressed issue is that the socket can now be
added
> > before being bound. See comment in v1. > > I extended the psock_fanout test with unbound fanout test. > > As far as I understand, the easiest way to verify bind is to
test
that
> po->prot_hook.dev != NULL, since we are under a bind_lock
anyway.
> But perhaps a more readable and direct approach to test "bind"
would be
> to test po->ifindex != -1, as ifindex is commented as "bound
device".
> However, at the moment ifindex is not initialized to -1, I can
add
such
> initialization, but perhaps I do not fully understand all the
logic.
> > Any preferences?
prot_hook.dev is not necessarily set if a packet socket is bound. It may be bound to any device. See dev_add_pack and ptype_head.
prot_hook.type, on the other hand, must be set if bound and is
only
modified with the bind_lock held too.
Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD also succeeds in case bind() was not called explicitly first to bind
to
a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds" do
you
mean SHOULD succeed or MAY SUCCEED by mistake if "something"
happens
???
I mean it succeeds currently. Which behavior must then be maintained.
Do you refer to the following scenario: socket is created with non-
zero
protocol and becomes RUNNING "without bind" for all devices. In
that
case
it can be added to FANOUT without bind. Is that considered a bug or
does
the bind requirement for fanout only apply for all-protocol (0)
sockets?
I'm beginning to think that this bind requirement is not needed.
I agree with that. I think that is an historical mistake that socket becomes implicitly bound to all interfaces if a protocol is defined during create. Without this bind requirement would make sense.
All type and dev are valid, even if an ETH_P_NONE fanout group would be fairly useless.
Fanout is all about RX, I think that refusing fanout for socket that will not receive any packet is OK. The condition can be: if (po->ifindex == -1 || !po->num)
Fanout is not limited to sockets bound to a specific interface. This will break existing users.
For specific interface ifindex >= 1 For "any interface" ifindex == 0 ifindex is -1 only if the socket was created unbound with proto == 0 or for the rare race case that during re-bind the new dev became unlisted. For both of these cases fanout should fail.
The only case where packet_create does not call __register_prot_hook is if proto == 0. If proto is anything else, the socket will be bound, whether to a device hook, or ptype_all. I don't think we need this extra ifindex condition.
Binding to ETH_P_NONE is useless, but we're not going to slow down legitimate users with branches for cases that are harmless.
With "branch", do you refer to performance or something else? As I said in other mail, ETH_P_NONE could not be used in a fanout before as well because socket cannot become RUNNING with proto == 0.
Good point.
For performance, we removed the RUNNING condition and added this. It is not like we need to perform 5M fanout registrations/sec. It is a syscall after all.
It's as much about code complexity as performance. Both the patch and resulting code should be as small and self-evident as possible.
Patch v3 introduces a lot of code churn.
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I realized another possible problem. We should consider adding ifindex Field to struct packet_fanout to be used for lookup of an existing
match.
There is little sense to bind sockets to different interfaces and then put them in the same fanout group. If you agree, I can prepare a separate patch for that.
The type and dev must match that of the fanout group, and once added to a fanout group can no longer be changed (bind will fail).
I briefy considered the reason might be max_num_members accounting. Since f->num_members counts running sockets. But that is not used when tracking membership of the group, sk_ref is. Every packet socket whose po->rollover is increased increases this refcount.
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets it to device id or 0?
psock_fanout, should probably be extended with scenarios that test "all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote:
> Gur Stavi wrote: > > >> @@ -1846,21 +1846,21 @@ static int fanout_add(struct
sock
*sk,
> struct fanout_args *args) > > >> err = -EINVAL; > > >> > > >> spin_lock(&po->bind_lock); > > >> - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && > > >> - match->type == type && > > >> + if (match->type == type && > > >> match->prot_hook.type == po->prot_hook.type && > > >> match->prot_hook.dev == po->prot_hook.dev) { > > > > > > Remaining unaddressed issue is that the socket can now be
added
> > > before being bound. See comment in v1. > > > > I extended the psock_fanout test with unbound fanout test. > > > > As far as I understand, the easiest way to verify bind is
to
test
that
> > po->prot_hook.dev != NULL, since we are under a bind_lock
anyway.
> > But perhaps a more readable and direct approach to test
"bind"
would be
> > to test po->ifindex != -1, as ifindex is commented as
"bound
device".
> > However, at the moment ifindex is not initialized to -1, I
can
add
such
> > initialization, but perhaps I do not fully understand all
the
logic.
> > > > Any preferences? > > prot_hook.dev is not necessarily set if a packet socket is
bound.
> It may be bound to any device. See dev_add_pack and
ptype_head.
> > prot_hook.type, on the other hand, must be set if bound and
is
only
> modified with the bind_lock held too. > > Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD
also
> succeeds in case bind() was not called explicitly first to
bind
to
> a specific device or change ptype.
Please clarify the last paragraph? When you say "also succeeds"
do
you
mean SHOULD succeed or MAY SUCCEED by mistake if "something"
happens
???
I mean it succeeds currently. Which behavior must then be
maintained.
Do you refer to the following scenario: socket is created with
non-
zero
protocol and becomes RUNNING "without bind" for all devices. In
that
case
it can be added to FANOUT without bind. Is that considered a
bug or
does
the bind requirement for fanout only apply for all-protocol (0)
sockets?
I'm beginning to think that this bind requirement is not needed.
I agree with that. I think that is an historical mistake that
socket
becomes implicitly bound to all interfaces if a protocol is defined during create. Without this bind requirement would make sense.
All type and dev are valid, even if an ETH_P_NONE fanout group
would
be fairly useless.
Fanout is all about RX, I think that refusing fanout for socket
that
will not receive any packet is OK. The condition can be: if (po->ifindex == -1 || !po->num)
Fanout is not limited to sockets bound to a specific interface. This will break existing users.
For specific interface ifindex >= 1 For "any interface" ifindex == 0 ifindex is -1 only if the socket was created unbound with proto == 0 or for the rare race case that during re-bind the new dev became
unlisted.
For both of these cases fanout should fail.
The only case where packet_create does not call __register_prot_hook is if proto == 0. If proto is anything else, the socket will be bound, whether to a device hook, or ptype_all. I don't think we need this extra ifindex condition.
Even though "unbound" is an unlikely state for such a socket the code Should still address this state consistently. If do_bind sets ifindex to -1 on the unlikely unlisted scenario so should packet_create on the more likely proto == 0 scenario.
Binding to ETH_P_NONE is useless, but we're not going to slow down legitimate users with branches for cases that are harmless.
With "branch", do you refer to performance or something else? As I said in other mail, ETH_P_NONE could not be used in a fanout before as well because socket cannot become RUNNING with proto == 0.
Good point.
For performance, we removed the RUNNING condition and added this. It is not like we need to perform 5M fanout registrations/sec. It is a syscall after all.
It's as much about code complexity as performance. Both the patch and resulting code should be as small and self-evident as possible.
Patch v3 introduces a lot of code churn.
Did you look at a side by side comparison? There is really very little extra code.
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I am not the one to decide if opening it is a good idea but it will be ironic if a patch with the intention to remove the only-RUNNING restriction will end up allowing never-RUNNING sockets into a fanout group.
I realized another possible problem. We should consider adding
ifindex
Field to struct packet_fanout to be used for lookup of an existing
match.
There is little sense to bind sockets to different interfaces and
then
put them in the same fanout group. If you agree, I can prepare a separate patch for that.
The type and dev must match that of the fanout group, and once
added
to a fanout group can no longer be changed (bind will fail).
I briefy considered the reason might be max_num_members
accounting.
Since f->num_members counts running sockets. But that is not used when tracking membership of the group, sk_ref is. Every packet
socket
whose po->rollover is increased increases this refcount.
What about using ifindex to detect bind? Initialize it to -1 in packet_create and ensure that packet_do_bind, on success, sets
it
to device id or 0?
psock_fanout, should probably be extended with scenarios that
test
"all devices" and all/specific protocols. Any specific scenario suggestions?
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote:
Gur Stavi wrote: > > Gur Stavi wrote: > > > >> @@ -1846,21 +1846,21 @@ static int fanout_add(struct
sock
*sk,
> > struct fanout_args *args) > > > >> err = -EINVAL; > > > >> > > > >> spin_lock(&po->bind_lock); > > > >> - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && > > > >> - match->type == type && > > > >> + if (match->type == type && > > > >> match->prot_hook.type == po->prot_hook.type && > > > >> match->prot_hook.dev == po->prot_hook.dev) { > > > > > > > > Remaining unaddressed issue is that the socket can now be
added
> > > > before being bound. See comment in v1. > > > > > > I extended the psock_fanout test with unbound fanout test. > > > > > > As far as I understand, the easiest way to verify bind is
to
test
that > > > po->prot_hook.dev != NULL, since we are under a bind_lock
anyway.
> > > But perhaps a more readable and direct approach to test
"bind"
would be > > > to test po->ifindex != -1, as ifindex is commented as
"bound
device". > > > However, at the moment ifindex is not initialized to -1, I
can
add
such > > > initialization, but perhaps I do not fully understand all
the
logic. > > > > > > Any preferences? > > > > prot_hook.dev is not necessarily set if a packet socket is
bound.
> > It may be bound to any device. See dev_add_pack and
ptype_head.
> > > > prot_hook.type, on the other hand, must be set if bound and
is
only
> > modified with the bind_lock held too. > > > > Well, and in packet_create. But setsockopt PACKET_FANOUT_ADD
also
> > succeeds in case bind() was not called explicitly first to
bind
to
> > a specific device or change ptype. > > Please clarify the last paragraph? When you say "also succeeds"
do
you
> mean SHOULD succeed or MAY SUCCEED by mistake if "something"
happens
???
I mean it succeeds currently. Which behavior must then be
maintained.
> Do you refer to the following scenario: socket is created with
non-
zero
> protocol and becomes RUNNING "without bind" for all devices. In
that
case > it can be added to FANOUT without bind. Is that considered a
bug or
does > the bind requirement for fanout only apply for all-protocol (0) sockets?
I'm beginning to think that this bind requirement is not needed.
I agree with that. I think that is an historical mistake that
socket
becomes implicitly bound to all interfaces if a protocol is defined during create. Without this bind requirement would make sense.
All type and dev are valid, even if an ETH_P_NONE fanout group
would
be fairly useless.
Fanout is all about RX, I think that refusing fanout for socket
that
will not receive any packet is OK. The condition can be: if (po->ifindex == -1 || !po->num)
Fanout is not limited to sockets bound to a specific interface. This will break existing users.
For specific interface ifindex >= 1 For "any interface" ifindex == 0 ifindex is -1 only if the socket was created unbound with proto == 0 or for the rare race case that during re-bind the new dev became
unlisted.
For both of these cases fanout should fail.
The only case where packet_create does not call __register_prot_hook is if proto == 0. If proto is anything else, the socket will be bound, whether to a device hook, or ptype_all. I don't think we need this extra ifindex condition.
Even though "unbound" is an unlikely state for such a socket the code Should still address this state consistently. If do_bind sets ifindex to -1 on the unlikely unlisted scenario so should packet_create on the more likely proto == 0 scenario.
do_bind sets it to -1 for unlisted probably to copy existing behavior in packet_notifier on NETDEV_DOWN.
But as far as I can tell nothing that uses po->ifindex differentiates between 0 and -1. It just means that no actual device ifindex matches.
Binding to ETH_P_NONE is useless, but we're not going to slow down legitimate users with branches for cases that are harmless.
With "branch", do you refer to performance or something else? As I said in other mail, ETH_P_NONE could not be used in a fanout before as well because socket cannot become RUNNING with proto == 0.
Good point.
For performance, we removed the RUNNING condition and added this. It is not like we need to perform 5M fanout registrations/sec. It is a syscall after all.
It's as much about code complexity as performance. Both the patch and resulting code should be as small and self-evident as possible.
Patch v3 introduces a lot of code churn.
Did you look at a side by side comparison?
Obviously
There is really very little extra code.
1 file changed, 21 insertions(+), 14 deletions(-)
vs
1 file changed, 5 insertions(+), 5 deletions(-)
for v2.
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I am not the one to decide if opening it is a good idea but it will be ironic if a patch with the intention to remove the only-RUNNING restriction will end up allowing never-RUNNING sockets into a fanout group.
It's fine to catch that, seem to just be this change?
spin_lock(&po->bind_lock); - if (po->running && + if (po->num != ETH_P_NONE && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) {
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I don't think that allowing ETH_P_NONE is relevant. In my opinion the 2 options that should be considered to fail fanout_add are: 1. Testing proto == 0 2. Testing proto == 0 || ifindex == -1
The only corner case that is caught by [2] and missed by [1] is the "unlisted" case during do_bind. It is such a rare case that probably no one will ever encounter bind "unlisted" followed by FANOUT_ADD. And this is not a dangerous corner case that leads to system crash.
However, being a purist, I see the major goal of code review to promote correctness by identifying corner cases while improving style is a secondary priority. Since we did identify this corner case in our discussion I think we should still use [2]. I don't consider the code complex. In fact, to me, the ifindex clause is a more understandable direct reason for failure than the proto which is indirect. Having the ifindex clause helps figuring out the proto clause.
Gur Stavi wrote:
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I don't think that allowing ETH_P_NONE is relevant. In my opinion the 2 options that should be considered to fail fanout_add are:
- Testing proto == 0
- Testing proto == 0 || ifindex == -1
The only corner case that is caught by [2] and missed by [1] is the "unlisted" case during do_bind. It is such a rare case that probably no one will ever encounter bind "unlisted" followed by FANOUT_ADD. And this is not a dangerous corner case that leads to system crash.
However, being a purist, I see the major goal of code review to promote correctness by identifying corner cases while improving style is a secondary priority. Since we did identify this corner case in our discussion I think we should still use [2]. I don't consider the code complex. In fact, to me, the ifindex clause is a more understandable direct reason for failure than the proto which is indirect. Having the ifindex clause helps figuring out the proto clause.
It's interesting that the unlisted fix does not return ENODEV, but returns success and leaves the socket in an unbound state, equivalent to binding to ETH_P_NONE and ifindex 0. This seems surprising behavior to the caller.
On rereading that, I still do not see a purpose of special ifindex -1.
Gur Stavi wrote:
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I don't think that allowing ETH_P_NONE is relevant. In my opinion the 2 options that should be considered to fail fanout_add are:
- Testing proto == 0
- Testing proto == 0 || ifindex == -1
The only corner case that is caught by [2] and missed by [1] is the "unlisted" case during do_bind. It is such a rare case that probably no one will ever encounter bind "unlisted" followed by FANOUT_ADD. And this is not a dangerous corner case that leads to system crash.
However, being a purist, I see the major goal of code review to promote correctness by identifying corner cases while improving style is a secondary priority. Since we did identify this corner case in our discussion I think we should still use [2]. I don't consider the code complex. In fact, to me, the ifindex clause is a more understandable direct reason for failure than the proto which is indirect. Having the ifindex clause helps figuring out the proto clause.
It's interesting that the unlisted fix does not return ENODEV, but returns success and leaves the socket in an unbound state, equivalent to binding to ETH_P_NONE and ifindex 0. This seems surprising behavior to the caller.
On rereading that, I still do not see a purpose of special ifindex -1.
Can this code be relevant?
case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break;
Perhaps, although the socket failed to (re) find the device, the device is still aware of the socket and we need the ifindex condition to fail.
Gur Stavi wrote:
Gur Stavi wrote:
If we don't care about opening up fanout groups to ETH_P_NONE, then patch v2 seems sufficient. If explicitly blocking this, the ENXIO return can be added, but ideally without touching the other lines.
I don't think that allowing ETH_P_NONE is relevant. In my opinion the 2 options that should be considered to fail fanout_add are:
- Testing proto == 0
- Testing proto == 0 || ifindex == -1
The only corner case that is caught by [2] and missed by [1] is the "unlisted" case during do_bind. It is such a rare case that probably no one will ever encounter bind "unlisted" followed by FANOUT_ADD. And this is not a dangerous corner case that leads to system crash.
However, being a purist, I see the major goal of code review to promote correctness by identifying corner cases while improving style is a secondary priority. Since we did identify this corner case in our discussion I think we should still use [2]. I don't consider the code complex. In fact, to me, the ifindex clause is a more understandable direct reason for failure than the proto which is indirect. Having the ifindex clause helps figuring out the proto clause.
It's interesting that the unlisted fix does not return ENODEV, but returns success and leaves the socket in an unbound state, equivalent to binding to ETH_P_NONE and ifindex 0. This seems surprising behavior to the caller.
On rereading that, I still do not see a purpose of special ifindex -1.
Can this code be relevant?
case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break;
Perhaps, although the socket failed to (re) find the device, the device is still aware of the socket and we need the ifindex condition to fail.
But the behavior is the same for ifindex -1 and 0. Devices always have an ifindex >= 1.
I realized another possible problem. We should consider adding ifindex Field to struct packet_fanout to be used for lookup of an existing match. There is little sense to bind sockets to different interfaces and then put them in the same fanout group. If you agree, I can prepare a separate patch for that.
My mistake: testing match->prot_hook.dev takes care of that.
Modify test_control_group to have toggle parameter. When toggle is non-zero, loopback device will be set down for the initialization of fd[1] which is still expected to successfully join the fanout.
Signed-off-by: Gur Stavi gur.stavi@huawei.com --- tools/testing/selftests/net/psock_fanout.c | 42 ++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 4f31e92ebd96..acdfae8f8a9a 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -48,6 +48,7 @@ #include <string.h> #include <sys/mman.h> #include <sys/socket.h> +#include <sys/ioctl.h> #include <sys/stat.h> #include <sys/types.h> #include <unistd.h> @@ -59,6 +60,33 @@
static uint32_t cfg_max_num_members;
+static void loopback_set_up_down(int state_up) +{ + struct ifreq ifreq = {}; + int fd, err; + + fd = socket(AF_PACKET, SOCK_RAW, 0); + if (fd < 0) { + perror("socket loopback"); + exit(1); + } + strcpy(ifreq.ifr_name, "lo"); + err = ioctl(fd, SIOCGIFFLAGS, &ifreq); + if (err) { + perror("SIOCGIFFLAGS"); + exit(1); + } + if (state_up != !!(ifreq.ifr_flags & IFF_UP)) { + ifreq.ifr_flags ^= IFF_UP; + err = ioctl(fd, SIOCSIFFLAGS, &ifreq); + if (err) { + perror("SIOCSIFFLAGS"); + exit(1); + } + } + close(fd); +} + /* Open a socket in a given fanout mode. * @return -1 if mode is bad, a valid socket otherwise */ static int sock_fanout_open(uint16_t typeflags, uint16_t group_id) @@ -264,17 +292,22 @@ static void test_control_single(void) }
/* Test illegal group with different modes or flags */ -static void test_control_group(void) +static void test_control_group(int toggle) { int fds[2];
- fprintf(stderr, "test: control multiple sockets\n"); + if (toggle) + fprintf(stderr, "test: control multiple sockets with link down toggle\n"); + else + fprintf(stderr, "test: control multiple sockets\n");
fds[0] = sock_fanout_open(PACKET_FANOUT_HASH, 0); if (fds[0] == -1) { fprintf(stderr, "ERROR: failed to open HASH socket\n"); exit(1); } + if (toggle) + loopback_set_up_down(0); if (sock_fanout_open(PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG, 0) != -1) { fprintf(stderr, "ERROR: joined group with wrong flag defrag\n"); @@ -294,6 +327,8 @@ static void test_control_group(void) fprintf(stderr, "ERROR: failed to join group\n"); exit(1); } + if (toggle) + loopback_set_up_down(1); if (close(fds[1]) || close(fds[0])) { fprintf(stderr, "ERROR: closing sockets\n"); exit(1); @@ -489,7 +524,8 @@ int main(int argc, char **argv) int port_off = 2, tries = 20, ret;
test_control_single(); - test_control_group(); + test_control_group(0); + test_control_group(1); test_control_group_max_num_members(); test_unique_fanout_group_ids();
linux-kselftest-mirror@lists.linaro.org