It is possible to create cycles using bpf_redirect_peer which lead to an an infinite loop inside __netif_receive_skb_core. The simplest way to illustrate this is by attaching a TC program to the ingress hook on both sides of a veth or netkit device pair which redirects to its own peer, although other cycles are possible. This patch places an upper limit on the number of iterations allowed inside __netif_receive_skb_core to prevent this.
Signed-off-by: Jordan Rife jrife@google.com Fixes: 9aa1206e8f48 ("bpf: Add redirect_peer helper") Cc: stable@vger.kernel.org --- net/core/dev.c | 11 +++- net/core/dev.h | 1 + .../selftests/bpf/prog_tests/tc_redirect.c | 51 +++++++++++++++++++ .../selftests/bpf/progs/test_tc_peer.c | 13 +++++ 4 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/net/core/dev.c b/net/core/dev.c index cd479f5f22f6..753f8d27f47c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5455,6 +5455,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; + int loops = 0; __be16 type;
net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); @@ -5521,8 +5522,16 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); - if (another) + if (another) { + loops++; + if (unlikely(loops == RX_LOOP_LIMIT)) { + ret = NET_RX_DROP; + net_crit_ratelimited("bpf: loop limit reached on datapath, buggy bpf program?\n"); + goto out; + } + goto another_round; + } if (!skb) goto out;
diff --git a/net/core/dev.h b/net/core/dev.h index 5654325c5b71..28d1cf2f9069 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -150,6 +150,7 @@ struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8 +#define RX_LOOP_LIMIT 8
#ifndef CONFIG_PREEMPT_RT static inline bool dev_xmit_recursion(void) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index c85798966aec..db1b36090d6c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -1081,6 +1081,55 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result) close_netns(nstoken); }
+static void test_tc_redirect_peer_loop(struct netns_setup_result *setup_result) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src); + struct test_tc_peer *skel; + struct nstoken *nstoken; + int err; + + /* Set up an infinite redirect loop using bpf_redirect_peer with ingress + * hooks on either side of a veth/netkit pair redirecting to its own + * peer. This should not lock up the kernel. + */ + nstoken = open_netns(NS_SRC); + if (!ASSERT_OK_PTR(nstoken, "setns src")) + return; + + skel = test_tc_peer__open(); + if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) + goto done; + + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src; + skel->rodata->IFINDEX_SRC_FWD = setup_result->ifindex_src_fwd; + + err = test_tc_peer__load(skel); + if (!ASSERT_OK(err, "test_tc_peer__load")) + goto done; + + QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src); + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.tc_src_self, 0); + close_netns(nstoken); + + nstoken = open_netns(NS_FWD); + if (!ASSERT_OK_PTR(nstoken, "setns fwd")) + return; + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, skel->progs.tc_src_fwd_self, 0); + + if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) + goto done; + + SYS_NOFAIL("ip netns exec " NS_SRC " %s -c 1 -W 1 -q %s > /dev/null", + ping_command(AF_INET), IP4_DST); +fail: +done: + if (skel) + test_tc_peer__destroy(skel); + close_netns(nstoken); +} + static int tun_open(char *name) { struct ifreq ifr; @@ -1280,6 +1329,8 @@ static void *test_tc_redirect_run_tests(void *arg) RUN_TEST(tc_redirect_peer, MODE_VETH); RUN_TEST(tc_redirect_peer, MODE_NETKIT); RUN_TEST(tc_redirect_peer_l3, MODE_VETH); + RUN_TEST(tc_redirect_peer_loop, MODE_VETH); + RUN_TEST(tc_redirect_peer_loop, MODE_NETKIT); RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT); RUN_TEST(tc_redirect_neigh, MODE_VETH); RUN_TEST(tc_redirect_neigh_fib, MODE_VETH); diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c index 365eacb5dc34..9b8a00ccad42 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_peer.c +++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c @@ -10,6 +10,7 @@
#include <bpf/bpf_helpers.h>
+volatile const __u32 IFINDEX_SRC_FWD; volatile const __u32 IFINDEX_SRC; volatile const __u32 IFINDEX_DST;
@@ -34,6 +35,18 @@ int tc_src(struct __sk_buff *skb) return bpf_redirect_peer(IFINDEX_DST, 0); }
+SEC("tc") +int tc_src_self(struct __sk_buff *skb) +{ + return bpf_redirect_peer(IFINDEX_SRC, 0); +} + +SEC("tc") +int tc_src_fwd_self(struct __sk_buff *skb) +{ + return bpf_redirect_peer(IFINDEX_SRC_FWD, 0); +} + SEC("tc") int tc_dst_l3(struct __sk_buff *skb) {
Hi Jordan,
kernel test robot noticed the following build warnings:
[auto build test WARNING on bpf-next/master] [also build test WARNING on bpf/master net/main net-next/main linus/master v6.12-rc1 next-20240927] [cannot apply to horms-ipvs/master] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Jordan-Rife/bpf-Prevent-infin... base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master patch link: https://lore.kernel.org/r/20240929170219.1881536-1-jrife%40google.com patch subject: [PATCH v1] bpf: Prevent infinite loops with bpf_redirect_peer config: openrisc-defconfig (https://download.01.org/0day-ci/archive/20240930/202409301255.h6vAvBWG-lkp@i...) compiler: or1k-linux-gcc (GCC) 14.1.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240930/202409301255.h6vAvBWG-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202409301255.h6vAvBWG-lkp@intel.com/
All warnings (new ones prefixed by >>):
net/core/dev.c: In function '__netif_receive_skb_core':
net/core/dev.c:5458:13: warning: unused variable 'loops' [-Wunused-variable]
5458 | int loops = 0; | ^~~~~
vim +/loops +5458 net/core/dev.c
5448 5449 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, 5450 struct packet_type **ppt_prev) 5451 { 5452 struct packet_type *ptype, *pt_prev; 5453 rx_handler_func_t *rx_handler; 5454 struct sk_buff *skb = *pskb; 5455 struct net_device *orig_dev; 5456 bool deliver_exact = false; 5457 int ret = NET_RX_DROP;
5458 int loops = 0;
5459 __be16 type; 5460 5461 net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); 5462 5463 trace_netif_receive_skb(skb); 5464 5465 orig_dev = skb->dev; 5466 5467 skb_reset_network_header(skb); 5468 if (!skb_transport_header_was_set(skb)) 5469 skb_reset_transport_header(skb); 5470 skb_reset_mac_len(skb); 5471 5472 pt_prev = NULL; 5473 5474 another_round: 5475 skb->skb_iif = skb->dev->ifindex; 5476 5477 __this_cpu_inc(softnet_data.processed); 5478 5479 if (static_branch_unlikely(&generic_xdp_needed_key)) { 5480 int ret2; 5481 5482 migrate_disable(); 5483 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), 5484 &skb); 5485 migrate_enable(); 5486 5487 if (ret2 != XDP_PASS) { 5488 ret = NET_RX_DROP; 5489 goto out; 5490 } 5491 } 5492 5493 if (eth_type_vlan(skb->protocol)) { 5494 skb = skb_vlan_untag(skb); 5495 if (unlikely(!skb)) 5496 goto out; 5497 } 5498 5499 if (skb_skip_tc_classify(skb)) 5500 goto skip_classify; 5501 5502 if (pfmemalloc) 5503 goto skip_taps; 5504 5505 list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { 5506 if (pt_prev) 5507 ret = deliver_skb(skb, pt_prev, orig_dev); 5508 pt_prev = ptype; 5509 } 5510 5511 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 5512 if (pt_prev) 5513 ret = deliver_skb(skb, pt_prev, orig_dev); 5514 pt_prev = ptype; 5515 } 5516 5517 skip_taps: 5518 #ifdef CONFIG_NET_INGRESS 5519 if (static_branch_unlikely(&ingress_needed_key)) { 5520 bool another = false; 5521 5522 nf_skip_egress(skb, true); 5523 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, 5524 &another); 5525 if (another) { 5526 loops++; 5527 if (unlikely(loops == RX_LOOP_LIMIT)) { 5528 ret = NET_RX_DROP; 5529 net_crit_ratelimited("bpf: loop limit reached on datapath, buggy bpf program?\n"); 5530 goto out; 5531 } 5532 5533 goto another_round; 5534 } 5535 if (!skb) 5536 goto out; 5537 5538 nf_skip_egress(skb, false); 5539 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 5540 goto out; 5541 } 5542 #endif 5543 skb_reset_redirect(skb); 5544 skip_classify: 5545 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 5546 goto drop; 5547 5548 if (skb_vlan_tag_present(skb)) { 5549 if (pt_prev) { 5550 ret = deliver_skb(skb, pt_prev, orig_dev); 5551 pt_prev = NULL; 5552 } 5553 if (vlan_do_receive(&skb)) 5554 goto another_round; 5555 else if (unlikely(!skb)) 5556 goto out; 5557 } 5558 5559 rx_handler = rcu_dereference(skb->dev->rx_handler); 5560 if (rx_handler) { 5561 if (pt_prev) { 5562 ret = deliver_skb(skb, pt_prev, orig_dev); 5563 pt_prev = NULL; 5564 } 5565 switch (rx_handler(&skb)) { 5566 case RX_HANDLER_CONSUMED: 5567 ret = NET_RX_SUCCESS; 5568 goto out; 5569 case RX_HANDLER_ANOTHER: 5570 goto another_round; 5571 case RX_HANDLER_EXACT: 5572 deliver_exact = true; 5573 break; 5574 case RX_HANDLER_PASS: 5575 break; 5576 default: 5577 BUG(); 5578 } 5579 } 5580 5581 if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { 5582 check_vlan_id: 5583 if (skb_vlan_tag_get_id(skb)) { 5584 /* Vlan id is non 0 and vlan_do_receive() above couldn't 5585 * find vlan device. 5586 */ 5587 skb->pkt_type = PACKET_OTHERHOST; 5588 } else if (eth_type_vlan(skb->protocol)) { 5589 /* Outer header is 802.1P with vlan 0, inner header is 5590 * 802.1Q or 802.1AD and vlan_do_receive() above could 5591 * not find vlan dev for vlan id 0. 5592 */ 5593 __vlan_hwaccel_clear_tag(skb); 5594 skb = skb_vlan_untag(skb); 5595 if (unlikely(!skb)) 5596 goto out; 5597 if (vlan_do_receive(&skb)) 5598 /* After stripping off 802.1P header with vlan 0 5599 * vlan dev is found for inner header. 5600 */ 5601 goto another_round; 5602 else if (unlikely(!skb)) 5603 goto out; 5604 else 5605 /* We have stripped outer 802.1P vlan 0 header. 5606 * But could not find vlan dev. 5607 * check again for vlan id to set OTHERHOST. 5608 */ 5609 goto check_vlan_id; 5610 } 5611 /* Note: we might in the future use prio bits 5612 * and set skb->priority like in vlan_do_receive() 5613 * For the time being, just ignore Priority Code Point 5614 */ 5615 __vlan_hwaccel_clear_tag(skb); 5616 } 5617 5618 type = skb->protocol; 5619 5620 /* deliver only exact match when indicated */ 5621 if (likely(!deliver_exact)) { 5622 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 5623 &ptype_base[ntohs(type) & 5624 PTYPE_HASH_MASK]); 5625 } 5626 5627 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 5628 &orig_dev->ptype_specific); 5629 5630 if (unlikely(skb->dev != orig_dev)) { 5631 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 5632 &skb->dev->ptype_specific); 5633 } 5634 5635 if (pt_prev) { 5636 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) 5637 goto drop; 5638 *ppt_prev = pt_prev; 5639 } else { 5640 drop: 5641 if (!deliver_exact) 5642 dev_core_stats_rx_dropped_inc(skb->dev); 5643 else 5644 dev_core_stats_rx_nohandler_inc(skb->dev); 5645 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); 5646 /* Jamal, now you will not able to escape explaining 5647 * me how you were going to use this. :-) 5648 */ 5649 ret = NET_RX_DROP; 5650 } 5651 5652 out: 5653 /* The invariant here is that if *ppt_prev is not NULL 5654 * then skb should also be non-NULL. 5655 * 5656 * Apparently *ppt_prev assignment above holds this invariant due to 5657 * skb dereferencing near it. 5658 */ 5659 *pskb = skb; 5660 return ret; 5661 } 5662
On 9/29/24 7:02 PM, Jordan Rife wrote:
It is possible to create cycles using bpf_redirect_peer which lead to an an infinite loop inside __netif_receive_skb_core. The simplest way to illustrate this is by attaching a TC program to the ingress hook on both sides of a veth or netkit device pair which redirects to its own peer, although other cycles are possible. This patch places an upper limit on the number of iterations allowed inside __netif_receive_skb_core to prevent this.
Signed-off-by: Jordan Rife jrife@google.com Fixes: 9aa1206e8f48 ("bpf: Add redirect_peer helper") Cc: stable@vger.kernel.org
net/core/dev.c | 11 +++- net/core/dev.h | 1 + .../selftests/bpf/prog_tests/tc_redirect.c | 51 +++++++++++++++++++ .../selftests/bpf/progs/test_tc_peer.c | 13 +++++ 4 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/net/core/dev.c b/net/core/dev.c index cd479f5f22f6..753f8d27f47c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5455,6 +5455,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP;
- int loops = 0; __be16 type;
net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); @@ -5521,8 +5522,16 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another);
if (another)
if (another) {
loops++;
No, as you mentioned, there are plenty of other misconfiguration possibilities in and outside bpf where something can loop in the stack (or where you can lock yourself out e.g. drop-all).
No, as you mentioned, there are plenty of other misconfiguration possibilities in and outside bpf where something can loop in the stack (or where you can lock yourself out e.g. drop-all).
I wasn't sure if it should be possible to lock up the kernel with such a combination of BPF programs. If this is the view generally, then fair enough I suppose. Maybe this is my ignorance showing, but my understanding is that with BPF generally we go to great lengths to make sure things don't block (e.g. making sure a BPF program terminates eventually) to avoid locking up the kernel. By extension, should it not also be impossible to block indefinitely in __netif_receive_skb_core with a combination of two BPF programs that create a cycle with bpf_redirect_peer? It seems like there are other provisions in place to avoid misconfiguration or buggy combinations of programs from breaking things too badly such as the dev_xmit_recursion() check in __bpf_tx_skb().
if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; }
-Jordan
linux-stable-mirror@lists.linaro.org