Re: [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs

8 Sep 2019

On 9/7/19 2:40 PM, Alan Maguire wrote:
...
packet capture is especially valuable in tracing contexts, so
extend bpf_pcap helper to take a tracing-derived skb pointer
as an argument.
In the case of tracing programs, the starting protocol
(corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for
IP, etc) needs to be specified and should reflect the protocol
type which is pointed to by the skb->data pointer; i.e. the
start of the packet.  This can derived in a limited set of cases,
but should be specified where possible.  For skb and xdp programs
this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).
Example usage for a tracing program, where we use a
struct bpf_pcap_hdr array map to pass in preferences for
protocol and max len:
struct bpf_map_def SEC("maps") pcap_conf_map = {
   .type = BPF_MAP_TYPE_ARRAY,
   .key_size = sizeof(int),
   .value_size = sizeof(struct bpf_pcap_hdr),
   .max_entries = 1,
};
struct bpf_map_def SEC("maps") pcap_map = {
   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
   .key_size = sizeof(int),
   .value_size = sizeof(int),
   .max_entries = 1024,
};
SEC("kprobe/kfree_skb")
int probe_kfree_skb(struct pt_regs *ctx)
{
   struct bpf_pcap_hdr *conf;
   int key = 0;
conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
   if (!conf)
   	return 0;
bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map,
   	 conf->protocol, BPF_F_CURRENT_CPU);
   return 0;
}
Signed-off-by: Alan Maguire alan.maguire@oracle.com
[...]
...
@@ -2977,6 +2992,8 @@ enum bpf_func_id {
  
  /* BPF_FUNC_pcap flags */
  #define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX		(1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE         (1ULL << 56)
  
  /* Mode for BPF_FUNC_skb_adjust_room helper. */
  enum bpf_adj_room_mode {

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d..311883b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,8 @@
  #include <linux/kprobes.h>
  #include <linux/syscalls.h>
  #include <linux/error-injection.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
  
  #include <asm/tlb.h>
  
@@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
   return __bpf_perf_event_output(regs, map, flags, sd);
  }
  
+/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */
+static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,

			unsigned long offset,


			unsigned long len)



+{

const struct sk_buff *frag_iterp, *skb = from;
struct skb_shared_info *shinfop, shinfo;
struct sk_buff frag_iter;
unsigned long copy, start;
void *to = tobuf;
int i, ret;

start = skb_headlen(skb);

copy = start - offset;
if (copy > 0) {
if (copy > len)


	copy = len;


ret = probe_kernel_read(to, skb->data, copy);


if (unlikely(ret < 0))


	goto out;


len -= copy;


if (len == 0)


	return 0;


offset += copy;


to += copy;


}

if (skb->data_len == 0)
goto out;



shinfop = skb_shinfo(skb);

ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
if (unlikely(ret < 0))
goto out;



if (shinfo.nr_frags > MAX_SKB_FRAGS) {
ret = -EINVAL;


goto out;


}
for (i = 0; i < shinfo.nr_frags; i++) {
skb_frag_t *f = &shinfo.frags[i];


int end;



if (start > offset + len) {


	ret = -E2BIG;


	goto out;


}



end = start + skb_frag_size(f);


copy = end - offset;


if (copy > 0) {


	u32 poff, p_len, copied;


	struct page *p;


	u8 *vaddr;



	if (copy > len)


		copy = len;



	skb_frag_foreach_page(f,


			      skb_frag_off(f) + offset - start,


			      copy, p, poff, p_len, copied) {



		vaddr = kmap_atomic(p);


		ret = probe_kernel_read(to + copied,


					vaddr + poff, p_len);


		kunmap_atomic(vaddr);



		if (unlikely(ret < 0))


			goto out;


	}


	len -= copy;


	if (len == 0)


		return 0;


	offset += copy;


	to += copy;


}


start = end;


}

for (frag_iterp = shinfo.frag_list; frag_iterp;
    frag_iterp = frag_iter.next) {


int end;



if (start > offset + len) {


	ret = -E2BIG;


	goto out;


}


ret = probe_kernel_read(&frag_iter, frag_iterp,


			sizeof(frag_iter));


if (ret)


	goto out;



end = start + frag_iter.len;


copy = end - offset;


if (copy > 0) {


	if (copy > len)


		copy = len;


	ret = bpf_trace_skb_copy(to, &frag_iter,


				offset - start,


				copy);


	if (ret)


		goto out;



	len -= copy;


	if (len == 0)


		return 0;


	offset += copy;


	to += copy;


}


start = end;


}

+out:

if (ret)
memset(tobuf, 0, len);



return ret;

+}
For net side bpf_perf_event_output, we have
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                   unsigned long off, unsigned long len)
{
         void *ptr = skb_header_pointer(skb, off, len, dst_buff);
if (unlikely(!ptr))
                 return len;
         if (ptr != dst_buff)
                 memcpy(dst_buff, ptr, len);
return 0;
}
BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map 
*, map,
            u64, flags, void *, meta, u64, meta_size)
{
         u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                 return -EINVAL;
         if (unlikely(skb_size > skb->len))
                 return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                 bpf_skb_copy);
}
It does not really consider output all the frags.
I understand that to get truly all packet data, frags should be
considered, but seems we did not do it before? I am wondering
whether we need to do here.
If we indeed do not need to handle frags here, I think maybe
bpf_probe_read() in existing bpf kprobe function should be
enough, we do not need this helper?
...



+/* Derive protocol for some of the easier cases.  For tracing, a probe point


may be dealing with packets in various states. Common cases are IP



packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet



(_PCAP_TYPE_ETH).  For other cases the caller must specify the



protocol they expect.  Other heuristics for packet identification



should be added here as needed, since determining the packet type



ensures we do not capture packets that fail to match the desired



pcap type in BPF_F_PCAP_STRICT_TYPE mode.


*/

+static inline int bpf_skb_protocol_get(struct sk_buff *skb)
+{

switch (htons(skb->protocol)) {
case ETH_P_IP:
case ETH_P_IPV6:
if (skb_network_header(skb) == skb->data)


	return BPF_PCAP_TYPE_IP;


else


	return BPF_PCAP_TYPE_ETH;


default:
return BPF_PCAP_TYPE_UNSET;


}

+}



+BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,

  int, protocol_wanted, u64, flags)



Up to now, for helpers, verifier has a way to verifier it is used 
properly regarding to the context. For example, for xdp version
perf_event_output, the help prototype,
   BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct 
bpf_map *, map,
            u64, flags, void *, meta, u64, meta_size)
the verifier is able to guarantee that the first parameter
has correct type xdp_buff, not something from type cast.
   .arg1_type      = ARG_PTR_TO_CTX,
This helper, in the below we have
   .arg1_type	= ARG_ANYTHING,
So it is not really enforced. Bringing BTF can help, but type
name matching typically bad.
...
+{

struct bpf_pcap_hdr pcap;
struct sk_buff skb;
int protocol;
int ret;

if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
	       BPF_F_PCAP_STRICT_TYPE)))


return -EINVAL;



ret = probe_kernel_read(&skb, data, sizeof(skb));
if (unlikely(ret < 0))
return ret;



/* Sanity check skb len in case we get bogus data. */
if (unlikely(!skb.len))
return -ENOENT;


if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
return -E2BIG;



protocol = bpf_skb_protocol_get(&skb);

if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
/* If we cannot determine protocol type, bail. */


if (protocol == BPF_PCAP_TYPE_UNSET)


	return -EPROTO;


} else {
/* if we determine protocol type, and it's not what we asked


 * for _and_ we are in strict mode, bail.  Otherwise we assume


 * the packet is the requested protocol type and drive on.


 */


if (flags & BPF_F_PCAP_STRICT_TYPE &&


    protocol != BPF_PCAP_TYPE_UNSET &&


    protocol != protocol_wanted)


	return -EPROTO;


protocol = protocol_wanted;


}

/* If we specified a matching incoming ifindex, bail if not a match. */
if (flags & BPF_F_PCAP_ID_IIFINDEX) {
int iif = flags & BPF_F_PCAP_ID_MASK;



if (iif && skb.skb_iif != iif)


	return -ENOENT;


}

ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
if (ret)
return ret;



return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
		&skb, pcap.cap_len, bpf_trace_skb_copy);



+}



+static const struct bpf_func_proto bpf_trace_pcap_proto = {

.func		= bpf_trace_pcap,
.gpl_only	= true,
.ret_type	= RET_INTEGER,
.arg1_type	= ARG_ANYTHING,
.arg2_type	= ARG_ANYTHING,
.arg3_type	= ARG_CONST_MAP_PTR,
.arg4_type	= ARG_ANYTHING,
.arg5_type	= ARG_ANYTHING,

+};

BPF_CALL_0(bpf_get_current_task)
{
 return (long) current;

@@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry)
  #endif
   case BPF_FUNC_send_signal:
   	return &bpf_send_signal_proto;

case BPF_FUNC_pcap:
return &bpf_trace_pcap_proto;

default:
return NULL;
}


    

2024

2023

2022

2021

2020

2019

2018

2017

Re: [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs

Signed-off-by: Alan Maguire alan.maguire@oracle.com