On 9/7/19 2:40 PM, Alan Maguire wrote:
packet capture is especially valuable in tracing contexts, so extend bpf_pcap helper to take a tracing-derived skb pointer as an argument.
In the case of tracing programs, the starting protocol (corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for IP, etc) needs to be specified and should reflect the protocol type which is pointed to by the skb->data pointer; i.e. the start of the packet. This can derived in a limited set of cases, but should be specified where possible. For skb and xdp programs this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).
Example usage for a tracing program, where we use a struct bpf_pcap_hdr array map to pass in preferences for protocol and max len:
struct bpf_map_def SEC("maps") pcap_conf_map = { .type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), .value_size = sizeof(struct bpf_pcap_hdr), .max_entries = 1, };
struct bpf_map_def SEC("maps") pcap_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(int), .max_entries = 1024, };
SEC("kprobe/kfree_skb") int probe_kfree_skb(struct pt_regs *ctx) { struct bpf_pcap_hdr *conf; int key = 0;
conf = bpf_map_lookup_elem(&pcap_conf_map, &key); if (!conf) return 0;
bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map, conf->protocol, BPF_F_CURRENT_CPU); return 0; }
Signed-off-by: Alan Maguire alan.maguire@oracle.com
[...]
@@ -2977,6 +2992,8 @@ enum bpf_func_id { /* BPF_FUNC_pcap flags */ #define BPF_F_PCAP_ID_MASK 0xffffffffffff +#define BPF_F_PCAP_ID_IIFINDEX (1ULL << 48) +#define BPF_F_PCAP_STRICT_TYPE (1ULL << 56) /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1255d..311883b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,8 @@ #include <linux/kprobes.h> #include <linux/syscalls.h> #include <linux/error-injection.h> +#include <linux/skbuff.h> +#include <linux/ip.h> #include <asm/tlb.h> @@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, return __bpf_perf_event_output(regs, map, flags, sd); } +/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */ +static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,
unsigned long offset,
unsigned long len)
+{
- const struct sk_buff *frag_iterp, *skb = from;
- struct skb_shared_info *shinfop, shinfo;
- struct sk_buff frag_iter;
- unsigned long copy, start;
- void *to = tobuf;
- int i, ret;
- start = skb_headlen(skb);
- copy = start - offset;
- if (copy > 0) {
if (copy > len)
copy = len;
ret = probe_kernel_read(to, skb->data, copy);
if (unlikely(ret < 0))
goto out;
len -= copy;
if (len == 0)
return 0;
offset += copy;
to += copy;
- }
- if (skb->data_len == 0)
goto out;
- shinfop = skb_shinfo(skb);
- ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
- if (unlikely(ret < 0))
goto out;
- if (shinfo.nr_frags > MAX_SKB_FRAGS) {
ret = -EINVAL;
goto out;
- }
- for (i = 0; i < shinfo.nr_frags; i++) {
skb_frag_t *f = &shinfo.frags[i];
int end;
if (start > offset + len) {
ret = -E2BIG;
goto out;
}
end = start + skb_frag_size(f);
copy = end - offset;
if (copy > 0) {
u32 poff, p_len, copied;
struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
skb_frag_foreach_page(f,
skb_frag_off(f) + offset - start,
copy, p, poff, p_len, copied) {
vaddr = kmap_atomic(p);
ret = probe_kernel_read(to + copied,
vaddr + poff, p_len);
kunmap_atomic(vaddr);
if (unlikely(ret < 0))
goto out;
}
len -= copy;
if (len == 0)
return 0;
offset += copy;
to += copy;
}
start = end;
- }
- for (frag_iterp = shinfo.frag_list; frag_iterp;
frag_iterp = frag_iter.next) {
int end;
if (start > offset + len) {
ret = -E2BIG;
goto out;
}
ret = probe_kernel_read(&frag_iter, frag_iterp,
sizeof(frag_iter));
if (ret)
goto out;
end = start + frag_iter.len;
copy = end - offset;
if (copy > 0) {
if (copy > len)
copy = len;
ret = bpf_trace_skb_copy(to, &frag_iter,
offset - start,
copy);
if (ret)
goto out;
len -= copy;
if (len == 0)
return 0;
offset += copy;
to += copy;
}
start = end;
- }
+out:
- if (ret)
memset(tobuf, 0, len);
- return ret;
+}
For net side bpf_perf_event_output, we have static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff);
if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len);
return 0; }
BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(skb_size > skb->len)) return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); }
It does not really consider output all the frags. I understand that to get truly all packet data, frags should be considered, but seems we did not do it before? I am wondering whether we need to do here.
If we indeed do not need to handle frags here, I think maybe bpf_probe_read() in existing bpf kprobe function should be enough, we do not need this helper?
+/* Derive protocol for some of the easier cases. For tracing, a probe point
- may be dealing with packets in various states. Common cases are IP
- packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
- (_PCAP_TYPE_ETH). For other cases the caller must specify the
- protocol they expect. Other heuristics for packet identification
- should be added here as needed, since determining the packet type
- ensures we do not capture packets that fail to match the desired
- pcap type in BPF_F_PCAP_STRICT_TYPE mode.
- */
+static inline int bpf_skb_protocol_get(struct sk_buff *skb) +{
- switch (htons(skb->protocol)) {
- case ETH_P_IP:
- case ETH_P_IPV6:
if (skb_network_header(skb) == skb->data)
return BPF_PCAP_TYPE_IP;
else
return BPF_PCAP_TYPE_ETH;
- default:
return BPF_PCAP_TYPE_UNSET;
- }
+}
+BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
int, protocol_wanted, u64, flags)
Up to now, for helpers, verifier has a way to verifier it is used properly regarding to the context. For example, for xdp version perf_event_output, the help prototype, BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) the verifier is able to guarantee that the first parameter has correct type xdp_buff, not something from type cast. .arg1_type = ARG_PTR_TO_CTX,
This helper, in the below we have .arg1_type = ARG_ANYTHING,
So it is not really enforced. Bringing BTF can help, but type name matching typically bad.
+{
- struct bpf_pcap_hdr pcap;
- struct sk_buff skb;
- int protocol;
- int ret;
- if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
BPF_F_PCAP_STRICT_TYPE)))
return -EINVAL;
- ret = probe_kernel_read(&skb, data, sizeof(skb));
- if (unlikely(ret < 0))
return ret;
- /* Sanity check skb len in case we get bogus data. */
- if (unlikely(!skb.len))
return -ENOENT;
- if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
return -E2BIG;
- protocol = bpf_skb_protocol_get(&skb);
- if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
/* If we cannot determine protocol type, bail. */
if (protocol == BPF_PCAP_TYPE_UNSET)
return -EPROTO;
- } else {
/* if we determine protocol type, and it's not what we asked
* for _and_ we are in strict mode, bail. Otherwise we assume
* the packet is the requested protocol type and drive on.
*/
if (flags & BPF_F_PCAP_STRICT_TYPE &&
protocol != BPF_PCAP_TYPE_UNSET &&
protocol != protocol_wanted)
return -EPROTO;
protocol = protocol_wanted;
- }
- /* If we specified a matching incoming ifindex, bail if not a match. */
- if (flags & BPF_F_PCAP_ID_IIFINDEX) {
int iif = flags & BPF_F_PCAP_ID_MASK;
if (iif && skb.skb_iif != iif)
return -ENOENT;
- }
- ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
- if (ret)
return ret;
- return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
&skb, pcap.cap_len, bpf_trace_skb_copy);
+}
+static const struct bpf_func_proto bpf_trace_pcap_proto = {
- .func = bpf_trace_pcap,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_CONST_MAP_PTR,
- .arg4_type = ARG_ANYTHING,
- .arg5_type = ARG_ANYTHING,
+};
- BPF_CALL_0(bpf_get_current_task) { return (long) current;
@@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry) #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto;
- case BPF_FUNC_pcap:
default: return NULL; }return &bpf_trace_pcap_proto;