On 3/28/18 10:27 AM, Shrijeet Mukherjee wrote: > diff --git a/include/net/mpls.h b/include/net/mpls.h > index 2583dbc689b8..3a5b8c00823d 100644 > --- a/include/net/mpls.h > +++ b/include/net/mpls.h > @@ -24,14 +24,50 @@ struct mpls_shim_hdr { > __be32 label_stack_entry; > }; > > +struct mpls_entry_decoded { > + u32 label; > + u8 ttl; > + u8 tc; > + u8 bos; > +}; > + > static inline bool eth_p_mpls(__be16 eth_type) > { > return eth_type == htons(ETH_P_MPLS_UC) || > eth_type == htons(ETH_P_MPLS_MC); > } > > -static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) > +static inline struct mpls_shim_hdr *skb_mpls_hdr(const struct sk_buff *skb) > { > return (struct mpls_shim_hdr *)skb_network_header(skb); > } > + > +static inline struct mpls_shim_hdr > +mpls_entry_encode(u32 label, unsigned int ttl, unsigned int tc, bool bos) > +{ > + struct mpls_shim_hdr result; > + > + result.label_stack_entry = > + cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) > + | (tc << MPLS_LS_TC_SHIFT) > + | (bos ? (1 << MPLS_LS_S_SHIFT) : 0) > + | (ttl << MPLS_LS_TTL_SHIFT)); > + > + return result; > +} > + > +static inline > +struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) > +{ > + struct mpls_entry_decoded result; > + unsigned int entry = be32_to_cpu(hdr->label_stack_entry); > + > + result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; > + result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; > + result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; > + result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; > + > + return result; > +} > + > #endif
The above should be in patch 1, though without the mpls_hdr rename. > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 18b7c510c511..2278548e1f8f 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -439,6 +439,12 @@ union bpf_attr { > * int bpf_skb_vlan_pop(skb) > * Return: 0 on success or negative error > * > + * int bpf_skb_mpls_push(skb, num_lbls, lbls[]) > + * Return 0 on success or negative error > + * > + * int bpf_skb_mpls_pop(skb, num_lbls) > + * Return number of popped labels, 0 is no-op, deliver packet to current > dst > + * > * int bpf_skb_get_tunnel_key(skb, key, size, flags) > * int bpf_skb_set_tunnel_key(skb, key, size, flags) > * retrieve or populate tunnel metadata > @@ -794,7 +800,10 @@ union bpf_attr { > FN(msg_redirect_map), \ > FN(msg_apply_bytes), \ > FN(msg_cork_bytes), \ > - FN(msg_pull_data), > + FN(msg_pull_data), \ > + FN(skb_mpls_push), \ > + FN(skb_mpls_pop), > + > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > diff --git a/net/core/filter.c b/net/core/filter.c > index 00f62fafc788..c96ae8ef423d 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2522,7 +2522,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 > len_diff) > } > > BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, > - u32, mode, u64, flags) > + u32, mode, u64, flags) drop the above whitespace change > { > if (unlikely(flags)) > return -EINVAL; > @@ -2542,6 +2542,125 @@ static const struct bpf_func_proto > bpf_skb_adjust_room_proto = { > .arg4_type = ARG_ANYTHING, > }; > > +static int bpf_skb_mpls_net_grow(struct sk_buff *skb, int len_diff) > +{ > + u32 off = skb_mac_header_len(skb); /*LL_RESERVED_SPACE ?? */ > + int ret; > + > + ret = skb_cow(skb, len_diff); > + if (unlikely(ret < 0)) > + return ret; > + > + skb_set_inner_protocol(skb, skb->protocol); > + skb_reset_inner_network_header(skb); > + > + ret = bpf_skb_generic_push(skb, off, len_diff); > + if (unlikely(ret < 0)) > + return ret; > + > + skb_reset_mac_header(skb); > + skb_set_network_header(skb, ETH_HLEN); > + skb->protocol = eth_hdr(skb)->h_proto = htons(ETH_P_MPLS_UC); > + > + if (skb_is_gso(skb)) { > +/* Due to header grow, MSS needs to be downgraded. */ > + skb_shinfo(skb)->gso_size -= len_diff; > +/* Header must be checked, and gso_segs recomputed. */ > + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; > + skb_shinfo(skb)->gso_segs = 0; > + } > + > + bpf_compute_data_pointers(skb); > + return 0; > +} The only thing MPLS specific is the protocol setting. Seems to me you could leverage the existing bpf_skb_net_grow or even bpf_skb_adjust_net and then set the protocols and headers in the caller of this function. You already do something similar in bpf_skb_mpls_pop. > + > +BPF_CALL_2(bpf_skb_mpls_pop, struct sk_buff*, skb, u32, num_lbls) > +{ > + u32 i = 0; > + struct mpls_shim_hdr *hdr; > + unsigned char *cursor; > + struct mpls_entry_decoded dec; > + > + if (num_lbls == 0) > + return 0; > + > + cursor = skb_network_header(skb); > + do { > + hdr = (struct mpls_shim_hdr *)cursor; > + dec = mpls_entry_decode(hdr); > + i++; cursor = cursor + sizeof(struct mpls_shim_hdr); > + } while (dec.bos != 1 && i < num_lbls); > + > + bpf_push_mac_rcsum(skb); > + skb_pull(skb, i * sizeof(struct mpls_shim_hdr)); > + skb_reset_network_header(skb); > + > + skb->protocol = eth_hdr(skb)->h_proto = htons(ETH_P_MPLS_UC); If I pop 1 label and there are more of them, then the protocol does not need to be adjusted -- it was already set to MPLS. If I pop all of the labels, then the protocol is not MPLS, but whatever the inner packet is -- IPv4 or IPv6 or other. > + bpf_pull_mac_rcsum(skb); > + bpf_compute_data_pointers(skb); > + > + return i; > +} > + > +const struct bpf_func_proto bpf_skb_mpls_pop_proto = { > + .func = bpf_skb_mpls_pop, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_ANYTHING, > +}; > +EXPORT_SYMBOL_GPL(bpf_skb_mpls_pop_proto); > + > +BPF_CALL_3(bpf_skb_mpls_push, struct sk_buff*, skb, > + __be32*, lbls, u32, num_lbls) > +{ > + int ret, i; > + unsigned int new_header_size = num_lbls * sizeof(__be32); > + unsigned int ttl = 255; > + struct dst_entry *dst = skb_dst(skb); dst is not always set. e.g., any program on ingress prior to the layer 3 will not have it. > + struct net_device *out_dev = dst->dev; > + struct mpls_shim_hdr *hdr; > + bool bos; > + > + /* Ensure there is enough space for the headers in the skb */ > + ret = bpf_skb_mpls_net_grow(skb, new_header_size); > + if (ret < 0) { > + trace_printk("COW was killed\n"); > + bpf_compute_data_pointers(skb); > + return -ENOMEM; > + } > + > + skb->dev = out_dev; > +/* XXX this may need finesse to integrate with > + * global TTL values for MPLS > + */ > + if (dst->ops->family == AF_INET) > + ttl = ip_hdr(skb)->ttl; > + else if (dst->ops->family == AF_INET6) > + ttl = ipv6_hdr(skb)->hop_limit; > + > + /* Push the new labels */ > + hdr = skb_mpls_hdr(skb); > + bos = true; > + for (i = num_lbls - 1; i >= 0; i--) { > + hdr[i] = mpls_entry_encode(lbls[i], ttl, 0, bos); > + bos = false; > + } > + > + bpf_compute_data_pointers(skb); > + return 0; > +} > + > +const struct bpf_func_proto bpf_skb_mpls_push_proto = { > + .func = bpf_skb_mpls_push, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_PTR_TO_MEM, > + .arg3_type = ARG_CONST_SIZE, > +}; > +EXPORT_SYMBOL_GPL(bpf_skb_mpls_push_proto); > + > static u32 __bpf_skb_min_len(const struct sk_buff *skb) > { > u32 min_len = skb_network_offset(skb); > @@ -3019,6 +3138,8 @@ bool bpf_helper_changes_pkt_data(void *func) > { > if (func == bpf_skb_vlan_push || > func == bpf_skb_vlan_pop || > + func == bpf_skb_mpls_push || > + func == bpf_skb_mpls_pop || > func == bpf_skb_store_bytes || > func == bpf_skb_change_proto || > func == bpf_skb_change_head || > @@ -3682,6 +3803,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) > return &bpf_skb_vlan_push_proto; > case BPF_FUNC_skb_vlan_pop: > return &bpf_skb_vlan_pop_proto; > + case BPF_FUNC_skb_mpls_push: > + return &bpf_skb_mpls_push_proto; > + case BPF_FUNC_skb_mpls_pop: > + return &bpf_skb_mpls_pop_proto; > case BPF_FUNC_skb_change_proto: > return &bpf_skb_change_proto_proto; > case BPF_FUNC_skb_change_type: >