adds an extension infrastructure for sk_buff:
1. extension memory is released when the sk_buff is free'd.
2. data is shared after cloning an skb.

This is also how xfrm and bridge netfilter skb-associated data
(skb->sp and skb->nf_bridge) are handled.

Two new members are added to sk_buff:
1. 'active_extensions' byte (filling a hole), telling which extensions
   have been allocated for the skb.
2. extension pointer, located at the end of the sk_buff.
   If active_extensions is 0, its content is undefined.

The 'nf_bridge' pointer is removed, i.e. sk_buff size remains the same,
in a followup patch.

This adds extra code to skb clone and free paths (to deal with
refcount/free of extension area) but replaces the existing code that
deals with skb->nf_bridge.

This patch only adds the basic infrastructure, the nf_bridge conversion
is done in the next patch.

Conversion of skb->sp (ipsec/xfrm secpath) to an skb extension is planned
as a followup.

Signed-off-by: Florian Westphal <f...@strlen.de>
---
 include/linux/skbuff.h | 124 +++++++++++++++++++++++++++++++++++++-
 net/Kconfig            |   3 +
 net/core/skbuff.c      | 131 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_output.c   |   1 +
 net/ipv6/ip6_output.c  |   1 +
 5 files changed, 259 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 73902acf2b71..832904d71a85 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -245,6 +245,7 @@ struct iov_iter;
 struct napi_struct;
 struct bpf_prog;
 union bpf_attr;
+struct skb_ext;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -633,6 +634,7 @@ typedef unsigned char *sk_buff_data_t;
  *     @queue_mapping: Queue mapping for multiqueue devices
  *     @xmit_more: More SKBs are pending for this queue
  *     @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
+ *     @active_extensions: active extensions (skb_ext_id types)
  *     @ndisc_nodetype: router type (from link layer)
  *     @ooo_okay: allow the mapping of a socket to a queue to be changed
  *     @l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -662,6 +664,7 @@ typedef unsigned char *sk_buff_data_t;
  *     @data: Data head pointer
  *     @truesize: Buffer size
  *     @users: User count - see {datagram,tcp}.c
+ *     @extensions: allocated extensions, valid if active_extensions is nonzero
  */
 
 struct sk_buff {
@@ -744,7 +747,9 @@ struct sk_buff {
                                head_frag:1,
                                xmit_more:1,
                                pfmemalloc:1;
-
+#ifdef CONFIG_SKB_EXTENSIONS
+       __u8                    active_extensions;
+#endif
        /* fields enclosed in headers_start/headers_end are copied
         * using a single memcpy() in __copy_skb_header()
         */
@@ -866,6 +871,11 @@ struct sk_buff {
                                *data;
        unsigned int            truesize;
        refcount_t              users;
+
+#ifdef CONFIG_SKB_EXTENSIONS
+       /* only useable after checking ->active_extensions != 0 */
+       struct skb_ext          *extensions;
+#endif
 };
 
 #ifdef __KERNEL__
@@ -3889,6 +3899,118 @@ static inline void nf_conntrack_get(struct nf_conntrack 
*nfct)
                atomic_inc(&nfct->use);
 }
 #endif
+
+#ifdef CONFIG_SKB_EXTENSIONS
+enum skb_ext_id {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+       SKB_EXT_BRIDGE_NF,
+#endif
+       SKB_EXT_NUM, /* must be last */
+};
+
+/* each extension aligned to this value */
+#define SKB_EXT_ALIGN  8
+/* offsets/len: left-shift needed to translate offset to bytes */
+#define SKB_EXT_ALIGN_SHIFT 3
+
+/**
+ *     struct skb_ext - sk_buff extensions
+ *     @refcount: 1 on allocation, deallocated on 0
+ *     @offset: offset to add to @data to obtain extension address
+ *     @len: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
+ *     @data: start of extension data, variable sized
+ *
+ *     Note: offsets and len are stored in chunks of 8 bytes, this allows
+ *     to use 'u8' types while allowing up to 2kb worth of extension data.
+ */
+struct skb_ext {
+       refcount_t refcnt;
+       u8 offset[SKB_EXT_NUM]; /* chunks of 8 bytes */
+       u8 len;                 /* same, i.e. size == len << 3 */
+       char data[0] __aligned(SKB_EXT_ALIGN);
+};
+
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_free(struct skb_ext *ext);
+
+static inline void __skb_ext_put(struct skb_ext *ext)
+{
+       if (ext && refcount_dec_and_test(&ext->refcnt))
+               __skb_ext_free(ext);
+}
+
+static inline void skb_ext_put(struct sk_buff *skb)
+{
+       if (skb->active_extensions)
+               __skb_ext_put(skb->extensions);
+}
+
+static inline void skb_ext_get(struct sk_buff *skb)
+{
+       if (skb->active_extensions) {
+               struct skb_ext *ext = skb->extensions;
+
+               if (ext)
+                       refcount_inc(&ext->refcnt);
+       }
+}
+
+static inline void __skb_ext_copy(struct sk_buff *dst,
+                                 const struct sk_buff *src)
+{
+       dst->active_extensions = src->active_extensions;
+
+       if (src->active_extensions) {
+               struct skb_ext *ext = src->extensions;
+
+               if (ext)
+                       refcount_inc(&ext->refcnt);
+               dst->extensions = ext;
+       }
+}
+
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
+{
+       skb_ext_put(dst);
+       __skb_ext_copy(dst, src);
+}
+
+static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id 
i)
+{
+       return !!ext->offset[i];
+}
+
+static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
+{
+       return skb->active_extensions & (1 << id);
+}
+
+static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+       if (skb_ext_exist(skb, id))
+               __skb_ext_del(skb, id);
+}
+
+static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
+{
+       if (skb_ext_exist(skb, id)) {
+               struct skb_ext *ext = skb->extensions;
+
+               if (ext && __skb_ext_exist(ext, id))
+                       return (void *)ext + (ext->offset[id] << 3);
+       }
+
+       return NULL;
+}
+#else
+static inline void skb_ext_put(struct sk_buff *skb) {}
+static inline void skb_ext_get(struct sk_buff *skb) {}
+static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
+static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) 
{}
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) 
{}
+#endif /* CONFIG_SKB_EXTENSIONS */
+
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
 {
diff --git a/net/Kconfig b/net/Kconfig
index f235edb593ba..93b291292860 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -51,6 +51,9 @@ config NET_INGRESS
 config NET_EGRESS
        bool
 
+config SKB_EXTENSIONS
+       bool
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 02cd7ae3d0fb..e29016030633 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -617,6 +617,7 @@ void skb_release_head_state(struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        nf_bridge_put(skb->nf_bridge);
 #endif
+       skb_ext_put(skb);
 }
 
 /* Free everything but the sk_buff shell. */
@@ -796,6 +797,7 @@ static void __copy_skb_header(struct sk_buff *new, const 
struct sk_buff *old)
        new->dev                = old->dev;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        skb_dst_copy(new, old);
+       __skb_ext_copy(new, old);
 #ifdef CONFIG_XFRM
        new->sp                 = secpath_get(old->sp);
 #endif
@@ -5531,3 +5533,132 @@ void skb_condense(struct sk_buff *skb)
         */
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 }
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+       [SKB_EXT_BRIDGE_NF] = sizeof(struct nf_bridge_info),
+#endif
+};
+
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+       return (void *)ext + (ext->offset[id] << SKB_EXT_ALIGN_SHIFT);
+}
+
+static struct skb_ext *skb_ext_cow(unsigned int len,
+                                  struct skb_ext *old)
+{
+       struct skb_ext *new = kmalloc(len, GFP_ATOMIC);
+
+       if (!new)
+               return NULL;
+
+       if (!old) {
+               memset(new->offset, 0, sizeof(new->offset));
+               refcount_set(&new->refcnt, 1);
+               return new;
+       }
+
+       memcpy(new, old, old->len << SKB_EXT_ALIGN_SHIFT);
+       refcount_set(&new->refcnt, 1);
+       __skb_ext_put(old);
+       return new;
+}
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+       return 0 +
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+               skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
+#endif
+               0;
+}
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extenion or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+       unsigned int newlen, newoff, oldlen;
+       struct skb_ext *new, *old = NULL;
+       bool cow_needed = true;
+
+       BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+       BUILD_BUG_ON(skb_ext_total_length() > (255 << 3));
+
+       if (skb->active_extensions) {
+               old = skb->extensions;
+
+               cow_needed = refcount_read(&old->refcnt) > 1;
+
+               if (__skb_ext_exist(old, id)) {
+                       if (!cow_needed) {
+                               new = old;
+                               goto set_active;
+                       }
+
+                       /* extension was allocated previously and it
+                        * might be used by a cloned skb. COW needed.
+                        */
+                       new = skb_ext_cow(old->len << SKB_EXT_ALIGN_SHIFT, old);
+                       if (!new)
+                               return NULL;
+
+                       skb->extensions = new;
+                       goto set_active;
+               }
+               oldlen = old->len << SKB_EXT_ALIGN_SHIFT;
+       } else {
+               oldlen = sizeof(*new);
+       }
+
+       newoff = ALIGN(oldlen, SKB_EXT_ALIGN);
+       newlen = newoff + skb_ext_type_len[id];
+
+       if (cow_needed)
+               new = skb_ext_cow(newlen, old);
+       else
+               new = krealloc(old, newlen, GFP_ATOMIC);
+       if (!new)
+               return NULL;
+
+       new->offset[id] = newoff >> SKB_EXT_ALIGN_SHIFT;
+       new->len = newlen >> SKB_EXT_ALIGN_SHIFT;
+       skb->extensions = new;
+set_active:
+       skb->active_extensions |= 1 << id;
+       return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+       struct skb_ext *ext;
+
+       skb->active_extensions &= ~(1 << id);
+       if (skb->active_extensions == 0) {
+               ext = skb->extensions;
+               skb->extensions = NULL;
+               __skb_ext_put(ext);
+       }
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_free(struct skb_ext *ext)
+{
+       kfree(ext);
+}
+EXPORT_SYMBOL(__skb_ext_free);
+#endif /* CONFIG_SKB_EXTENSIONS */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c09219e7f230..a12e12f983d5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct 
sk_buff *from)
        to->tc_index = from->tc_index;
 #endif
        nf_copy(to, from);
+       skb_ext_copy(to, from);
 #if IS_ENABLED(CONFIG_IP_VS)
        to->ipvs_property = from->ipvs_property;
 #endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89e0d5118afe..7eeb0f24be87 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -574,6 +574,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct 
sk_buff *from)
        to->tc_index = from->tc_index;
 #endif
        nf_copy(to, from);
+       skb_ext_copy(to, from);
        skb_copy_secmark(to, from);
 }
 
-- 
2.18.1

Reply via email to