Add cgroup to assoicate tasks with L3 networking domains. AF_INET{6}
sockets opened by tasks associated with an l3mdev cgroup are bound to
the associated master device when the socket is created. This allows a
user to run a command (and its children) within an L3 networking context.

The master-device for an l3mdev cgroup must be an L3 master device
(e.g., VRF), and it must be set before attaching tasks to the cgroup. Once
set the master-device can not change. Nested l3mdev cgroups are not
supported. The root (aka default) l3mdev cgroup can not be bound to a
master device.

Example:
    ip link add vrf-red type vrf table vrf-red
    ip link set dev vrf-red up
    ip link set dev eth1 master vrf-red

    cgcreate -g l3mdev:vrf-red
    cgset -r l3mdev.master-device=vrf-red vrf-red
    cgexec -g l3mdev:vrf-red bash

At this point the current shell and its child processes are attached to
the vrf-red L3 domain. Any AF_INET and AF_INET6 sockets opened by the
tasks are bound to the vrf-red device.

TO-DO:
- how to auto-create the cgroup when a VRF device is created and auto-deleted
  when a VRF device is destroyed

Signed-off-by: David Ahern <d...@cumulusnetworks.com>
---
 include/linux/cgroup_subsys.h |   3 +
 include/net/l3mdev_cgroup.h   |  27 ++++++
 net/core/sock.c               |   2 +
 net/l3mdev/Kconfig            |  12 +++
 net/l3mdev/Makefile           |   1 +
 net/l3mdev/l3mdev_cgroup.c    | 195 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 240 insertions(+)
 create mode 100644 include/net/l3mdev_cgroup.h
 create mode 100644 net/l3mdev/l3mdev_cgroup.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1a96fdaa33d5..507df40f11de 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -58,6 +58,9 @@ SUBSYS(net_prio)
 SUBSYS(hugetlb)
 #endif
 
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+SUBSYS(l3mdev)
+#endif
 /*
  * Subsystems that implement the can_fork() family of callbacks.
  */
diff --git a/include/net/l3mdev_cgroup.h b/include/net/l3mdev_cgroup.h
new file mode 100644
index 000000000000..c20fbb0a7f46
--- /dev/null
+++ b/include/net/l3mdev_cgroup.h
@@ -0,0 +1,27 @@
+/*
+ * l3mdev_cgroup.h             Control Group for L3 Master Device
+ *
+ * Copyright (c) 2015 Cumulus Networks. All rights reserved.
+ * Copyright (c) 2015 David Ahern <d...@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _L3MDEV_CGROUP_H
+#define _L3MDEV_CGROUP_H
+
+#if IS_ENABLED(CONFIG_CGROUP_L3MDEV)
+
+void sock_update_l3mdev(struct sock *sk);
+
+#else /* !CONFIG_CGROUP_L3MDEV */
+
+static inline void sock_update_l3mdev(struct sock *sk)
+{
+}
+
+#endif /* CONFIG_CGROUP_L3MDEV */
+#endif /* _L3MDEV_CGROUP_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 565bab7baca9..19ce06674dd9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -131,6 +131,7 @@
 #include <linux/ipsec.h>
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
+#include <net/l3mdev_cgroup.h>
 #include <linux/sock_diag.h>
 
 #include <linux/filter.h>
@@ -1424,6 +1425,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
 
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
+               sock_update_l3mdev(sk);
        }
 
        return sk;
diff --git a/net/l3mdev/Kconfig b/net/l3mdev/Kconfig
index 5d47325037bc..3142d810e222 100644
--- a/net/l3mdev/Kconfig
+++ b/net/l3mdev/Kconfig
@@ -8,3 +8,15 @@ config NET_L3_MASTER_DEV
        ---help---
          This module provides glue between core networking code and device
          drivers to support L3 master devices like VRF.
+
+config CGROUP_L3MDEV
+       bool "L3 Master Device cgroup"
+       depends on CGROUPS
+       depends on NET_L3_MASTER_DEV
+       ---help---
+         Cgroup subsystem for assigning processes to an L3 domain.
+         When a process is assigned to an l3mdev domain all AF_INET and
+         AF_INET6 sockets opened by the process are bound to the L3 master
+         device.
+
+
diff --git a/net/l3mdev/Makefile b/net/l3mdev/Makefile
index 84a53a6f609a..ae74ebad8db7 100644
--- a/net/l3mdev/Makefile
+++ b/net/l3mdev/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o
+obj-$(CONFIG_CGROUP_L3MDEV) += l3mdev_cgroup.o
diff --git a/net/l3mdev/l3mdev_cgroup.c b/net/l3mdev/l3mdev_cgroup.c
new file mode 100644
index 000000000000..0326c06bfe02
--- /dev/null
+++ b/net/l3mdev/l3mdev_cgroup.c
@@ -0,0 +1,195 @@
+/*
+ * net/l3mdev/l3mdev_cgroup.c  Control Group for L3 Master Devices
+ *
+ * Copyright (c) 2015 Cumulus Networks. All rights reserved.
+ * Copyright (c) 2015 David Ahern <d...@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/cgroup.h>
+#include <net/sock.h>
+#include <net/l3mdev_cgroup.h>
+
+struct l3mdev_cgroup {
+       struct cgroup_subsys_state      css;
+       struct net                      *net;
+       int                             dev_idx;
+};
+
+static inline struct l3mdev_cgroup *css_l3mdev(struct cgroup_subsys_state *css)
+{
+       return css ? container_of(css, struct l3mdev_cgroup, css) : NULL;
+}
+
+static void l3mdev_set_bound_dev(struct sock *sk)
+{
+       struct task_struct *tsk = current;
+       struct l3mdev_cgroup *l3mdev_cgrp;
+
+       rcu_read_lock();
+
+       l3mdev_cgrp = css_l3mdev(task_css(tsk, l3mdev_cgrp_id));
+       if (l3mdev_cgrp && l3mdev_cgrp->dev_idx)
+               sk->sk_bound_dev_if = l3mdev_cgrp->dev_idx;
+
+       rcu_read_unlock();
+}
+
+void sock_update_l3mdev(struct sock *sk)
+{
+       switch (sk->sk_family) {
+       case AF_INET:
+       case AF_INET6:
+               l3mdev_set_bound_dev(sk);
+               break;
+       }
+}
+
+static bool is_root_cgroup(struct cgroup_subsys_state *css)
+{
+       return !css || !css->parent;
+}
+
+static struct cgroup_subsys_state *
+l3mdev_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+       struct l3mdev_cgroup *l3mdev_cgrp;
+
+       /* nested l3mdev domains are not supportd */
+       if (!is_root_cgroup(parent_css))
+               return ERR_PTR(-EINVAL);
+
+       l3mdev_cgrp = kzalloc(sizeof(*l3mdev_cgrp), GFP_KERNEL);
+       if (!l3mdev_cgrp)
+               return ERR_PTR(-ENOMEM);
+
+       return &l3mdev_cgrp->css;
+}
+
+static int l3mdev_css_online(struct cgroup_subsys_state *css)
+{
+       return 0;
+}
+
+static void l3mdev_css_free(struct cgroup_subsys_state *css)
+{
+       kfree(css_l3mdev(css));
+}
+
+static int l3mdev_read(struct seq_file *sf, void *v)
+{
+       struct cgroup_subsys_state *css = seq_css(sf);
+       struct l3mdev_cgroup *l3mdev_cgrp = css_l3mdev(css);
+
+       if (!l3mdev_cgrp)
+               return -EINVAL;
+
+       if (l3mdev_cgrp->net) {
+               struct net_device *dev;
+
+               dev = dev_get_by_index(l3mdev_cgrp->net, l3mdev_cgrp->dev_idx);
+
+               seq_printf(sf, "net[%u]: device index %d ==> %s\n",
+                          l3mdev_cgrp->net->ns.inum, l3mdev_cgrp->dev_idx,
+                          dev ? dev->name : "<none>");
+
+               if (dev)
+                       dev_put(dev);
+       }
+       return 0;
+}
+
+static ssize_t l3mdev_write(struct kernfs_open_file *of,
+                           char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup_subsys_state *css = of_css(of);
+       struct l3mdev_cgroup *l3mdev_cgrp = css_l3mdev(css);
+       struct net *net = current->nsproxy->net_ns;
+       struct net_device *dev;
+       char name[IFNAMSIZ];
+       int rc = -EINVAL;
+
+       /* once master device is set can not undo. Must delete
+        * cgroup and reset
+        */
+       if (l3mdev_cgrp->dev_idx)
+               goto out;
+
+       /* root cgroup does not bind to an L3 domain */
+       if (is_root_cgroup(css))
+               goto out;
+
+       if (sscanf(buf, "%" __stringify(IFNAMSIZ) "s", name) != 1)
+               goto out;
+
+       dev = dev_get_by_name(net, name);
+       if (!dev) {
+               rc = -ENODEV;
+               goto out;
+       }
+
+       if (netif_is_l3_master(dev)) {
+               l3mdev_cgrp->net = net;
+               l3mdev_cgrp->dev_idx = dev->ifindex;
+               rc = 0;
+       }
+
+       dev_put(dev);
+out:
+       return rc ? : nbytes;
+}
+
+/* make master device is set for non-root cgroups before tasks can be added */
+static int l3mdev_can_attach(struct cgroup_taskset *tset)
+{
+       struct cgroup_subsys_state *dst_css;
+       struct task_struct *tsk;
+       int rc = 0;
+
+       cgroup_taskset_for_each(tsk, dst_css, tset) {
+               struct l3mdev_cgroup *l3mdev_cgrp;
+
+               l3mdev_cgrp = css_l3mdev(dst_css);
+               if (!is_root_cgroup(dst_css) && !l3mdev_cgrp->dev_idx) {
+                       rc = -ENODEV;
+                       break;
+               }
+       }
+
+       return rc;
+}
+
+static struct cftype ss_files[] = {
+       {
+               .name     = "master-device",
+               .seq_show = l3mdev_read,
+               .write    = l3mdev_write,
+       },
+       { }     /* terminate */
+};
+
+struct cgroup_subsys l3mdev_cgrp_subsys = {
+       .css_alloc      = l3mdev_css_alloc,
+       .css_online     = l3mdev_css_online,
+       .css_free       = l3mdev_css_free,
+       .can_attach     = l3mdev_can_attach,
+       .legacy_cftypes = ss_files,
+};
+
+static int __init init_cgroup_l3mdev(void)
+{
+       return 0;
+}
+
+subsys_initcall(init_cgroup_l3mdev);
+MODULE_AUTHOR("David Ahern");
+MODULE_DESCRIPTION("Control Group for L3 Networking Domains");
+MODULE_LICENSE("GPL");
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to