On Mon, Oct 22, 2018 at 02:29:19PM +0200, Pierre-Elliott Bécue wrote:
> Cc-ing Christian as he proposed the patch.
> 
> Le jeudi 18 octobre 2018 à 14:32:12+0200, Matthias Heinz a écrit :
> > Hi,
> > 
> > I've tried applying the patch to the 2.0.9 source. Unfortunately there have 
> > been more changes to conf.c since the release of lxc 2.0.9 and this patch 
> > wont 
> > apply without applying these changes as well.

So, let me make your life easier by appending a patch on top of 2.0.9.
:)

Christian

> 
> What amount of patches would it represent? Would it be relevant to consider
> updating lxc in buster to a newer release?
> 
> Cheers,
> 
> -- 
> Pierre-Elliott Bécue
> GPG: 9AE0 4D98 6400 E3B6 7528  F493 0D44 2664 1949 74E2
> It's far easier to fight for one's principles than to live up to them.
>From 8991fdfa9401da4803fb932786b449d3f32ca07c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brau...@ubuntu.com>
Date: Mon, 22 Oct 2018 16:30:49 +0200
Subject: [PATCH] autodev: adapt to changes in Linux 4.18

Starting with commit
55956b59df33 ("vfs: Allow userns root to call mknod on owned filesystems.")
Linux will allow mknod() in user namespaces for userns root if CAP_MKNOD is
available.
However, these device nodes are useless since

static struct super_block *alloc_super(struct file_system_type *type, int flags,
                                       struct user_namespace *user_ns)
{
        /* <snip> */

        if (s->s_user_ns != &init_user_ns)
                s->s_iflags |= SB_I_NODEV;

        /* <snip> */
}

will set the SB_I_NODEV flag on the filesystem. When a device node created in
non-init userns is open()ed the call chain will hit:

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

which will cause an EPERM because the device node is located on an fs
owned by non-init-userns and thus doesn't grant access to device nodes due to
SB_I_NODEV.

The solution is straightforward. Unless you're real root you should bind-mount
device nodes.

Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
---
 src/lxc/conf.c | 127 +++++++++++++++++++++++++++++++------------------
 1 file changed, 81 insertions(+), 46 deletions(-)

diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 91816beb..384138ec 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -1130,32 +1130,41 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
 	return 0;
 }
 
-struct lxc_devs {
+struct lxc_device_node {
 	const char *name;
-	mode_t mode;
-	int maj;
-	int min;
+	const mode_t mode;
+	const int maj;
+	const int min;
 };
 
-static const struct lxc_devs lxc_devs[] = {
-	{ "null",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
-	{ "zero",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
+static const struct lxc_device_node lxc_devices[] = {
 	{ "full",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
-	{ "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
+	{ "null",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
 	{ "random",  S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
 	{ "tty",     S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
+	{ "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
+	{ "zero",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
+};
+
+
+
+enum {
+	LXC_DEVNODE_BIND,
+	LXC_DEVNODE_MKNOD,
+	LXC_DEVNODE_PARTIAL,
+	LXC_DEVNODE_OPEN,
 };
 
 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
 {
-	int ret;
-	char path[MAXPATHLEN];
-	int i;
+	int i, ret;
+	char path[PATH_MAX];
 	mode_t cmask;
+	int use_mknod = LXC_DEVNODE_MKNOD;
 
-	ret = snprintf(path, MAXPATHLEN, "%s/dev",
+	ret = snprintf(path, PATH_MAX, "%s/dev",
 		       rootfs->path ? rootfs->mount : "");
-	if (ret < 0 || ret >= MAXPATHLEN)
+	if (ret < 0 || ret >= PATH_MAX)
 		return -1;
 
 	/* ignore, just don't try to fill in */
@@ -1165,53 +1174,79 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
 	INFO("Populating \"/dev\"");
 
 	cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
-	for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
-		const struct lxc_devs *d = &lxc_devs[i];
+	for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
+		char hostpath[PATH_MAX];
+		const struct lxc_device_node *device = &lxc_devices[i];
 
-		ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
-			       rootfs->path ? rootfs->mount : "", d->name);
-		if (ret < 0 || ret >= MAXPATHLEN)
+		ret = snprintf(path, PATH_MAX, "%s/dev/%s",
+			       rootfs->path ? rootfs->mount : "", device->name);
+		if (ret < 0 || ret >= PATH_MAX)
 			return -1;
 
-		ret = mknod(path, d->mode, makedev(d->maj, d->min));
-		if (ret < 0) {
-			FILE *pathfile;
-			char hostpath[MAXPATHLEN];
+		if (use_mknod >= LXC_DEVNODE_MKNOD) {
+			ret = mknod(path, device->mode, makedev(device->maj, device->min));
+			if (ret == 0 || (ret < 0 && errno == EEXIST)) {
+				DEBUG("Created device node \"%s\"", path);
+			} else if (ret < 0) {
+				if (errno != EPERM) {
+					SYSERROR("Failed to create device node \"%s\"", path);
+					return -1;
+				}
+
+				use_mknod = LXC_DEVNODE_BIND;
+			}
 
-			if (errno == EEXIST) {
-				DEBUG("\"%s\" device already existed", path);
+			/* Device nodes are fully useable. */
+			if (use_mknod == LXC_DEVNODE_OPEN)
 				continue;
+
+			if (use_mknod == LXC_DEVNODE_MKNOD) {
+				/* See
+				 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
+				 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
+				 */
+				ret = open(path, O_RDONLY | O_CLOEXEC);
+				if (ret >= 0) {
+					close(ret);
+					/* Device nodes are fully useable. */
+					use_mknod = LXC_DEVNODE_OPEN;
+					continue;
+				}
+
+				TRACE("Failed to open \"%s\" device", path);
+				/* Device nodes are only partially useable. */
+				use_mknod = LXC_DEVNODE_PARTIAL;
 			}
+		}
 
-			/* Unprivileged containers cannot create devices, so
-			 * bind mount the device from the host.
+		if (use_mknod != LXC_DEVNODE_PARTIAL) {
+			/* If we are dealing with partially functional device
+			 * nodes the prio mknod() call will have created the
+			 * device node so we can use it as a bind-mount target.
 			 */
-			ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
-			if (ret < 0 || ret >= MAXPATHLEN)
-				return -1;
-
-			pathfile = fopen(path, "wb");
-			if (!pathfile) {
+			ret = mknod(path, S_IFREG | 0000, 0);
+			if (ret < 0 && errno != EEXIST) {
 				SYSERROR("Failed to create file \"%s\"", path);
 				return -1;
 			}
-			fclose(pathfile);
+		}
 
-			ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
-					 rootfs->path ? rootfs->mount : NULL);
-			if (ret < 0) {
-				SYSERROR("Failed to bind mount \"%s\" from "
-					 "host into container",
-					 d->name);
-				return -1;
-			}
-			DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
-			      path);
-		} else {
-			DEBUG("Created device node \"%s\"", path);
+		/* Fallback to bind-mounting the device from the host. */
+		ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
+		if (ret < 0 || ret >= PATH_MAX)
+			return -1;
+
+		ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
+				 rootfs->path ? rootfs->mount : NULL);
+		if (ret < 0) {
+			SYSERROR("Failed to bind mount host device node \"%s\" "
+				 "onto \"%s\"", hostpath, path);
+			return -1;
 		}
+		DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
+		      hostpath, path);
 	}
-	umask(cmask);
+	(void)umask(cmask);
 
 	INFO("Populated \"/dev\"");
 	return 0;
-- 
2.19.1

Reply via email to