On Tue, Nov 25, 2025 at 11:38:59AM +0100, Moritz Haase wrote: > In order for one to use QEMU user mode emulation under a chroot, it is > required to use binfmt_misc. This can be avoided by QEMU never doing a raw > execve() to the host system, which is especially useful in environments > where binfmt_misc can't be used. > > Introduce a new option, -execve, that uses the current QEMU interpreter to > intercept execve(). In addition, execve mode can also be en- and disabled > using the 'QEMU_EXECVE' env var. > > qemu_execve() will prepend the interpreter path, similar to what binfmt_misc > would do, and then pass the modified execve() to the host. > > It is necessary to parse hashbang scripts in that function otherwise the > kernel will try to run the interpreter of a script without QEMU and get an > invalid exec format error. > > Note that a previous incarnation of this patch was submitted a few years ago > (see [0]) by Petros Angelatos as the original author who confirmed that it's > OK to resubmit it.
Did this patch code directly evolve from Petros' patch linked at [0], or was the latter merely inspiration and this a fresh implementation ? If the former, best practice is to keep Petros's Signed-off-by, and add a short 1-2 line note of key changes, and then add your own Signed-off-by eg something like Signed-off-by: Petros Angelatos <[email protected]> [MH: changed a, y and z] Signed-off-by: Moritz Haase <[email protected]> > > CC: [email protected] > CC: [email protected] > CC: [email protected] > CC: [email protected] > > Signed-off-by: Moritz Haase <[email protected]> > > --- > > We've been using this feature internally for at least five years by now. > Prior to submission, the code was updated to (hopefully) conform to the > current QEMU coding style. > > I'd be happy to add test cases for this feature, but I'd need some pointers > given that I'm a first-time contributor. Thanks! > > [0]: > https://patchwork.kernel.org/project/qemu-devel/patch/[email protected]/ > --- > linux-user/linuxload.c | 119 ++++++++++++++++++++++++++++++++++-- > linux-user/loader.h | 1 + > linux-user/main.c | 54 ++++++++++++++++ > linux-user/syscall.c | 94 ++++++++++++++++++++++++---- > linux-user/user-internals.h | 1 + > 5 files changed, 252 insertions(+), 17 deletions(-) > > diff --git a/linux-user/linuxload.c b/linux-user/linuxload.c > index 85d700953e..eb1fdf3f85 100644 > --- a/linux-user/linuxload.c > +++ b/linux-user/linuxload.c > @@ -138,15 +138,124 @@ abi_ulong loader_build_argptr(int envc, int argc, > abi_ulong sp, > return sp; > } > > +int load_script_file(const char *filename, struct linux_binprm *bprm) > +{ > + int retval, fd; > + char *i_arg = NULL, *i_name = NULL; > + char **new_argv; > + char *cp; > + char buf[BPRM_BUF_SIZE]; > + > + /* Check if it is a script */ > + fd = open(filename, O_RDONLY); > + if (fd == -1) { > + return fd; > + } > + > + retval = read(fd, buf, BPRM_BUF_SIZE); > + if (retval == -1) { > + close(fd); > + return retval; > + } > + > + /* if we have less than 2 bytes, we can guess it is not executable */ > + if (retval < 2) { > + close(fd); > + return -ENOEXEC; > + } > + > + close(fd); > + /* > + * adapted from the kernel > + * > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c > + */ > + if ((buf[0] == '#') && (buf[1] == '!')) { > + buf[BPRM_BUF_SIZE - 1] = '\0'; > + cp = strchr(buf, '\n'); > + if (cp == NULL) { > + cp = buf + BPRM_BUF_SIZE - 1; > + } > + *cp = '\0'; > + while (cp > buf) { > + cp--; > + if ((*cp == ' ') || (*cp == '\t')) { > + *cp = '\0'; > + } else { > + break; > + } > + } > + for (cp = buf + 2; (*cp == ' ') || (*cp == '\t'); cp++) { > + /* nothing */ ; > + } > + if (*cp == '\0') { > + return -ENOEXEC; /* No interpreter name found */ > + } > + i_name = cp; > + i_arg = NULL; > + for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) { > + /* nothing */ ; > + } > + while ((*cp == ' ') || (*cp == '\t')) { > + *cp++ = '\0'; > + } > + > + new_argv = NULL; > + if (*cp) { > + i_arg = cp; > + } > + > + if (i_arg) { > + new_argv = g_alloca(sizeof(void *)); > + new_argv[0] = i_arg; FYI, use of g_alloca() is no longer permitted in QEMU code. I thought we blocked that with -Walloca, but it seems we do not. We ought to fix that gap. > + } > + bprm->argv = new_argv; > + bprm->filename = i_name; > + } else { > + return 1; > + } > + return 0; > +} > + > int loader_exec(int fdexec, const char *filename, char **argv, char **envp, > struct image_info *infop, struct linux_binprm *bprm) > { > - int retval; > + int retval, fd, offset = 1, argc = count(argv); > + char **new_argv; > + > + retval = load_script_file(filename, bprm); > + if (retval == 0) { > + if (bprm->argv != NULL) { > + offset = 2; > + } > + new_argv = g_alloca((argc + offset + 1) * sizeof(void *)); > + > + new_argv[0] = (char *)filename; > + if (bprm->argv != NULL) { > + new_argv[1] = bprm->argv[0]; > + } > + /* Copy the original arguments with offset */ > + for (int i = 0; i < argc; i++) { > + new_argv[i + offset] = argv[i]; > + } > + new_argv[argc + offset] = NULL; > + > + bprm->argc = count(new_argv); > + bprm->argv = new_argv; > + fd = open(bprm->filename, O_RDONLY); > + if (fd < 0) { > + printf("Error while loading %s: %s\n", > + bprm->filename, > + strerror(errno)); > + _exit(EXIT_FAILURE); > + } > + bprm->src.fd = fd; > + } else { > + bprm->filename = (char *)filename; > + bprm->argc = count(argv); > + bprm->argv = argv; > + bprm->src.fd = fdexec; If load_script_file() failed to open or read the filename, is it correct to continue executing with this fallback ? Is there a scenario where QEMU would be unable to open/read, but it be none the less possible for it to work int his fallback ? > + } > > - bprm->src.fd = fdexec; > - bprm->filename = (char *)filename; > - bprm->argc = count(argv); > - bprm->argv = argv; > bprm->envc = count(envp); > bprm->envp = envp; > > @@ -367,6 +368,56 @@ static void handle_arg_guest_base(const char *arg) > have_guest_base = true; > } > > +static void handle_arg_execve(const char *arg) > +{ > + const char *execfn; > + char buf[PATH_MAX]; > + char *ret; > + int len; > + > + /* > + * Since the 'execve' command line option has no argument ('has_arg' is > + * 'false'), this function will always receive NULL for 'arg' during > + * argument parsing. If 'arg' is non-NULL, we are being called during env > + * var handling, because QEMU_EXECVE is set. > + */ > + if (arg != NULL) { > + /* > + * If the env var is set, check whether its value is '0'. In this > case, > + * we don't want to enable 'execve' mode and thus bail out. Please > note > + * that an empty value will NOT disable 'execve' mode. > + */ > + if (!strcmp(arg, "0")) { > + return; > + } > + } > + > + /* try getauxval() */ > + execfn = (const char *)qemu_getauxval(AT_EXECFN); > + > + if (execfn != 0) { > + ret = realpath(execfn, buf); > + > + if (ret != NULL) { > + qemu_execve_path = g_strdup(buf); > + return; > + } > + } > + > + /* try /proc/self/exe */ > + len = readlink("/proc/self/exe", buf, sizeof(buf) - 1); > + > + if (len != -1) { > + buf[len] = '\0'; > + qemu_execve_path = g_strdup(buf); > + return; > + } > + > + fprintf(stderr, "qemu_execve: unable to determine interpreter's path\n"); > + exit(EXIT_FAILURE); > +} > + > + > static void handle_arg_reserved_va(const char *arg) > { > char *p; > @@ -497,6 +548,9 @@ static const struct qemu_argument arg_table[] = { > "uname", "set qemu uname release string to 'uname'"}, > {"B", "QEMU_GUEST_BASE", true, handle_arg_guest_base, > "address", "set guest_base address to 'address'"}, > + {"execve", "QEMU_EXECVE", false, handle_arg_execve, > + "", "use this interpreter when a process calls execve() " > + "(disabled if env var is '0', enabled for all other values / when > empty)"}, > {"R", "QEMU_RESERVED_VA", true, handle_arg_reserved_va, > "size", "reserve 'size' bytes for guest virtual address space"}, > {"t", "QEMU_RTSIG_MAP", true, handle_arg_rtsig_map, > diff --git a/linux-user/syscall.c b/linux-user/syscall.c > index 2060e561a2..bf9e084975 100644 > --- a/linux-user/syscall.c > +++ b/linux-user/syscall.c > @@ -127,6 +127,7 @@ > #include <libdrm/drm.h> > #include <libdrm/i915_drm.h> > #endif > +#include <linux/binfmts.h> > #include "linux_loop.h" > #include "uname.h" > > @@ -8726,6 +8727,86 @@ ssize_t do_guest_readlink(const char *pathname, char > *buf, size_t bufsiz) > return ret; > } > > +static int qemu_execve(const char *filename, char *argv[], > + char *envp[]) > +{ > + char **new_argv; > + const char *new_filename; > + int argc, ret, i, offset = 3; > + struct linux_binprm *bprm; > + > + /* normal execve case */ > + if (qemu_execve_path == NULL || *qemu_execve_path == 0) { > + new_filename = filename; > + new_argv = argv; > + } else { > + new_filename = qemu_execve_path; > + > + for (argc = 0; argv[argc] != NULL; argc++) { > + /* nothing */ ; > + } > + > + bprm = g_alloca(sizeof(struct linux_binprm)); > + ret = load_script_file(filename, bprm); > + > + if (ret < 0) { > + if (ret == -1) { > + return get_errno(ret); > + } else { > + return -host_to_target_errno(ENOEXEC); > + } > + } > + > + if (ret == 0) { > + if (bprm->argv != NULL) { > + offset = 5; > + } else { > + offset = 4; > + } > + } > + > + /* Need to store execve argument */ > + offset++; > + > + new_argv = g_alloca((argc + offset + 1) * sizeof(void *)); > + > + /* Copy the original arguments with offset */ > + for (i = 0; i < argc; i++) { > + new_argv[i + offset] = argv[i]; > + } > + > + new_argv[0] = g_strdup(qemu_execve_path); > + new_argv[1] = g_strdup("-execve"); /* Add execve argument */ > + new_argv[2] = g_strdup("-0"); > + new_argv[offset] = g_strdup(filename); > + new_argv[argc + offset] = NULL; > + > + if (ret == 0) { > + new_argv[3] = bprm->filename; > + new_argv[4] = bprm->filename; > + > + if (bprm->argv != NULL) { > + new_argv[5] = bprm->argv[0]; > + } > + } else { > + new_argv[3] = argv[0]; > + } > + } > + > + /* > + * Although execve() is not an interruptible syscall it is > + * a special case where we must use the safe_syscall wrapper: > + * if we allow a signal to happen before we make the host > + * syscall then we will 'lose' it, because at the point of > + * execve the process leaves QEMU's control. So we use the > + * safe syscall wrapper to ensure that we either take the > + * signal as a guest signal, or else it does not happen > + * before the execve completes and makes it the other > + * program's problem. > + */ > + return safe_execve(new_filename, new_argv, envp); > +} With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
