ujail: fix signal forwarding
[project/procd.git] / jail / jail.c
index 3b5587a..9d7483c 100644 (file)
  */
 
 #define _GNU_SOURCE
-#include <sys/syscall.h>
-#include <sys/mman.h>
-#include <sys/utsname.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/wait.h>
 
 #include <stdlib.h>
 #include <unistd.h>
-#include <values.h>
 #include <errno.h>
-#include <stdio.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#include <syslog.h>
 #include <libgen.h>
-#include <glob.h>
-#include <elf.h>
 #include <sched.h>
+#include <linux/limits.h>
+#include <signal.h>
 
+#include "capabilities.h"
 #include "elf.h"
+#include "fs.h"
+#include "jail.h"
+#include "log.h"
 
-#include <libubox/utils.h>
-#include <libubox/list.h>
 #include <libubox/uloop.h>
 
 #define STACK_SIZE     (1024 * 1024)
-#define OPT_ARGS       "P:S:n:r:w:psuldo"
-
-struct extra {
-       struct list_head list;
-
-       const char *path;
-       const char *name;
-       int readonly;
-};
-
-static LIST_HEAD(extras);
+#define OPT_ARGS       "S:C:n:h:r:w:d:psuloc"
+
+static struct {
+       char *name;
+       char *hostname;
+       char **jail_argv;
+       char *seccomp;
+       char *capabilities;
+       int no_new_privs;
+       int namespace;
+       int procfs;
+       int ronly;
+       int sysfs;
+} opts;
 
 extern int pivot_root(const char *new_root, const char *put_old);
 
@@ -81,140 +77,120 @@ static int mkdir_p(char *dir, mode_t mask)
                return 0;
 
        if (ret)
-               ERROR("mkdir failed on %s: %s\n", dir, strerror(errno));
+               ERROR("mkdir(%s, %d) failed: %s\n", dir, mask, strerror(errno));
 
        return ret;
 }
 
-static int mount_bind(const char *root, const char *path, const char *name, int readonly, int error)
+int mount_bind(const char *root, const char *path, int readonly, int error)
 {
-       const char *p = path;
        struct stat s;
-       char old[256];
-       char new[256];
+       char new[PATH_MAX];
        int fd;
 
-       if (strstr(p, "local"))
-               p = "/lib";
-
-       snprintf(old, sizeof(old), "%s/%s", path, name);
-       snprintf(new, sizeof(new), "%s%s", root, p);
-
-       mkdir_p(new, 0755);
-
-       snprintf(new, sizeof(new), "%s%s/%s", root, p, name);
-
-       if (stat(old, &s)) {
-               ERROR("%s does not exist\n", old);
+       if (stat(path, &s)) {
+               ERROR("stat(%s) failed: %s\n", path, strerror(errno));
                return error;
        }
 
+       snprintf(new, sizeof(new), "%s%s", root, path);
        if (S_ISDIR(s.st_mode)) {
                mkdir_p(new, 0755);
        } else {
+               mkdir_p(dirname(new), 0755);
+               snprintf(new, sizeof(new), "%s%s", root, path);
                fd = creat(new, 0644);
                if (fd == -1) {
-                       ERROR("failed to create %s: %s\n", new, strerror(errno));
+                       ERROR("creat(%s) failed: %s\n", new, strerror(errno));
                        return -1;
                }
                close(fd);
        }
 
-       if (mount(old, new, NULL, MS_BIND, NULL)) {
-               ERROR("failed to mount -B %s %s: %s\n", old, new, strerror(errno));
+       if (mount(path, new, NULL, MS_BIND, NULL)) {
+               ERROR("failed to mount -B %s %s: %s\n", path, new, strerror(errno));
                return -1;
        }
 
-       if (readonly && mount(old, new, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY, NULL)) {
+       if (readonly && mount(NULL, new, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY, NULL)) {
                ERROR("failed to remount ro %s: %s\n", new, strerror(errno));
                return -1;
        }
 
-       DEBUG("mount -B %s %s\n", old, new);
+       DEBUG("mount -B %s %s (%s)\n", path, new, readonly?"ro":"rw");
 
        return 0;
 }
 
-static int build_jail(const char *path)
+static int build_jail_fs(void)
 {
-       struct library *l;
-       struct extra *m;
-       int ret = 0;
+       char jail_root[] = "/tmp/ujail-XXXXXX";
+       if (mkdtemp(jail_root) == NULL) {
+               ERROR("mkdtemp(%s) failed: %s\n", jail_root, strerror(errno));
+               return -1;
+       }
 
-       mkdir(path, 0755);
+       /* oldroot can't be MS_SHARED else pivot_root() fails */
+       if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
+               ERROR("private mount failed %s\n", strerror(errno));
+               return -1;
+       }
 
-       if (mount("tmpfs", path, "tmpfs", MS_NOATIME, "mode=0744")) {
+       if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
                ERROR("tmpfs mount failed %s\n", strerror(errno));
                return -1;
        }
 
-       avl_for_each_element(&libraries, l, avl)
-               if (mount_bind(path, l->path, l->name, 1, -1))
-                       return -1;
-
-       list_for_each_entry(m, &extras, list)
-               if (mount_bind(path, m->path, m->name, m->readonly, 0))
-                       return -1;
-
-       return ret;
-}
-
-static void _umount(const char *root, const char *path)
-{
-       char *buf = NULL;
-
-       if (asprintf(&buf, "%s%s", root, path) < 0) {
-               ERROR("failed to alloc umount buffer: %s\n", strerror(errno));
-       } else {
-               DEBUG("umount %s\n", buf);
-               umount(buf);
-               free(buf);
+       if (chdir(jail_root)) {
+               ERROR("chdir(%s) (jail_root) failed: %s\n", jail_root, strerror(errno));
+               return -1;
        }
-}
-
-static int stop_jail(const char *root)
-{
-       struct library *l;
-       struct extra *m;
-
-       avl_for_each_element(&libraries, l, avl) {
-               char path[256];
-               char *p = l->path;
 
-               if (strstr(p, "local"))
-                       p = "/lib";
-
-               snprintf(path, sizeof(path), "%s%s/%s", root, p, l->name);
-               DEBUG("umount %s\n", path);
-               umount(path);
+       if (mount_all(jail_root)) {
+               ERROR("mount_all() failed\n");
+               return -1;
        }
 
-       list_for_each_entry(m, &extras, list) {
-               char path[256];
+       char dirbuf[sizeof(jail_root) + 4];
+       snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
+       mkdir(dirbuf, 0755);
 
-               snprintf(path, sizeof(path), "%s%s/%s", root, m->path, m->name);
-               DEBUG("umount %s\n", path);
-               umount(path);
+       if (pivot_root(jail_root, dirbuf) == -1) {
+               ERROR("pivot_root(%s, %s) failed: %s\n", jail_root, dirbuf, strerror(errno));
+               return -1;
+       }
+       if (chdir("/")) {
+               ERROR("chdir(/) (after pivot_root) failed: %s\n", strerror(errno));
+               return -1;
        }
 
-       _umount(root, "/proc");
-       _umount(root, "/sys");
+       snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
+       rmdir(dirbuf);
+       umount2("/old", MNT_DETACH);
+       rmdir("/old");
 
-       DEBUG("umount %s\n", root);
-       umount(root);
-       rmdir(root);
+       if (opts.procfs) {
+               mkdir("/proc", 0755);
+               mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
+       }
+       if (opts.sysfs) {
+               mkdir("/sys", 0755);
+               mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
+       }
+       if (opts.ronly)
+               mount(NULL, "/", NULL, MS_RDONLY | MS_REMOUNT, 0);
 
        return 0;
 }
 
 #define MAX_ENVP       8
-static char** build_envp(const char *seccomp, int debug)
+static char** build_envp(const char *seccomp)
 {
        static char *envp[MAX_ENVP];
-       static char preload_var[64];
-       static char seccomp_var[64];
+       static char preload_var[PATH_MAX];
+       static char seccomp_var[PATH_MAX];
        static char debug_var[] = "LD_DEBUG=all";
-       char *preload_lib = find_lib("libpreload-seccomp.so");
+       const char *preload_lib = find_lib("libpreload-seccomp.so");
        int count = 0;
 
        if (seccomp && !preload_lib) {
@@ -227,271 +203,253 @@ static char** build_envp(const char *seccomp, int debug)
                snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
                envp[count++] = preload_var;
        }
-       if (debug)
+       if (debug > 1)
                envp[count++] = debug_var;
 
        return envp;
 }
 
-static int spawn(const char *path, char **argv, const char *seccomp)
-{
-       pid_t pid = fork();
-
-       if (pid < 0) {
-               ERROR("failed to spawn %s: %s\n", *argv, strerror(errno));
-               return -1;
-       } else if (!pid) {
-               char **envp = build_envp(seccomp, 0);
-
-               INFO("spawning %s\n", *argv);
-               execve(*argv, argv, envp);
-               ERROR("failed to spawn child %s: %s\n", *argv, strerror(errno));
-               exit(-1);
-       }
-
-       return pid;
-}
-
-static int usage(void)
+static void usage(void)
 {
-       fprintf(stderr, "jail <options> -D <binary> <params ...>\n");
-       fprintf(stderr, "  -P <path>\tpath where the jail will be staged\n");
-       fprintf(stderr, "  -S <file>\tseccomp filter\n");
+       fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
+       fprintf(stderr, "  -d <num>\tshow debug log (increase num to increase verbosity)\n");
+       fprintf(stderr, "  -S <file>\tseccomp filter config\n");
+       fprintf(stderr, "  -C <file>\tcapabilities drop config\n");
+       fprintf(stderr, "  -c\t\tset PR_SET_NO_NEW_PRIVS\n");
        fprintf(stderr, "  -n <name>\tthe name of the jail\n");
+       fprintf(stderr, "namespace jail options:\n");
+       fprintf(stderr, "  -h <hostname>\tchange the hostname of the jail\n");
        fprintf(stderr, "  -r <file>\treadonly files that should be staged\n");
        fprintf(stderr, "  -w <file>\twriteable files that should be staged\n");
-       fprintf(stderr, "  -p\t\tjail has /proc\t\n");
-       fprintf(stderr, "  -s\t\tjail has /sys\t\n");
-       fprintf(stderr, "  -l\t\tjail has /dev/log\t\n");
-       fprintf(stderr, "  -u\t\tjail has a ubus socket\t\n");
-
-       return -1;
+       fprintf(stderr, "  -p\t\tjail has /proc\n");
+       fprintf(stderr, "  -s\t\tjail has /sys\n");
+       fprintf(stderr, "  -l\t\tjail has /dev/log\n");
+       fprintf(stderr, "  -u\t\tjail has a ubus socket\n");
+       fprintf(stderr, "  -o\t\tremont jail root (/) read only\n");
+       fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
+and he has the same powers as root outside the jail,\n\
+thus he can escape the jail and/or break stuff.\n\
+Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
+If you use none of the namespace jail options,\n\
+ujail will not use namespace/build a jail,\n\
+and will only drop capabilities/apply seccomp filter.\n\n");
 }
 
-static int child_running = 1;
-
-static void child_process_handler(struct uloop_process *c, int ret)
+static int exec_jail(void *_notused)
 {
-       INFO("child (%d) exited: %d\n", c->pid, ret);
-       uloop_end();
-       child_running = 0;
-}
-
-struct uloop_process child_process = {
-       .cb = child_process_handler,
-};
+       if (opts.capabilities && drop_capabilities(opts.capabilities))
+               exit(EXIT_FAILURE);
 
-static int spawn_child(void *arg)
-{
-       char *path = get_current_dir_name();
-       int procfs = 0, sysfs = 0;
-       char *seccomp = NULL;
-       char **argv = arg;
-       int argc = 0, ch;
-       char *mpoint;
-       int ronly = 0;
-
-       while (argv[argc])
-               argc++;
-
-       optind = 0;
-       while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
-               switch (ch) {
-               case 'd':
-                       debug = 1;
-                       break;
-               case 'S':
-                       seccomp = optarg;
-                       break;
-               case 'p':
-                       procfs = 1;
-                       break;
-               case 'o':
-                       ronly = 1;
-                       break;
-               case 's':
-                       sysfs = 1;
-                       break;
-               case 'n':
-                       sethostname(optarg, strlen(optarg));
-                       break;
-               }
+       if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+                ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %s\n", strerror(errno));
+               exit(EXIT_FAILURE);
        }
 
-       asprintf(&mpoint, "%s/old", path);
-       mkdir_p(mpoint, 0755);
-       if (pivot_root(path, mpoint) == -1) {
-               ERROR("pivot_root failed:%s\n", strerror(errno));
-               return -1;
-       }
-       free(mpoint);
-       umount2("/old", MNT_DETACH);
-       rmdir("/old");
-       if (procfs) {
-               mkdir("/proc", 0755);
-               mount("proc", "/proc", "proc", MS_NOATIME, 0);
-       }
-       if (sysfs) {
-               mkdir("/sys", 0755);
-               mount("sysfs", "/sys", "sysfs", MS_NOATIME, 0);
+       if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
+                       && sethostname(opts.hostname, strlen(opts.hostname))) {
+               ERROR("sethostname(%s) failed: %s\n", opts.hostname, strerror(errno));
+               exit(EXIT_FAILURE);
        }
-       if (ronly)
-               mount(NULL, "/", NULL, MS_RDONLY | MS_REMOUNT, 0);
 
-       uloop_init();
-
-       child_process.pid = spawn(path, &argv[optind], seccomp);
-       uloop_process_add(&child_process);
-       uloop_run();
-       uloop_done();
-       if (child_running) {
-               kill(child_process.pid, SIGTERM);
-               waitpid(child_process.pid, NULL, 0);
+       if (opts.namespace && build_jail_fs()) {
+               ERROR("failed to build jail fs\n");
+               exit(EXIT_FAILURE);
        }
 
-       return 0;
+       char **envp = build_envp(opts.seccomp);
+       if (!envp)
+               exit(EXIT_FAILURE);
+
+       INFO("exec-ing %s\n", *opts.jail_argv);
+       execve(*opts.jail_argv, opts.jail_argv, envp);
+       /* we get there only if execve fails */
+       ERROR("failed to execve %s: %s\n", *opts.jail_argv, strerror(errno));
+       exit(EXIT_FAILURE);
 }
 
-static int namespace_running = 1;
+static int jail_running = 1;
+static int jail_return_code = 0;
 
-static void namespace_process_handler(struct uloop_process *c, int ret)
+static void jail_process_timeout_cb(struct uloop_timeout *t);
+static struct uloop_timeout jail_process_timeout = {
+       .cb = jail_process_timeout_cb,
+};
+
+static void jail_process_handler(struct uloop_process *c, int ret)
 {
-       INFO("namespace (%d) exited: %d\n", c->pid, ret);
+       uloop_timeout_cancel(&jail_process_timeout);
+       if (WIFEXITED(ret)) {
+               jail_return_code = WEXITSTATUS(ret);
+               INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
+       } else {
+               jail_return_code = WTERMSIG(ret);
+               INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
+       }
+       jail_running = 0;
        uloop_end();
-       namespace_running = 0;
 }
 
-struct uloop_process namespace_process = {
-       .cb = namespace_process_handler,
+static struct uloop_process jail_process = {
+       .cb = jail_process_handler,
 };
 
-static void spawn_namespace(const char *path, int argc, char **argv)
+static void jail_process_timeout_cb(struct uloop_timeout *t)
 {
-       char *dir = get_current_dir_name();
-
-       uloop_init();
-       chdir(path);
-       namespace_process.pid = clone(spawn_child,
-                       child_stack + STACK_SIZE,
-                       CLONE_NEWUTS | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD, argv);
-
-       if (namespace_process.pid != -1) {
-               chdir(dir);
-               free(dir);
-               uloop_process_add(&namespace_process);
-               uloop_run();
-               uloop_done();
-               if (namespace_running) {
-                       kill(namespace_process.pid, SIGTERM);
-                       waitpid(namespace_process.pid, NULL, 0);
-               }
-       } else {
-               ERROR("failed to spawn namespace: %s\n", strerror(errno));
-       }
+       DEBUG("jail process failed to stop, sending SIGKILL\n");
+       kill(jail_process.pid, SIGKILL);
 }
 
-static void add_extra(char *name, int readonly)
+static void jail_handle_signal(int signo)
 {
-       struct extra *f;
-
-       if (*name != '/') {
-               ERROR("%s is not an absolute path\n", name);
-               return;
-       }
-
-       f = calloc(1, sizeof(struct extra));
-
-       f->name = basename(name);
-       f->path = dirname(strdup(name));
-       f->readonly = readonly;
-
-       list_add_tail(&f->list, &extras);
+       DEBUG("forwarding signal %d to the jailed process\n", signo);
+       kill(jail_process.pid, signo);
 }
 
 int main(int argc, char **argv)
 {
+       sigset_t sigmask;
        uid_t uid = getuid();
-       const char *name = NULL;
-       char *path = NULL;
-       struct stat s;
-       int ch, ret;
        char log[] = "/dev/log";
        char ubus[] = "/var/run/ubus.sock";
+       int ch, i;
 
        if (uid) {
                ERROR("not root, aborting: %s\n", strerror(errno));
-               return -1;
+               return EXIT_FAILURE;
        }
 
        umask(022);
+       mount_list_init();
+       init_library_search();
 
        while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
                switch (ch) {
                case 'd':
-                       debug = 1;
+                       debug = atoi(optarg);
                        break;
-               case 'P':
-                       path = optarg;
+               case 'p':
+                       opts.namespace = 1;
+                       opts.procfs = 1;
                        break;
-               case 'n':
-                       name = optarg;
+               case 'o':
+                       opts.namespace = 1;
+                       opts.ronly = 1;
+                       break;
+               case 's':
+                       opts.namespace = 1;
+                       opts.sysfs = 1;
                        break;
                case 'S':
+                       opts.seccomp = optarg;
+                       add_mount(optarg, 1, -1);
+                       break;
+               case 'C':
+                       opts.capabilities = optarg;
+                       break;
+               case 'c':
+                       opts.no_new_privs = 1;
+                       break;
+               case 'n':
+                       opts.name = optarg;
+                       break;
+               case 'h':
+                       opts.hostname = optarg;
+                       break;
                case 'r':
-                       add_extra(optarg, 1);
+                       opts.namespace = 1;
+                       add_path_and_deps(optarg, 1, 0, 0);
                        break;
                case 'w':
-                       add_extra(optarg, 0);
+                       opts.namespace = 1;
+                       add_path_and_deps(optarg, 0, 0, 0);
                        break;
                case 'u':
-                       add_extra(ubus, 0);
+                       opts.namespace = 1;
+                       add_mount(ubus, 0, -1);
                        break;
                case 'l':
-                       add_extra(log, 0);
+                       opts.namespace = 1;
+                       add_mount(log, 0, -1);
                        break;
                }
        }
 
-       if (argc - optind < 1)
-               return usage();
-
-       if (!path && asprintf(&path, "/tmp/%s", basename(argv[optind])) == -1) {
-               ERROR("failed to set root path\n: %s", strerror(errno));
-               return -1;
+       /* no <binary> param found */
+       if (argc - optind < 1) {
+               usage();
+               return EXIT_FAILURE;
        }
-
-       if (!stat(path, &s)) {
-               ERROR("%s already exists: %s\n", path, strerror(errno));
-               return -1;
+       if (!(opts.namespace||opts.capabilities||opts.seccomp)) {
+               ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
+               usage();
+               return EXIT_FAILURE;
        }
+       DEBUG("Using namespaces(%d), capabilities(%d), seccomp(%d)\n",
+               opts.namespace,
+               opts.capabilities != 0,
+               opts.seccomp != 0);
 
-       if (name)
-               prctl(PR_SET_NAME, name, NULL, NULL, NULL);
-
-       avl_init(&libraries, avl_strcmp, false, NULL);
-       alloc_library_path("/lib64");
-       alloc_library_path("/lib");
-       alloc_library_path("/usr/lib");
-       load_ldso_conf("/etc/ld.so.conf");
+       opts.jail_argv = &argv[optind];
 
-       if (elf_load_deps(argv[optind])) {
+       if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
                ERROR("failed to load dependencies\n");
                return -1;
        }
 
-       if (elf_load_deps("libpreload-seccomp.so")) {
+       if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
                ERROR("failed to load libpreload-seccomp.so\n");
                return -1;
        }
 
-       ret = build_jail(path);
+       if (opts.name)
+               prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
 
-       if (!ret)
-               spawn_namespace(path, argc, argv);
-       else
-               ERROR("failed to build jail\n");
+       uloop_init();
 
-       stop_jail(path);
+       sigfillset(&sigmask);
+       for (i = 0; i < _NSIG; i++) {
+               struct sigaction s = { 0 };
 
-       return ret;
+               if (!sigismember(&sigmask, i))
+                       continue;
+               if ((i == SIGCHLD) || (i == SIGPIPE))
+                       continue;
+
+               s.sa_handler = jail_handle_signal;
+               sigaction(i, &s, NULL);
+       }
+
+       if (opts.namespace) {
+               add_mount("/dev/full", 0, -1);
+               add_mount("/dev/null", 0, -1);
+               add_mount("/dev/urandom", 0, -1);
+               add_mount("/dev/zero", 0, -1);
+
+               int flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | SIGCHLD;
+               if (opts.hostname)
+                       flags |= CLONE_NEWUTS;
+               jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, flags, NULL);
+       } else {
+               jail_process.pid = fork();
+       }
+
+       if (jail_process.pid > 0) {
+               /* parent process */
+               uloop_process_add(&jail_process);
+               uloop_run();
+               if (jail_running) {
+                       DEBUG("uloop interrupted, killing jail process\n");
+                       kill(jail_process.pid, SIGTERM);
+                       uloop_timeout_set(&jail_process_timeout, 1000);
+                       uloop_run();
+               }
+               uloop_done();
+               return jail_return_code;
+       } else if (jail_process.pid == 0) {
+               /* fork child process */
+               return exec_jail(NULL);
+       } else {
+               ERROR("failed to clone/fork: %s\n", strerror(errno));
+               return EXIT_FAILURE;
+       }
 }