Repositories / jai.git

jai.cc

Clone (read-only): git clone http://git.guha-anderson.com/git/jai.git

Branch
38059 bytes · c0138ef94e1a
#include "jai.h" #include "print_compat.h" #include <cassert> #include <csignal> #include <cstdlib> #include <cstring> #include <filesystem> #include <linux/prctl.h> #include <acl/libacl.h> #include <poll.h> #include <pwd.h> #include <ranges> #include <sys/prctl.h> #include <sys/signalfd.h> #include <sys/types.h> #include <sys/wait.h> #include <termios.h> path prog; void Config::parse_config_fd(int fd, Options *opts) { auto ld = fdpath(fd, true); if (auto [_it, ok] = config_loop_detect_.insert(ld); !ok) err<Options::Error>("configuration loop"); Defer _clear([this, ld, pcf = parsing_config_file_] { config_loop_detect_.erase(ld); parsing_config_file_ = pcf; }); parsing_config_file_ = true; auto go = [&](Options *o) { o->parse_file(read_file(fd), ld); }; go(opts ? opts : opt_parser().get()); } bool Config::parse_config_file(path file, Options *opts) { bool slash = std::ranges::distance(file.begin(), file.end()) > 1; bool fromcwd = slash && !parsing_config_file_; if (struct stat sb; !slash && file.extension() != ".conf" && fstatat(home_jai(), file.c_str(), &sb, 0) && errno == ENOENT && !fstatat(home_jai(), cat(file, ".conf").c_str(), &sb, 0) && S_ISREG(sb.st_mode)) file += ".conf"; Fd fd = openat(fromcwd ? AT_FDCWD : home_jai(), file.c_str(), O_RDONLY); if (!fd) { if (errno == ENOENT) return false; syserr("{}", file.c_str()); } parse_config_fd(*fd, opts); return true; } static std::expected<Fd, Defer> lock_or_validate_file(int dfd, const path &file, int flags, auto &&validate, path lockfile = {}) requires requires { { validate(1) } -> std::convertible_to<bool>; } { assert(!file.empty()); if (lockfile.empty()) lockfile = cat(file, ".lock"); flags |= O_NOFOLLOW | O_CLOEXEC; return lock_or_validate(dfd, lockfile, [&] -> Fd { if (Fd fd = openat(dfd, file.c_str(), flags); fd && validate(*fd)) return fd; else if (!fd && errno != ENOENT) syserr(R"(openat("{}", "{}", {}))", fdpath(dfd), file.string(), open_flags_to_string(flags)); return {}; }); } void Config::init_credentials() { auto realuid = getuid(); const char *envuser{}; if (!user_.empty()) envuser = user_.c_str(); else if (const char *u = getenv("SUDO_USER")) envuser = u; else if (const char *u = getenv("USER")) envuser = u; PwEnt pw; if (realuid == 0 && envuser) { if (!(pw = PwEnt::get_nam(envuser))) err("cannot find password entry for user {}", envuser); } else if (!(pw = PwEnt::get_id(realuid))) err("cannot find password entry for uid {}", realuid); user_ = pw->pw_name; homepath_ = pw->pw_dir; shell_ = pw->pw_shell; untrusted_cred_ = user_cred_ = Credentials::get_user(pw); setenv("JAI_USER", user_.c_str(), 1); // HOME may incorrectly be root's when using su/sudo if (realuid == 0 && pw->pw_uid != 0) setenv("HOME", pw->pw_dir, 1); if (PwEnt u = PwEnt::get_nam(kUntrustedUser)) { if (u->pw_uid && !strcmp(u->pw_gecos, kUntrustedGecos) && !strcmp(u->pw_dir, "/")) untrusted_cred_ = Credentials::get_user(u); else warn(R"(Ignoring user {} because uid is 0, home dir is not "/", or GECOS field is not "{}")", kUntrustedUser, kUntrustedGecos); } const char *jcd = getenv("JAI_CONFIG_DIR"); homejaipath_ = homepath_ / (jcd ? jcd : ".jai"); // Paranoia about ptrace, because we will drop privileges to access // the file system as the user. if (prctl(PR_SET_DUMPABLE, 0) == -1) syserr("prctl(PR_SET_DUMPABLE, 0)"); old_umask_ = umask(0); } Defer Config::asuser(const Credentials *crp) { if (!crp->uid_) // target is root, nothing to do return {}; auto old = Credentials::get_effective(); if (old.uid_) { if (old.uid_ == crp->uid_) // we are already the target user return {}; err("Config::asuser: want uid {} but already uid {}", crp->uid_, old.uid_); } crp->make_effective(); return Defer{[old = std::move(old)] { old.make_effective(); }}; } void Config::check_user(const struct stat &sb, std::string p, bool untrusted_ok) { if (sb.st_uid != user_cred_.uid_) { if (!untrusted_ok) err("{}: owned by {} should be owned by {}", p, sb.st_uid, user_cred_.uid_); else if (sb.st_uid != untrusted_cred_.uid_) err("{}: owned by {} should be owned by {} or {}", p, sb.st_uid, user_cred_.uid_, untrusted_cred_.uid_); } } int Config::home_jai(bool create) { if (!home_jai_fd_) { if (create) home_jai_fd_ = ensure_udir(home(), homejaipath_); else if (Fd fd = openat(home(), homejaipath_.c_str(), O_RDONLY | O_DIRECTORY | O_CLOEXEC)) { check_user(*fd); home_jai_fd_ = std::move(fd); } else if (errno == ENOENT) { err("{} does not exist; run {} --init to create it", fdpath(home(), homejaipath_), prog.filename().string()); } else syserr("{}", fdpath(home(), homejaipath_)); } return *home_jai_fd_; } int Config::storage() { if (storage_fd_) return *storage_fd_; auto restore = asuser(); if (storagedir_.empty()) storage_fd_ = xdup(home_jai()); else storage_fd_ = ensure_udir(AT_FDCWD, storagedir_); path fullpath = fdpath(*storage_fd_, true); if (fullpath.is_relative()) err("cannot find full pathname for {}", storagedir_.string()); if (!is_fd_at_path(*storage_fd_, -1, fullpath)) err("{} is no longer at {}", storagedir_.string(), fullpath.string()); storagedir_ = fullpath; return *storage_fd_; } int Config::run_jai() { if (run_jai_fd_) return *run_jai_fd_; auto r = lock_or_validate_file(-1, kRunRoot, O_RDONLY | O_DIRECTORY, [](int fd) { return is_mountpoint(fd) && (xfstat(fd).st_mode & 0777); }); if (r) return *(run_jai_fd_ = std::move(*r)); // Get rid of any partially set up directories recursive_umount(kRunRoot); xmnt_move(*make_tmpfs("run-jai", "size", "64M", "mode", "0", "gid", "0"), *ensure_dir(-1, kRunRoot, 0755, kFollow)); Fd dirfd = xopenat(-1, kRunRoot, O_RDONLY | O_DIRECTORY | O_CLOEXEC); xmnt_propagate(*dirfd, MS_PRIVATE); fchmod(*dirfd, 0755); return *(run_jai_fd_ = std::move(dirfd)); } int Config::run_jai_user() { if (run_jai_user_fd_) return *run_jai_user_fd_; Fd dirfd = ensure_dir(run_jai(), user_, 0750, kNoFollow); RaiiHelper<acl_free, acl_t> acl = acl_get_fd(*dirfd); if (!acl) syserr("acl_get_fd"); if (int r = acl_equiv_mode(acl, nullptr); r < 0) syserr("acl_equiv_mode"); else if (r == 0) { auto text = std::format("u::rwx,g::---,o::---,u:{}:r-x,m::r-x", user_cred_.uid_); set_fd_acl(*dirfd, text.c_str(), kAclAccess); } return *(run_jai_user_fd_ = std::move(dirfd)); } int Config::home() { if (!home_fd_) { auto cleanup = asuser(); Fd fd; if (!(fd = open(homepath_.c_str(), O_PATH | O_CLOEXEC))) syserr("{}", homepath_.string()); check_user(*fd); home_fd_ = std::move(fd); } return *home_fd_; } Fd Config::make_blacklist(int dfd, path name) { Fd blacklistfd = ensure_dir(dfd, name.c_str(), 0700, kFollow); check_user(*blacklistfd); if (is_mountpoint(*blacklistfd)) err("{}: directory must not be a mountpoint", fdpath(*blacklistfd)); for (path p : mask_files_) { try { make_whiteout(*blacklistfd, p); } catch (const std::exception &e) { warn("{}", e.what()); } } return blacklistfd; } Fd Config::make_home_overlay() { path sb = cat(sandbox_name_, ".home"); auto r = lock_or_validate_file( run_jai_user(), sb, O_RDONLY | O_DIRECTORY, [](int fd) { return is_mountpoint(fd); }, ".lock"); if (r) { mask_warn(); return std::move(*r); } Fd sandboxed_home = ensure_dir(run_jai_user(), sb, 0755, kFollow, true); if (is_mountpoint(*sandboxed_home)) return sandboxed_home; auto restore = asuser(); auto chgpath = cat(sandbox_name_, ".changes"); Fd changes = make_blacklist(storage(), chgpath); Fd work = ensure_udir(*changes, ".." / cat(sandbox_name_, ".work")); restore.reset(); Fd fsfd = xfsopen("overlay", cat("jai-", sb).c_str()); auto xsetfd = [&](const char *param, int fd) { if (fsconfig(*fsfd, FSCONFIG_SET_FD, param, nullptr, fd)) syserr("fsconfig(FSCONFIG_SET_FD, \"{}\")", param); }; xsetfd("lowerdir+", home()); xsetfd("upperdir", *changes); xsetfd("workdir", *work); Fd mnt = make_mount(*fsfd); xmnt_move(*mnt, *sandboxed_home); restore = asuser(); return xopenat(run_jai_user(), sb, O_RDONLY | O_CLOEXEC | O_DIRECTORY); } Fd Config::make_private_tmproot() { auto r = lock_or_validate_file( run_jai_user(), "tmp", O_RDONLY | O_DIRECTORY, [](int fd) { return is_mountpoint(fd); }, ".lock"); if (r) return std::move(*r); Fd tmp = ensure_dir(run_jai_user(), "tmp", 0755, kFollow); if (!is_mountpoint(*tmp)) { xmnt_move(*make_tmpfs("jai-tmp", "gid", "0", "mode", "0755", "size", "40%", "huge", "within_size"), *tmp); } return xopenat(run_jai_user(), "tmp", O_RDONLY | O_NOFOLLOW); } Fd Config::make_private_tmp(path subdir, bool userowned) { Fd fd = make_private_tmproot(); if (!subdir.empty()) { assert(subdir.is_relative()); fd = ensure_dir(*fd, subdir, 0755, kNoFollow, false); } if (userowned) { fd = ensure_dir(*fd, sandbox_name_, 0700, kNoFollow, true, [this](int fd) { if (fchown(fd, user_cred_.uid_, user_cred_.gid_)) syserr("{}: fchown", fdpath(fd)); }); check_user(*fd); return fd; } else return ensure_dir(*fd, sandbox_name_, 01777, kNoFollow); } Fd Config::make_private_passwd() { if (Fd fd = openat(run_jai_user(), "passwd", O_RDONLY | O_CLOEXEC)) return fd; if (errno != ENOENT) syserr("{}", fdpath(run_jai_user(), "passwd")); RaiiHelper<fclose> r, w; Fd wfd = xopenat(run_jai_user(), ".", O_RDWR | O_TMPFILE | O_CLOEXEC, 0444); if (!(w = fdopen(*wfd, "w"))) syserr("fdopen({})", fdpath(*wfd)); wfd.release(); auto restore = asuser(); r = fopen("/etc/passwd", "r"); if (!r) syserr("/etc/passwd"); fcntl(fileno(r), F_SETFD, 1); restore.reset(); while (auto pw = PwEnt::find(fgetpwent_r, *r)) { if (!strcmp(pw->pw_name, kUntrustedUser)) { pw.get()->pw_dir = const_cast<char *>(homepath_.c_str()); pw.get()->pw_shell = const_cast<char *>(shell_.c_str()); } if (putpwent(pw.get(), *w)) syserr("putpwent"); } if (fflush(*w)) syserr("fflush"); if (linkat(fileno(*w), "", run_jai_user(), "passwd", AT_EMPTY_PATH) && errno != EEXIST) syserr("linkat({})", fdpath(run_jai_user(), "passwd")); r.reset(); w.reset(); return xopenat(run_jai_user(), "passwd", O_RDONLY); } Fd Config::make_idmap_ns() { pid_t pid{-1}; Defer _reap([&pid] { if (pid > 0) { kill(pid, SIGKILL); while (waitpid(pid, nullptr, 0) == -1 && errno == EINTR) ; } }); if (!(pid = xfork(CLONE_NEWUSER))) { pause(); _exit(0); } path child = std::format("/proc/{}", pid); Fd newns = xopenat(-1, child / "ns/user", O_RDONLY | O_CLOEXEC); Fd mapctl = xopenat(-1, child / "gid_map", O_WRONLY | O_CLOEXEC); auto map = make_id_map(user_cred_.gid_, untrusted_cred_.gid_); if (write(*mapctl, map.data(), map.size()) == -1) syserr("write(gid_map)"); mapctl = xopenat(-1, child / "uid_map", O_WRONLY | O_CLOEXEC); map = make_id_map(user_cred_.uid_, untrusted_cred_.uid_); if (write(*mapctl, map.data(), map.size()) == -1) syserr("write(uid_map)"); mapctl.reset(); return newns; } Fd Config::make_mnt_ns() { Fd oldns = xopenat(-1, "/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); Defer _restore_ns{[fd = *oldns] { xsetns(fd, CLONE_NEWNS); }}; bool strict_ok = untrusted_cred_ != user_cred_; if (mode_ == kStrict && !strict_ok) err("Cannot use strict mode: invalid user {}", kUntrustedUser); assert(!sandbox_name_.empty()); mount_attr attr{ .attr_set = MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV, .propagation = MS_PRIVATE, }; Fd tmp = clone_tree(*mp_holder_.emplace_back(make_private_tmp())); Fd passwd; if (mode_ == kStrict) passwd = clone_tree(*make_private_passwd()); path xdgrun = std::format("/run/user/{}", user_cred_.uid_); Fd rundir = grant_directories_.contains(xdgrun) ? Fd{} : clone_tree(*make_private_tmp(".run", true)); Fd shmdir; if (struct stat sb; !stat("/dev/shm", &sb) && S_ISDIR(sb.st_mode)) shmdir = clone_tree(*make_private_tmp(".shm")); Fd home; Fd mapns; Credentials *sbcred = &user_cred_; if (mode_ == kCasual) home = clone_tree(*mp_holder_.emplace_back(make_home_overlay())); else { if (mode_ == kStrict) { sbcred = &untrusted_cred_; mapns = make_idmap_ns(); attr.attr_set |= MOUNT_ATTR_IDMAP; attr.userns_fd = *mapns; } home = clone_tree(*ensure_udir(storage(), cat(sandbox_name_, ".home"))); } for (int dfd : {*tmp, *home, *passwd, *rundir, *shmdir}) if (dfd != -1) xmnt_setattr(dfd, attr); if (unshare(CLONE_NEWNS)) syserr("unshare(CLONE_NEWNS)"); Fd newns = xopenat(-1, "/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); xmnt_setattr(-1, "/", mount_attr{ .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID, .propagation = MS_PRIVATE, }); if (umount2(kRunRoot, MNT_DETACH)) syserr("umount2({}, MNT_DETACH)", kRunRoot); umount2("/tmp", MNT_DETACH); // ignore error umount2("/var/tmp", MNT_DETACH); // ignore error xmnt_move(*tmp, -1, "/tmp"); xmnt_move(*clone_tree(-1, "/tmp"), -1, "/var/tmp", 0); xmnt_move(*home, -1, homepath_); if (passwd) xmnt_move(*passwd, -1, "/etc/passwd"); if (rundir) xmnt_move(*rundir, -1, xdgrun); if (shmdir) { umount2("/dev/shm", MNT_DETACH); xmnt_move(*shmdir, -1, "/dev/shm"); } if (grant_cwd_) { if (!grant_directories_.contains(cwd())) { if (cwd() == homepath_) { std::string name = prog.filename().string(); warn( R"(Refusing to grant your entire home directory to jailed code. {1:>{2}} Run "jai -D" to avoid granting the current working directory.)", name, "", name.size()); exit(1); } grant_directories_.emplace(cwd()); } } for (auto d : grant_directories_) { if (d.is_relative()) d = "/" / d; if (contains(homejaipath_, d)) err("{}: cannot export a directory within {}", d.string(), homejaipath_.string()); if (contains(storagedir_, d)) err("{}: cannot export a directory within {}", d.string(), storagedir_.string()); xsetns(*oldns, CLONE_NEWNS); auto restore_root = asuser(); Fd src = xopenat(-1, d, O_DIRECTORY | O_PATH | O_CLOEXEC); check_user(*src, d); restore_root.reset(); src = clone_tree(*src); // Should it be recursive? xmnt_setattr(*src, attr); xsetns(*newns, CLONE_NEWNS); restore_root = asuser(); Fd dst = openat(-1, d.c_str(), O_DIRECTORY | O_PATH | O_CLOEXEC); if (!dst) { if (errno != EACCES && errno != ENOENT) syserr("{}", d.string()); restore_root.reset(); restore_root = asuser(sbcred); dst = ensure_dir(-1, d, 0755, kNoFollow, true); } check_user(*dst, d, true); restore_root.reset(); xmnt_move(*src, *dst); } xsetns(*newns, CLONE_NEWNS); auto blockdir = [this, &oldns, &newns, &sbcred](const path &p) { assert(p.is_absolute()); auto restore_root = asuser(sbcred); Fd target = openat(AT_FDCWD, p.c_str(), O_DIRECTORY | O_RDONLY); if (!target) return; restore_root.reset(); if (mode_ != kCasual) { struct stat sbold, sbnew = xfstat(*target); xsetns(*oldns, CLONE_NEWNS); int staterr = stat(p.c_str(), &sbold); xsetns(*newns, CLONE_NEWNS); if (staterr || sbold.st_ino != sbnew.st_ino || sbold.st_dev != sbnew.st_dev) return; } check_user(*target, p, true); Fd empty = xopenat(-1, kRunRoot, O_RDONLY); if (!is_dir_empty(*empty)) err("{} should be empty in jail", kRunRoot); Fd source = clone_tree(*empty); xmnt_setattr(*source, mount_attr{ .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV, .propagation = MS_PRIVATE, }); xmnt_move(*source, *target); }; blockdir(storagedir_); if (homejaipath_ != storagedir_) blockdir(homejaipath_); return newns; } void Config::unmount() { Fd lock; while (!(lock = open_lockfile(run_jai_user(), ".lock"))) ; auto runuser = path(kRunRoot) / user_; auto mp = runuser / cat(sandbox_name_, ".home"); umount2(mp.c_str(), UMOUNT_NOFOLLOW); unlinkat(run_jai_user(), mp.filename().c_str(), AT_REMOVEDIR); unlinkat(run_jai_user(), ".lock", 0); lock.reset(); unlinkat(run_jai(), user_.c_str(), AT_REMOVEDIR); } static void clean_root_owned_dir(int dfd, path file) { Fd target = openat(dfd, file.c_str(), O_RDONLY | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (!target) { if (errno != ENOENT) warn("{}: {}", fdpath(dfd, file), strerror(errno)); return; } if (!is_fd_at_path(dfd, *target, "..", kNoFollow)) { warn("{}: ignored (possible TOCTTOU problem)", fdpath(dfd, file)); return; } auto d = xopendir(*target); while (auto de = readdir(d)) { struct stat sb; if (fstatat(*target, d_name(de), &sb, AT_SYMLINK_NOFOLLOW)) { warn("fstatat {}: {}", fdpath(*target, d_name(de)), strerror(errno)); continue; } else if (!S_ISDIR(sb.st_mode) && (sb.st_size == 0 || !S_ISREG(sb.st_mode))) { if (unlinkat(*target, d_name(de), 0)) warn("unlinkat {}: {}", fdpath(*target, d_name(de)), strerror(errno)); else warn("deleted {}", fdpath(*target, d_name(de))); } } } bool Config::unmountall() { Fd lock; while (!(lock = open_lockfile(run_jai_user(), ".lock"))) ; bool unmount_ok = recursive_umount(path(kRunRoot) / user_, false); auto dir = xopendir(run_jai_user()); while (auto de = readdir(dir)) if (unlinkat(run_jai_user(), de->d_name, AT_REMOVEDIR) && errno == ENOTDIR) unlinkat(run_jai_user(), de->d_name, 0); // Get rid of any stale files the user can't delete if (unmount_ok) try { auto restore = asuser(); auto jd = xopendir(storage()); while (auto de = readdir(jd)) { path name = d_name(de); if (name.extension() == ".changes") try { path workpath = name / ".." / cat(name.stem(), ".work"); Fd work = xopenat(storage(), workpath.c_str(), O_RDONLY | O_DIRECTORY | O_CLOEXEC); check_user(*work); restore.reset(); Defer _unrestore([&restore, this] { restore = asuser(); }); clean_root_owned_dir(*work, "work"); clean_root_owned_dir(*work, "index"); } catch (const std::exception &e) { warn("{}", e.what()); } } } catch (const std::exception &e) { warn("{}", e.what()); } unlinkat(run_jai_user(), ".lock", 0); lock.reset(); unlinkat(run_jai(), user_.c_str(), AT_REMOVEDIR); return unmount_ok; } std::vector<const char *> Config::make_env() { std::vector<std::string_view> filter_patterns; std::set<std::string_view, std::less<>> filter_vars; for (const auto &v : env_filter_) if (v.find('*') == v.npos) filter_vars.insert(v); else filter_patterns.push_back(v); for (char **e = environ; *e; ++e) { std::string_view sv(*e); if (auto eq = sv.find('='); eq != sv.npos) sv = sv.substr(0, eq); else continue; if (filter_vars.contains(sv) || std::ranges::any_of(filter_patterns, [sv](auto pat) { return glob(pat, sv); })) continue; setenv_.try_emplace(std::string(sv), *e); } std::vector<const char *> ret; ret.reserve(setenv_.size() + 1); for (auto &kv : setenv_ | std::views::values) ret.push_back(kv.c_str()); ret.push_back(nullptr); return ret; } static pid_t main_pid = getpid(); // Return stop signal if status indicates a child stopped, exit or // kill ourselves if the child terminated on a signal, and return 0 // otherwise. static int propagate_termination_status(int status) { if (WIFSTOPPED(status)) { if (int sig = WSTOPSIG(status); sig == SIGSTOP || sig == SIGTSTP || sig == SIGTTIN || sig == SIGTTOU) return sig; // Unlikely to reach here, but maybe another process attached to // our child with the debugger and it got SIGTRAP or something? return 0; } void (*do_exit)(int) = getpid() == main_pid ? exit : _exit; if (WIFEXITED(status)) do_exit(WEXITSTATUS(status)); if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); signal(sig, SIG_DFL); auto ss = sigsingleton(WTERMSIG(status)); sigprocmask(SIG_UNBLOCK, &ss, nullptr); raise(sig); do_exit(-1); } return 0; // Continued? } static int wait_propagate(int pid) { assert(pid > 0); int status; for (;;) { if (auto r = waitpid(-1, &status, WUNTRACED); r == -1) { if (errno != EINTR) { if (getpid() == main_pid) syserr("waitpid"); warn("waitpid: {}", strerror(errno)); _exit(1); } } else if (r == pid) if (auto sig = propagate_termination_status(status); sig > 0) return sig; } } void Config::fix_proc() try { xmnt_propagate(-1, "/", MS_PRIVATE); recursive_umount("/proc"); xmnt_move(*make_mount(*xfsopen("proc", "proc"), MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | MOUNT_ATTR_NOEXEC), -1, "/proc"); } catch (const std::exception &e) { warn("{}", e.what()); _exit(1); } void Config::exec(int nsfd, char **argv) { // This function is a bit annoying because the existing jai process // cannot move to a new PID namespace, so we have to fork once. But // the forked process will have PID 1 and behave strangely (such as // not receiving signals from within the PID namespace), so needs to // fork again to run the actual jailed program with a normal PID. // That means PID 1 has to propagate termination events of process 2 // to its own parent, which must then propagate them to the process // than ran jai. // // Further complicating matters, PID 1 cannot stop itself (since it // cannot receive a SIGSTOP from within the PID namespace). Hence, // if the jailed program stops, it uses a pipe to request that the // original jai process stop it (from outside the PID namespace). auto stop_me = xpipe(); if (auto pid = xfork(CLONE_NEWPID | CLONE_NEWIPC | CLONE_NEWNS)) { // This is the last process in the old PID namespace close(nsfd); stop_me[1].reset(); // discard first write, whose only purpose is to test the parent // didn't die before the child set PR_SET_PDEATSIG. char c; read(*stop_me[0], &c, 1); parent_loop(pid, *stop_me[0]); } stop_me[0].reset(); xsetns(nsfd, CLONE_NEWNS); fix_proc(); pid1(std::move(stop_me[1])); pid2(argv); } void Config::parent_loop(pid_t pid, int stop_requests) { auto ss = sigsingleton(SIGCHLD); if (sigprocmask(SIG_BLOCK, &ss, nullptr)) syserr("sigprocmask SIG_BLOCK SIGCHLD"); Fd sigfd = signalfd(-1, &ss, SFD_CLOEXEC); if (!sigfd) syserr("signalfd"); if (int n = fcntl(stop_requests, F_GETFL); n == -1) syserr("F_GETFL"); else if (fcntl(stop_requests, F_SETFL, n | O_NONBLOCK) == -1) syserr("F_SETFL O_NONBLOCK"); // Put stop_requests in static to call drain_pipe from signal handler static int rqfd; rqfd = stop_requests; // Flush pipe returning latest signal request or 0 if none constexpr auto drain_pipe = +[]() { int ret = 0, n; unsigned char buf[8]; while ((n = read(rqfd, buf, sizeof(buf))) > 0) ret = buf[n - 1]; if (n == -1 && errno != EAGAIN && errno != EINTR) syserr("read from stop_requests pipe"); return ret; }; // If we've been resumed, discard any previous stop requests static volatile sig_atomic_t continued = 0; struct sigaction sa{}; sa.sa_handler = +[](int) { drain_pipe(); continued = 1; }; sigemptyset(&sa.sa_mask); if (sigaction(SIGCONT, &sa, nullptr)) syserr("sigaction(SIGCONT)"); std::array<pollfd, 2> pollfds{pollfd{.fd = stop_requests, .events = POLLIN}, pollfd{.fd = *sigfd, .events = POLLIN}}; for (int my_next_stop_sig = 0;;) { if (poll(pollfds.data(), pollfds.size(), -1) < 0) { if (errno == EINTR) continue; syserr("poll"); } for (;;) { int status; if (auto r = waitpid(pid, &status, WNOHANG | WUNTRACED); r == 0) break; else if (r == -1) { if (errno != EINTR) syserr("waitpid"); } else if (auto sig = propagate_termination_status(status); sig > 0) { if (my_next_stop_sig > 0) sig = std::exchange(my_next_stop_sig, 0); continued = 0; raise(sig); if (continued == 0) // If we are in an orphaned process group, the default // action of SIGTSTP, SIGTTIN, and SIGTTOU is ignore rather // than stop. While we'd like to propagate the exact stop // signal of the jailed process when possible, in this case // the only way to stop ourselves is with SIGSTOP. raise(SIGSTOP); } } if ((my_next_stop_sig = drain_pipe()) > 0) kill(pid, SIGSTOP); } } // Implement PID 1 in the new namespace. Only returns for PID 2. void Config::pid1(Fd stop_me) try { // Kill entire sandbox if parent jai process terminates prctl(PR_SET_PDEATHSIG, SIGKILL); // On the off chance that our parent exited and we got reparented to // init (outside the sandbox) before setting PDEATHSIG, try writing // one byte to the pipe so that a SIGPIPE kills us. if (write(*stop_me, "", 1) != 1) err("parent killed before PR_SET_PDEATHSIG"); // Return in pid 2, continue in pid 1 auto pid = xfork(); if (!pid) return; prctl(PR_SET_NAME, "jai-init"); // Note: getpgid is technically not async-signal-safe, but // disassembling glibc shows it doesn't do anything problematic // other than maybe change errno (which we save/restore in the // handler). Call getpgid at least once before setting the signal // handler to avoid any lazy dynamic linking in the signal handler. static pid_t my_pgid, main_child_pid; my_pgid = getpgid(0); main_child_pid = pid; static Fd tty; tty = open("/dev/tty", O_RDWR | O_CLOEXEC); struct sigaction sa{}; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGTTOU); sa.sa_handler = +[](int sig) { int saved_errno = errno; if (auto pg = getpgid(main_child_pid); pg != my_pgid) { if (tty) tcsetpgrp(*tty, pg); killpg(pg, sig); } errno = saved_errno; }; if (sigaction(SIGCONT, &sa, nullptr)) syserr("sigaction(SIGCONT)"); for (;;) { unsigned char sig = wait_propagate(pid); write(*stop_me, &sig, 1); } } catch (const std::exception &e) { warn("{}", e.what()); _exit(1); } void Config::pid2(char **argv) try { if (mode_ == kCasual || mode_ == kBare) user_cred_.make_real(); else untrusted_cred_.make_real(); if (chdir(cwd().c_str())) { if (mode_ == kCasual || grant_cwd_ || errno != ENOENT) syserr("chdir({})", cwd().string()); if (chdir(homepath_.c_str())) syserr("chdir({})", homepath_.string()); warn("No \"{}\" in jail, changed to home directory", cwd().string()); } umask(old_umask_); const char *argv0 = argv[0]; std::vector<const char *> bashcmd; if (!shellcmd_.empty()) { argv0 = PATH_BASH; bashcmd.push_back("init"); bashcmd.push_back("-c"); bashcmd.push_back(shellcmd_.c_str()); while (*argv) bashcmd.push_back(*(argv++)); bashcmd.push_back(nullptr); argv = const_cast<char **>(bashcmd.data()); } setenv("JAI_JAIL", sandbox_name_.c_str(), 1); setenv("JAI_MODE", std::format("{}", mode_).c_str(), 1); auto env = make_env(); execvpe(argv0, argv, const_cast<char **>(env.data())); perror(argv0); _exit(1); } catch (const std::exception &e) { warn("{}", e.what()); _exit(1); } std::unique_ptr<Options> Config::opt_parser(bool dotjail) { auto ret = std::make_unique<Options>(); Options &opts = *ret; opts( "-m", "--mode", [this](std::string_view m) { static const std::map<std::string, Mode, std::less<>> modemap{ {"casual", kCasual}, {"bare", kBare}, {"strict", kStrict}}; if (auto it = modemap.find(m); it != modemap.end()) mode_ = it->second; else err<Options::Error>(R"(invalid mode {})", m); }, std::format(R"(Set execution mode to one of the following: casual - run as invoking UID with overlay home directory bare - run as invoking UID with bare home directory strict - run as UID {} with bare home directory)", kUntrustedUser), "casual|bare|strict"); opts( "-d", "--dir", [this](std::string_view arg) { path d(expand(arg)); grant_directories_.emplace( canonical(parsing_config_file_ ? homepath_ / d : d)); }, "Grant full access to DIR.", "DIR"); opts( "-x", "--xdir", [this](std::string_view arg) { path d(expand(arg)); grant_directories_.erase( canonical(parsing_config_file_ ? homepath_ / d : d)); }, "undo the effects of a previous --dir option", "DIR"); opts( "-D", "--nocwd", [this] { grant_cwd_ = false; }, "Do not grant access to the current working directory"); if (!dotjail) opts( "-j", "--jail", [this](path sb) { if (!name_ok(sb)) err<Options::Error>("{}: invalid sandbox name", sb.string()); sandbox_name_ = sb; }, "Use private or overlay home directory named NAME", "NAME"); else opts("-j", "--jail", [](path) { err<Options::Error>("cannot set name from a .jail file or include"); }); opts("--conf", [this, opts = ret.get()](std::string_view arg) { path file(expand(arg)); if (!parse_config_file(file, opts)) err<Options::Error>("{}: configuration file not found", file.string()); }); opts( "--mask", [this](std::string_view arg) { path p(expand(arg)); if (p.is_absolute()) err<Options::Error>("{}: cannot mask an absolute path", p.string()); mask_files_.emplace(std::move(p)); }, "Erase $HOME/FILE when first creating overlay home", "FILE"); opts( "--unmask", [this](std::string_view arg) { path p(expand(arg)); mask_files_.erase(p); }, "Undo the effects of a previous --mask option", "FILE"); opts( "--unsetenv", [this](std::string_view var) { erase_if(setenv_, [var](const auto &it) { return glob(var, it.first); }); env_filter_.emplace(var); }, "Remove VAR (wich may contain wildcard '*') from the environment", "VAR"); opts( "--setenv", [this](std::string var) { if (auto pos = var.find('='); pos != var.npos) { auto var_eq_val = std::format("{}{}", var.substr(0, pos + 1), expand(var.substr(pos + 1))); setenv_.insert_or_assign(var.substr(0, pos), var_eq_val); } else if (auto it = env_filter_.find(var); it != env_filter_.end()) env_filter_.erase(it); else if (var.contains(' ')) // space almost certainly an error since it didn't match err<Options::Error>( R"(Environment variable "{}" contains space, did you mean '='?)", var); else if (const char *p = getenv(var.c_str()); p && std::ranges::any_of(env_filter_, [&var](const auto &pat) { return glob(pat, var); })) setenv_.insert_or_assign(var, std::format("{}={}", var, p)); }, "Undo the effects of --unsetenv=VAR, or set VAR=VALUE", "VAR[=VALUE]"); opts( "--command", [this](std::string cmd) { shellcmd_ = std::move(cmd); }, R"(Bash command line to execute program (default: "$0" "$@"))", "CMD"); opts( "--storage", [this](std::string_view s) { auto sd = expand(s); if (parsing_config_file_) storagedir_ = homepath_ / sd; else storagedir_ = cwd() / sd; }, R"(Store overlay and private home directories in DIR (default: $JAI_CONFIG_DIR or $HOME/.jai))", "DIR"); return ret; } std::string option_help; [[noreturn]] static void usage(int status) { if (status) print_compat::println(stderr, "Try {} --help for more information.", prog.filename().string()); else print_compat::print(stdout, "usage: {0} [OPTIONS] [CMD [ARG...]]\n{1}", prog.filename().string(), option_help); exit(status); } [[noreturn]] static void version() { print_compat::println(R"({} {} Untrusted user for strict mode: {} Copyright (C) 2026 David Mazieres This program comes with NO WARRANTY, to the extent permitted by law. You may redistribute it under the terms of the GNU General Public License version 3 or later; see the file named COPYING for details.)", PACKAGE_STRING, PACKAGE_URL, kUntrustedUser); exit(0); } int do_main(int argc, char **argv) { Config conf; conf.init_credentials(); auto restore = conf.asuser(); conf.cwd(); // compute and cache while privileges lowered bool opt_u{}; std::vector<path> opt_d; path opt_C = ""; bool opt_init{}; auto opts = conf.opt_parser(); // A few options not available in config files (*opts)("-u", [&] { opt_u = true; }, "Unmount sandboxed file systems"); (*opts)( "--init", [&] { opt_init = true; }, "Create initial configuration files and exit"); // Override inline conf to make CLI idempotent (*opts)( "-C", "--conf", [&](path p) { opt_C = p; }, R"(Use FILE as configuration file. A file FILE with no '/' is relative to $JAI_CONFIG_DIR if set, otherwise to ~/.jai. The default is CMD.conf if it exists, otherwise default.conf)", "FILE"); (*opts)("--help", [] { usage(0); }); (*opts)("--version", version, "Print copyright and version then exit"); (*opts)( "--print-defaults", [] { write(1, jai_defaults.data(), jai_defaults.size()); exit(0); }, "Show default contents of $JAI_CONFIG_DIR/.defaults"); option_help = opts->help(); if (argc > 2 && !strcmp(argv[1], "--complete")) return conf.complete(opts->complete_args(2, argc, argv)); std::vector<char *> cmd; try { auto args = opts->parse_argv(argc, argv); cmd.assign(args.begin(), args.end()); } catch (Options::Error &e) { warn("{}", e.what()); usage(2); } if (!conf.mask_files_.empty()) conf.mask_warn_ = true; // true instead of opt_init, just so it works by default. ensure_file(conf.home_jai(true), ".defaults", jai_defaults, 0600, create_warn); ensure_file(conf.home_jai(), "default.conf", default_conf, 0600, create_warn); if (opt_init) { ensure_file(conf.storage(), "default.jail", default_jail, 0600, create_warn); print_compat::println( "You can edit the configuration defaults in {}/.defaults.", conf.homejaipath_.string()); print_compat::println( "Run {} --print-defaults to see the original contents of that file.", prog.filename().string()); return 0; } if (opt_u) { if (!conf.grant_cwd_ || !conf.grant_directories_.empty() || !cmd.empty()) { print_compat::println(stderr, "-u is not compatible with -d, -D, or a command"); usage(2); } restore.reset(); return conf.unmountall() ? 0 : 1; } if (!opt_C.empty()) { if (!conf.parse_config_file(opt_C)) err("{}: no such configuration file", opt_C.string()); } else if ((cmd.empty() || !conf.name_ok(cmd[0]) || !conf.parse_config_file(std::format("{}.conf", cmd[0]))) && !conf.parse_config_file("default.conf")) conf.parse_config_file("default.conf"); // Re-parse command line to override files opts->parse_argv(argc, argv); bool createwarn = false; if (conf.sandbox_name_.empty()) conf.sandbox_name_ = "default"; Fd dotjail = ensure_file(conf.storage(), cat(conf.sandbox_name_, ".jail"), conf.sandbox_name_ == "default" ? default_jail : std::format("mode {}\n", conf.mode_), 0600, create_warn); conf.parse_config_fd(*dotjail, conf.opt_parser(true).get()); // Re-parse command line to override files opts->parse_argv(argc, argv); restore.reset(); if (geteuid() && !getenv("JAI_TRY_NONROOT")) err("{} requires root. Please run it with sudo or make it setuid root", prog.filename().string()); if (cmd.empty()) { const char *shell = conf.shell_.empty() ? "/bin/sh" : conf.shell_.c_str(); cmd.push_back(const_cast<char *>(shell)); } auto fd = conf.make_mnt_ns(); cmd.push_back(nullptr); conf.exec(*fd, cmd.data()); return 0; } int main(int argc, char **argv) { if (argc > 0) prog = argv[0]; else prog = PACKAGE_TARNAME; #if 1 using ToCatch = std::exception; #else struct ToCatch { auto what() const { return ""; } }; #endif try { exit(do_main(argc, argv)); } catch (const ToCatch &e) { warn("{}", e.what()); } return 1; }