From d5bffccb2086ad3c889acd04db652ea0036adf83 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 20:09:22 +0000 Subject: [PATCH 01/34] feat(suidhelper): mix suidhelper.install wrapping cargo xtask install --- lib/mix/tasks/suidhelper.install.ex | 40 +++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 lib/mix/tasks/suidhelper.install.ex diff --git a/lib/mix/tasks/suidhelper.install.ex b/lib/mix/tasks/suidhelper.install.ex new file mode 100644 index 0000000..de27fbc --- /dev/null +++ b/lib/mix/tasks/suidhelper.install.ex @@ -0,0 +1,40 @@ +defmodule Mix.Tasks.Suidhelper.Install do + @shortdoc "Build, stamp, and install the setuid helper (wraps `cargo xtask install`)" + @moduledoc """ + Builds, stamps, and installs the Rust setuid helper by wrapping + `cargo xtask install` in `native/suidhelper`. + + mix suidhelper.install + + The xtask first stamps the release binary (BLAKE3 self-checksum into + `.note.sum`, the same step the `:suidhelper_stamp` compiler runs) and then + installs it setuid-root to `/usr/local/bin/hyper-suidhelper` via `sudo + install`. `sudo` may prompt for a password on the controlling terminal. + + This is the privileged counterpart to `mix suidhelper.stamp`: that one only + rebuilds, stamps, and re-captures the embedded build identity; this one also + places the binary on `PATH` setuid-root. `cargo` and the helper's toolchain + (see `native/suidhelper/rust-toolchain.toml`) must be installed. + """ + + use Mix.Task + + @helper_dir "native/suidhelper" + + @impl Mix.Task + def run(argv) do + {_, 0} = + System.cmd("cargo", ["xtask", "install" | argv], + cd: @helper_dir, + into: IO.stream(:stdio, :line) + ) + rescue + MatchError -> + Mix.raise(""" + `cargo xtask install` failed installing the suidhelper. + + Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml) + are installed, and that `sudo` is available for the setuid install step. + """) + end +end From 9201730a0f02f8bddba6cfb29c4c8dea28d60063 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 20:16:17 +0000 Subject: [PATCH 02/34] docs: document full node requirements (postgres, dm targets, kvm, cgroup v2, suidhelper) --- docs/cookbook/intro.md | 59 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index a0030d3..2c69380 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -19,13 +19,58 @@ The absolute best way to get started with `Hyper` is to play with it. ### Requirements -Hyper requires the following software be installed on each node running it: - - - [`skopeo`](https://github.com/containers/skopeo) - - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) - -Hyper has more runtime dependencies, but they are automatically redistributed -by Hyper. +#### External services + +Hyper needs a **PostgreSQL** server reachable from every node — it is the image +database and the only stateful external dependency. + +#### System binaries + +The following must be on each node's `PATH` (the bracketed override is the +`config :hyper` key you can set if the binary lives elsewhere): + + - [`skopeo`](https://github.com/containers/skopeo) — pulls OCI images + (`skopeo_path`) + - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) — provides `mke2fs`, which + builds the ext4 rootfs (`mke2fs_path`) + - `losetup`, `blockdev` (from **util-linux**) — loop-device setup + (`losetup_path`, `blockdev_path`) + - `dmsetup` (from **lvm2** / device-mapper) — dm-snapshot and thin-pool + layering (`dmsetup_path`). Frequently *not* installed by default — check + this one first. + - `du`, `getent` (from **coreutils** and **glibc**) — rootfs sizing and user + resolution. Present on essentially every distro. + +#### Kernel features + +The host kernel must provide: + + - **KVM** — `/dev/kvm` must exist and be accessible to the per-VM users (see + the `uid_gid_range` configuration). + - **cgroup v2** — the unified hierarchy mounted at `/sys/fs/cgroup`. v1-only + hosts are not supported. + - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` — load the + `dm_snapshot` and `dm_thin_pool` modules (`modprobe dm_snapshot + dm_thin_pool`). Hyper refuses to start its device helper without them. + - **loop devices** — the `loop` module, used to attach layer images as block + devices. + +#### Privileged setup + + - The **setuid-root device helper** (`hyper-suidhelper`) must be installed. + Run `mix suidhelper.install`, which builds, stamps, and places it + setuid-root on `PATH`. Every privileged operation (losetup, dmsetup, mknod, + chroot jails) routes through it; the BEAM itself runs unprivileged. + - A **parent cgroup** named by `cgroup_parent` (default `hyper`) must exist + under `/sys/fs/cgroup`; Hyper creates each VM's cgroup beneath it. + - The host UID/GID range given by `uid_gid_range` must be free for Hyper to + allocate per-VM users from. + +#### Auto-redistributed + +The remaining runtime dependencies — `firecracker`, `jailer`, `umoci`, and the +guest `vmlinux` kernels — are downloaded, checksum-verified, and managed by +Hyper itself; you do not install them. ### Installation From b68dd5e7ebb72d650229001369d08e366addf5bd Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 20:16:47 +0000 Subject: [PATCH 03/34] docs: replace em-dashes with ASCII hyphens in intro.md --- docs/cookbook/intro.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index 2c69380..526c7e9 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -21,7 +21,7 @@ The absolute best way to get started with `Hyper` is to play with it. #### External services -Hyper needs a **PostgreSQL** server reachable from every node — it is the image +Hyper needs a **PostgreSQL** server reachable from every node - it is the image database and the only stateful external dependency. #### System binaries @@ -29,30 +29,30 @@ database and the only stateful external dependency. The following must be on each node's `PATH` (the bracketed override is the `config :hyper` key you can set if the binary lives elsewhere): - - [`skopeo`](https://github.com/containers/skopeo) — pulls OCI images + - [`skopeo`](https://github.com/containers/skopeo) - pulls OCI images (`skopeo_path`) - - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) — provides `mke2fs`, which + - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) - provides `mke2fs`, which builds the ext4 rootfs (`mke2fs_path`) - - `losetup`, `blockdev` (from **util-linux**) — loop-device setup + - `losetup`, `blockdev` (from **util-linux**) - loop-device setup (`losetup_path`, `blockdev_path`) - - `dmsetup` (from **lvm2** / device-mapper) — dm-snapshot and thin-pool - layering (`dmsetup_path`). Frequently *not* installed by default — check + - `dmsetup` (from **lvm2** / device-mapper) - dm-snapshot and thin-pool + layering (`dmsetup_path`). Frequently *not* installed by default - check this one first. - - `du`, `getent` (from **coreutils** and **glibc**) — rootfs sizing and user + - `du`, `getent` (from **coreutils** and **glibc**) - rootfs sizing and user resolution. Present on essentially every distro. #### Kernel features The host kernel must provide: - - **KVM** — `/dev/kvm` must exist and be accessible to the per-VM users (see + - **KVM** - `/dev/kvm` must exist and be accessible to the per-VM users (see the `uid_gid_range` configuration). - - **cgroup v2** — the unified hierarchy mounted at `/sys/fs/cgroup`. v1-only + - **cgroup v2** - the unified hierarchy mounted at `/sys/fs/cgroup`. v1-only hosts are not supported. - - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` — load the + - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` - load the `dm_snapshot` and `dm_thin_pool` modules (`modprobe dm_snapshot dm_thin_pool`). Hyper refuses to start its device helper without them. - - **loop devices** — the `loop` module, used to attach layer images as block + - **loop devices** - the `loop` module, used to attach layer images as block devices. #### Privileged setup @@ -68,8 +68,8 @@ The host kernel must provide: #### Auto-redistributed -The remaining runtime dependencies — `firecracker`, `jailer`, `umoci`, and the -guest `vmlinux` kernels — are downloaded, checksum-verified, and managed by +The remaining runtime dependencies - `firecracker`, `jailer`, `umoci`, and the +guest `vmlinux` kernels - are downloaded, checksum-verified, and managed by Hyper itself; you do not install them. ### Installation From e2253f61146e677a9432e12c3aec863e89325fb0 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 20:51:04 +0000 Subject: [PATCH 04/34] fix(suidhelper): make mix suidhelper.install tty-safe Mix spawns subprocesses in their own session with no controlling terminal (erl_child_setup calls setsid), so the nested `sudo install` could never prompt for a password and always failed with "a terminal is required". Split the task: `cargo xtask stamp` builds + stamps unprivileged, then the privileged copy runs via sudo only when it is already non-interactive (`sudo -n` succeeds); otherwise it prints the exact command to run by hand. --- lib/mix/tasks/suidhelper.install.ex | 101 +++++++++++++++++++++------- 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/lib/mix/tasks/suidhelper.install.ex b/lib/mix/tasks/suidhelper.install.ex index de27fbc..2085de3 100644 --- a/lib/mix/tasks/suidhelper.install.ex +++ b/lib/mix/tasks/suidhelper.install.ex @@ -1,40 +1,93 @@ defmodule Mix.Tasks.Suidhelper.Install do - @shortdoc "Build, stamp, and install the setuid helper (wraps `cargo xtask install`)" + @shortdoc "Build, stamp, and install the setuid helper" @moduledoc """ - Builds, stamps, and installs the Rust setuid helper by wrapping - `cargo xtask install` in `native/suidhelper`. + Builds, stamps, and installs the Rust setuid helper. mix suidhelper.install - The xtask first stamps the release binary (BLAKE3 self-checksum into - `.note.sum`, the same step the `:suidhelper_stamp` compiler runs) and then - installs it setuid-root to `/usr/local/bin/hyper-suidhelper` via `sudo - install`. `sudo` may prompt for a password on the controlling terminal. + Two steps: - This is the privileged counterpart to `mix suidhelper.stamp`: that one only - rebuilds, stamps, and re-captures the embedded build identity; this one also - places the binary on `PATH` setuid-root. `cargo` and the helper's toolchain - (see `native/suidhelper/rust-toolchain.toml`) must be installed. + 1. `cargo xtask stamp` in `native/suidhelper` builds the release binary and + writes its BLAKE3 self-checksum into `.note.sum` (the same step the + `:suidhelper_stamp` compiler runs). + 2. The stamped binary is copied setuid-root (mode `4755`) to + `/usr/local/bin/hyper-suidhelper`. + + The copy needs root, but Mix runs every subprocess in its own session with no + controlling terminal (`erl_child_setup` calls `setsid`), so a nested `sudo` + cannot open `/dev/tty` to prompt for a password. This task therefore only runs + `sudo` itself when it is already non-interactive (`sudo -n` succeeds, e.g. + `NOPASSWD` or a usable cached credential). Otherwise it prints the exact + privileged command for you to run in your own terminal. + + This is the privileged counterpart to `mix suidhelper.stamp`, which stamps + only. `cargo` and the helper's toolchain (see + `native/suidhelper/rust-toolchain.toml`) must be installed. """ use Mix.Task @helper_dir "native/suidhelper" + @source Path.join(@helper_dir, "target/release/hyper-suidhelper") + # Must match `Hyper.Config`'s default `suid_helper` path and the xtask's + # `INSTALL_PATH`: a `PATH` location the unprivileged node can exec. + @install_path "/usr/local/bin/hyper-suidhelper" @impl Mix.Task def run(argv) do - {_, 0} = - System.cmd("cargo", ["xtask", "install" | argv], - cd: @helper_dir, - into: IO.stream(:stdio, :line) - ) - rescue - MatchError -> - Mix.raise(""" - `cargo xtask install` failed installing the suidhelper. - - Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml) - are installed, and that `sudo` is available for the setuid install step. - """) + stamp!(argv) + install_privileged() + end + + defp stamp!(argv) do + case System.cmd("cargo", ["xtask", "stamp" | argv], + cd: @helper_dir, + into: IO.stream(:stdio, :line) + ) do + {_, 0} -> + :ok + + {_, _} -> + Mix.raise(""" + `cargo xtask stamp` failed building the suidhelper. + + Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml) + are installed. + """) + end + end + + defp install_privileged do + if passwordless_sudo?() do + Mix.shell().info("Installing #{@source} -> #{@install_path} (setuid root)") + + case System.cmd("sudo", install_argv(), into: IO.stream(:stdio, :line)) do + {_, 0} -> Mix.shell().info("installed #{@install_path} (setuid root)") + {_, _} -> Mix.raise(manual_instructions()) + end + else + Mix.shell().info(manual_instructions()) + end + end + + # `sudo -n true` exits 0 only when sudo can run without prompting. With no + # controlling terminal a cached `tty_tickets` credential is invisible, so this + # is true essentially only under `NOPASSWD` -- exactly the case where the + # nested `sudo install` below can succeed. + defp passwordless_sudo? do + match?({_, 0}, System.cmd("sudo", ["-n", "true"], stderr_to_stdout: true)) + end + + defp install_argv, + do: ["install", "-o", "root", "-g", "root", "-m", "4755", @source, @install_path] + + defp manual_instructions do + """ + + The binary is built and stamped, but installing it setuid-root needs a + password and `sudo` has no terminal to prompt on here. Run the copy yourself: + + sudo #{Enum.join(install_argv(), " ")} + """ end end From 4ddcd4d5a18f29eb27464394d8805e77b979f7ab Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 20:51:14 +0000 Subject: [PATCH 05/34] feat(suidhelper): source device binaries from config, drop caller --bin The unprivileged node used to pass `--bin ` to the setuid helper for every losetup/dmsetup/blockdev op. Letting the caller name the binary the helper escalates to run is a needless trust hole, even with SafeBin checks. Move the paths into the helper-owned config (/etc/hyper/config.toml) with sane defaults (/usr/sbin/{losetup,dmsetup,blockdev}); the helper validates each as a SafeBin (absolute, root-owned, non-writable, exact basename) at dispatch, as the real uid, before acquiring root. The `--bin` argument is gone from both sides. Also: - Add a `dmsetup targets` op so the dm-target readiness probe runs through the helper (it opens /dev/mapper/control, which needs root) instead of shelling dmsetup directly as the BEAM user. - An absent config file now falls back to the built-in defaults (trusted, compiled into the root-owned binary); a present-but-untrusted file stays fatal. Drop the Elixir-side *_path config and per-tool presence checks. - Add :mix to the dialyzer PLT so the Mix tasks resolve Mix.raise/shell. --- lib/hyper/config.ex | 27 +++--- lib/hyper/suid_helper.ex | 18 ++-- lib/hyper/suid_helper/blockdev.ex | 11 +-- lib/hyper/suid_helper/dmsetup.ex | 43 +++------- lib/hyper/suid_helper/losetup.ex | 22 +---- mix.exs | 4 + native/suidhelper/src/config.rs | 76 ++++++++++++++++- native/suidhelper/src/tools/dmsetup/mod.rs | 10 +++ native/suidhelper/src/tools/mod.rs | 47 ++++++----- native/suidhelper/src/util/safe_bin.rs | 60 ++++++------- native/suidhelper/tests/e2e/argv.rs | 97 ++++++++++++++-------- native/suidhelper/tests/e2e/config.rs | 25 ++++-- native/suidhelper/tests/util/safe_bin.rs | 4 +- 13 files changed, 265 insertions(+), 179 deletions(-) diff --git a/lib/hyper/config.ex b/lib/hyper/config.ex index 1332d9e..e3c056d 100644 --- a/lib/hyper/config.ex +++ b/lib/hyper/config.ex @@ -16,9 +16,6 @@ defmodule Hyper.Config do @parent_cgroup Application.compile_env(:hyper, :cgroup_parent, "hyper") @uid_gid_range Application.compile_env!(:hyper, :uid_gid_range) @layer_dir Application.compile_env!(:hyper, :layer_dir) - @losetup_path Application.compile_env(:hyper, :losetup_path, "losetup") - @dmsetup_path Application.compile_env(:hyper, :dmsetup_path, "dmsetup") - @blockdev_path Application.compile_env(:hyper, :blockdev_path, "blockdev") @skopeo_path Application.compile_env(:hyper, :skopeo_path, "skopeo") @umoci_path Application.compile_env(:hyper, :umoci_path, nil) @mke2fs_path Application.compile_env(:hyper, :mke2fs_path, "mke2fs") @@ -111,15 +108,6 @@ defmodule Hyper.Config do @spec layer_dir :: Path.t() def layer_dir, do: @layer_dir - @doc "Path to the losetup binary." - def losetup_path, do: @losetup_path - - @doc "Path to the dmsetup binary." - def dmsetup_path, do: @dmsetup_path - - @doc "Path to the blockdev binary." - def blockdev_path, do: @blockdev_path - @doc "Path to the skopeo binary (used by `Hyper.Img.OciLoader` to pull OCI images)." def skopeo_path, do: @skopeo_path @@ -132,15 +120,20 @@ defmodule Hyper.Config do @doc "Path to the mke2fs binary (used by `Hyper.Img.OciLoader` to build the ext4 rootfs)." def mke2fs_path, do: @mke2fs_path + # Where `cargo xtask install` (via `mix suidhelper.install`) drops the helper. + @default_suid_helper "/usr/local/bin/hyper-suidhelper" + @doc """ - Path to the setuid-root device helper (`hyper-suidhelper`). Required: the node - runs unprivileged and routes every `losetup`/`dmsetup`/`blockdev` operation - through it. + Path to the setuid-root device helper (`hyper-suidhelper`). The node runs + unprivileged and routes every `losetup`/`dmsetup`/`blockdev` operation through + it. - Runtime config (host-specific), so it can be set per node without recompiling. + Defaults to `#{@default_suid_helper}`, the install path used by `mix + suidhelper.install`. Runtime config (host-specific), so an operator who + installs it elsewhere can override per node without recompiling. """ @spec suid_helper :: Path.t() - def suid_helper, do: Application.fetch_env!(:hyper, :suid_helper) + def suid_helper, do: Application.get_env(:hyper, :suid_helper, @default_suid_helper) @doc """ Directory for per-VM scratch (writable-layer COW) files. Must be node-local and diff --git a/lib/hyper/suid_helper.ex b/lib/hyper/suid_helper.ex index 690f004..6d4ff56 100644 --- a/lib/hyper/suid_helper.ex +++ b/lib/hyper/suid_helper.ex @@ -16,7 +16,7 @@ defmodule Hyper.SuidHelper do self-test and reports the base path it was compiled against. """ - alias Hyper.SuidHelper.{Blockdev, Dmsetup, Expected, Losetup} + alias Hyper.SuidHelper.{Dmsetup, Expected} use OpenTelemetryDecorator @@ -51,18 +51,20 @@ defmodule Hyper.SuidHelper do end @doc """ - Check that the setuid helper and every tool it execs are usable on this - machine: the helper binary is present, is the build this release expects - (`verify_version/0`), then each tool submodule's own check. + Check that the setuid helper is usable on this machine: the helper binary is + present, is the build this release expects (`verify_version/0`), and the kernel + exposes the device-mapper targets we need (`Dmsetup.test_system/0`, which also + exercises the helper's configured `dmsetup` binary). + + The `losetup`/`blockdev` binaries are validated by the helper the first time + each is used; their paths live in the helper's own config, not here. """ @spec test_system() :: :ok | {:error, term()} @decorate with_span("Hyper.SuidHelper.test_system") def test_system do with :ok <- helper_present(), - :ok <- verify_version(), - :ok <- Losetup.test_system(), - :ok <- Dmsetup.test_system() do - Blockdev.test_system() + :ok <- verify_version() do + Dmsetup.test_system() end end diff --git a/lib/hyper/suid_helper/blockdev.ex b/lib/hyper/suid_helper/blockdev.ex index 9f675ea..760fae8 100644 --- a/lib/hyper/suid_helper/blockdev.ex +++ b/lib/hyper/suid_helper/blockdev.ex @@ -11,18 +11,9 @@ defmodule Hyper.SuidHelper.Blockdev do @spec device_sectors(Path.t()) :: {:ok, pos_integer()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Blockdev.device_sectors", include: [:path]) def device_sectors(path) do - case SuidHelper.exec(["blockdev", "--bin", Hyper.Config.blockdev_path(), "--getsz", path]) do + case SuidHelper.exec(["blockdev", "--getsz", path]) do {:ok, %{"sectors" => n}} -> {:ok, n} {:error, _} = err -> err end end - - @doc "Check the blockdev binary is present." - @spec test_system() :: :ok | {:error, :blockdev_not_found} - @decorate with_span("Hyper.SuidHelper.Blockdev.test_system") - def test_system do - if System.find_executable(Hyper.Config.blockdev_path()), - do: :ok, - else: {:error, :blockdev_not_found} - end end diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex index ef67063..e32ec84 100644 --- a/lib/hyper/suid_helper/dmsetup.ex +++ b/lib/hyper/suid_helper/dmsetup.ex @@ -59,14 +59,7 @@ defmodule Hyper.SuidHelper.Dmsetup do @spec remove(String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.remove", include: [:name]) def remove(name) do - case SuidHelper.exec([ - "dmsetup", - "--bin", - Hyper.Config.dmsetup_path(), - "remove", - "--retry", - name - ]) do + case SuidHelper.exec(["dmsetup", "remove", "--retry", name]) do {:ok, _} -> :ok {:error, _} = err -> err end @@ -76,39 +69,31 @@ defmodule Hyper.SuidHelper.Dmsetup do @spec message(String.t(), String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.message", include: [:name, :message]) def message(name, message) do - argv = - ["dmsetup", "--bin", Hyper.Config.dmsetup_path(), "message", name, "--message", message] - - case SuidHelper.exec(argv) do + case SuidHelper.exec(["dmsetup", "message", name, "--message", message]) do {:ok, _} -> :ok {:error, _} = err -> err end end @doc """ - Check the dmsetup binary is present and the kernel exposes the dm targets we - use (snapshot, thin, thin-pool). + Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool). + + Routes through the setuid helper: `dmsetup targets` opens `/dev/mapper/control`, + which needs root, and the BEAM runs unprivileged. The helper validates its + configured `dmsetup` binary before running it, so a missing or unsafe binary + surfaces here too. """ @spec test_system() :: :ok | {:error, term()} @decorate with_span("Hyper.SuidHelper.Dmsetup.test_system") def test_system do - if System.find_executable(Hyper.Config.dmsetup_path()), - do: test_targets(), - else: {:error, :dmsetup_not_found} - end - - @doc "Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool)." - @spec test_targets() :: :ok | {:error, term()} - @decorate with_span("Hyper.SuidHelper.Dmsetup.test_targets") - def test_targets do - case System.cmd(Hyper.Config.dmsetup_path(), ["targets"], stderr_to_stdout: true) do - {out, 0} -> + case SuidHelper.exec(["dmsetup", "targets"]) do + {:ok, %{"output" => out}} -> have = parse_targets(out) missing = Enum.reject(@required_targets, &MapSet.member?(have, &1)) if missing == [], do: :ok, else: {:error, {:missing_dm_targets, missing}} - {out, code} -> - {:error, {:dmsetup_targets_failed, code, String.trim(out)}} + {:error, {code, msg}} -> + {:error, {:dmsetup_targets_failed, code, msg}} end end @@ -145,9 +130,7 @@ defmodule Hyper.SuidHelper.Dmsetup do # create flags (e.g. `--readonly`). Returns the `/dev/mapper/` path. @spec create(String.t(), String.t(), [String.t()]) :: {:ok, Path.t()} | {:error, err()} defp create(name, table, flags) do - argv = - ["dmsetup", "--bin", Hyper.Config.dmsetup_path(), "create", name] ++ - flags ++ ["--table", table] + argv = ["dmsetup", "create", name] ++ flags ++ ["--table", table] case SuidHelper.exec(argv) do {:ok, %{"device" => dev}} -> {:ok, dev} diff --git a/lib/hyper/suid_helper/losetup.ex b/lib/hyper/suid_helper/losetup.ex index d825b73..405c5ca 100644 --- a/lib/hyper/suid_helper/losetup.ex +++ b/lib/hyper/suid_helper/losetup.ex @@ -11,7 +11,7 @@ defmodule Hyper.SuidHelper.Losetup do @spec attach_ro(Path.t()) :: {:ok, Path.t()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.attach_ro", include: [:path]) def attach_ro(path) do - case SuidHelper.exec(["losetup", "--bin", Hyper.Config.losetup_path(), "attach", path]) do + case SuidHelper.exec(["losetup", "attach", path]) do {:ok, %{"device" => dev}} -> {:ok, dev} {:error, _} = err -> err end @@ -21,14 +21,7 @@ defmodule Hyper.SuidHelper.Losetup do @spec attach_rw(Path.t()) :: {:ok, Path.t()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.attach_rw", include: [:path]) def attach_rw(path) do - case SuidHelper.exec([ - "losetup", - "--bin", - Hyper.Config.losetup_path(), - "attach", - "--rw", - path - ]) do + case SuidHelper.exec(["losetup", "attach", "--rw", path]) do {:ok, %{"device" => dev}} -> {:ok, dev} {:error, _} = err -> err end @@ -38,18 +31,9 @@ defmodule Hyper.SuidHelper.Losetup do @spec detach(Path.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.detach", include: [:dev]) def detach(dev) do - case SuidHelper.exec(["losetup", "--bin", Hyper.Config.losetup_path(), "detach", dev]) do + case SuidHelper.exec(["losetup", "detach", dev]) do {:ok, _} -> :ok {:error, _} = err -> err end end - - @doc "Check the losetup binary is present." - @spec test_system() :: :ok | {:error, :losetup_not_found} - @decorate with_span("Hyper.SuidHelper.Losetup.test_system") - def test_system do - if System.find_executable(Hyper.Config.losetup_path()), - do: :ok, - else: {:error, :losetup_not_found} - end end diff --git a/mix.exs b/mix.exs index 7906c6a..5c620b8 100644 --- a/mix.exs +++ b/mix.exs @@ -24,6 +24,10 @@ defmodule Hyper.MixProject do # Cache the PLTs in a stable, gitignored dir so CI can cache them. plt_local_path: "priv/plts", plt_core_path: "priv/plts", + # `:mix` is needed so the Mix tasks under `lib/mix/tasks` (which call + # `Mix.raise/1`, `Mix.shell/0`, and implement the `Mix.Task` behaviour) + # resolve instead of tripping `unknown_function`. + plt_add_apps: [:mix], # Verify @specs against actual returns, and flag ignored return values. flags: [:unmatched_returns, :extra_return, :missing_return] ] diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs index 425ba38..2c94ba0 100644 --- a/native/suidhelper/src/config.rs +++ b/native/suidhelper/src/config.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: AGPL-3.0-only //! Runtime host configuration, read from a single root-owned TOML file. +use crate::util::safe_bin::{self, SafeBin}; use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile}; use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents}; use nix::fcntl::OFlag; @@ -41,16 +42,57 @@ fn config_path() -> PathBuf { } /// Hyper's /etc/hyper/config.toml file format. +/// +/// The device-tool paths are read from here (never from the unprivileged +/// caller, which is why there is no `--bin` argument): the helper alone decides +/// which binary it escalates to run. Each defaults to its usual location and is +/// validated as a [`SafeBin`] before use. #[derive(Debug, Clone, Deserialize)] pub struct Config { work_dir: PathBuf, + #[serde(default = "default_dmsetup")] + dmsetup: PathBuf, + #[serde(default = "default_losetup")] + losetup: PathBuf, + #[serde(default = "default_blockdev")] + blockdev: PathBuf, +} + +// The default data root. Must match the Elixir node's `@dev_work_dir`, which it +// uses when the same config file is absent, so both sides agree (see +// `Hyper.Node.check_helper_base`). +fn default_work_dir() -> PathBuf { + PathBuf::from("/srv/hyper") +} + +fn default_dmsetup() -> PathBuf { + PathBuf::from("/usr/sbin/dmsetup") +} + +fn default_losetup() -> PathBuf { + PathBuf::from("/usr/sbin/losetup") +} + +fn default_blockdev() -> PathBuf { + PathBuf::from("/usr/sbin/blockdev") +} + +impl Default for Config { + fn default() -> Self { + Self { + work_dir: default_work_dir(), + dmsetup: default_dmsetup(), + losetup: default_losetup(), + blockdev: default_blockdev(), + } + } } impl Config { /// The process-wide config, loaded once (and forced unprivileged by - /// [`Config::init`]). A load failure is fatal: the helper cannot safely - /// operate without a trusted data root, so it prints the error and exits - /// rather than guessing a default. + /// [`Config::init`]). An absent file yields the built-in defaults; a + /// *present but untrusted* file (wrong owner/mode, malformed) is fatal - + /// the helper prints the error and exits rather than trusting it. pub fn get() -> &'static Config { LazyLock::force(&CONFIG) } @@ -74,6 +116,21 @@ impl Config { self.work_dir.join("jails") } + /// The validated `dmsetup` binary the helper will run. + pub fn dmsetup(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.dmsetup) + } + + /// The validated `losetup` binary the helper will run. + pub fn losetup(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.losetup) + } + + /// The validated `blockdev` binary the helper will run. + pub fn blockdev(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.blockdev) + } + /// Read, ownership-check, parse, and validate the config file. See the module /// docs for the trust model. pub fn safe_load() -> Result { @@ -82,7 +139,18 @@ impl Config { let safe_path: SafePath = path.clone().try_into()?; let file: SafeFile = - SafeFile::open(&safe_path, OFlag::O_RDONLY)?; + match SafeFile::open(&safe_path, OFlag::O_RDONLY) { + Ok(file) => file, + // A genuinely-absent file means "use the built-in defaults": those + // are compiled into this root-owned binary, so they are trusted. Any + // OTHER failure - a present but wrong-owner/mode file, an I/O error - + // stays fatal, because it is a signal (someone put an untrusted file + // there), not an absence. + Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => { + return Ok(Self::default()) + } + Err(e) => return Err(e.into()), + }; let body = std::io::read_to_string(std::fs::File::from(file.into_owned_fd())) .map_err(|_| LoadingError::Unreadable(path.clone()))?; diff --git a/native/suidhelper/src/tools/dmsetup/mod.rs b/native/suidhelper/src/tools/dmsetup/mod.rs index eef016a..ca40561 100644 --- a/native/suidhelper/src/tools/dmsetup/mod.rs +++ b/native/suidhelper/src/tools/dmsetup/mod.rs @@ -52,6 +52,9 @@ enum DmOp { #[arg(long)] message: ThinMessage, }, + /// List the target types the kernel device-mapper exposes. Read-only, but + /// still needs root: it opens `/dev/mapper/control`. + Targets, } #[derive(Serialize)] @@ -60,6 +63,7 @@ pub enum DmsetupOut { Created { device: PathBuf }, Removed, Messaged, + Targets { output: String }, } pub struct Dmsetup { @@ -105,6 +109,9 @@ impl IsTool for Dmsetup { .arg("0") .arg(message.to_string()); } + DmOp::Targets => { + cmd.arg("targets"); + } } cmd.env_clear().output() @@ -124,6 +131,9 @@ impl IsTool for Dmsetup { }, DmOp::Remove { .. } => DmsetupOut::Removed, DmOp::Message { .. } => DmsetupOut::Messaged, + DmOp::Targets => DmsetupOut::Targets { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/src/tools/mod.rs b/native/suidhelper/src/tools/mod.rs index 10c53c4..24a2be4 100644 --- a/native/suidhelper/src/tools/mod.rs +++ b/native/suidhelper/src/tools/mod.rs @@ -1,7 +1,8 @@ //! Per-tool CLI fragments and their `IsTool` implementations. Each tool lives in -//! its own submodule and owns its own error type, operand validation, and `--bin` -//! parser; this module owns the shared trait, the `Tool` subcommand tree, and the -//! privilege boundary. +//! its own submodule and owns its own error type and operand validation; this +//! module owns the shared trait, the `Tool` subcommand tree, and the privilege +//! boundary. The binary each tool runs is resolved from the trusted config here, +//! never passed by the caller. mod blockdev; pub mod chroot_jail; @@ -13,15 +14,15 @@ pub use chroot_jail::ChrootJailOp; pub use dmsetup::{DmTable, Dmsetup, DmsetupArgs, ThinMessage}; pub use losetup::{Losetup, LosetupArgs}; -use crate::util::safe_bin::SafeBin; +use crate::config::Config; use crate::util::setuid_privileged::{self, Privileged}; use clap::Subcommand; use serde::Serialize; use thiserror::Error as ThisError; -/// Errors of the dispatch layer: whatever the privilege guard or the chosen tool -/// raises on the way out. (`--bin` and operand validation are handled by clap at -/// parse time, so they never reach here.) +/// Errors of the dispatch layer: an invalid configured binary (`SafeBin`), the +/// privilege guard, or the chosen tool's own failure on the way out. (Operand +/// validation is handled by clap at parse time, so it never reaches here.) #[derive(Debug, ThisError)] pub enum Error { #[error(transparent)] @@ -65,28 +66,23 @@ pub trait IsTool { } } -/// The subcommand tree: one subcommand per tool, each taking its own `--bin` -/// with the tool-specific args flattened in from the submodule. +/// The subcommand tree: one subcommand per tool, with the tool-specific args +/// flattened in from the submodule. The binary each tool runs is not a caller +/// argument - it comes from the root-owned config (see [`Config`]). #[derive(Subcommand)] pub enum Tool { /// Attach/detach loop devices. Losetup { - #[arg(long)] - bin: SafeBin<"losetup">, #[command(flatten)] args: LosetupArgs, }, /// Create/remove device-mapper snapshot devices. Dmsetup { - #[arg(long)] - bin: SafeBin<"dmsetup">, #[command(flatten)] args: DmsetupArgs, }, /// Query a block device's size. Blockdev { - #[arg(long)] - bin: SafeBin<"blockdev">, #[command(flatten)] args: BlockdevArgs, }, @@ -99,13 +95,24 @@ pub enum Tool { impl Tool { /// Dispatch to the selected tool's `run` (or, for `chroot-jail`, its nested - /// op), returning its already-serialized `Value`. The `--bin` is already - /// validated (it is a `SafeBin`, constructed only by its value parser). + /// op), returning its already-serialized `Value`. The binary path is taken + /// from the trusted config and validated (`SafeBin`) here, as the real uid, + /// before any privilege is acquired. pub fn run(self) -> Result { + let config = Config::get(); match self { - Tool::Losetup { bin, args } => Losetup::new(bin.into(), args).run(), - Tool::Dmsetup { bin, args } => Dmsetup::new(bin.into(), args).run(), - Tool::Blockdev { bin, args } => Blockdev::new(bin.into(), args).run(), + Tool::Losetup { args } => { + let bin = config.losetup().map_err(|e| Error::Tool(Box::new(e)))?; + Losetup::new(bin.into(), args).run() + } + Tool::Dmsetup { args } => { + let bin = config.dmsetup().map_err(|e| Error::Tool(Box::new(e)))?; + Dmsetup::new(bin.into(), args).run() + } + Tool::Blockdev { args } => { + let bin = config.blockdev().map_err(|e| Error::Tool(Box::new(e)))?; + Blockdev::new(bin.into(), args).run() + } Tool::ChrootJail { op } => op.run(), } } diff --git a/native/suidhelper/src/util/safe_bin.rs b/native/suidhelper/src/util/safe_bin.rs index 8f323aa..9108440 100644 --- a/native/suidhelper/src/util/safe_bin.rs +++ b/native/suidhelper/src/util/safe_bin.rs @@ -1,17 +1,16 @@ -//! A validated `--bin` path. +//! A validated tool-binary path. //! -//! The caller names the binary to run, but it must be the expected tool and a -//! binary only root could have produced. [`SafeBin`] is a newtype whose only -//! constructor runs those checks, so holding one is proof the path was -//! validated. The const string parameter `NAME` is the basename it was validated -//! against - a `SafeBin<"losetup">` can never be passed where a -//! `SafeBin<"dmsetup">` is wanted. Combined with the [`FromStr`] impl (see -//! `tools`), clap validates the path at argument-parse time with no per-tool -//! boilerplate. +//! The path of each device tool (`losetup`, `dmsetup`, `blockdev`) comes from +//! the root-owned config file, never from the unprivileged caller. [`SafeBin`] +//! is a newtype whose only constructor runs the safety checks, so holding one is +//! proof the path was validated. The const string parameter `NAME` is the +//! basename it was validated against - a `SafeBin<"losetup">` can never be +//! passed where a `SafeBin<"dmsetup">` is wanted. //! -//! These checks are what keep this from being arbitrary-root-execution: an -//! unprivileged caller cannot point us at a binary it controls (must be -//! root-owned, not group/other-writable, not a symlink, exact basename). +//! These checks are what keep this from being arbitrary-root-execution: even a +//! mistaken config entry cannot point us at a binary a non-root user controls +//! (must be an absolute path, the exact basename, root-owned, not a symlink, not +//! group/other-writable). use std::ffi::OsStr; use std::fs; @@ -23,14 +22,14 @@ use thiserror::Error as ThisError; #[derive(Debug, ThisError)] pub enum Error { - #[error("--bin must be an absolute path: {0}")] + #[error("binary path must be absolute: {0}")] NotAbsolute(PathBuf), - #[error("--bin basename must be `{expected}`: {got}")] + #[error("binary basename must be `{expected}`: {got}")] Name { expected: &'static str, got: PathBuf, }, - #[error("--bin {path}: {source}")] + #[error("binary {path}: {source}")] Stat { path: PathBuf, #[source] @@ -44,22 +43,17 @@ pub enum Error { Writable(PathBuf), } -/// A `--bin` path validated to have basename `NAME`. The wrapped path is private -/// and the only constructor is the [`FromStr`] impl, so a `SafeBin` value cannot -/// exist without having been checked. +/// A tool-binary path validated to have basename `NAME`. The wrapped path is +/// private and the only constructor is [`SafeBin::from_path`], so a `SafeBin` +/// value cannot exist without having been checked. #[derive(Debug, Clone)] pub struct SafeBin(PathBuf); -// Lets clap validate `--bin` at parse time straight into a `SafeBin`, with -// no per-tool value parser: the const basename is the whole spec. Validates that -// `s` is an absolute path with basename `NAME`, a regular root-owned file no -// non-root user could have written. -impl FromStr for SafeBin { - type Err = Error; - - fn from_str(s: &str) -> Result { - let bin = Path::new(s); - +impl SafeBin { + /// Validate `bin` as the `NAME` tool binary: an absolute path with basename + /// `NAME`, a real (non-symlink) regular file owned by root that no non-root + /// user could have written. These checks are the whole point of the type. + pub fn from_path(bin: &Path) -> Result { if !bin.is_absolute() { return Err(Error::NotAbsolute(bin.to_path_buf())); } @@ -93,6 +87,16 @@ impl FromStr for SafeBin { } } +// Lets a string parse straight into a validated `SafeBin` (used by the +// test suite); delegates to the single `from_path` constructor. +impl FromStr for SafeBin { + type Err = Error; + + fn from_str(s: &str) -> Result { + Self::from_path(Path::new(s)) + } +} + // Read the validated path back out; the "validated" guarantee stays attached to // the `SafeBin` type until this conversion. impl From> for PathBuf { diff --git a/native/suidhelper/tests/e2e/argv.rs b/native/suidhelper/tests/e2e/argv.rs index 6ebf997..519d664 100644 --- a/native/suidhelper/tests/e2e/argv.rs +++ b/native/suidhelper/tests/e2e/argv.rs @@ -1,7 +1,7 @@ //! L4: prove the exact argv (and empty env) the helper hands to the child tool — -//! the one thing the design deliberately hides from the caller. We point `--bin` -//! at a root-owned fake that writes its argv+env to a file as JSON, then assert -//! on the reconstructed command line. +//! the one thing the design deliberately hides from the caller. We point the +//! tool's config path at a root-owned fake that writes its argv+env to a file as +//! JSON, then assert on the reconstructed command line. #![cfg(feature = "insecure_test_seams")] use std::fs; @@ -31,9 +31,16 @@ fn install_fake(dir: &Path, basename: &str, record: &Path, stdout_line: &str) -> path // root-owned because this test runs as root } -fn write_root_config(dir: &Path) -> PathBuf { +/// Write a root-owned config that points the named tools at the given (fake) +/// binaries, so the helper resolves each tool's path from config rather than a +/// caller argument. +fn write_root_config(dir: &Path, bins: &[(&str, &Path)]) -> PathBuf { let p = dir.join("config.toml"); - fs::write(&p, "work_dir = \"/srv/hyper\"\n").unwrap(); + let mut body = String::from("work_dir = \"/srv/hyper\"\n"); + for (key, path) in bins { + body.push_str(&format!("{key} = \"{}\"\n", path.display())); + } + fs::write(&p, body).unwrap(); fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); p } @@ -60,17 +67,15 @@ fn dmsetup_create_snapshot_reconstructs_canonical_table_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); // Deliberately weird inner spacing; the helper must re-render canonically. let out = run( &cfg, &[ "dmsetup", - "--bin", - bin.to_str().unwrap(), "create", "hyper-vm1", "--readonly", @@ -106,21 +111,11 @@ fn dmsetup_remove_retry_toggle_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); - let out = run( - &cfg, - &[ - "dmsetup", - "--bin", - bin.to_str().unwrap(), - "remove", - "--retry", - "hyper-vm1", - ], - ); + let out = run(&cfg, &["dmsetup", "remove", "--retry", "hyper-vm1"]); assert_eq!(out.status.code(), Some(0)); assert_eq!(recorded_argv(&rec), vec!["remove", "--retry", "hyper-vm1"]); } @@ -132,16 +127,14 @@ fn dmsetup_message_create_thin_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); let out = run( &cfg, &[ "dmsetup", - "--bin", - bin.to_str().unwrap(), "message", "hyper-pool", "--message", @@ -156,6 +149,51 @@ fn dmsetup_message_create_thin_as_root() { ); } +#[test] +fn dmsetup_targets_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_targets: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + // Fake prints one `dmsetup targets` row; the helper returns it verbatim. + let bin = install_fake(tmp.path(), "dmsetup", &rec, "snapshot v1.16.0"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "targets"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["targets"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "targets"); + assert_eq!(json["output"], "snapshot v1.16.0\n"); +} + +#[test] +fn dmsetup_rejects_configured_bin_with_wrong_basename_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_rejects_bin: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // A real, root-owned system file, but the wrong basename for `dmsetup`. + let cfg = write_root_config(tmp.path(), &[("dmsetup", Path::new("/usr/bin/env"))]); + + let out = run(&cfg, &["dmsetup", "targets"]); + assert_ne!( + out.status.code(), + Some(0), + "a configured binary with the wrong basename must be refused" + ); + let err = String::from_utf8_lossy(&out.stderr); + assert!(err.contains("basename must be"), "stderr: {err}"); +} + #[test] fn blockdev_getsz_argv_and_parse_as_root() { if !is_root() { @@ -163,21 +201,12 @@ fn blockdev_getsz_argv_and_parse_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); // Fake prints "2048" as the sector count for the helper to parse. let bin = install_fake(tmp.path(), "blockdev", &rec, "2048"); + let cfg = write_root_config(tmp.path(), &[("blockdev", &bin)]); - let out = run( - &cfg, - &[ - "blockdev", - "--bin", - bin.to_str().unwrap(), - "--getsz", - "/dev/loop0", - ], - ); + let out = run(&cfg, &["blockdev", "--getsz", "/dev/loop0"]); assert_eq!( out.status.code(), Some(0), diff --git a/native/suidhelper/tests/e2e/config.rs b/native/suidhelper/tests/e2e/config.rs index cd3ba33..100a686 100644 --- a/native/suidhelper/tests/e2e/config.rs +++ b/native/suidhelper/tests/e2e/config.rs @@ -34,17 +34,28 @@ fn write_root_config(dir: &Path, body: &str) -> std::path::PathBuf { p } -/// A missing config file is refused at SafeFile::open with exit code 2. -/// The error is LoadingError::File(ValidationError::Open(ENOENT)), -/// which displays as "open failed: ". +/// A genuinely-absent config file is NOT an error: the helper falls back to the +/// built-in defaults (compiled into this root-owned binary, hence trusted). The +/// default `work_dir` is `/srv/hyper`. Needs root because `sys-test` then +/// acquires privileges to prove it can promote. #[test] -fn missing_config_exits_2() { +fn missing_config_falls_back_to_defaults_as_root() { + if !is_root() { + eprintln!("SKIP missing_config defaults: sys-test needs root"); + return; + } let tmp = tempfile::tempdir().unwrap(); let missing = tmp.path().join("nope.toml"); let out = run_with_config(&missing, &["sys-test"]); - assert_eq!(out.status.code(), Some(2), "missing config must exit 2"); - let err = String::from_utf8_lossy(&out.stderr); - assert!(err.contains("open failed"), "stderr: {err}"); + assert_eq!( + out.status.code(), + Some(0), + "absent config should use defaults; stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).expect("stdout is JSON"); + assert_eq!(json["sys_test"], "ok"); + assert_eq!(json["hyper_base"], "/srv/hyper"); } #[test] diff --git a/native/suidhelper/tests/util/safe_bin.rs b/native/suidhelper/tests/util/safe_bin.rs index 950e06b..e752243 100644 --- a/native/suidhelper/tests/util/safe_bin.rs +++ b/native/suidhelper/tests/util/safe_bin.rs @@ -1,5 +1,5 @@ -//! `SafeBin` is what stops `--bin` from pointing the helper at an -//! attacker-controlled binary it would then run as root. The constructor demands +//! `SafeBin` is what stops a configured path from pointing the helper at +//! an attacker-controlled binary it would then run as root. The constructor demands //! an absolute path, exact basename, a real (non-symlink) regular file owned by //! root and not group/other-writable. These assert the refusal axes; the symlink //! axis is root-independent, the owner axis is asserted both ways. From f340571089761f7ae686f630ac4bfbba03f1ed23 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 20:51:19 +0000 Subject: [PATCH 06/34] docs: Postgres quickstart, optional helper config, install caveats Document the Docker Postgres setup matching config.exs defaults, the now config-sourced (and optional) device-binary paths, and the sudo/tty caveat for mix suidhelper.install. --- docs/cookbook/intro.md | 72 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index 526c7e9..7da1909 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -24,23 +24,70 @@ The absolute best way to get started with `Hyper` is to play with it. Hyper needs a **PostgreSQL** server reachable from every node - it is the image database and the only stateful external dependency. +For local development the quickest path is Docker. The connection details below +match the defaults in `config/config.exs` (`Hyper.Img.Db.Repo`): + +```sh +docker run -d --name hyper-pg \ + -e POSTGRES_USER=postgres \ + -e POSTGRES_PASSWORD=postgres \ + -e POSTGRES_DB=hyper_dev \ + -p 5432:5432 \ + postgres:16 +``` + +Once it is up, create and migrate the schema (the repo is not in `ecto_repos`, +so pass it with `-r`): + +```sh +mix ecto.create -r Hyper.Img.Db.Repo +mix ecto.migrate -r Hyper.Img.Db.Repo +``` + +The container is ephemeral; `docker start hyper-pg` brings it back after a +reboot. To point Hyper at an existing server instead, override the +`Hyper.Img.Db.Repo` block in your `config.exs`. + #### System binaries -The following must be on each node's `PATH` (the bracketed override is the -`config :hyper` key you can set if the binary lives elsewhere): +These are used by the unprivileged node directly; each must be on the node's +`PATH` (the bracketed override is the `config :hyper` key you can set if the +binary lives elsewhere): - [`skopeo`](https://github.com/containers/skopeo) - pulls OCI images (`skopeo_path`) - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) - provides `mke2fs`, which builds the ext4 rootfs (`mke2fs_path`) - - `losetup`, `blockdev` (from **util-linux**) - loop-device setup - (`losetup_path`, `blockdev_path`) - - `dmsetup` (from **lvm2** / device-mapper) - dm-snapshot and thin-pool - layering (`dmsetup_path`). Frequently *not* installed by default - check - this one first. - `du`, `getent` (from **coreutils** and **glibc**) - rootfs sizing and user resolution. Present on essentially every distro. +The privileged device binaries - `losetup`, `blockdev` (from **util-linux**) +and `dmsetup` (from **lvm2** / device-mapper) - are run only by the setuid +helper, never named by the unprivileged caller. Their paths therefore live in +the helper's own config, `/etc/hyper/config.toml`, and default to +`/usr/sbin/{losetup,blockdev,dmsetup}`. + +**The config file is optional.** If it is absent the helper uses the built-in +defaults below (and `work_dir = "/srv/hyper"`, matching the node's own +fallback). Create one only to override a default - and if you do, it must be +root-owned and not group/other-writable, or the helper refuses to start (a +present-but-untrusted file is treated as an attack signal, unlike a missing +one): + +```toml +# /etc/hyper/config.toml (root-owned, mode 0644) - every line optional +work_dir = "/srv/hyper" + +# Each must be an absolute path to a root-owned, non-world-writable binary; +# the helper validates this before it will exec the tool. +dmsetup = "/usr/sbin/dmsetup" +losetup = "/usr/sbin/losetup" +blockdev = "/usr/sbin/blockdev" +``` + +`dmsetup` (lvm2) is frequently *not* installed by default - check that one +first. + #### Kernel features The host kernel must provide: @@ -61,6 +108,17 @@ The host kernel must provide: Run `mix suidhelper.install`, which builds, stamps, and places it setuid-root on `PATH`. Every privileged operation (losetup, dmsetup, mknod, chroot jails) routes through it; the BEAM itself runs unprivileged. + + The final `sudo install` step runs without a controlling terminal (Mix + captures the nested `cargo` output), so on a typical `tty_tickets` sudo + setup it cannot prompt for a password. If it fails, the build has already + stamped the binary -- just run the copy yourself: + + ```sh + sudo install -o root -g root -m 4755 \ + native/suidhelper/target/release/hyper-suidhelper \ + /usr/local/bin/hyper-suidhelper + ``` - A **parent cgroup** named by `cgroup_parent` (default `hyper`) must exist under `/sys/fs/cgroup`; Hyper creates each VM's cgroup beneath it. - The host UID/GID range given by `uid_gid_range` must be free for Hyper to From 23a58830bb91aadf146f8199ad09bcf9d85b735a Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:12:00 +0000 Subject: [PATCH 07/34] docs: how to load device-mapper modules for the dm targets Spell out modprobe + dmsetup targets verify, the linux-modules-extra fallback for stripped cloud kernels, and persisting via modules-load.d - the missing_dm_targets boot failure points here. --- docs/cookbook/intro.md | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index 7da1909..57615f6 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -96,12 +96,35 @@ The host kernel must provide: the `uid_gid_range` configuration). - **cgroup v2** - the unified hierarchy mounted at `/sys/fs/cgroup`. v1-only hosts are not supported. - - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` - load the - `dm_snapshot` and `dm_thin_pool` modules (`modprobe dm_snapshot - dm_thin_pool`). Hyper refuses to start its device helper without them. + - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` - from the + `dm_snapshot` (provides `snapshot`) and `dm_thin_pool` (provides `thin` and + `thin-pool`) modules. Hyper refuses to start without all three; on boot it + fails with `{:missing_dm_targets, [...]}` listing whichever are absent. - **loop devices** - the `loop` module, used to attach layer images as block devices. +Load the modules and confirm the targets are present: + +```sh +sudo modprobe dm_snapshot dm_thin_pool loop +sudo dmsetup targets # must list snapshot, thin, and thin-pool +``` + +If `modprobe` reports the module is missing, the running kernel lacks it - +minimal cloud images often strip device-mapper. On Debian/Ubuntu, install the +extra modules for the running kernel, then load them: + +```sh +sudo apt-get install -y linux-modules-extra-$(uname -r) +sudo modprobe dm_snapshot dm_thin_pool loop +``` + +Make the modules load on every boot: + +```sh +printf 'dm_snapshot\ndm_thin_pool\nloop\n' | sudo tee /etc/modules-load.d/hyper.conf +``` + #### Privileged setup - The **setuid-root device helper** (`hyper-suidhelper`) must be installed. From b0c0b5f882520510104fb5abf0d0dd8185d8aafb Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:25:57 +0000 Subject: [PATCH 08/34] docs: how to create the parent cgroup + delegate controllers mkdir under the cgroup-v2 hierarchy, enable cpu/memory in subtree_control (root-level fallback), and persist via systemd-tmpfiles since cgroupfs is memory-backed. The :missing_parent_cgroup boot failure points here. --- docs/cookbook/intro.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index 57615f6..99c8012 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -143,7 +143,31 @@ printf 'dm_snapshot\ndm_thin_pool\nloop\n' | sudo tee /etc/modules-load.d/hyper. /usr/local/bin/hyper-suidhelper ``` - A **parent cgroup** named by `cgroup_parent` (default `hyper`) must exist - under `/sys/fs/cgroup`; Hyper creates each VM's cgroup beneath it. + under the cgroup-v2 hierarchy; Hyper creates each VM's cgroup beneath it and + fails to boot with `:missing_parent_cgroup` if it is absent. Create it and + delegate the `cpu` and `memory` controllers so the per-VM cgroups can set + `cpu.max` / `memory.max`: + + ```sh + sudo mkdir -p /sys/fs/cgroup/hyper + echo '+cpu +memory' | sudo tee /sys/fs/cgroup/hyper/cgroup.subtree_control + ``` + + If that last write errors, the root hierarchy is not delegating those + controllers down yet - enable them there first, then retry the line above: + + ```sh + echo '+cpu +memory' | sudo tee /sys/fs/cgroup/cgroup.subtree_control + ``` + + The cgroup hierarchy is memory-backed, so `/sys/fs/cgroup/hyper` does **not** + survive a reboot. Re-create it each boot, or persist it with + `systemd-tmpfiles`: + + ```sh + echo 'd /sys/fs/cgroup/hyper 0755 root root -' \ + | sudo tee /etc/tmpfiles.d/hyper-cgroup.conf + ``` - The host UID/GID range given by `uid_gid_range` must be free for Hyper to allocate per-VM users from. From 5761a4ef82f4aa3a49c5f9d07e523229e5ef8caf Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:27:52 +0000 Subject: [PATCH 09/34] fix(node): start Layer before Budget.Supervisor Budget.Advertiser advertises on init, which calls Hyper.Node.Layer.active/0 and selects on Hyper.Node.Layer.Registry. That registry is owned by Hyper.Node.Layer, which was ordered after Budget.Supervisor - so boot crashed with `unknown registry: Hyper.Node.Layer.Registry`. Order Layer first. --- lib/hyper/node.ex | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index 6282eaa..6892eef 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -49,9 +49,12 @@ defmodule Hyper.Node do def init(_opts) do children = [ Hyper.Node.Users, + # Layer owns Hyper.Node.Layer.Registry, which Budget.Advertiser queries + # (via Hyper.Node.Layer.active/0) as it advertises on init - so Layer must + # be up first. + Hyper.Node.Layer, Hyper.Node.Budget.Supervisor, {DynamicSupervisor, name: @vm_sup, strategy: :one_for_one}, - Hyper.Node.Layer, Hyper.Node.Img ] From 8dc61d35439af8c27c56348492897c664e067462 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:33:47 +0000 Subject: [PATCH 10/34] fix: quiet benign ThinPool port exits and keyless OTLP export ThinPool traps exits (for terminate/2 device teardown), so the normal close of each System.cmd port arrived as an unhandled {:EXIT, port, :normal} and logged as an error. Add a handle_info clause to ignore normal exits. Tracing defaulted to Honeycombs endpoint but only sent the auth header when HONEYCOMB_API_KEY was set, so keyless runs 401d on every batch. Export only when a key or a custom OTEL endpoint is configured; otherwise disable the exporter. --- config/runtime.exs | 32 +++++++++++++++++++++----------- lib/hyper/node/img/thin_pool.ex | 7 +++++++ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/config/runtime.exs b/config/runtime.exs index a3b4fd0..c9ca89f 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -14,17 +14,27 @@ config :hyper, Hyper.Node.Config.Budget, # Where to send traces. Defaults to Honeycomb; override OTEL_EXPORTER_OTLP_* # to point at any OTLP/HTTP backend (Collector, Grafana, etc). if config_env() != :test do - endpoint = System.get_env("OTEL_EXPORTER_OTLP_ENDPOINT", "https://api.honeycomb.io") + custom_endpoint = System.get_env("OTEL_EXPORTER_OTLP_ENDPOINT") + api_key = System.get_env("HONEYCOMB_API_KEY") - headers = - case System.get_env("HONEYCOMB_API_KEY") do - nil -> [] - "" -> [] - key -> [{"x-honeycomb-team", key}] - end + cond do + api_key not in [nil, ""] -> + config :opentelemetry_exporter, + otlp_protocol: :http_protobuf, + otlp_endpoint: custom_endpoint || "https://api.honeycomb.io", + otlp_headers: [{"x-honeycomb-team", api_key}] - config :opentelemetry_exporter, - otlp_protocol: :http_protobuf, - otlp_endpoint: endpoint, - otlp_headers: headers + custom_endpoint not in [nil, ""] -> + # A custom OTLP backend (e.g. a local Collector) needs no Honeycomb key. + config :opentelemetry_exporter, + otlp_protocol: :http_protobuf, + otlp_endpoint: custom_endpoint, + otlp_headers: [] + + true -> + # No backend configured: exporting to the Honeycomb default with no key + # 401s on every batch. Stay silent instead (typical for local dev). Set + # HONEYCOMB_API_KEY or OTEL_EXPORTER_OTLP_ENDPOINT to enable tracing. + config :opentelemetry, traces_exporter: :none + end end diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex index f3ec3e1..9599da2 100644 --- a/lib/hyper/node/img/thin_pool.ex +++ b/lib/hyper/node/img/thin_pool.ex @@ -94,6 +94,13 @@ defmodule Hyper.Node.Img.ThinPool do {:reply, :ok, id_free(state, id)} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. An abnormal exit + # carries a non-`:normal` reason and falls through to the default handler. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, state) do _ = SuidHelper.Dmsetup.remove(@pool_name) From 1320700f0d3d088828382a0dd9d40b6ace50a255 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:42:40 +0000 Subject: [PATCH 11/34] fix(thin_pool): reclaim a stale dm pool on init An unclean shutdown (SIGKILL, so terminate/2 never ran) leaves hyper-thinpool behind; the next boot failed with "device-mapper: create ioctl ... Device or resource busy". Best-effort remove any pre-existing pool before rebuilding, ahead of zero_metadata so a still-live pool is never corrupted. --- lib/hyper/node/img/thin_pool.ex | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex index 9599da2..2302df8 100644 --- a/lib/hyper/node/img/thin_pool.ex +++ b/lib/hyper/node/img/thin_pool.ex @@ -53,6 +53,7 @@ defmodule Hyper.Node.Img.ThinPool do with :ok <- File.mkdir_p(Hyper.Config.scratch_dir()), {:ok, meta} <- ensure_backing(@meta_file, ImgConfig.thin_pool_meta_size()), {:ok, data} <- ensure_backing(@data_file, ImgConfig.thin_pool_data_size()), + :ok <- reclaim_stale_pool(), :ok <- zero_metadata(meta), {:ok, meta_loop} <- SuidHelper.Losetup.attach_rw(meta), {:ok, data_loop} <- SuidHelper.Losetup.attach_rw(data), @@ -118,6 +119,17 @@ defmodule Hyper.Node.Img.ThinPool do @spec id_free(map(), non_neg_integer()) :: map() def id_free(%{freed: freed} = s, id), do: %{s | freed: [id | freed]} + # A run that did not shut down cleanly (e.g. SIGKILL, so `terminate/2` never + # ran) leaves the dm pool behind, and recreating it fails with "Device or + # resource busy". Best-effort remove any stale pool of our name so this boot + # can build a fresh one. Runs before `zero_metadata`, which would otherwise + # corrupt a still-live pool. + @spec reclaim_stale_pool() :: :ok + defp reclaim_stale_pool do + _ = SuidHelper.Dmsetup.remove(@pool_name) + :ok + end + # Create a sparse file of `size` if absent; reuse it if already present. @spec ensure_backing(String.t(), Information.t()) :: {:ok, Path.t()} | {:error, term()} defp ensure_backing(file, size) do From 6ef56d28ea96d95a4d41c7a230f2b64bafe8bb9d Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:50:14 +0000 Subject: [PATCH 12/34] fix(node): validate OCI loader tools at boot Hyper.Img.OciLoader.test_system/0 (checks skopeo/umoci/mke2fs) existed but was never called from Hyper.Node.test_system, so a missing skopeo only surfaced at image-load time. Wire it in after Umoci.ensure_installed so the node refuses to boot when an OCI loader tool is absent, like every other host requirement. --- lib/hyper/node.ex | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index 6892eef..c9ab6ad 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -151,6 +151,7 @@ defmodule Hyper.Node do :ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(), :ok <- Hyper.Node.Vmlinux.test_system(), :ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(), + :ok <- Hyper.Img.OciLoader.test_system(), :ok <- Hyper.Node.Users.test_system(), :ok <- Hyper.Node.Layer.Repo.test_system(), :ok <- Hyper.SuidHelper.test_system(), From 06cc05bab14841dc843a14f02acc589594cdc800 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:55:07 +0000 Subject: [PATCH 13/34] deslop --- native/suidhelper/src/config.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs index 2c94ba0..42079a1 100644 --- a/native/suidhelper/src/config.rs +++ b/native/suidhelper/src/config.rs @@ -42,11 +42,6 @@ fn config_path() -> PathBuf { } /// Hyper's /etc/hyper/config.toml file format. -/// -/// The device-tool paths are read from here (never from the unprivileged -/// caller, which is why there is no `--bin` argument): the helper alone decides -/// which binary it escalates to run. Each defaults to its usual location and is -/// validated as a [`SafeBin`] before use. #[derive(Debug, Clone, Deserialize)] pub struct Config { work_dir: PathBuf, From c0685ae553d4a113aa19ebcde76df2923cbb1bcc Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 21:57:07 +0000 Subject: [PATCH 14/34] fix: ignore benign port exits in the remaining trap_exit servers Layer.Server, Img.Mutable and Img.Server all trap exits (for terminate/2 device teardown) and shell privileged commands through System.cmd, which links a transient port to the process. The port's normal close arrived as {:EXIT, port, :normal} with no matching handle_info clause - Layer.Server crash-looped on it while mounting a layer, which surfaced to create_vm as :no_capacity. Add the same ignore-clause already in ThinPool to each. --- lib/hyper/node/img/mutable.ex | 6 ++++++ lib/hyper/node/img/server.ex | 6 ++++++ lib/hyper/node/layer/server.ex | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex index 386e36f..4dbab64 100644 --- a/lib/hyper/node/img/mutable.ex +++ b/lib/hyper/node/img/mutable.ex @@ -121,6 +121,12 @@ defmodule Hyper.Node.Img.Mutable do @impl true def handle_info(:idle_timeout, state), do: {:noreply, state} + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, state) do # Destroy the thin volume, then release the image (its monitor on us also diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex index d51fe3b..de6579f 100644 --- a/lib/hyper/node/img/server.ex +++ b/lib/hyper/node/img/server.ex @@ -131,6 +131,12 @@ defmodule Hyper.Node.Img.Server do {:noreply, state} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, %State{dm_names: dm_names}) do # Remove top-down (a snapshot's origin is the device below it). Layers are diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex index 188e135..f717e15 100644 --- a/lib/hyper/node/layer/server.ex +++ b/lib/hyper/node/layer/server.ex @@ -122,6 +122,12 @@ defmodule Hyper.Node.Layer.Server do {:noreply, state} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, %State{blk_path: blk_path}) do case SuidHelper.Losetup.detach(blk_path) do From 7fa0480ce7ec495f5c2b14f6c9c376a496ba85fc Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 22:04:37 +0000 Subject: [PATCH 15/34] fix(fire_vmm): child_spec key must be :id, not :vm_id DynamicSupervisor.start_child rejected the FireVMM child spec with {:invalid_child_spec, ...} because the map used a :vm_id key where a supervisor child spec requires :id - so no VM could ever boot. Use :id. Add @spec child_spec(Opts.t()) :: Supervisor.child_spec(). child_spec/1 is a plain def (not a typed @callback), so without a spec dialyzer never compared the returned map against the child-spec contract and the typo passed the gate. With the spec, the bad key fails dialyzer as invalid_contract. --- lib/hyper/node/fire_vmm.ex | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex index 7e5603b..5938862 100644 --- a/lib/hyper/node/fire_vmm.ex +++ b/lib/hyper/node/fire_vmm.ex @@ -50,11 +50,12 @@ defmodule Hyper.Node.FireVMM do Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id)) end + @spec child_spec(Opts.t()) :: Supervisor.child_spec() def child_spec(opts) do # Keyed by VM id and :transient so a cleanly-stopped VM is not rebooted by # the node-level DynamicSupervisor. %{ - vm_id: {__MODULE__, opts.vm_id}, + id: {__MODULE__, opts.vm_id}, start: {__MODULE__, :start_link, [opts]}, type: :supervisor, restart: :transient From 2ac546ad5f2e23f01c7adc93e6c2ea271ea53fb1 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 22:04:37 +0000 Subject: [PATCH 16/34] fix(scheduler): log why a candidate refused placement place/3 reduced every candidate failure into a blanket {:error, :no_capacity}, so a real boot error on the only candidate was indistinguishable from genuine lack of capacity. Log the actual refusal reason at each candidate. --- lib/hyper/cluster/scheduler.ex | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/hyper/cluster/scheduler.ex b/lib/hyper/cluster/scheduler.ex index 1412a0b..4877658 100644 --- a/lib/hyper/cluster/scheduler.ex +++ b/lib/hyper/cluster/scheduler.ex @@ -16,6 +16,8 @@ defmodule Hyper.Cluster.Scheduler do alias Hyper.Vm.Instance.Spec alias Unit.Information + require Logger + use OpenTelemetryDecorator @type layer_sizes :: [{Hyper.Layer.id(), Unit.Information.t()}] @@ -45,8 +47,15 @@ defmodule Hyper.Cluster.Scheduler do |> candidates(layers) |> Enum.reduce_while({:error, :no_capacity}, fn node, acc -> case attempt.(node) do - {:ok, result} -> {:halt, {:ok, {node, result}}} - {:error, _reason} -> {:cont, acc} + {:ok, result} -> + {:halt, {:ok, {node, result}}} + + {:error, reason} -> + # The candidate fit the snapshot but refused at confirmation time. + # Log the real reason: otherwise an actual boot failure on the only + # candidate is indistinguishable from genuine `:no_capacity`. + Logger.warning("scheduler: #{inspect(node)} refused placement: #{inspect(reason)}") + {:cont, acc} end end) end From 68c8d4b189ddd28463d4e6f563d829bd5cd23b56 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 22:25:29 +0000 Subject: [PATCH 17/34] feat(node): reclaim orphaned dm/loop devices at boot An unclean shutdown (SIGKILL / :erlang.halt, where terminate/2 never runs) leaves hyper dm devices and loop devices behind. The next boot then crashed in ThinPool.init with "device-mapper: create ioctl on hyper-thinpool: Device or resource busy", and a leaked thin volume held the pool open so the previous remove-the-pool-only reclaim could not clear it. Add read-only `dmsetup ls` and `losetup --list` ops to the suidhelper, and a Hyper.Node.Reclaim pass that runs once before any device GenServer starts: it removes every hyper-prefixed dm device leaf-first (retrying leftovers until a pass clears nothing new, so stacked snapshots and the pool-under-volume case resolve) then detaches loop devices backing files under the data dirs. Replaces ThinPool.init's pool-only reclaim. Best-effort; logs and continues. --- lib/hyper/node.ex | 10 +- lib/hyper/node/img/thin_pool.ex | 12 --- lib/hyper/node/reclaim.ex | 99 +++++++++++++++++++ lib/hyper/suid_helper/dmsetup.ex | 26 +++++ lib/hyper/suid_helper/losetup.ex | 25 +++++ native/suidhelper/src/tools/dmsetup/mod.rs | 9 ++ native/suidhelper/src/tools/losetup.rs | 15 +++ native/suidhelper/tests/e2e/argv.rs | 65 ++++++++++++ .../suid_helper/dmsetup_properties_test.exs | 13 +++ test/hyper/suid_helper/losetup_test.exs | 37 +++++++ 10 files changed, 297 insertions(+), 14 deletions(-) create mode 100644 lib/hyper/node/reclaim.ex create mode 100644 test/hyper/suid_helper/losetup_test.exs diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index c9ab6ad..eb3bb73 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -40,8 +40,14 @@ defmodule Hyper.Node do def start_link(opts \\ []) do case test_system() do - :ok -> Supervisor.start_link(__MODULE__, opts, name: __MODULE__) - {:error, reason} -> {:error, reason} + :ok -> + # Clear any dm/loop devices a previous unclean shutdown left behind, + # before the device-owning children start and collide with them. + :ok = Hyper.Node.Reclaim.run() + Supervisor.start_link(__MODULE__, opts, name: __MODULE__) + + {:error, reason} -> + {:error, reason} end end diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex index 2302df8..9599da2 100644 --- a/lib/hyper/node/img/thin_pool.ex +++ b/lib/hyper/node/img/thin_pool.ex @@ -53,7 +53,6 @@ defmodule Hyper.Node.Img.ThinPool do with :ok <- File.mkdir_p(Hyper.Config.scratch_dir()), {:ok, meta} <- ensure_backing(@meta_file, ImgConfig.thin_pool_meta_size()), {:ok, data} <- ensure_backing(@data_file, ImgConfig.thin_pool_data_size()), - :ok <- reclaim_stale_pool(), :ok <- zero_metadata(meta), {:ok, meta_loop} <- SuidHelper.Losetup.attach_rw(meta), {:ok, data_loop} <- SuidHelper.Losetup.attach_rw(data), @@ -119,17 +118,6 @@ defmodule Hyper.Node.Img.ThinPool do @spec id_free(map(), non_neg_integer()) :: map() def id_free(%{freed: freed} = s, id), do: %{s | freed: [id | freed]} - # A run that did not shut down cleanly (e.g. SIGKILL, so `terminate/2` never - # ran) leaves the dm pool behind, and recreating it fails with "Device or - # resource busy". Best-effort remove any stale pool of our name so this boot - # can build a fresh one. Runs before `zero_metadata`, which would otherwise - # corrupt a still-live pool. - @spec reclaim_stale_pool() :: :ok - defp reclaim_stale_pool do - _ = SuidHelper.Dmsetup.remove(@pool_name) - :ok - end - # Create a sparse file of `size` if absent; reuse it if already present. @spec ensure_backing(String.t(), Information.t()) :: {:ok, Path.t()} | {:error, term()} defp ensure_backing(file, size) do diff --git a/lib/hyper/node/reclaim.ex b/lib/hyper/node/reclaim.ex new file mode 100644 index 0000000..f9513f8 --- /dev/null +++ b/lib/hyper/node/reclaim.ex @@ -0,0 +1,99 @@ +defmodule Hyper.Node.Reclaim do + @moduledoc """ + Boot-time reclamation of device-mapper and loop devices orphaned by an unclean + shutdown (SIGKILL or `:erlang.halt`, where the owning GenServers' `terminate/2` + never ran to tear them down). + + Hyper names every dm device it creates with a `hyper-` prefix (`hyper-thinpool`, + `hyper-rw-`, `hyper-img--`), so this removes exactly those - never an + operator's unrelated dm devices. Removal is leaf-first: a device still open by + another (the pool under a thin volume, a snapshot under the next in its chain) + refuses until its dependents are gone, so leftovers are retried until a pass + removes nothing new. Loop devices backing files under Hyper's data dirs are then + detached (the dm devices that held them are gone by that point). + + Entirely best-effort: every failure is logged and boot continues. It runs once, + before any device-owning GenServer starts, so the freshly-booting node never + collides with its own previous instance's leftovers. + """ + + alias Hyper.SuidHelper.{Dmsetup, Losetup} + + require Logger + + @dm_prefix "hyper-" + + @spec run() :: :ok + def run do + reclaim_dm() + reclaim_loops() + :ok + end + + defp reclaim_dm do + case Dmsetup.list() do + {:ok, names} -> + case Enum.filter(names, &String.starts_with?(&1, @dm_prefix)) do + [] -> + :ok + + stale -> + Logger.warning( + "reclaim: removing #{length(stale)} stale dm device(s): #{inspect(stale)}" + ) + + remove_dm(stale) + end + + {:error, reason} -> + Logger.warning("reclaim: could not list dm devices: #{inspect(reason)}") + end + end + + @spec remove_dm([String.t()]) :: :ok + defp remove_dm([]), do: :ok + + defp remove_dm(names) do + {failed, removed_any?} = + Enum.reduce(names, {[], false}, fn name, {failed, any?} -> + case Dmsetup.remove(name) do + :ok -> {failed, true} + {:error, _} -> {[name | failed], any?} + end + end) + + cond do + failed == [] -> :ok + # A pass made progress: a retry may now clear the devices that were still + # held by the ones just removed. + removed_any? -> remove_dm(failed) + true -> Logger.error("reclaim: could not remove dm devices: #{inspect(failed)}") + end + end + + defp reclaim_loops do + case Losetup.list() do + {:ok, pairs} -> + for {dev, backing} <- pairs, under_data_dirs?(backing) do + case Losetup.detach(dev) do + :ok -> + :ok + + {:error, reason} -> + Logger.warning("reclaim: could not detach #{dev} (#{backing}): #{inspect(reason)}") + end + end + + :ok + + {:error, reason} -> + Logger.warning("reclaim: could not list loop devices: #{inspect(reason)}") + end + end + + @spec under_data_dirs?(Path.t()) :: boolean() + defp under_data_dirs?(path) do + String.starts_with?(path, Hyper.Config.scratch_dir() <> "/") or + String.starts_with?(path, Hyper.Config.layer_dir() <> "/") + end +end diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex index e32ec84..3c57c6b 100644 --- a/lib/hyper/suid_helper/dmsetup.ex +++ b/lib/hyper/suid_helper/dmsetup.ex @@ -107,6 +107,32 @@ defmodule Hyper.SuidHelper.Dmsetup do |> MapSet.new() end + @doc "Names of every device-mapper device currently present on this host." + @spec list() :: {:ok, [String.t()]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Dmsetup.list") + def list do + case SuidHelper.exec(["dmsetup", "ls"]) do + {:ok, %{"output" => out}} -> {:ok, parse_names(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_names(String.t()) :: [String.t()] + def parse_names(out) do + case String.trim(out) do + # `dmsetup ls` prints this sentinel (not a device row) when there are none. + "No devices found" -> + [] + + _ -> + out + |> String.split("\n", trim: true) + |> Enum.map(&(&1 |> String.split() |> List.first())) + |> Enum.reject(&is_nil/1) + end + end + @doc false @spec snapshot_table(Path.t(), Path.t(), pos_integer(), pos_integer()) :: String.t() def snapshot_table(origin_dev, cow_dev, sectors, chunk_sectors) do diff --git a/lib/hyper/suid_helper/losetup.ex b/lib/hyper/suid_helper/losetup.ex index 405c5ca..3abf4fb 100644 --- a/lib/hyper/suid_helper/losetup.ex +++ b/lib/hyper/suid_helper/losetup.ex @@ -36,4 +36,29 @@ defmodule Hyper.SuidHelper.Losetup do {:error, _} = err -> err end end + + @doc "Currently-attached loop devices as `{device, backing_file}` pairs." + @spec list() :: {:ok, [{Path.t(), Path.t()}]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Losetup.list") + def list do + case SuidHelper.exec(["losetup", "list"]) do + {:ok, %{"output" => out}} -> {:ok, parse_list(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_list(String.t()) :: [{Path.t(), Path.t()}] + def parse_list(out) do + out + |> String.split("\n", trim: true) + |> Enum.flat_map(fn line -> + # `NAME BACK-FILE` rows; a loop with no backing file has only one column + # (nothing for us to reclaim by file), so skip it. + case String.split(line, " ", parts: 2, trim: true) do + [dev, backing] -> [{dev, String.trim(backing)}] + _ -> [] + end + end) + end end diff --git a/native/suidhelper/src/tools/dmsetup/mod.rs b/native/suidhelper/src/tools/dmsetup/mod.rs index ca40561..d77674d 100644 --- a/native/suidhelper/src/tools/dmsetup/mod.rs +++ b/native/suidhelper/src/tools/dmsetup/mod.rs @@ -55,6 +55,8 @@ enum DmOp { /// List the target types the kernel device-mapper exposes. Read-only, but /// still needs root: it opens `/dev/mapper/control`. Targets, + /// List the names of existing dm devices (for stale-device reclaim). + Ls, } #[derive(Serialize)] @@ -64,6 +66,7 @@ pub enum DmsetupOut { Removed, Messaged, Targets { output: String }, + Listed { output: String }, } pub struct Dmsetup { @@ -112,6 +115,9 @@ impl IsTool for Dmsetup { DmOp::Targets => { cmd.arg("targets"); } + DmOp::Ls => { + cmd.arg("ls"); + } } cmd.env_clear().output() @@ -134,6 +140,9 @@ impl IsTool for Dmsetup { DmOp::Targets => DmsetupOut::Targets { output: String::from_utf8_lossy(&out.stdout).into_owned(), }, + DmOp::Ls => DmsetupOut::Listed { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/src/tools/losetup.rs b/native/suidhelper/src/tools/losetup.rs index fa8a49d..aadb5b6 100644 --- a/native/suidhelper/src/tools/losetup.rs +++ b/native/suidhelper/src/tools/losetup.rs @@ -66,6 +66,8 @@ enum LosetupOp { Attach(AttachArgs), /// Detach a loop device. Detach { dev: LoopDev }, + /// List loop devices as `NAME BACK-FILE` rows (for stale-device reclaim). + List, } #[derive(Serialize)] @@ -73,6 +75,7 @@ enum LosetupOp { pub enum LosetupOut { Attached { device: PathBuf }, Detached, + Listed { output: String }, } pub struct Losetup { @@ -101,6 +104,15 @@ impl IsTool for Losetup { let dev: &Path = dev.as_ref(); cmd.arg("-d").arg(dev); } + LosetupOp::List => { + cmd.args([ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE", + ]); + } } cmd.env_clear().output() @@ -119,6 +131,9 @@ impl IsTool for Losetup { device: PathBuf::from(String::from_utf8_lossy(&out.stdout).trim()), }, LosetupOp::Detach { .. } => LosetupOut::Detached, + LosetupOp::List => LosetupOut::Listed { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/tests/e2e/argv.rs b/native/suidhelper/tests/e2e/argv.rs index 519d664..dcda8db 100644 --- a/native/suidhelper/tests/e2e/argv.rs +++ b/native/suidhelper/tests/e2e/argv.rs @@ -174,6 +174,71 @@ fn dmsetup_targets_argv_and_parse_as_root() { assert_eq!(json["output"], "snapshot v1.16.0\n"); } +#[test] +fn dmsetup_ls_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_ls: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake(tmp.path(), "dmsetup", &rec, "hyper-thinpool\\nhyper-rw-abc"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "ls"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["ls"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!(json["output"], "hyper-thinpool\nhyper-rw-abc\n"); +} + +#[test] +fn losetup_list_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP losetup_list: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake( + tmp.path(), + "losetup", + &rec, + "/dev/loop0 /srv/hyper/scratch/thinpool.meta", + ); + let cfg = write_root_config(tmp.path(), &[("losetup", &bin)]); + + let out = run(&cfg, &["losetup", "list"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!( + recorded_argv(&rec), + vec![ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE" + ] + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!( + json["output"], + "/dev/loop0 /srv/hyper/scratch/thinpool.meta\n" + ); +} + #[test] fn dmsetup_rejects_configured_bin_with_wrong_basename_as_root() { if !is_root() { diff --git a/test/hyper/suid_helper/dmsetup_properties_test.exs b/test/hyper/suid_helper/dmsetup_properties_test.exs index 73858fc..a7e49d8 100644 --- a/test/hyper/suid_helper/dmsetup_properties_test.exs +++ b/test/hyper/suid_helper/dmsetup_properties_test.exs @@ -73,4 +73,17 @@ defmodule Hyper.SuidHelper.DmsetupPropertiesTest do assert Dmsetup.parse_targets(out) == MapSet.new(targets) end end + + property "parse_names recovers the device name from each `dmsetup ls` row" do + check all(names <- uniq_list_of(dev(), min_length: 1, max_length: 6)) do + # `dmsetup ls` rows are "\t(major:minor)"; order is preserved. + out = Enum.map_join(names, "\n", fn n -> "#{n}\t(254:0)" end) + assert Dmsetup.parse_names(out) == names + end + end + + test "parse_names reads the empty-table sentinel as no devices" do + assert Dmsetup.parse_names("No devices found\n") == [] + assert Dmsetup.parse_names("") == [] + end end diff --git a/test/hyper/suid_helper/losetup_test.exs b/test/hyper/suid_helper/losetup_test.exs new file mode 100644 index 0000000..4cdc6f0 --- /dev/null +++ b/test/hyper/suid_helper/losetup_test.exs @@ -0,0 +1,37 @@ +defmodule Hyper.SuidHelper.LosetupTest do + @moduledoc """ + `parse_list/1` turns `losetup --list --output NAME,BACK-FILE` rows into + `{device, backing_file}` pairs. The reclaim pass relies on it to recognise loop + devices backing Hyper's files, so the edges that matter are: a loop with no + backing file (skipped, nothing to reclaim by file) and a `(deleted)` backing + suffix (kept, so the data-dir prefix still matches). + """ + use ExUnit.Case, async: true + + alias Hyper.SuidHelper.Losetup + + test "pairs device with backing file and skips rows that have no backing file" do + out = """ + /dev/loop0 /srv/hyper/scratch/thinpool.meta + /dev/loop1 /srv/hyper/layers/blob + /dev/loop2 + """ + + assert Losetup.parse_list(out) == [ + {"/dev/loop0", "/srv/hyper/scratch/thinpool.meta"}, + {"/dev/loop1", "/srv/hyper/layers/blob"} + ] + end + + test "keeps a `(deleted)` backing suffix so data-dir prefix matching still works" do + out = "/dev/loop0 /srv/hyper/scratch/thinpool.data (deleted)\n" + + assert Losetup.parse_list(out) == [ + {"/dev/loop0", "/srv/hyper/scratch/thinpool.data (deleted)"} + ] + end + + test "an empty listing yields no pairs" do + assert Losetup.parse_list("") == [] + end +end From b6ce604a0e2231a4e04fd50b665c03287983fa1b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 23:07:40 +0000 Subject: [PATCH 18/34] feat(suidhelper): add firecracker/jailer/uid_gid_range to config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the suidhelper config with four new fields needed before the jailer exec path can be wired up: - firecracker / jailer: Option — no built-in default; accessors return BinError::Unconfigured when absent so callers get a clear "operator must configure this" signal rather than a path-validation error. - parent_cgroup: String — defaults to "hyper", matching Elixir's @parent_cgroup (operators narrowing must keep both in sync). - uid_gid_range: Option — total accessor returning (900_000, 999_999) when absent; a present range with min==0 or min>max is fatal at safe_load time, consistent with the "present-but-untrusted is fatal" trust model. Adds BinError (Unconfigured | Bin) so firecracker/jailer accessors have a richer error type than the existing tool accessors (which can never be Unconfigured). Adds validate_uid_gid_range as a public pure function so the refusal contract can be property-tested without touching the file system. New test target config_uid_gid_range exercises: absent → default, valid range round-trips via TOML, min==0 always rejected, min>max always rejected. Extends e2e/config with: Unconfigured on absent keys, basename mismatch rejected, Ok on root-owned correct-name binaries (root-guarded), bad uid_gid_range binary exit 2 (root-guarded). --- native/suidhelper/Cargo.toml | 5 + native/suidhelper/src/config.rs | 108 +++++++++++++++++- .../suidhelper/tests/config_uid_gid_range.rs | 59 ++++++++++ native/suidhelper/tests/e2e/config.rs | 94 +++++++++++++++ 4 files changed, 263 insertions(+), 3 deletions(-) create mode 100644 native/suidhelper/tests/config_uid_gid_range.rs diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index f5b4b1b..550ba2d 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -72,6 +72,10 @@ path = "tests/e2e/argv.rs" name = "e2e_chroot_jail" path = "tests/e2e/chroot_jail.rs" +[[test]] +name = "config_uid_gid_range" +path = "tests/config_uid_gid_range.rs" + [dependencies] clap = { version = "4", features = ["derive"] } hyper-suidhelper-meta = { path = "meta" } @@ -84,6 +88,7 @@ toml = { version = "0.8", default-features = false, features = ["parse"] } [dev-dependencies] proptest = "1" tempfile = "3" +toml = { version = "0.8", default-features = false, features = ["parse"] } [profile.release] strip = true diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs index 42079a1..76a7f22 100644 --- a/native/suidhelper/src/config.rs +++ b/native/suidhelper/src/config.rs @@ -1,5 +1,12 @@ // SPDX-License-Identifier: AGPL-3.0-only //! Runtime host configuration, read from a single root-owned TOML file. +//! +//! ## UID/GID range divergence +//! +//! Elixir keeps `compile_env` default `{900_000, 999_999}` that governs which +//! UIDs the node hands *out*; this helper reads `[uid_gid_range]` from +//! config.toml to decide which UIDs it *accepts* (default `{900_000, 999_999}` +//! when the key is absent). Operators narrowing the range must set **both**. use crate::util::safe_bin::{self, SafeBin}; use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile}; @@ -22,11 +29,48 @@ pub enum LoadingError { Malformed(PathBuf), #[error("work_dir in {0:?} must be an absolute path")] Relative(PathBuf), + #[error("uid_gid_range.min must be >= 1 and <= max (got min={min}, max={max})")] + BadUidGidRange { min: u32, max: u32 }, +} + +/// Error returned by config accessors for tool binaries derived from config. +#[derive(Debug, Error)] +pub enum BinError { + #[error("required binary `{0}` is not configured in /etc/hyper/config.toml")] + Unconfigured(&'static str), + #[error(transparent)] + Bin(#[from] safe_bin::Error), } const CONFIG_PATHSTR: &str = "/etc/hyper/config.toml"; const INSECURE_CONFIG_PATH_ENV: &str = "HYPER_SETUIDHELPER_CONFIG_PATH"; +/// UID/GID allocation band, read from `[uid_gid_range]` in config.toml. +/// Controls which UIDs the helper *accepts* from the BEAM — see module docs. +#[derive(Debug, Clone, Copy, Deserialize)] +pub struct UidGidRange { + pub min: u32, + pub max: u32, +} + +// Band defaults match Elixir's `compile_env` allocation defaults so that an +// unconfigured helper and an unconfigured node agree out of the box. +const DEFAULT_UID_GID: (u32, u32) = (900_000, 999_999); + +/// Validate a uid_gid_range value. A present range where min==0 or min>max is +/// treated as a config trust violation — fatal at load, consistent with the +/// "present but untrusted" model. Exposed so tests can verify the contract +/// without touching the file system. +pub fn validate_uid_gid_range(r: &UidGidRange) -> Result<(), LoadingError> { + if r.min == 0 || r.min > r.max { + return Err(LoadingError::BadUidGidRange { + min: r.min, + max: r.max, + }); + } + Ok(()) +} + /// The config file path. In production this is the fixed `/etc/hyper/config.toml`. /// Only in INSECURE TEST MODE (both gates open) may an env var redirect it — the /// secure arm is always the hardcoded path, so a release build cannot be steered. @@ -51,6 +95,14 @@ pub struct Config { losetup: PathBuf, #[serde(default = "default_blockdev")] blockdev: PathBuf, + #[serde(default)] + firecracker: Option, + #[serde(default)] + jailer: Option, + #[serde(default = "default_parent_cgroup")] + parent_cgroup: String, + #[serde(default)] + uid_gid_range: Option, } // The default data root. Must match the Elixir node's `@dev_work_dir`, which it @@ -72,6 +124,11 @@ fn default_blockdev() -> PathBuf { PathBuf::from("/usr/sbin/blockdev") } +fn default_parent_cgroup() -> String { + // Must match Elixir node's `@parent_cgroup`; operators need to keep them in sync. + "hyper".into() +} + impl Default for Config { fn default() -> Self { Self { @@ -79,6 +136,10 @@ impl Default for Config { dmsetup: default_dmsetup(), losetup: default_losetup(), blockdev: default_blockdev(), + firecracker: None, + jailer: None, + parent_cgroup: default_parent_cgroup(), + uid_gid_range: None, } } } @@ -86,7 +147,7 @@ impl Default for Config { impl Config { /// The process-wide config, loaded once (and forced unprivileged by /// [`Config::init`]). An absent file yields the built-in defaults; a - /// *present but untrusted* file (wrong owner/mode, malformed) is fatal - + /// *present but untrusted* file (wrong owner/mode, malformed) is fatal — /// the helper prints the error and exits rather than trusting it. pub fn get() -> &'static Config { LazyLock::force(&CONFIG) @@ -95,7 +156,7 @@ impl Config { /// Force the config to load now. Call this once at the very start of `main`, /// after privileges have already been dropped (the `.preinit_array` entry in /// `setuid_privileged` runs before `main`), so the file is never first read - /// lazily from inside a `Privileged` scope - i.e. it is guaranteed to be read + /// lazily from inside a `Privileged` scope — i.e. it is guaranteed to be read /// as the real uid, not as root. pub fn init() { let _ = Self::get(); @@ -126,6 +187,42 @@ impl Config { SafeBin::from_path(&self.blockdev) } + /// The Firecracker VMM binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set this key before any VM can be launched. + pub fn firecracker(&self) -> Result, BinError> { + self.firecracker + .as_deref() + .ok_or(BinError::Unconfigured("firecracker")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The Firecracker jailer binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set this key before any VM can be launched. + pub fn jailer(&self) -> Result, BinError> { + self.jailer + .as_deref() + .ok_or(BinError::Unconfigured("jailer")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The jailer `--parent-cgroup` value. Defaults to `"hyper"`, matching the + /// Elixir node's `@parent_cgroup`. + pub fn parent_cgroup(&self) -> &str { + &self.parent_cgroup + } + + /// The UID/GID band the helper accepts from the BEAM. Defaults to + /// `(900_000, 999_999)` when the key is absent (matching Elixir's defaults). + /// A present range with min==0 or min>max is rejected at load time by + /// [`Config::safe_load`], so this accessor is always total. + pub fn uid_gid_range(&self) -> (u32, u32) { + self.uid_gid_range + .map(|r| (r.min, r.max)) + .unwrap_or(DEFAULT_UID_GID) + } + /// Read, ownership-check, parse, and validate the config file. See the module /// docs for the trust model. pub fn safe_load() -> Result { @@ -138,7 +235,7 @@ impl Config { Ok(file) => file, // A genuinely-absent file means "use the built-in defaults": those // are compiled into this root-owned binary, so they are trusted. Any - // OTHER failure - a present but wrong-owner/mode file, an I/O error - + // OTHER failure — a present but wrong-owner/mode file, an I/O error — // stays fatal, because it is a signal (someone put an untrusted file // there), not an absence. Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => { @@ -155,6 +252,11 @@ impl Config { if !config.work_dir.is_absolute() { return Err(LoadingError::Relative(path)); } + + if let Some(r) = &config.uid_gid_range { + validate_uid_gid_range(r)?; + } + Ok(config) } } diff --git a/native/suidhelper/tests/config_uid_gid_range.rs b/native/suidhelper/tests/config_uid_gid_range.rs new file mode 100644 index 0000000..1f176e9 --- /dev/null +++ b/native/suidhelper/tests/config_uid_gid_range.rs @@ -0,0 +1,59 @@ +//! Properties of the `uid_gid_range` configuration field. +//! +//! Contracts under test: +//! - A valid range (min >= 1, min <= max) is always accepted. +//! - Absent range yields the built-in default (900_000, 999_999). +//! - A valid range round-trips through TOML deserialization + uid_gid_range(). +//! - min == 0 is always rejected (uid 0 means root; the jailer must never +//! receive it — it skips its privilege drop when uid == 0). +//! - min > max is always rejected (incoherent range; likely a config typo). + +use hyper_suidhelper::config::{validate_uid_gid_range, Config, LoadingError, UidGidRange}; +use proptest::prelude::*; + +#[test] +fn absent_range_yields_default() { + assert_eq!(Config::default().uid_gid_range(), (900_000, 999_999)); +} + +proptest! { + #[test] + fn valid_range_accepted(min in 1u32.., delta in 0u32..) { + // max = min + delta, saturating so it never wraps past u32::MAX. + let max = min.saturating_add(delta); + let r = UidGidRange { min, max }; + prop_assert!(validate_uid_gid_range(&r).is_ok()); + } + + #[test] + fn valid_range_round_trips_via_toml(min in 1u32.., delta in 0u32..) { + let max = min.saturating_add(delta); + let body = format!( + "work_dir = \"/srv/hyper\"\n[uid_gid_range]\nmin = {min}\nmax = {max}\n" + ); + let config: Config = toml::from_str(&body).expect("valid TOML"); + prop_assert_eq!(config.uid_gid_range(), (min, max)); + } + + #[test] + fn zero_min_always_rejected(max in 0u32..) { + let r = UidGidRange { min: 0, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { min: 0, .. }) + ); + prop_assert!(rejected); + } + + #[test] + fn min_exceeds_max_always_rejected(max in 0u32..u32::MAX) { + // min = max + 1 is always strictly greater than max and always >= 1. + let min = max + 1; + let r = UidGidRange { min, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { .. }) + ); + prop_assert!(rejected); + } +} diff --git a/native/suidhelper/tests/e2e/config.rs b/native/suidhelper/tests/e2e/config.rs index 100a686..ef57c97 100644 --- a/native/suidhelper/tests/e2e/config.rs +++ b/native/suidhelper/tests/e2e/config.rs @@ -3,6 +3,8 @@ //! tempfile instead of /etc/hyper, so tests never touch the real host. #![cfg(feature = "insecure_test_seams")] +use hyper_suidhelper::config::{BinError, Config}; +use hyper_suidhelper::util::safe_bin; use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; @@ -126,3 +128,95 @@ fn valid_config_and_setuid_yields_sys_test_ok_as_root() { assert_eq!(json["sys_test"], "ok"); assert_eq!(json["hyper_base"], "/srv/hyper"); } + +#[test] +fn firecracker_unconfigured_when_absent() { + // Config::default() has firecracker == None; the accessor must signal this + // distinctly so callers can report a missing-configuration error rather than + // a safe_bin validation error. + let err = Config::default() + .firecracker() + .expect_err("absent firecracker must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("firecracker")), + "expected Unconfigured(\"firecracker\"), got {err:?}", + ); +} + +#[test] +fn jailer_unconfigured_when_absent() { + let err = Config::default() + .jailer() + .expect_err("absent jailer must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("jailer")), + "expected Unconfigured(\"jailer\"), got {err:?}", + ); +} + +#[test] +fn jailer_basename_mismatch_rejected() { + // The basename check in SafeBin::from_path precedes the stat, so we do not + // need a real file — any absolute path with the wrong leaf name is enough. + let body = "work_dir = \"/srv/hyper\"\njailer = \"/usr/local/bin/not-jailer\"\n"; + let config: Config = toml::from_str(body).unwrap(); + let err = config + .jailer() + .expect_err("wrong-basename jailer path must be rejected"); + assert!( + matches!(err, BinError::Bin(safe_bin::Error::Name { .. })), + "expected a Name error, got {err:?}", + ); +} + +#[test] +fn firecracker_and_jailer_return_ok_when_root_owned_as_root() { + if !is_root() { + eprintln!("SKIP firecracker_jailer_configured: needs root to create root-owned binaries"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let fc = tmp.path().join("firecracker"); + let jr = tmp.path().join("jailer"); + // 0o755: root-owned, not group/other-writable — satisfies SafeBin's checks. + for p in [&fc, &jr] { + fs::write(p, b"#!/bin/sh\n").unwrap(); + fs::set_permissions(p, fs::Permissions::from_mode(0o755)).unwrap(); + } + let body = format!( + "work_dir = \"/srv/hyper\"\nfirecracker = \"{}\"\njailer = \"{}\"\n", + fc.display(), + jr.display(), + ); + let config: Config = toml::from_str(&body).unwrap(); + assert!( + config.firecracker().is_ok(), + "root-owned firecracker with correct basename must be accepted" + ); + assert!( + config.jailer().is_ok(), + "root-owned jailer with correct basename must be accepted" + ); +} + +#[test] +fn bad_uid_gid_range_exits_2_as_root() { + if !is_root() { + eprintln!("SKIP bad_uid_gid_range: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // min = 0 is the clearest violation: uid 0 is root, which the jailer must + // never receive because it skips its privilege drop when uid == 0. + let p = write_root_config( + tmp.path(), + "work_dir = \"/srv/hyper\"\n[uid_gid_range]\nmin = 0\nmax = 100\n", + ); + let out = run_with_config(&p, &["sys-test"]); + assert_eq!(out.status.code(), Some(2)); + let err = String::from_utf8_lossy(&out.stderr); + assert!( + err.contains("uid_gid_range"), + "expected a uid_gid_range error in stderr, got: {err}", + ); +} From 8f671a34906f32a1990d0ea1bb46c794c541415b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 23:22:41 +0000 Subject: [PATCH 19/34] feat(suidhelper): add jailer subcommand that execs the jailer as root --- native/suidhelper/Cargo.toml | 10 +- native/suidhelper/src/main.rs | 13 +- native/suidhelper/src/tools/jailer.rs | 357 ++++++++++++++++++ native/suidhelper/src/tools/mod.rs | 1 + .../suidhelper/src/util/setuid_privileged.rs | 39 +- native/suidhelper/tests/e2e/jailer.rs | 193 ++++++++++ native/suidhelper/tests/tools/jailer.rs | 162 ++++++++ 7 files changed, 770 insertions(+), 5 deletions(-) create mode 100644 native/suidhelper/src/tools/jailer.rs create mode 100644 native/suidhelper/tests/e2e/jailer.rs create mode 100644 native/suidhelper/tests/tools/jailer.rs diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index 550ba2d..9b2b6cb 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -48,6 +48,10 @@ path = "tests/util/safe_bin.rs" name = "tools_dmsetup_parsers" path = "tests/tools/dmsetup_parsers.rs" +[[test]] +name = "tools_jailer" +path = "tests/tools/jailer.rs" + [[test]] name = "util_confinement" path = "tests/util/confinement.rs" @@ -68,6 +72,10 @@ path = "tests/e2e/config.rs" name = "e2e_argv" path = "tests/e2e/argv.rs" +[[test]] +name = "e2e_jailer" +path = "tests/e2e/jailer.rs" + [[test]] name = "e2e_chroot_jail" path = "tests/e2e/chroot_jail.rs" @@ -79,7 +87,7 @@ path = "tests/config_uid_gid_range.rs" [dependencies] clap = { version = "4", features = ["derive"] } hyper-suidhelper-meta = { path = "meta" } -nix = { version = "0.29", features = ["user", "fs", "dir"] } +nix = { version = "0.29", features = ["user", "fs", "dir", "process"] } serde = { version = "1", features = ["derive"] } serde_json = "1" thiserror = "1" diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs index 617fd6d..665370b 100644 --- a/native/suidhelper/src/main.rs +++ b/native/suidhelper/src/main.rs @@ -15,6 +15,7 @@ use clap::{Parser, Subcommand}; use hyper_suidhelper::config; +use hyper_suidhelper::tools::jailer::{self, JailerArgs}; use hyper_suidhelper::tools::Tool; use hyper_suidhelper::util::setuid_privileged::{self, Privileged}; use serde::Serialize; @@ -37,6 +38,9 @@ enum Command { Tool(Tool), /// Check the helper is correctly installed (can promote to root). SysTest, + /// Validate the caller's args, become root, and `execve` the firecracker + /// jailer in our place. Prints nothing on success (the image is replaced). + Jailer(JailerArgs), /// Print the build version and BLAKE3 checksum of this binary. Version, } @@ -101,12 +105,19 @@ fn main() { config::Config::init(); // Each command yields a serializable value (errors stringified to unify); we - // render the final JSON line here. + // render the final JSON line here. The jailer is the exception: it `execve`s + // in place, so on success it never returns and never emits JSON, and on + // failure it reports to stderr and exits before reaching the JSON pipeline. let output = match command { Command::Tool(tool) => tool.run().map(Output::Tool).map_err(|e| e.to_string()), Command::SysTest => SysTest::perform() .map(Output::SysTest) .map_err(|e| e.to_string()), + Command::Jailer(args) => { + let e = jailer::run(args).expect_err("jailer::run only returns on error"); + eprintln!("{e}"); + std::process::exit(2); + } Command::Version => unreachable!("handled above"), }; diff --git a/native/suidhelper/src/tools/jailer.rs b/native/suidhelper/src/tools/jailer.rs new file mode 100644 index 0000000..d816de1 --- /dev/null +++ b/native/suidhelper/src/tools/jailer.rs @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! The `jailer` subcommand: validate the BEAM's arguments, re-acquire root +//! permanently, and `execve` the firecracker jailer in our place. +//! +//! Unlike the device tools this is **not** an [`crate::tools::IsTool`]: it does +//! not run a child and parse JSON, it *becomes* the jailer via `execve`, so the +//! unprivileged BEAM's MuonTrap port keeps supervising the resulting process +//! across the image replacement. There is no output and no return on success. +//! +//! Threat model: the BEAM is untrusted. It supplies only `--id`, `--uid`, +//! `--gid`, repeated `--cgroup KEY=VALUE`, and `--api-sock`. Every privileged +//! path (the jailer binary, the firecracker `--exec-file`, the chroot base, the +//! parent cgroup) comes from the root-owned config, never the caller. The +//! refusal contracts on the newtypes below are the security core: a compromised +//! BEAM must not be able to name a privileged path, request uid 0, traverse out +//! of the chroot base, inject a flag, or smuggle an environment/fd into root. +//! +//! ## Validator laws (property-tested in `tests/tools/jailer.rs`) +//! - [`validate_id_number`] accepts iff `n != 0 && lo <= n <= hi`; 0 is rejected +//! for *every* range (uid 0 makes the jailer skip its privilege drop). +//! - [`VmId`] round-trips exactly the allowed charset/length and rejects any +//! separator, dot, NUL, whitespace, leading dash, empty, or over-long input. +//! - [`CgroupSetting`] re-renders a valid pair to its canonical `key=value` and +//! rejects unknown keys and values outside the per-key grammar. +//! - [`JailSock`] accepts exactly `/` + one filename and rejects multi-component, +//! relative, `..`, and NUL/whitespace inputs. + +use crate::config::{BinError, Config}; +use crate::util::setuid_privileged; +use clap::Args; +use nix::errno::Errno; +use std::ffi::CString; +use std::fmt; +use std::os::unix::ffi::OsStrExt; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use thiserror::Error as ThisError; + +/// The jailer protects at most a handful of controllers; an unbounded `--cgroup` +/// list is a caller trying something. Cap it well above any legitimate need. +const MAX_CGROUPS: usize = 16; + +/// `sun_path` in `sockaddr_un` is 108 bytes on Linux; a socket path longer than +/// that can never be bound, so reject it up front. +const MAX_SOCK_LEN: usize = 108; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("invalid --id {0:?}: must be 1..=64 chars of [A-Za-z0-9_-], not starting with '-'")] + VmId(String), + #[error("invalid --cgroup {0:?}: unknown key or value outside its grammar")] + Cgroup(String), + #[error("invalid --api-sock {0:?}: must be / with name in [A-Za-z0-9_.-]")] + Sock(String), + #[error("--uid/--gid {value} is zero or outside the configured range [{lo}, {hi}]")] + IdNumber { value: u32, lo: u32, hi: u32 }, + #[error("too many --cgroup settings: {0} (max {MAX_CGROUPS})")] + TooManyCgroups(usize), + #[error(transparent)] + Bin(#[from] BinError), + #[error(transparent)] + Privilege(#[from] setuid_privileged::Error), + #[error("argument contains an interior NUL byte")] + NulArgument, + #[error("execve {path:?} failed: {errno}")] + Exec { path: PathBuf, errno: Errno }, +} + +/// `n != 0 && lo <= n <= hi`. uid/gid 0 is rejected unconditionally: a jailer run +/// with uid 0 skips its privilege drop and leaves firecracker running as root. +pub fn validate_id_number(n: u32, range: (u32, u32)) -> Result { + let (lo, hi) = range; + if n != 0 && lo <= n && n <= hi { + Ok(n) + } else { + Err(Error::IdNumber { value: n, lo, hi }) + } +} + +/// A VM id used as a chroot subdirectory name: `[A-Za-z0-9_-]`, length `1..=64`, +/// first character not `-` (so it can never be read as a flag). No `/`, `.`, NUL, +/// or whitespace can appear, so it can never traverse out of the chroot base. +#[derive(Debug, Clone)] +pub struct VmId(String); + +impl FromStr for VmId { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::VmId(s.to_string()); + let bytes = s.as_bytes(); + if !(1..=64).contains(&bytes.len()) { + return Err(reject()); + } + if bytes[0] == b'-' { + return Err(reject()); + } + if !bytes + .iter() + .all(|&b| b.is_ascii_alphanumeric() || b == b'_' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for VmId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +/// `1..=20` ASCII digits — bounds a numeric cgroup limit without pulling in a +/// regex engine. The upper bound comfortably exceeds `u64::MAX`'s 20 digits. +fn is_digits_1_20(s: &str) -> bool { + !s.is_empty() && s.len() <= 20 && s.bytes().all(|b| b.is_ascii_digit()) +} + +/// A single `KEY=VALUE` cgroup setting from an allowlist. The helper re-emits +/// `key=value` itself from the canonical key, so the jailer never sees the +/// caller's raw bytes. Per-key value grammar: +/// - `memory.max` : `[0-9]{1,20}` or the literal `max` +/// - `cpu.max` : `[0-9]{1,20} [0-9]{1,20}` or `max [0-9]{1,20}` +#[derive(Debug, Clone)] +pub struct CgroupSetting { + key: &'static str, + value: String, +} + +impl FromStr for CgroupSetting { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Cgroup(s.to_string()); + // Split on the FIRST `=`. None of the value grammars contains a `=`, so a + // second `=` lands in `value` and is rejected by the grammar check below. + let (raw_key, value) = s.split_once('=').ok_or_else(reject)?; + + let key: &'static str = match raw_key { + "memory.max" => "memory.max", + "cpu.max" => "cpu.max", + _ => return Err(reject()), + }; + + let valid = match key { + "memory.max" => value == "max" || is_digits_1_20(value), + "cpu.max" => match value.split_once(' ') { + Some((quota, period)) => { + (quota == "max" || is_digits_1_20(quota)) && is_digits_1_20(period) + } + None => false, + }, + _ => false, + }; + + if valid { + Ok(Self { + key, + value: value.to_string(), + }) + } else { + Err(reject()) + } + } +} + +impl fmt::Display for CgroupSetting { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}={}", self.key, self.value) + } +} + +/// The firecracker API socket path: an absolute path that is exactly `/` plus one +/// filename in `[A-Za-z0-9_.-]`. The charset excludes `/`, so the value is always +/// a direct child of `/` with no extra components and no traversal; `.`/`..` as +/// the whole filename are rejected explicitly. +#[derive(Debug, Clone)] +pub struct JailSock(String); + +impl FromStr for JailSock { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Sock(s.to_string()); + if s.len() > MAX_SOCK_LEN { + return Err(reject()); + } + let name = s.strip_prefix('/').ok_or_else(reject)?; + if name.is_empty() || name == "." || name == ".." { + return Err(reject()); + } + if !name + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for JailSock { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +#[derive(Args)] +pub struct JailerArgs { + /// Microvm id; becomes the chroot subdirectory name. + #[arg(long)] + id: VmId, + /// Unprivileged uid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + uid: u32, + /// Unprivileged gid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + gid: u32, + /// Repeatable `KEY=VALUE` cgroup setting from the allowlist. + #[arg(long = "cgroup")] + cgroup: Vec, + /// Absolute firecracker API socket path (single filename under `/`). + #[arg(long = "api-sock")] + api_sock: JailSock, +} + +fn cstr_bytes(bytes: &[u8]) -> Result { + CString::new(bytes).map_err(|_| Error::NulArgument) +} + +fn cstr_str(s: &str) -> Result { + cstr_bytes(s.as_bytes()) +} + +fn cstr_path(p: &Path) -> Result { + cstr_bytes(p.as_os_str().as_bytes()) +} + +/// Build the exact argv handed to the jailer. argv[0] is the jailer path. The +/// caller never names the jailer, the `--exec-file`, the chroot base, the cgroup +/// version, or the parent cgroup — those are derived from trusted config here. +#[allow(clippy::too_many_arguments)] +fn build_argv( + jailer: &Path, + id: &VmId, + firecracker: &Path, + uid: u32, + gid: u32, + jail_base: &Path, + parent_cgroup: &str, + cgroups: &[CgroupSetting], + api_sock: &JailSock, +) -> Result, Error> { + let mut argv = vec![ + cstr_path(jailer)?, + cstr_str("--id")?, + cstr_str(&id.to_string())?, + cstr_str("--exec-file")?, + cstr_path(firecracker)?, + cstr_str("--uid")?, + cstr_str(&uid.to_string())?, + cstr_str("--gid")?, + cstr_str(&gid.to_string())?, + cstr_str("--chroot-base-dir")?, + cstr_path(jail_base)?, + cstr_str("--cgroup-version")?, + cstr_str("2")?, + cstr_str("--parent-cgroup")?, + cstr_str(parent_cgroup)?, + ]; + + for cg in cgroups { + argv.push(cstr_str("--cgroup")?); + argv.push(cstr_str(&cg.to_string())?); + } + + argv.push(cstr_str("--")?); + argv.push(cstr_str("--api-sock")?); + argv.push(cstr_str(&api_sock.to_string())?); + + Ok(argv) +} + +/// Close every inherited fd above stdio so a compromised BEAM cannot smuggle an +/// open fd into the root jailer. Keep 0/1/2: MuonTrap supervises the jailer +/// through stdio, and stderr carries our exec-failure message. `close_range(2)` +/// needs Linux 5.9+; on ENOSYS we fall back to closing each fd up to the limit. +fn close_inherited_fds() { + const FIRST: u32 = 3; + // SAFETY: raw syscall with no memory operands; closing fds has no UB. + let rc = unsafe { nix::libc::close_range(FIRST, u32::MAX, 0) }; + if rc == 0 || Errno::last() != Errno::ENOSYS { + return; + } + + // SAFETY: sysconf is a pure query of a system limit. + let max = unsafe { nix::libc::sysconf(nix::libc::_SC_OPEN_MAX) }; + let max = if max < 0 { 4096 } else { max as i32 }; + for fd in (FIRST as i32)..max { + // SAFETY: closing an arbitrary fd is safe; EBADF for unused fds is ignored. + unsafe { + nix::libc::close(fd); + } + } +} + +/// Validate the caller's args, then permanently become root and `execve` the +/// jailer. On success this never returns (the process image is replaced); the +/// `Ok` arm is therefore [`std::convert::Infallible`]. Every failure is returned +/// as [`Error`] for the caller to print and exit non-zero. +pub fn run(args: JailerArgs) -> Result { + let config = Config::get(); + + // Resolve everything that can fail as the REAL uid, before any privilege is + // raised: config accessors, binary validation, range, and arg validation. + let jailer: PathBuf = config.jailer()?.into(); + let firecracker: PathBuf = config.firecracker()?.into(); + let jail_base = config.jail_base(); + let parent_cgroup = config.parent_cgroup(); + let range = config.uid_gid_range(); + + let uid = validate_id_number(args.uid, range)?; + let gid = validate_id_number(args.gid, range)?; + + if args.cgroup.len() > MAX_CGROUPS { + return Err(Error::TooManyCgroups(args.cgroup.len())); + } + + let argv = build_argv( + &jailer, + &args.id, + &firecracker, + uid, + gid, + &jail_base, + parent_cgroup, + &args.cgroup, + &args.api_sock, + )?; + let jailer_cstr = cstr_path(&jailer)?; + + // Point of no return: from here we are permanently root and must execve. + setuid_privileged::become_root_permanently()?; + close_inherited_fds(); + + // Empty envp: once ruid==0 the kernel clears AT_SECURE, so a smuggled + // LD_PRELOAD/LD_LIBRARY_PATH would be honored by the dynamic loader and + // hijack the root jailer. We pass nothing and let the jailer build its own. + let empty_env: [CString; 0] = []; + let errno = nix::unistd::execve(&jailer_cstr, &argv, &empty_env) + .expect_err("execve only returns on failure"); + Err(Error::Exec { + path: jailer, + errno, + }) +} diff --git a/native/suidhelper/src/tools/mod.rs b/native/suidhelper/src/tools/mod.rs index 24a2be4..9f9feda 100644 --- a/native/suidhelper/src/tools/mod.rs +++ b/native/suidhelper/src/tools/mod.rs @@ -7,6 +7,7 @@ mod blockdev; pub mod chroot_jail; mod dmsetup; +pub mod jailer; mod losetup; pub use blockdev::{Blockdev, BlockdevArgs}; diff --git a/native/suidhelper/src/util/setuid_privileged.rs b/native/suidhelper/src/util/setuid_privileged.rs index 3c58bd3..e334b11 100644 --- a/native/suidhelper/src/util/setuid_privileged.rs +++ b/native/suidhelper/src/util/setuid_privileged.rs @@ -12,17 +12,20 @@ //! constructor nor a Drop impl can report an error, and silently staying root //! would defeat the point. -use nix::unistd::{getuid, seteuid, Uid}; +use nix::unistd::{getuid, seteuid, setgroups, setresgid, setresuid, Gid, Uid}; use thiserror::Error as ThisError; -/// Failures of the privilege transitions. Both are fatal: if we can't raise we -/// aren't installed setuid root, and if we can't lower we refuse to keep running. +/// Failures of the privilege transitions. All fatal: if we can't raise we aren't +/// installed setuid root, if we can't lower we refuse to keep running, and if we +/// can't seal permanent root we refuse to hand off to the execve target. #[derive(Debug, ThisError)] pub enum Error { #[error("not installed setuid root")] NotSetuidRoot, #[error("failed to drop privileges")] DropPrivileges, + #[error("failed to acquire permanent root for execve handoff")] + PermanentRoot, } /// `.preinit_array` runs before `.init_array` and before any shared-library @@ -76,6 +79,36 @@ impl Privileged { } } +/// Re-acquire full root **permanently** for an `execve` handoff and return — +/// there is deliberately no [`Privileged`] Drop guard, because `execve` replaces +/// the entire process image: nothing of this process survives to run a destructor, +/// and the new image (the firecracker jailer) is responsible for dropping its own +/// privileges. We must hand it a *genuine* root process (all of real, effective +/// and saved uid == 0) so the jailer's own privilege-drop is the real thing. +/// +/// Order matters and each step needs the privilege the previous one preserves: +/// 1. `seteuid(0)` — regain effective root; without it the rest are EPERM. +/// 2. `setresgid(0,0,0)` — set every gid to root *before* touching uids, while +/// we still hold euid 0 (gid changes require privilege). +/// 3. `setgroups([0])` — `drop_to_real` only lowered euid; it never touched the +/// caller's supplementary groups, so the jailer would otherwise inherit them. +/// Reset to just {0} now, while still privileged. +/// 4. `setresuid(0,0,0)` LAST — this seals real+effective+saved uid to root. It +/// goes last because once the saved uid is 0 there is no escape hatch left +/// (which is the point: permanent), and because it must follow the gid/group +/// changes that needed our euid-0 to be permitted. +pub fn become_root_permanently() -> Result<(), Error> { + let root_uid = Uid::from_raw(0); + let root_gid = Gid::from_raw(0); + + seteuid(root_uid).map_err(|_| Error::NotSetuidRoot)?; + setresgid(root_gid, root_gid, root_gid).map_err(|_| Error::PermanentRoot)?; + setgroups(&[root_gid]).map_err(|_| Error::PermanentRoot)?; + setresuid(root_uid, root_uid, root_uid).map_err(|_| Error::PermanentRoot)?; + + Ok(()) +} + impl Drop for Privileged { fn drop(&mut self) { if lower().is_err() { diff --git a/native/suidhelper/tests/e2e/jailer.rs b/native/suidhelper/tests/e2e/jailer.rs new file mode 100644 index 0000000..87084ae --- /dev/null +++ b/native/suidhelper/tests/e2e/jailer.rs @@ -0,0 +1,193 @@ +//! L4 for the jailer handoff: prove the exact argv (and an EMPTY environment) the +//! helper hands to the jailer via `execve`, and that the jailer's exit status +//! propagates. We point the config's `jailer` at a root-owned recorder that +//! writes its argv and its `/proc/self/environ` to files, then exits with a known +//! code. Root-only: `become_root_permanently` requires we are already root. +#![cfg(feature = "insecure_test_seams")] + +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::process::Command; + +const HELPER: &str = env!("CARGO_BIN_EXE_hyper-suidhelper"); +const RECORDER_EXIT: i32 = 7; + +fn is_root() -> bool { + nix::unistd::geteuid().is_root() +} + +fn cat_bin() -> &'static str { + ["/bin/cat", "/usr/bin/cat"] + .into_iter() + .find(|p| Path::new(p).exists()) + .expect("a `cat` binary for the recorder") +} + +/// Install a root-owned recorder named `jailer` that writes its argv (minus +/// argv[0]) as a JSON array to `argv_rec` and copies its `/proc/self/environ` to +/// `env_rec`, then exits `RECORDER_EXIT`. Paths are baked into the script text +/// because the recorder runs with an empty environment and absolute `cat` so it +/// needs no `PATH`. Returns the recorder's absolute path. +fn install_recorder(dir: &Path, argv_rec: &Path, env_rec: &Path) -> PathBuf { + let path = dir.join("jailer"); + let script = format!( + "#!/bin/sh\n\ + (\n printf '['\n sep=''\n for a in \"$@\"; do printf '%s\"%s\"' \"$sep\" \"$a\"; sep=','; done\n printf ']'\n) > '{argv}'\n\ + {cat} /proc/self/environ > '{env}'\n\ + exit {code}\n", + argv = argv_rec.display(), + cat = cat_bin(), + env = env_rec.display(), + code = RECORDER_EXIT, + ); + fs::write(&path, script).unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o755)).unwrap(); + path +} + +/// A root-owned plain file with basename `firecracker` — the `--exec-file`. It is +/// never executed by us (the jailer would), only validated as a `SafeBin`. +fn install_firecracker(dir: &Path) -> PathBuf { + let path = dir.join("firecracker"); + fs::write(&path, b"#!/bin/true\n").unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap(); + path +} + +fn write_root_config(dir: &Path, jailer: &Path, firecracker: &Path) -> PathBuf { + let p = dir.join("config.toml"); + let body = format!( + "work_dir = \"/srv/hyper\"\njailer = \"{}\"\nfirecracker = \"{}\"\n", + jailer.display(), + firecracker.display(), + ); + fs::write(&p, body).unwrap(); + fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); + p +} + +fn run(config: &Path, args: &[&str]) -> std::process::Output { + Command::new(HELPER) + .args(args) + .env_clear() + .env("HYPER_SETUIDHELPER_IS_INSECURE_MODE", "1") + .env("HYPER_SETUIDHELPER_CONFIG_PATH", config) + .output() + .expect("spawn helper") +} + +#[test] +fn execs_jailer_with_canonical_argv_and_empty_env_as_root() { + if !is_root() { + eprintln!("SKIP jailer exec: needs root to become_root_permanently + own the fakes"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "900001", + "--gid", + "900002", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--api-sock", + "/api.sock", + ], + ); + + // The jailer's own exit status must propagate through the execve handoff. + assert_eq!( + out.status.code(), + Some(RECORDER_EXIT), + "exit status did not propagate; stderr: {}", + String::from_utf8_lossy(&out.stderr), + ); + + let argv: Vec = + serde_json::from_str(&fs::read_to_string(&argv_rec).expect("recorded argv")).unwrap(); + assert_eq!( + argv, + vec![ + "--id", + "vm1", + "--exec-file", + &firecracker.to_string_lossy(), + "--uid", + "900001", + "--gid", + "900002", + "--chroot-base-dir", + "/srv/hyper/jails", + "--cgroup-version", + "2", + "--parent-cgroup", + "hyper", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--", + "--api-sock", + "/api.sock", + ], + "helper handed the jailer a non-canonical argv", + ); + + // The execve envp must be empty: once ruid==0 a smuggled LD_PRELOAD would be + // honored, so nothing of the caller's environment may reach the root jailer. + let environ = fs::read(&env_rec).expect("recorded environ"); + assert!( + environ.is_empty(), + "jailer inherited a non-empty environment: {environ:?}", + ); +} + +#[test] +fn refuses_uid_zero_without_exec_as_root() { + if !is_root() { + eprintln!("SKIP jailer uid 0: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "0", + "--gid", + "900002", + "--api-sock", + "/api.sock", + ], + ); + + assert_ne!(out.status.code(), Some(0), "uid 0 must be refused"); + assert_eq!(out.status.code(), Some(2), "validation failure exits 2"); + assert!( + !argv_rec.exists(), + "the jailer must never have been exec'd for uid 0", + ); +} diff --git a/native/suidhelper/tests/tools/jailer.rs b/native/suidhelper/tests/tools/jailer.rs new file mode 100644 index 0000000..9ab7544 --- /dev/null +++ b/native/suidhelper/tests/tools/jailer.rs @@ -0,0 +1,162 @@ +//! Refusal contracts for the jailer's pure validators — the security core. A +//! valid input must round-trip to its canonical form; an invalid one must +//! *always* be rejected, never silently accepted. These properties are what stop +//! a compromised BEAM from naming uid 0, a privileged path, a traversal, or a +//! flag through the jailer subcommand. + +use hyper_suidhelper::tools::jailer::{validate_id_number, CgroupSetting, JailSock, VmId}; +use proptest::prelude::*; +use std::str::FromStr; + +proptest! { + /// uid/gid 0 is rejected for EVERY range — a jailer run with uid 0 skips its + /// privilege drop and leaves firecracker running as root. + #[test] + fn id_zero_always_rejected(lo in any::(), span in any::()) { + let hi = lo.saturating_add(span); + prop_assert!(validate_id_number(0, (lo, hi)).is_err()); + } + + /// Any nonzero value inside the (nonempty) range is accepted unchanged. + #[test] + fn id_in_range_nonzero_accepted( + lo in 1u32..=1_000_000, + span in 0u32..=1_000_000, + off in 0u32..=1_000_000, + ) { + let hi = lo + span; + let n = lo + (off % (span + 1)); // off bounded into [0, span] => n in [lo, hi] + prop_assert_eq!(validate_id_number(n, (lo, hi)).ok(), Some(n)); + } + + /// Values just below `lo` (still nonzero) and just above `hi` are rejected. + #[test] + fn id_out_of_range_rejected(lo in 2u32..=1_000_000, span in 0u32..=1_000_000) { + let hi = lo + span; + prop_assert!(validate_id_number(lo - 1, (lo, hi)).is_err()); + if hi < u32::MAX { + prop_assert!(validate_id_number(hi + 1, (lo, hi)).is_err()); + } + } + + /// Every string over the allowed charset/length with a non-dash leader parses + /// and renders back to itself. + #[test] + fn vmid_valid_round_trips(s in "[A-Za-z0-9_][A-Za-z0-9_-]{0,63}") { + prop_assert_eq!(VmId::from_str(&s).unwrap().to_string(), s); + } + + /// A leading dash is always rejected (no flag injection via `--id`). + #[test] + fn vmid_leading_dash_rejected(s in "-[A-Za-z0-9_-]{0,63}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Any embedded path separator is rejected (no chroot traversal via `--id`). + #[test] + fn vmid_slash_rejected(s in "[A-Za-z0-9_]{0,10}/[A-Za-z0-9_]{0,10}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Over-length ids (>64) are rejected. + #[test] + fn vmid_too_long_rejected(s in "[A-Za-z][A-Za-z0-9_-]{64,90}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// A valid `memory.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_memory_round_trips(s in "memory[.]max=([0-9]{1,20}|max)") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A valid `cpu.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_cpu_round_trips(s in "cpu[.]max=([0-9]{1,20} [0-9]{1,20}|max [0-9]{1,20})") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A single-filename absolute socket path round-trips; `.`/`..` filenames are + /// rejected even though they are within the charset. + #[test] + fn jailsock_single_filename(name in "[A-Za-z0-9_.-]{1,40}") { + let s = format!("/{name}"); + let res = JailSock::from_str(&s); + if name == "." || name == ".." { + prop_assert!(res.is_err()); + } else { + prop_assert_eq!(res.unwrap().to_string(), s); + } + } + + /// A second path component is always rejected (the socket must be a direct + /// child of `/`). + #[test] + fn jailsock_multi_component_rejected( + a in "[A-Za-z0-9_]{1,10}", + b in "[A-Za-z0-9_]{1,10}", + ) { + let s = format!("/{a}/{b}"); + prop_assert!(JailSock::from_str(&s).is_err()); + } +} + +#[test] +fn vmid_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "-leading", // leading dash + "a/b", // separator + "a.b", // dot + "a b", // whitespace + "a\tb", // tab + "a\0b", // NUL + "naïve", // non-ascii + &"x".repeat(65), // too long + ] { + assert!(VmId::from_str(bad).is_err(), "VmId accepted {bad:?}"); + } +} + +#[test] +fn cgroup_rejects_known_bad_shapes() { + for bad in [ + "linear=10", // unknown key + "memory.high=10", // unknown key + "memory.max=", // empty value + "memory.max=12x", // non-digit + "memory.max=1=2", // second '=' + "memory.max", // no '=' + "cpu.max=100000", // missing period field + "cpu.max=100000 100000 5", // extra field + "cpu.max=x 100000", // bad quota + "cpu.max=max max", // bad period + "cpu.max=max", // missing period + ] { + assert!( + CgroupSetting::from_str(bad).is_err(), + "CgroupSetting accepted {bad:?}" + ); + } +} + +#[test] +fn jailsock_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "relative", // not absolute + "/", // no filename + "/a/b", // multi-component + "/../etc", // traversal + "/..", // traversal as whole filename + "/.", // current dir + "/a b", // whitespace + "/a\0b", // NUL + "//x", // empty leading component + ] { + assert!( + JailSock::from_str(bad).is_err(), + "JailSock accepted {bad:?}" + ); + } +} From a5e098c732ef4e2609f92f3bae347bce8fe47c4c Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 23:27:22 +0000 Subject: [PATCH 20/34] fix(suidhelper): fail closed on close_range error; _exit on jailer exec failure --- native/suidhelper/src/main.rs | 4 +++- native/suidhelper/src/tools/jailer.rs | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs index 665370b..11b0831 100644 --- a/native/suidhelper/src/main.rs +++ b/native/suidhelper/src/main.rs @@ -116,7 +116,9 @@ fn main() { Command::Jailer(args) => { let e = jailer::run(args).expect_err("jailer::run only returns on error"); eprintln!("{e}"); - std::process::exit(2); + // _exit bypasses atexit handlers; safe because we are permanently root + // at this point and must not run any cleanup registered before escalation. + unsafe { nix::libc::_exit(2) } } Command::Version => unreachable!("handled above"), }; diff --git a/native/suidhelper/src/tools/jailer.rs b/native/suidhelper/src/tools/jailer.rs index d816de1..ec29ac8 100644 --- a/native/suidhelper/src/tools/jailer.rs +++ b/native/suidhelper/src/tools/jailer.rs @@ -285,12 +285,13 @@ fn build_argv( /// Close every inherited fd above stdio so a compromised BEAM cannot smuggle an /// open fd into the root jailer. Keep 0/1/2: MuonTrap supervises the jailer /// through stdio, and stderr carries our exec-failure message. `close_range(2)` -/// needs Linux 5.9+; on ENOSYS we fall back to closing each fd up to the limit. +/// needs Linux 5.9+; on any failure (ENOSYS or otherwise) we fall back to +/// closing each fd individually — fail closed before handing root to the jailer. fn close_inherited_fds() { const FIRST: u32 = 3; // SAFETY: raw syscall with no memory operands; closing fds has no UB. let rc = unsafe { nix::libc::close_range(FIRST, u32::MAX, 0) }; - if rc == 0 || Errno::last() != Errno::ENOSYS { + if rc == 0 { return; } From 5f9f36b9ba0addca6dbd6d95e2e1e560a637d384 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 23:41:35 +0000 Subject: [PATCH 21/34] refactor(fire_vmm): drive jailer through suidhelper; drop Provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BEAM no longer names a privileged binary path. `Jailer.command/1` now launches `hyper-suidhelper jailer` with only the id, uid/gid, cgroup flags, and api-sock; the helper reads firecracker/jailer binary paths, chroot base, parent cgroup, and cgroup version from its trusted /etc/hyper/config.toml, re-acquires root, and execve's the jailer (same pid, so MuonTrap owns the lifetime). Changes: - config.ex: add config_toml/0 (file → persistent_term cache); rewrite work_dir/0 over it; add firecracker_bin/0 + jailer_bin/0 via fetch_bin!/1 (raises when key absent); remove firecracker_install_dir/0. - provider.ex: deleted. - jailer.ex: rewire command/1 and exec_name/0; drop Provider alias. - daemon.ex: note that supervised process is the helper (execve → jailer). - node.ex: replace Provider.ensure_installed() with check_firecracker_bins/0. - hyper.ex: prefix gen_vm_id/0 with "v" so ids never start with "-". - jailer_test.exs: new pure test suite; load-bearing assertion that args contain no --exec-file / --chroot-base-dir / -- flags. --- lib/hyper.ex | 2 +- lib/hyper/config.ex | 79 +++++++++++++------- lib/hyper/node.ex | 15 +++- lib/hyper/node/fire_vmm/daemon.ex | 10 +-- lib/hyper/node/fire_vmm/jailer.ex | 48 +++++-------- lib/hyper/node/fire_vmm/provider.ex | 92 ------------------------ test/hyper/node/fire_vmm/jailer_test.exs | 81 +++++++++++++++++++++ 7 files changed, 171 insertions(+), 156 deletions(-) delete mode 100644 lib/hyper/node/fire_vmm/provider.ex create mode 100644 test/hyper/node/fire_vmm/jailer_test.exs diff --git a/lib/hyper.ex b/lib/hyper.ex index fb0a626..c29d2f0 100644 --- a/lib/hyper.ex +++ b/lib/hyper.ex @@ -36,7 +36,7 @@ defmodule Hyper do @doc "Generate a fresh VM id (url-safe base64, dm-name compatible)." @spec gen_vm_id() :: Hyper.Vm.id() - def gen_vm_id, do: Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false) + def gen_vm_id, do: "v" <> Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false) @spec resolve_arch(Hyper.Vm.Instance.arch() | nil) :: {:ok, Hyper.Vm.Instance.arch()} | {:error, term()} diff --git a/lib/hyper/config.ex b/lib/hyper/config.ex index e3c056d..250bfa0 100644 --- a/lib/hyper/config.ex +++ b/lib/hyper/config.ex @@ -2,10 +2,10 @@ defmodule Hyper.Config do @moduledoc """ Host configuration, read from `config :hyper, ...` (see `config/config.exs`). - `work_dir` is the one value shared with the setuid helper - (`native/suidhelper`); both sides read it from `/etc/hyper/config.toml` so the - data root has a single source of truth (see `work_dir/0`). Everything else is - compile-time. + Runtime values shared with the setuid helper (`native/suidhelper`) — `work_dir`, + `firecracker_bin`, `jailer_bin` — are read from `/etc/hyper/config.toml` (the + single source of truth for both sides) the first time they are needed, then + cached in `:persistent_term`. Everything else is compile-time. """ # The shared data-root config file, read by both this node and the setuid @@ -24,39 +24,66 @@ defmodule Hyper.Config do Root work directory for this node. All firecracker paths derive from it. Read from `#{@config_path}` (the single source of truth shared with the setuid - helper) the first time it is needed, then cached in `:persistent_term`. Falls - back to `#{@dev_work_dir}` when the file is absent (local dev / CI, where the - helper is not installed anyway). + helper) the first time it is needed, then cached via `config_toml/0`. Falls back + to `#{@dev_work_dir}` when the file is absent (local dev / CI, where the helper + is not installed anyway). """ @spec work_dir :: Path.t() - def work_dir do - case :persistent_term.get({__MODULE__, :work_dir}, nil) do + def work_dir, do: Map.get(config_toml(), "work_dir", @dev_work_dir) + + @doc "Directory holding redistributable binaries downloaded by the node." + @spec redist_dir :: Path.t() + def redist_dir, do: Path.join(work_dir(), "redist") + + @doc """ + Absolute path to the firecracker binary, as set in `#{@config_path}` under the + `firecracker_bin` key. Raises if the key is absent — the operator must configure + it; there is no default. + """ + @spec firecracker_bin :: Path.t() + def firecracker_bin, do: fetch_bin!("firecracker_bin") + + @doc """ + Absolute path to the jailer binary, as set in `#{@config_path}` under the + `jailer_bin` key. Raises if the key is absent — the operator must configure it; + there is no default. + """ + @spec jailer_bin :: Path.t() + def jailer_bin, do: fetch_bin!("jailer_bin") + + @spec fetch_bin!(String.t()) :: Path.t() + defp fetch_bin!(key) do + case Map.fetch(config_toml(), key) do + {:ok, path} -> + path + + :error -> + raise "#{@config_path}: key #{inspect(key)} is not set; " <> + "operator must configure it before starting the node" + end + end + + @spec config_toml :: map() + defp config_toml do + case :persistent_term.get({__MODULE__, :config_toml}, nil) do nil -> - work_dir = load_work_dir() - :persistent_term.put({__MODULE__, :work_dir}, work_dir) - work_dir + cfg = load_config_toml() + :persistent_term.put({__MODULE__, :config_toml}, cfg) + cfg - work_dir -> - work_dir + cfg -> + cfg end end - @spec load_work_dir :: Path.t() - defp load_work_dir do + @spec load_config_toml :: map() + defp load_config_toml do case File.read(@config_path) do - {:ok, body} -> body |> Toml.decode!() |> Map.fetch!("work_dir") - {:error, _} -> @dev_work_dir + {:ok, body} -> Toml.decode!(body) + {:error, _} -> %{} end end - @doc "Directory holding redistributable binaries downloaded by the node." - @spec redist_dir :: Path.t() - def redist_dir, do: Path.join(work_dir(), "redist") - - @doc "Directory where `Hyper.Node.FireVMM.Provider` installs the firecracker release." - @spec firecracker_install_dir :: Path.t() - def firecracker_install_dir, do: Path.join(redist_dir(), "firecracker") - @doc "Directory where `Hyper.Node.FireVMM.VmLinux.Provider` installs guest kernels." @spec vmlinux_install_dir :: Path.t() def vmlinux_install_dir, do: Path.join(redist_dir(), "vmlinux") diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index eb3bb73..5c2bd3f 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -153,7 +153,7 @@ defmodule Hyper.Node do @spec test_system :: :ok | {:error, term()} def test_system do with {:ok, _} <- Hyper.Node.Config.Budget.load(), - :ok <- Hyper.Node.FireVMM.Provider.ensure_installed(), + :ok <- check_firecracker_bins(), :ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(), :ok <- Hyper.Node.Vmlinux.test_system(), :ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(), @@ -167,6 +167,19 @@ defmodule Hyper.Node do end end + @spec check_firecracker_bins :: + :ok | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}} + defp check_firecracker_bins do + fc = Hyper.Config.firecracker_bin() + jail = Hyper.Config.jailer_bin() + + cond do + not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}} + not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}} + true -> :ok + end + end + @spec check_helper_base(Path.t()) :: :ok | {:error, {:suid_helper_base_mismatch, Path.t(), Path.t()}} defp check_helper_base(base) do diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex index 7cbc159..c139340 100644 --- a/lib/hyper/node/fire_vmm/daemon.ex +++ b/lib/hyper/node/fire_vmm/daemon.ex @@ -4,15 +4,17 @@ defmodule Hyper.Node.FireVMM.Daemon do `Hyper.Node.FireVMM.Core`. Lifecycle is supervisor-owned. On every (re)start it first resets any stale - jail left by a prior incarnation - the firecracker jailer refuses to reuse an - existing chroot - then builds the jailer command and runs it under + jail left by a prior incarnation — the firecracker jailer refuses to reuse an + existing chroot — then builds the jailer command and runs it under `MuonTrap.Daemon`, which kills the OS process when its port closes (controller crash, container teardown, or BEAM death). So no firecracker process outlives the supervisor, and `Core`'s `:one_for_all` restarting this child (e.g. after a firecracker crash) cleanly cold-boots against a fresh jail. - The supervised process *is* the `MuonTrap.Daemon` - `start_link/1` does the - reset, then delegates and returns that pid. + The supervised process is `hyper-suidhelper jailer ...`, which `execve`s into + the jailer (same pid) so MuonTrap owns the firecracker lifetime without needing + to know the jailer path. `start_link/1` does the reset, then delegates and + returns that pid. """ alias Hyper.Node.FireVMM.{Jailer, Opts} diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex index 6dd1b80..355ad0d 100644 --- a/lib/hyper/node/fire_vmm/jailer.ex +++ b/lib/hyper/node/fire_vmm/jailer.ex @@ -1,27 +1,26 @@ defmodule Hyper.Node.FireVMM.Jailer do @moduledoc """ - Builds the firecracker - [jailer](https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md) - command for one VM. + Builds the `hyper-suidhelper jailer` command for one VM. - The jailer sets up the chroot, namespaces, cgroup (via `Hyper.Vm.Instance` - flags) and drops privileges, then exec's firecracker. We run the jailer (not - firecracker directly) under `MuonTrap.Daemon`; MuonTrap only supervises the OS - process, the jailer owns isolation. + The BEAM does not invoke the jailer directly. Instead it calls the setuid helper + with the `jailer` subcommand; the helper reads the firecracker binary path, chroot + base, parent cgroup, and cgroup version from its trusted `/etc/hyper/config.toml`, + re-acquires root, and `execve`s the jailer (same pid, so `MuonTrap.Daemon` keeps + supervising it). + + This means the BEAM passes only untrusted-origin values: `--id`, `--uid`, `--gid`, + repeated `--cgroup KEY=VALUE`, and `--api-sock`. The helper derives and validates + everything else; it also inserts the `--` separator between its own flags and + firecracker's flags. Because firecracker is chrooted to `///root`, the API - socket it opens at `/api.socket` lives at `host_socket` on the host - that's the + socket it opens at `/api.socket` lives at `host_socket` on the host — that's the path the controller connects to. - - Host config: paths are derived from `config :hyper, work_dir: ...`. The - firecracker + jailer binaries are installed under `/redist/firecracker` - by `Hyper.Node.FireVMM.Provider`; the chroot base is `/jails`. """ use OpenTelemetryDecorator alias Hyper.Node.FireVMM - alias Hyper.Node.FireVMM.Provider alias Hyper.Vm.Instance # firecracker's API socket path *inside* the chroot. @@ -94,26 +93,11 @@ defmodule Hyper.Node.FireVMM.Jailer do @spec command(FireVMM.Opts.t()) :: t() def command(opts) do args = - [ - "--id", - opts.vm_id, - "--exec-file", - Provider.firecracker_bin(), - "--uid", - to_string(opts.uid), - "--gid", - to_string(opts.gid), - "--chroot-base-dir", - Hyper.Config.chroot_base(), - "--cgroup-version", - "2", - "--parent-cgroup", - Hyper.Config.parent_cgroup() - ] ++ + ["jailer", "--id", opts.vm_id, "--uid", to_string(opts.uid), "--gid", to_string(opts.gid)] ++ cgroup_flags(opts.type) ++ - ["--", "--api-sock", "/" <> @jail_socket] + ["--api-sock", "/" <> @jail_socket] - %{binary: Provider.jailer_bin(), args: args, host_socket: host_socket(opts.vm_id)} + %{binary: Hyper.Config.suid_helper(), args: args, host_socket: host_socket(opts.vm_id)} end # Find the appropriate jailer cgroup flags for the given instance type. @@ -166,5 +150,5 @@ defmodule Hyper.Node.FireVMM.Jailer do ]) end - defp exec_name, do: Path.basename(Provider.firecracker_bin()) + defp exec_name, do: Path.basename(Hyper.Config.firecracker_bin()) end diff --git a/lib/hyper/node/fire_vmm/provider.ex b/lib/hyper/node/fire_vmm/provider.ex deleted file mode 100644 index e50aee1..0000000 --- a/lib/hyper/node/fire_vmm/provider.ex +++ /dev/null @@ -1,92 +0,0 @@ -defmodule Hyper.Node.FireVMM.Provider do - @moduledoc """ - Installs the firecracker release for the current architecture into - `Hyper.Config.firecracker_install_dir/0` (`/redist/firecracker`). - - `ensure_installed/0` is idempotent: if the binaries are already present and - executable it returns `:ok` without touching the network. Otherwise it fetches - the official firecracker release tarball for the detected architecture via - `Redist.Targz` (download, SHA-256 verify, extract). The archive is - extracted as-is - the binaries live under `release-v-/` exactly as - firecracker ships them, and `firecracker_bin/0` / `jailer_bin/0` resolve those - paths. - """ - - alias Redist.Targz - - @downloads %{ - x86_64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-x86_64.tgz", - sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", - firecracker: "release-v1.16.0-x86_64/firecracker-v1.16.0-x86_64", - jailer: "release-v1.16.0-x86_64/jailer-v1.16.0-x86_64" - }, - aarch64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-aarch64.tgz", - sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", - firecracker: "release-v1.16.0-aarch64/firecracker-v1.16.0-aarch64", - jailer: "release-v1.16.0-aarch64/jailer-v1.16.0-aarch64" - } - } - - @doc "Absolute path to the installed firecracker binary." - @spec firecracker_bin() :: Path.t() - def firecracker_bin, do: bin_path(:firecracker) - - @doc "Absolute path to the installed jailer binary." - @spec jailer_bin() :: Path.t() - def jailer_bin, do: bin_path(:jailer) - - @doc "Ensure the firecracker release is installed for this node." - @spec ensure_installed() :: :ok | {:error, term()} - def ensure_installed do - with {:ok, arch} <- Sys.Arch.current() do - dl = Map.fetch!(@downloads, arch) - - case check_install(dl) do - :ok -> :ok - {:error, :not_installed} -> install(dl) - {:error, :bad_install} -> reinstall(dl) - end - end - end - - # `:ok` if `dl`'s version-specific binaries are present and executable; - # `{:error, :not_installed}` if the install dir is empty/absent; otherwise - # `{:error, :bad_install}` - something is there but it's the wrong version, - # partial, or corrupt, which we cannot fix in place because `Targz` keeps - # existing files. The remedy is to wipe and reinstall. - @spec check_install(map()) :: :ok | {:error, :not_installed | :bad_install} - defp check_install(dl) do - fc = Path.join(install_dir(), dl.firecracker) - jail = Path.join(install_dir(), dl.jailer) - - cond do - Sys.Posix.executable?(fc) and Sys.Posix.executable?(jail) -> - :ok - - File.dir?(install_dir()) and File.ls!(install_dir()) != [] -> - {:error, :bad_install} - - true -> - {:error, :not_installed} - end - end - - defp install(dl), do: Targz.install(dl.url, dl.sha256, install_dir()) - - defp reinstall(dl) do - _ = File.rm_rf!(install_dir()) - install(dl) - end - - defp bin_path(key) do - {:ok, arch} = Sys.Arch.current() - dl = Map.fetch!(@downloads, arch) - Path.join(install_dir(), Map.fetch!(dl, key)) - end - - defp install_dir, do: Hyper.Config.firecracker_install_dir() -end diff --git a/test/hyper/node/fire_vmm/jailer_test.exs b/test/hyper/node/fire_vmm/jailer_test.exs new file mode 100644 index 0000000..9a23111 --- /dev/null +++ b/test/hyper/node/fire_vmm/jailer_test.exs @@ -0,0 +1,81 @@ +defmodule Hyper.Node.FireVMM.JailerTest do + @moduledoc """ + Properties and examples for `Hyper.Node.FireVMM.Jailer.command/1`. + + Load-bearing invariant: the BEAM must never place a privileged binary path + (firecracker, jailer) or lifecycle flags owned by the suidhelper (`--exec-file`, + `--chroot-base-dir`, `--cgroup-version`, `--parent-cgroup`, `--`) in the args + it hands to the helper. The helper derives those from its trusted config. + """ + + use ExUnit.Case, async: false + use ExUnitProperties + + alias Hyper.Node.FireVMM + alias Hyper.Node.FireVMM.Jailer + + @vm_id "vmtest01" + + # Stub config_toml persistent_term so firecracker_bin/jailer_bin resolve + # to dummy paths without requiring /etc/hyper/config.toml on the test host. + # async: false because persistent_term is global state. + setup do + :persistent_term.put({Hyper.Config, :config_toml}, %{ + "firecracker_bin" => "/usr/local/bin/firecracker-v1.16.0-x86_64", + "jailer_bin" => "/usr/local/bin/jailer-v1.16.0-x86_64" + }) + + on_exit(fn -> :persistent_term.erase({Hyper.Config, :config_toml}) end) + end + + defp micro_opts do + %FireVMM.Opts{ + vm_id: @vm_id, + uid: 900_001, + gid: 900_001, + type: :micro, + arch: :x86_64, + mutable: nil, + kernel: "/srv/hyper/redist/vmlinux/vmlinux-x86_64-6.1", + boot_args: nil + } + end + + test "binary is the suid helper" do + assert Jailer.command(micro_opts()).binary == Hyper.Config.suid_helper() + end + + test "args start with the jailer subcommand" do + %{args: [first | _]} = Jailer.command(micro_opts()) + assert first == "jailer" + end + + test "args contain --id, --uid, --gid with the opts values" do + %{args: args} = Jailer.command(micro_opts()) + assert "--id" in args + assert @vm_id in args + assert "--uid" in args + assert "--gid" in args + assert "900001" in args + end + + test "args end with --api-sock /api.socket" do + %{args: args} = Jailer.command(micro_opts()) + assert Enum.take(args, -2) == ["--api-sock", "/api.socket"] + end + + test "args do not contain privileged flags owned by the suidhelper" do + %{args: args} = Jailer.command(micro_opts()) + refute "--exec-file" in args + refute "--chroot-base-dir" in args + refute "--cgroup-version" in args + refute "--parent-cgroup" in args + refute "--" in args + end + + property "gen_vm_id/0 never produces an id starting with -" do + check all(_ <- StreamData.constant(nil)) do + refute String.starts_with?(Hyper.gen_vm_id(), "-") + end + end +end From 3a58b77a38bc29db83664f87026e8a62fdea53d8 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Thu, 25 Jun 2026 23:53:37 +0000 Subject: [PATCH 22/34] feat(mix): add firecracker.install task to fetch+configure firecracker/jailer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downloads the pinned Firecracker v1.16.0 release via Redist.Targz.install/3 (download → SHA-256 verify → extract), copies the version-stamped binaries to bare-basename paths (/firecracker and /jailer) required by the suidhelper's SafeBin validator, and prints the config snippets the operator needs to paste. --- lib/mix/tasks/firecracker.install.ex | 141 +++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 lib/mix/tasks/firecracker.install.ex diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex new file mode 100644 index 0000000..5d2a872 --- /dev/null +++ b/lib/mix/tasks/firecracker.install.ex @@ -0,0 +1,141 @@ +defmodule Mix.Tasks.Firecracker.Install do + @shortdoc "Download, verify, and install the pinned Firecracker release" + @moduledoc """ + Downloads, verifies, and installs the pinned Firecracker release (v1.16.0) + for the current CPU architecture. + + mix firecracker.install [--prefix DIR] + + Steps performed: + + 1. Detects the CPU architecture (`x86_64` or `aarch64`). + 2. Downloads the release tarball and verifies its SHA-256 checksum. + 3. Extracts the tarball, then copies the binaries to `/firecracker` + and `/jailer` using the **bare basenames** `firecracker` and + `jailer`. The setuid helper validates binaries via `SafeBin<"firecracker">` + and `SafeBin<"jailer">`, which match on basename only — version-stamped + names such as `firecracker-v1.16.0-x86_64` are rejected unconditionally. + 4. Marks both binaries executable (`0o755`). + 5. Prints the configuration snippets the operator needs to paste. + + This task installs **unprivileged** binaries and prints configuration. + Privilege at runtime is handled by `hyper-suidhelper` (the setuid helper). + This task does **not** setuid `firecracker` or `jailer`. Install and setuid + the helper separately with `mix suidhelper.install`. + + ## Options + + * `--prefix DIR` — installation directory (default: `/opt/firecracker`). + + ## Security requirements + + After installing, ensure: + + * The binaries are root-owned and **not** group- or world-writable. + The suidhelper refuses binaries with loose permissions. + * `/etc/hyper/config.toml` is root-owned with mode `0644`. + """ + + use Mix.Task + + @version "1.16.0" + @default_prefix "/opt/firecracker" + + @impl Mix.Task + @spec run([String.t()]) :: :ok + def run(argv) do + {opts, _rest, _invalid} = OptionParser.parse(argv, strict: [prefix: :string]) + prefix = Keyword.get(opts, :prefix, @default_prefix) + + arch = detect_arch!() + + case Application.ensure_all_started(:req) do + {:ok, _} -> :ok + {:error, {reason, app}} -> Mix.raise("Cannot start HTTP client #{app}: #{inspect(reason)}") + end + + install!(release_for(arch), prefix) + print_config(prefix) + end + + defp detect_arch! do + case Sys.Arch.current() do + {:ok, arch} -> + arch + + {:error, {:unsupported_arch, raw}} -> + Mix.raise( + "Unsupported CPU architecture #{inspect(raw)}; " <> + "Firecracker supports x86_64 and aarch64." + ) + end + end + + defp release_for(:x86_64) do + %{ + url: + "https://github.com/firecracker-microvm/firecracker/releases/download/" <> + "v#{@version}/firecracker-v#{@version}-x86_64.tgz", + sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", + firecracker_path: "release-v#{@version}-x86_64/firecracker-v#{@version}-x86_64", + jailer_path: "release-v#{@version}-x86_64/jailer-v#{@version}-x86_64" + } + end + + defp release_for(:aarch64) do + %{ + url: + "https://github.com/firecracker-microvm/firecracker/releases/download/" <> + "v#{@version}/firecracker-v#{@version}-aarch64.tgz", + sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", + firecracker_path: "release-v#{@version}-aarch64/firecracker-v#{@version}-aarch64", + jailer_path: "release-v#{@version}-aarch64/jailer-v#{@version}-aarch64" + } + end + + defp install!( + %{url: url, sha256: sha256, firecracker_path: fc_rel, jailer_path: jailer_rel}, + prefix + ) do + extract_dir = Path.join(prefix, ".firecracker-extract") + + Mix.shell().info("Downloading Firecracker v#{@version} from #{url} ...") + + case Redist.Targz.install(url, sha256, extract_dir) do + :ok -> :ok + {:error, reason} -> Mix.raise("Download from #{url} failed: #{inspect(reason)}") + end + + dst_fc = Path.join(prefix, "firecracker") + dst_jailer = Path.join(prefix, "jailer") + + # The release ships version-stamped names; copy to bare basenames so SafeBin + # validation passes. The helper matches on basename, not full path. + File.cp!(Path.join(extract_dir, fc_rel), dst_fc) + File.cp!(Path.join(extract_dir, jailer_rel), dst_jailer) + File.chmod!(dst_fc, 0o755) + File.chmod!(dst_jailer, 0o755) + _ = File.rm_rf!(extract_dir) + + Mix.shell().info("Installed #{dst_fc}") + Mix.shell().info("Installed #{dst_jailer}") + end + + defp print_config(prefix) do + fc = Path.join(prefix, "firecracker") + jailer = Path.join(prefix, "jailer") + + Mix.shell().info(""" + + Add to /etc/hyper/config.toml (file: root-owned, mode 0644; binaries: + root-owned, not group- or world-writable): + + firecracker = "#{fc}" + jailer = "#{jailer}" + + Or in your Elixir config (e.g. config/runtime.exs): + + config :hyper, firecracker_bin: "#{fc}", jailer_bin: "#{jailer}" + """) + end +end From a8743fc9e3d8a47a6376298709ede041b4c30d0b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 00:06:04 +0000 Subject: [PATCH 23/34] docs: firecracker/jailer are operator-installed via mix firecracker.install Remove firecracker/jailer from the auto-redistributed list; operators now install them via `mix firecracker.install [--prefix ]`. Update /etc/hyper/config.toml example to show the required firecracker/jailer keys (no default, root-owned + non-world-writable, bare basenames validated by the helper) and the optional [uid_gid_range] table. Update `config :hyper` snippet to drop the auto-download TODOs and point at the install-produced paths. Document that uid_gid_range in config :hyper and [uid_gid_range] in config.toml must be kept in sync. --- docs/cookbook/intro.md | 65 +++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index 99c8012..ce3c3bf 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -67,22 +67,38 @@ helper, never named by the unprivileged caller. Their paths therefore live in the helper's own config, `/etc/hyper/config.toml`, and default to `/usr/sbin/{losetup,blockdev,dmsetup}`. -**The config file is optional.** If it is absent the helper uses the built-in -defaults below (and `work_dir = "/srv/hyper"`, matching the node's own -fallback). Create one only to override a default - and if you do, it must be -root-owned and not group/other-writable, or the helper refuses to start (a -present-but-untrusted file is treated as an attack signal, unlike a missing -one): +**The config file must exist** to set `firecracker` and `jailer` (no built-in +defaults for those). The device-tool paths (`dmsetup`, `losetup`, `blockdev`) +and `work_dir` do have built-in defaults, so if you only need those defaults +and are not running VMs you may omit the file entirely. When the file is +present it must be root-owned and not group/other-writable, or the helper +refuses to start (a present-but-untrusted file is treated as an attack signal, +unlike a missing one): ```toml -# /etc/hyper/config.toml (root-owned, mode 0644) - every line optional +# /etc/hyper/config.toml (root-owned, mode 0644) work_dir = "/srv/hyper" -# Each must be an absolute path to a root-owned, non-world-writable binary; -# the helper validates this before it will exec the tool. +# REQUIRED - no default. Each must be an absolute path to a root-owned, +# non-group/world-writable binary named exactly "firecracker" or "jailer" +# (the helper validates the basename). Run `mix firecracker.install` to +# download the pinned release and print these values. +firecracker = "/opt/firecracker/firecracker" +jailer = "/opt/firecracker/jailer" + +# Optional device-tool overrides; default to /usr/sbin/{dmsetup,losetup,blockdev}. +# Each must be root-owned and not group/world-writable. dmsetup = "/usr/sbin/dmsetup" losetup = "/usr/sbin/losetup" blockdev = "/usr/sbin/blockdev" + +# Optional. Governs which uid/gid values the helper accepts when launching the +# jailer. Must satisfy min > 0 and min <= max. Defaults to {900000, 999999}. +# If you narrow this range, set the same bounds in `config :hyper, uid_gid_range:` +# so the node hands out only uids the helper will accept. +[uid_gid_range] +min = 900000 +max = 999999 ``` `dmsetup` (lvm2) is frequently *not* installed by default - check that one @@ -168,14 +184,22 @@ printf 'dm_snapshot\ndm_thin_pool\nloop\n' | sudo tee /etc/modules-load.d/hyper. echo 'd /sys/fs/cgroup/hyper 0755 root root -' \ | sudo tee /etc/tmpfiles.d/hyper-cgroup.conf ``` - - The host UID/GID range given by `uid_gid_range` must be free for Hyper to - allocate per-VM users from. + - The host UID/GID range must be free for Hyper to allocate per-VM users + from. The node's range is set by `uid_gid_range` in `config :hyper`; the + helper independently reads `[uid_gid_range]` from `/etc/hyper/config.toml` + (see below) and only accepts jailer `--uid`/`--gid` within that range. + Keep the two in sync. #### Auto-redistributed -The remaining runtime dependencies - `firecracker`, `jailer`, `umoci`, and the -guest `vmlinux` kernels - are downloaded, checksum-verified, and managed by -Hyper itself; you do not install them. +`umoci` and the guest `vmlinux` kernels are downloaded, checksum-verified, and +managed by Hyper itself; you do not install them. + +`firecracker` and `jailer` are not auto-downloaded. Install them with +`mix firecracker.install [--prefix ]` (default prefix `/opt/firecracker`), +which downloads the pinned v1.16.0 release, places the binaries at +`/firecracker` and `/jailer`, and prints the config snippets to +paste into `/etc/hyper/config.toml` and `config.exs`. ### Installation @@ -190,18 +214,19 @@ configuration. ```elixir config :hyper, - # TODO(markovejnovic): Remove this after it gets auto-downloaded. - jailer_bin: "/opt/firecracker/jailer-v1.16.0-x86_64", - # TODO(markovejnovic): Remove this after it gets auto-downloaded. - firecracker_bin: "/opt/firecracker/firecracker-v1.16.0-x86_64", + # REQUIRED. Must point at the bare-basename binaries installed by + # `mix firecracker.install`. The setuid helper validates these paths + # (root-owned, non-group/world-writable, basename exactly "firecracker"/"jailer"). + firecracker_bin: "/opt/firecracker/firecracker", + jailer_bin: "/opt/firecracker/jailer", # You must create a parent cgroup on your system. Continue reading for # further details. cgroup_parent: "hyper", - # TODO(markovejnovic): Merge these directories into one. jailer_chroot_base: "/srv/hyper/jails", socket_dir: "/srv/hyper/socks", scratch_dir: "/srv/hyper/scratch", - # Hyper requires that each VM you pass + # Must match the [uid_gid_range] table in /etc/hyper/config.toml so the node + # hands out only uids the helper will accept. uid_gid_range: {900_000, 999_999}, layer_dir: "/srv/hyper/layers" ``` From 5770f57fd0a09e26cc9087b793e00ac41f24453f Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 00:17:21 +0000 Subject: [PATCH 24/34] fix(config): node and helper read the same firecracker/jailer TOML keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Elixir node was fetching TOML keys "firecracker_bin"/"jailer_bin" while the setuid helper (and the TOML example in docs) uses "firecracker"/"jailer". A correctly-installed host therefore crashed on launch with an unset-key error even when the file was present. - config.ex: fix fetch_bin! keys to "firecracker"/"jailer"; add non-raising firecracker_bin_configured/0 and jailer_bin_configured/0 returning {:ok, path} | :error for use by pre-launch checks. - node.ex: rewrite check_firecracker_bins/0 to use the non-raising accessors so a missing key returns {:error, :firecracker_not_configured} instead of raising, honouring the @spec contract. - firecracker.install.ex: drop the dead "config :hyper, firecracker_bin:" snippet from print_config/1 — nothing reads Application env for these paths. - docs/cookbook/intro.md: remove firecracker_bin/jailer_bin from config :hyper block; clarify paths live only in /etc/hyper/config.toml. - jailer_test.exs: align stub map keys to "firecracker"/"jailer" (the bug that masked this); add positive --cgroup assertion for :micro type. - Cargo.toml: remove redundant toml dev-dependency (already a normal dep). --- docs/cookbook/intro.md | 13 ++++----- lib/hyper/config.ex | 36 ++++++++++++++++++------ lib/hyper/node.ex | 21 ++++++++------ lib/mix/tasks/firecracker.install.ex | 8 ++---- native/suidhelper/Cargo.toml | 1 - test/hyper/node/fire_vmm/jailer_test.exs | 11 ++++++-- 6 files changed, 58 insertions(+), 32 deletions(-) diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index ce3c3bf..57aa4ff 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -198,8 +198,8 @@ managed by Hyper itself; you do not install them. `firecracker` and `jailer` are not auto-downloaded. Install them with `mix firecracker.install [--prefix ]` (default prefix `/opt/firecracker`), which downloads the pinned v1.16.0 release, places the binaries at -`/firecracker` and `/jailer`, and prints the config snippets to -paste into `/etc/hyper/config.toml` and `config.exs`. +`/firecracker` and `/jailer`, and prints the `/etc/hyper/config.toml` +snippet to paste in. ### Installation @@ -214,11 +214,6 @@ configuration. ```elixir config :hyper, - # REQUIRED. Must point at the bare-basename binaries installed by - # `mix firecracker.install`. The setuid helper validates these paths - # (root-owned, non-group/world-writable, basename exactly "firecracker"/"jailer"). - firecracker_bin: "/opt/firecracker/firecracker", - jailer_bin: "/opt/firecracker/jailer", # You must create a parent cgroup on your system. Continue reading for # further details. cgroup_parent: "hyper", @@ -231,6 +226,10 @@ config :hyper, layer_dir: "/srv/hyper/layers" ``` +The `firecracker` and `jailer` binary paths are **not** set here — they are read +from `/etc/hyper/config.toml` (the single source of truth shared with the setuid +helper). See the `config.toml` example above. + ### Usage diff --git a/lib/hyper/config.ex b/lib/hyper/config.ex index 250bfa0..0704c64 100644 --- a/lib/hyper/config.ex +++ b/lib/hyper/config.ex @@ -3,9 +3,9 @@ defmodule Hyper.Config do Host configuration, read from `config :hyper, ...` (see `config/config.exs`). Runtime values shared with the setuid helper (`native/suidhelper`) — `work_dir`, - `firecracker_bin`, `jailer_bin` — are read from `/etc/hyper/config.toml` (the - single source of truth for both sides) the first time they are needed, then - cached in `:persistent_term`. Everything else is compile-time. + `firecracker`, `jailer` — are read from `/etc/hyper/config.toml` (the single + source of truth for both sides) the first time they are needed, then cached in + `:persistent_term`. Everything else is compile-time. """ # The shared data-root config file, read by both this node and the setuid @@ -37,19 +37,39 @@ defmodule Hyper.Config do @doc """ Absolute path to the firecracker binary, as set in `#{@config_path}` under the - `firecracker_bin` key. Raises if the key is absent — the operator must configure - it; there is no default. + `firecracker` key. Raises if the key is absent — the operator must configure it; + there is no default. + + For the launch path only. Pre-launch checks should use `firecracker_bin_configured/0` + so a missing key returns a typed error rather than crashing. """ @spec firecracker_bin :: Path.t() - def firecracker_bin, do: fetch_bin!("firecracker_bin") + def firecracker_bin, do: fetch_bin!("firecracker") + + @doc """ + Non-raising form of `firecracker_bin/0`. Returns `{:ok, path}` when the + `firecracker` key is present in `#{@config_path}`, or `:error` when it is absent. + """ + @spec firecracker_bin_configured :: {:ok, Path.t()} | :error + def firecracker_bin_configured, do: Map.fetch(config_toml(), "firecracker") @doc """ Absolute path to the jailer binary, as set in `#{@config_path}` under the - `jailer_bin` key. Raises if the key is absent — the operator must configure it; + `jailer` key. Raises if the key is absent — the operator must configure it; there is no default. + + For the launch path only. Pre-launch checks should use `jailer_bin_configured/0` + so a missing key returns a typed error rather than crashing. """ @spec jailer_bin :: Path.t() - def jailer_bin, do: fetch_bin!("jailer_bin") + def jailer_bin, do: fetch_bin!("jailer") + + @doc """ + Non-raising form of `jailer_bin/0`. Returns `{:ok, path}` when the `jailer` key + is present in `#{@config_path}`, or `:error` when it is absent. + """ + @spec jailer_bin_configured :: {:ok, Path.t()} | :error + def jailer_bin_configured, do: Map.fetch(config_toml(), "jailer") @spec fetch_bin!(String.t()) :: Path.t() defp fetch_bin!(key) do diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index 5c2bd3f..b472116 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -168,15 +168,20 @@ defmodule Hyper.Node do end @spec check_firecracker_bins :: - :ok | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}} + :ok + | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}} + | {:error, :firecracker_not_configured | :jailer_not_configured} defp check_firecracker_bins do - fc = Hyper.Config.firecracker_bin() - jail = Hyper.Config.jailer_bin() - - cond do - not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}} - not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}} - true -> :ok + with {:fc, {:ok, fc}} <- {:fc, Hyper.Config.firecracker_bin_configured()}, + {:jail, {:ok, jail}} <- {:jail, Hyper.Config.jailer_bin_configured()} do + cond do + not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}} + not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}} + true -> :ok + end + else + {:fc, :error} -> {:error, :firecracker_not_configured} + {:jail, :error} -> {:error, :jailer_not_configured} end end diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex index 5d2a872..9db3e2e 100644 --- a/lib/mix/tasks/firecracker.install.ex +++ b/lib/mix/tasks/firecracker.install.ex @@ -16,7 +16,7 @@ defmodule Mix.Tasks.Firecracker.Install do and `SafeBin<"jailer">`, which match on basename only — version-stamped names such as `firecracker-v1.16.0-x86_64` are rejected unconditionally. 4. Marks both binaries executable (`0o755`). - 5. Prints the configuration snippets the operator needs to paste. + 5. Prints the `/etc/hyper/config.toml` snippet the operator needs to paste. This task installs **unprivileged** binaries and prints configuration. Privilege at runtime is handled by `hyper-suidhelper` (the setuid helper). @@ -131,11 +131,7 @@ defmodule Mix.Tasks.Firecracker.Install do root-owned, not group- or world-writable): firecracker = "#{fc}" - jailer = "#{jailer}" - - Or in your Elixir config (e.g. config/runtime.exs): - - config :hyper, firecracker_bin: "#{fc}", jailer_bin: "#{jailer}" + jailer = "#{jailer}" """) end end diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index 9b2b6cb..154a592 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -96,7 +96,6 @@ toml = { version = "0.8", default-features = false, features = ["parse"] } [dev-dependencies] proptest = "1" tempfile = "3" -toml = { version = "0.8", default-features = false, features = ["parse"] } [profile.release] strip = true diff --git a/test/hyper/node/fire_vmm/jailer_test.exs b/test/hyper/node/fire_vmm/jailer_test.exs index 9a23111..9ac1753 100644 --- a/test/hyper/node/fire_vmm/jailer_test.exs +++ b/test/hyper/node/fire_vmm/jailer_test.exs @@ -21,8 +21,8 @@ defmodule Hyper.Node.FireVMM.JailerTest do # async: false because persistent_term is global state. setup do :persistent_term.put({Hyper.Config, :config_toml}, %{ - "firecracker_bin" => "/usr/local/bin/firecracker-v1.16.0-x86_64", - "jailer_bin" => "/usr/local/bin/jailer-v1.16.0-x86_64" + "firecracker" => "/usr/local/bin/firecracker", + "jailer" => "/usr/local/bin/jailer" }) on_exit(fn -> :persistent_term.erase({Hyper.Config, :config_toml}) end) @@ -73,6 +73,13 @@ defmodule Hyper.Node.FireVMM.JailerTest do refute "--" in args end + test "args include --cgroup cpu.max and memory.max for :micro type" do + %{args: args} = Jailer.command(micro_opts()) + assert "--cgroup" in args + assert Enum.any?(args, &String.starts_with?(&1, "cpu.max=")) + assert Enum.any?(args, &String.starts_with?(&1, "memory.max=")) + end + property "gen_vm_id/0 never produces an id starting with -" do check all(_ <- StreamData.constant(nil)) do refute String.starts_with?(Hyper.gen_vm_id(), "-") From ffd46a4c4e19f425a26986807d9fd08d421f9af7 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 00:33:23 +0000 Subject: [PATCH 25/34] feat(mix): firecracker.install prints the chown/chmod root commands The task runs unprivileged, so the binaries land owned by the invoking user. SafeBin in the suidhelper refuses any jailer/firecracker not owned by root, so every launch fails closed until they are chowned. Print the exact chown/chmod commands (with the real installed paths) instead of a vague 'ensure root-owned' note. --- lib/mix/tasks/firecracker.install.ex | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex index 9db3e2e..03ada2b 100644 --- a/lib/mix/tasks/firecracker.install.ex +++ b/lib/mix/tasks/firecracker.install.ex @@ -125,10 +125,20 @@ defmodule Mix.Tasks.Firecracker.Install do fc = Path.join(prefix, "firecracker") jailer = Path.join(prefix, "jailer") + # This task runs unprivileged, so the binaries land owned by the invoking + # user. The suidhelper's SafeBin refuses any binary not owned by root and not + # free of group/other write bits, so the operator MUST chown/chmod them or + # every jailer launch fails closed. Print the exact commands rather than a + # vague "ensure root-owned". Mix.shell().info(""" - Add to /etc/hyper/config.toml (file: root-owned, mode 0644; binaries: - root-owned, not group- or world-writable): + Almost done. Run these as root so the setuid helper will accept the binaries + (it refuses any jailer/firecracker not owned by root): + + sudo chown root:root #{fc} #{jailer} + sudo chmod 0755 #{fc} #{jailer} + + Then add to /etc/hyper/config.toml (file: root-owned, mode 0644): firecracker = "#{fc}" jailer = "#{jailer}" From 202141be1e259d5d7775c181e91edcae27d3338f Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 01:11:18 +0000 Subject: [PATCH 26/34] feat(fire_vmm): log jailer/firecracker output and real exit status The supervised process is MuonTrap.Daemon, so route the jailed process's stdout+stderr (guest serial console included) to the Logger via log_output, and map the exit status into {:firecracker_exited, status} so a crash names the real exit code instead of the opaque :error_exit_status. Add explicit launch/launch-failure log lines keyed by vm id, so a boot loop is visible without reading console scrollback. --- lib/hyper/node/fire_vmm/daemon.ex | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex index c139340..b625773 100644 --- a/lib/hyper/node/fire_vmm/daemon.ex +++ b/lib/hyper/node/fire_vmm/daemon.ex @@ -23,6 +23,8 @@ defmodule Hyper.Node.FireVMM.Daemon do use OpenTelemetryDecorator + require Logger + @shutdown_timeout Time.s(5) @spec child_spec(Opts.t()) :: Supervisor.child_spec() @@ -46,9 +48,26 @@ defmodule Hyper.Node.FireVMM.Daemon do with :ok <- SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) do cmd = Jailer.command(opts) - case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, []) do - {:ok, pid} -> {:ok, pid} - {:error, _} = err -> err + # Surface what the jailed process actually does: `log_output` routes the + # helper/jailer/firecracker stdout+stderr (guest serial console included) + # to the Logger, and `exit_status_to_reason` turns MuonTrap's opaque + # `:error_exit_status` into `{:firecracker_exited, status}` so a crash + # report names the real exit code instead of hiding it. + daemon_opts = [ + log_output: :info, + log_prefix: "vm #{id} firecracker: ", + stderr_to_stdout: true, + exit_status_to_reason: &{:firecracker_exited, &1} + ] + + case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, daemon_opts) do + {:ok, pid} -> + Logger.info("vm #{id}: jailer launched under MuonTrap (#{inspect(pid)})") + {:ok, pid} + + {:error, reason} = err -> + Logger.error("vm #{id}: jailer failed to launch: #{inspect(reason)}") + err end end end From 0ecccf558d9ee6753ac66c91f68acec2e7cac5a2 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 01:33:11 +0000 Subject: [PATCH 27/34] feat(fire_vmm): surface the API readiness-probe failure reason The :awaiting_api probe swallowed the describe_instance error and stopped with a bare :daemon_unready, hiding why a healthy-looking firecracker is unreachable. Log the last probe error on deadline and carry it in the stop reason ({:daemon_unready, reason}), so a host->jail socket permission/path problem is diagnosable instead of looking like an unexplained 5s restart loop. --- lib/hyper/node/fire_vmm/state.ex | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index b390965..552da0f 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -113,6 +113,8 @@ defmodule Hyper.Node.FireVMM.State do alias Hyper.Node.FireVMM.{Client, Opts} alias Unit.Time + require Logger + # How often to probe the daemon's API while waiting for it. @probe_interval Time.ms(50) @@ -123,9 +125,19 @@ defmodule Hyper.Node.FireVMM.State do {:ok, %InstanceInfo{}} -> {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} - {:error, _reason} -> + {:error, reason} -> if System.monotonic_time(:millisecond) >= data.boot_deadline do - {:stop, {:shutdown, {:boot_failed, :daemon_unready}}, data} + # firecracker's own log shows the API server is up, so a persistent + # probe failure points at the host->jail socket (path or, more often, + # permissions: the jailer drops firecracker to the per-VM uid and the + # unprivileged controller cannot reach the jailed socket). Surface the + # last error rather than swallowing it into a bare :daemon_unready. + Logger.warning( + "vm #{id}: firecracker API not reachable before deadline; " <> + "last probe error: #{inspect(reason)}" + ) + + {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data} else {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} end From 2d48872d902c311776e1739d345f642e7912101b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 01:51:13 +0000 Subject: [PATCH 28/34] feat(suidhelper): add chroot-jail grant-api to hand the API socket to the node user The jailer drops firecracker to a per-VM uid/gid and chroots it, so the API socket it creates at /root/api.socket is owned by that per-VM id, mode 0755. Connecting a unix socket needs write permission, so the unprivileged node controller gets EACCES on connect(). grant-api confines the socket path under JAIL_BASE (SafePath + O_NOFOLLOW walk, fd-relative on the pinned root dir), verifies the leaf is a real socket via fstatat(AT_SYMLINK_NOFOLLOW) (a planted file/symlink is refused, never touched), then chowns it to the helper's caller (getuid/getgid, the real=caller ids inside the privileged scope) and chmods 0660. A not-yet-created socket is reported Pending, not an error. Adds SafeDir::stat and SafeDir::chmod fd-relative primitives. --- native/suidhelper/Cargo.toml | 4 + .../src/tools/chroot_jail/grant_api.rs | 160 +++++++++++++ .../suidhelper/src/tools/chroot_jail/mod.rs | 5 + native/suidhelper/src/util/safe_dir.rs | 37 ++- .../tests/tools/chroot_jail_grant_api.rs | 224 ++++++++++++++++++ 5 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 native/suidhelper/src/tools/chroot_jail/grant_api.rs create mode 100644 native/suidhelper/tests/tools/chroot_jail_grant_api.rs diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index 154a592..be6a449 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -60,6 +60,10 @@ path = "tests/util/confinement.rs" name = "tools_chroot_jail_remove" path = "tests/tools/chroot_jail_remove.rs" +[[test]] +name = "tools_chroot_jail_grant_api" +path = "tests/tools/chroot_jail_grant_api.rs" + [[test]] name = "util_chroot_jail" path = "tests/util/chroot_jail.rs" diff --git a/native/suidhelper/src/tools/chroot_jail/grant_api.rs b/native/suidhelper/src/tools/chroot_jail/grant_api.rs new file mode 100644 index 0000000..a74b548 --- /dev/null +++ b/native/suidhelper/src/tools/chroot_jail/grant_api.rs @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! `chroot-jail grant-api`: hand the firecracker API socket to the node user so +//! the unprivileged controller can `connect()` to it. +//! +//! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker +//! then creates its API socket at `/root/api.socket` owned by that per-VM +//! id, mode `0755`. Connecting a unix socket needs *write* permission on the +//! node, so the node user (a different uid) gets `EACCES`. This op chowns just +//! that one socket to the helper's CALLER — `getuid()`/`getgid()`, which inside +//! the privileged scope are the real (caller) ids while euid is 0 — and chmods +//! it `0660`. The node thus connects as owner, and humans added to the node's +//! group connect via the group bit. Per-VM isolation is otherwise untouched: +//! only this single socket moves, nothing else in the jail. +//! +//! Security: the socket path is validated as a `SafePath` and reached by an +//! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect +//! the chown outside the jail, and every op is fd-relative on the pinned `root` +//! dir fd, never by re-resolved name. The leaf must be exactly `api.socket` +//! `//root` below the base, and `fstatat(AT_SYMLINK_NOFOLLOW)` must +//! report a *socket* — a regular file or symlink planted at that name is an +//! attack and is refused, never chmod'd. A missing socket (`ENOENT`, anywhere on +//! the path) is `Pending`, not an error: firecracker has not created it yet, so +//! the controller keeps probing. + +use crate::config::Config; +use crate::tools::IsTool; +use crate::util::safe_dir::{self, SafeDir}; +use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents}; +use clap::Args; +use nix::errno::Errno; +use nix::sys::stat::SFlag; +use nix::unistd::{getgid, getuid}; +use serde::Serialize; +use std::path::{Path, PathBuf}; +use thiserror::Error as ThisError; + +/// The fixed in-jail socket name firecracker opens (mirrors the Elixir +/// `Hyper.Node.FireVMM.Jailer` `@jail_socket`). +const SOCKET_NAME: &str = "api.socket"; + +/// The socket sits at `///root/api.socket`: three parent +/// components (``, ``, `root`) before the leaf. +const SOCKET_PARENT_DEPTH: usize = 3; + +/// Mode handed to the node: owner+group read/write, no world access. +const SOCKET_MODE: u32 = 0o660; + +type LexicalPath = SafePath; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("--socket path: {0}")] + SocketPath(#[source] safe_path::ValidationError), + #[error("--socket must be exactly //root/api.socket below JAIL_BASE: {0:?}")] + SocketShape(PathBuf), + #[error("--socket leaf must be {SOCKET_NAME:?}: {0:?}")] + SocketName(PathBuf), + #[error("walking to the jail root: {0}")] + Walk(#[source] safe_dir::Error), + #[error("api.socket is not a socket (or is a symlink); refusing to touch it")] + NotASocket, + #[error("statting the socket: {0}")] + Stat(#[source] safe_dir::Error), + #[error("chowning the socket to the caller: {0}")] + Chown(#[source] safe_dir::Error), + #[error("chmoding the socket: {0}")] + Chmod(#[source] safe_dir::Error), +} + +#[derive(Args)] +pub struct GrantApiArgs { + /// Host path of the firecracker API socket, shape + /// ///root/api.socket. + #[arg(long)] + socket: PathBuf, +} + +#[derive(Debug, Serialize)] +#[serde(tag = "result", rename_all = "snake_case")] +pub enum GrantOut { + /// The socket was handed to the caller (chowned + chmoded). + Granted, + /// The socket does not exist yet; the caller should keep waiting. + Pending, +} + +/// Run the `grant-api` op in its own privileged scope (returns its serialized `Value`). +pub fn run(args: GrantApiArgs) -> Result { + GrantApi { args }.run() +} + +struct GrantApi { + args: GrantApiArgs, +} + +impl IsTool for GrantApi { + type Args = GrantApiArgs; + type Output = GrantOut; + type RunT = Result; + + fn run_privileged(&self) -> Self::RunT { + grant_api_under(&Config::get().jail_base(), &self.args.socket) + } + + fn parse(&self, res: Self::RunT) -> Result> { + Ok(res?) + } +} + +/// Hand `socket` (`///root/api.socket`) to the helper's +/// caller, fd-relative after an `O_NOFOLLOW` walk from `jail_base`. Returns +/// `Pending` if any path component or the socket itself is not yet present. +pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result { + let path: LexicalPath = socket.to_path_buf().try_into().map_err(Error::SocketPath)?; + let (parents, leaf) = path.relative_to(jail_base).map_err(Error::SocketPath)?; + if parents.len() != SOCKET_PARENT_DEPTH { + return Err(Error::SocketShape(socket.to_path_buf())); + } + if leaf != Path::new(SOCKET_NAME) { + return Err(Error::SocketName(socket.to_path_buf())); + } + + let Some(root) = walk(jail_base.to_path_buf(), &parents)? else { + return Ok(GrantOut::Pending); // jail not fully created yet + }; + + let leaf = Path::new(SOCKET_NAME); + let stat = match root.stat(leaf) { + Ok(stat) => stat, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(GrantOut::Pending), + Err(e) => return Err(Error::Stat(e)), + }; + // `stat` used AT_SYMLINK_NOFOLLOW, so a symlink reports as S_IFLNK and fails + // this check too: only a real socket is accepted, anything else is refused. + if stat.st_mode & SFlag::S_IFMT.bits() != SFlag::S_IFSOCK.bits() { + return Err(Error::NotASocket); + } + + root.chown(leaf, getuid().as_raw(), getgid().as_raw()) + .map_err(Error::Chown)?; + root.chmod(leaf, SOCKET_MODE).map_err(Error::Chmod)?; + Ok(GrantOut::Granted) +} + +/// Open `base` and walk `parents` from it (`O_NOFOLLOW` each step). Returns +/// `Ok(None)` if `base` or any parent is not yet present (`ENOENT`), so the +/// caller can treat a half-built jail as `Pending` rather than an error. +fn walk(base: PathBuf, parents: &[PathBuf]) -> Result, Error> { + let base_path: LexicalPath = base.try_into().map_err(Error::SocketPath)?; + let anchor = match SafeDir::open(&base_path) { + Ok(dir) => dir, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(None), + Err(e) => return Err(Error::Walk(e)), + }; + match anchor.descend(parents) { + Ok(dir) => Ok(Some(dir)), + Err(e) if e.errno() == Some(Errno::ENOENT) => Ok(None), + Err(e) => Err(Error::Walk(e)), + } +} diff --git a/native/suidhelper/src/tools/chroot_jail/mod.rs b/native/suidhelper/src/tools/chroot_jail/mod.rs index d65bac7..3a06c35 100644 --- a/native/suidhelper/src/tools/chroot_jail/mod.rs +++ b/native/suidhelper/src/tools/chroot_jail/mod.rs @@ -1,9 +1,11 @@ // SPDX-License-Identifier: AGPL-3.0-only //! `chroot-jail`: per-VM chroot/jail lifecycle. +pub mod grant_api; mod prepare; pub mod remove; +pub use grant_api::GrantApiArgs; pub use prepare::PrepareArgs; pub use remove::RemoveArgs; @@ -15,6 +17,8 @@ pub enum ChrootJailOp { Prepare(PrepareArgs), /// Remove a VM's stale chroot and cgroup leaf before relaunching the jailer. Remove(RemoveArgs), + /// Hand the firecracker API socket to the node user (chown to caller, 0660). + GrantApi(GrantApiArgs), } impl ChrootJailOp { @@ -25,6 +29,7 @@ impl ChrootJailOp { match self { ChrootJailOp::Prepare(args) => prepare::run(args), ChrootJailOp::Remove(args) => remove::run(args), + ChrootJailOp::GrantApi(args) => grant_api::run(args), } } } diff --git a/native/suidhelper/src/util/safe_dir.rs b/native/suidhelper/src/util/safe_dir.rs index 388ae83..ed88f07 100644 --- a/native/suidhelper/src/util/safe_dir.rs +++ b/native/suidhelper/src/util/safe_dir.rs @@ -17,7 +17,7 @@ use super::safe_path::SafePath; use nix::dir::{Dir, Type}; use nix::fcntl::{openat, AtFlags, OFlag}; use nix::libc::dev_t; -use nix::sys::stat::{mknodat, Mode, SFlag}; +use nix::sys::stat::{fchmodat, fstatat, mknodat, FchmodatFlags, FileStat, Mode, SFlag}; use nix::unistd::{dup, fchownat, linkat, unlinkat, Gid, Uid, UnlinkatFlags}; use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; @@ -37,6 +37,10 @@ pub enum Error { Mknod { name: PathBuf, source: nix::Error }, #[error("fchownat {name:?}: {source}")] Chown { name: PathBuf, source: nix::Error }, + #[error("fchmodat {name:?}: {source}")] + Chmod { name: PathBuf, source: nix::Error }, + #[error("fstatat {name:?}: {source}")] + Stat { name: PathBuf, source: nix::Error }, #[error("linkat -> {name:?}: {source}")] Link { name: PathBuf, source: nix::Error }, #[error("dup: {0}")] @@ -52,6 +56,8 @@ impl Error { | Error::Unlink { source, .. } | Error::Mknod { source, .. } | Error::Chown { source, .. } + | Error::Chmod { source, .. } + | Error::Stat { source, .. } | Error::Link { source, .. } => Some(*source), Error::ReadDir(source) | Error::Dup(source) => Some(*source), } @@ -180,6 +186,35 @@ impl SafeDir { }) } + /// `fstat` entry `name` relative to this dir's fd without following a final + /// symlink (`AT_SYMLINK_NOFOLLOW`). A symlink stats as itself (`S_IFLNK`), + /// never its target, so a caller inspecting the file type can reject one. + pub fn stat(&self, name: &Path) -> Result { + fstatat(Some(self.0.as_raw_fd()), name, AtFlags::AT_SYMLINK_NOFOLLOW).map_err(|source| { + Error::Stat { + name: name.to_path_buf(), + source, + } + }) + } + + /// `chmod` entry `name` to `mode`. Linux's `fchmodat` has no working + /// no-follow mode (it returns `ENOTSUP`), so this follows a final symlink; + /// call it only after [`stat`](Self::stat) has proven `name` is not a + /// symlink, so the follow is a no-op on a real (non-link) entry. + pub fn chmod(&self, name: &Path, mode: u32) -> Result<(), Error> { + fchmodat( + Some(self.0.as_raw_fd()), + name, + Mode::from_bits_truncate(mode), + FchmodatFlags::FollowSymlink, + ) + .map_err(|source| Error::Chmod { + name: name.to_path_buf(), + source, + }) + } + /// Remove the non-directory entry `name` from this directory. pub fn unlink(&self, name: &Path) -> Result<(), Error> { unlinkat(Some(self.0.as_raw_fd()), name, UnlinkatFlags::NoRemoveDir).map_err(|source| { diff --git a/native/suidhelper/tests/tools/chroot_jail_grant_api.rs b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs new file mode 100644 index 0000000..3e6092f --- /dev/null +++ b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs @@ -0,0 +1,224 @@ +//! Contracts of the `chroot-jail grant-api` op, driven through the base-injected +//! `grant_api_under` seam so they run unprivileged in a tempdir. The promises +//! under test (refusal contracts first — they are the security boundary): +//! * shape — the socket is accepted iff it is exactly +//! `//root/api.socket` below the jail base; any other depth or a +//! leaf that is not `api.socket` is refused before any chown; +//! * lexical — a `.`/`..`/empty component or a relative path is always rejected +//! before any filesystem access; +//! * type — a regular file or a symlink planted at `api.socket` is refused +//! (`NotASocket`) and left untouched, never chmod'd; only a real socket is +//! granted; +//! * confinement — a symlinked path component is never followed, so the chown +//! can never escape the anchored jail tree (the core TOCTOU guarantee); +//! * pending — a not-yet-created socket (or half-built jail) is `Pending`, not +//! an error, so the controller keeps probing; +//! * grant — a real socket is chowned to the caller and left mode 0660. + +use hyper_suidhelper::tools::chroot_jail::grant_api::{grant_api_under, Error, GrantOut}; +use hyper_suidhelper::util::safe_path::ValidationError; +use proptest::prelude::*; +use std::os::unix::fs::{symlink, PermissionsExt}; +use std::os::unix::net::UnixListener; +use std::path::{Path, PathBuf}; +use std::{fs, os::unix::fs::MetadataExt}; + +/// Build the canonical `/exec/id/root` parent dirs and return that dir. +fn make_root(jail: &Path) -> PathBuf { + let root = jail.join("exec").join("id").join("root"); + fs::create_dir_all(&root).unwrap(); + root +} + +#[test] +fn socket_outside_jail_base_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path().join("jail"); + fs::create_dir(&jail).unwrap(); + let outside = tmp.path().join("elsewhere/exec/id/root/api.socket"); + let err = grant_api_under(&jail, &outside).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::NotUnderBase)), + "got {err:?}", + ); +} + +#[test] +fn wrong_leaf_basename_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail.join("exec").join("id").join("root").join("evil.sock"); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketName(_)), "got {err:?}"); +} + +#[test] +fn too_shallow_is_shape_error() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail.join("exec").join("id").join("api.socket"); // missing root/ + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketShape(_)), "got {err:?}"); +} + +#[test] +fn too_deep_is_shape_error() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail + .join("exec") + .join("id") + .join("root") + .join("extra") + .join("api.socket"); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketShape(_)), "got {err:?}"); +} + +#[test] +fn dotdot_traversal_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = PathBuf::from(format!("{}/exec/../id/root/api.socket", jail.display())); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::LooseComponents)), + "got {err:?}", + ); +} + +#[test] +fn relative_socket_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let err = grant_api_under(tmp.path(), Path::new("exec/id/root/api.socket")).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::NotAbsolute)), + "got {err:?}", + ); +} + +#[test] +fn missing_socket_is_pending() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let socket = root.join("api.socket"); // never created + let out = grant_api_under(jail, &socket).expect("missing socket must be Ok(Pending)"); + assert!(matches!(out, GrantOut::Pending), "got {out:?}"); +} + +#[test] +fn missing_jail_tree_is_pending() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let socket = jail.join("exec").join("id").join("root").join("api.socket"); // nothing created + let out = grant_api_under(jail, &socket).expect("half-built jail must be Ok(Pending)"); + assert!(matches!(out, GrantOut::Pending), "got {out:?}"); +} + +// A real socket is granted: chowned to the caller (our own uid/gid, which a +// non-root process is allowed to set on a file it owns) and chmod'd 0660. +#[test] +fn real_socket_is_granted_and_chmod_0660() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let socket = root.join("api.socket"); + let _listener = UnixListener::bind(&socket).unwrap(); + fs::set_permissions(&socket, fs::Permissions::from_mode(0o755)).unwrap(); + + let out = grant_api_under(jail, &socket).expect("real socket must grant"); + assert!(matches!(out, GrantOut::Granted), "got {out:?}"); + + let meta = fs::symlink_metadata(&socket).unwrap(); + assert_eq!(meta.mode() & 0o777, 0o660, "socket must be chmod'd 0660"); + assert_eq!(meta.uid(), nix::unistd::getuid().as_raw()); + assert_eq!(meta.gid(), nix::unistd::getgid().as_raw()); +} + +// A regular file planted at api.socket is refused and left untouched (not chmod'd). +#[test] +fn regular_file_at_leaf_is_refused_and_untouched() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let imposter = root.join("api.socket"); + fs::write(&imposter, b"not a socket").unwrap(); + fs::set_permissions(&imposter, fs::Permissions::from_mode(0o600)).unwrap(); + + let err = grant_api_under(jail, &imposter).unwrap_err(); + assert!(matches!(err, Error::NotASocket), "got {err:?}"); + assert_eq!( + fs::symlink_metadata(&imposter).unwrap().mode() & 0o777, + 0o600, + "imposter file must not be chmod'd", + ); +} + +// A symlink planted at api.socket is refused: fstatat(AT_SYMLINK_NOFOLLOW) stats +// the link itself (S_IFLNK), so it is never seen as a socket and never followed. +#[test] +fn symlink_at_leaf_is_refused() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let target = tmp.path().join("real-target"); + fs::write(&target, b"secret").unwrap(); + let link = root.join("api.socket"); + symlink(&target, &link).unwrap(); + + let err = grant_api_under(jail, &link).unwrap_err(); + assert!(matches!(err, Error::NotASocket), "got {err:?}"); +} + +// A symlinked path component must NOT be followed: the walk fails rather than +// reaching through it, so nothing outside the jail is touched. +#[test] +fn symlinked_component_does_not_escape() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path().join("jail"); + fs::create_dir(&jail).unwrap(); + + let sentinel = tmp.path().join("sentinel"); + fs::create_dir_all(sentinel.join("id").join("root")).unwrap(); + let outside_socket = sentinel.join("id").join("root").join("api.socket"); + let _listener = UnixListener::bind(&outside_socket).unwrap(); + fs::set_permissions(&outside_socket, fs::Permissions::from_mode(0o700)).unwrap(); + + // `/exec` is a symlink to the external sentinel dir. + symlink(&sentinel, jail.join("exec")).unwrap(); + + let socket = jail.join("exec").join("id").join("root").join("api.socket"); + let _ = grant_api_under(&jail, &socket); // O_NOFOLLOW makes the walk refuse + + assert_eq!( + fs::symlink_metadata(&outside_socket).unwrap().mode() & 0o777, + 0o700, + "grant escaped through a symlinked component", + ); +} + +proptest! { + // For a socket `depth` components below the jail base with leaf `api.socket` + // (target never created), grant_api_under returns Ok(Pending) iff depth == 4 + // (i.e. 3 parents), else SocketShape. The generator emits only plain names so + // the lexical gate never fires and the leaf is always `api.socket`. + #[test] + fn shape_classification( + parents in prop::collection::vec("[a-z][a-z0-9]{0,5}", 1..6) + ) { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let mut socket = jail.to_path_buf(); + for c in &parents { + socket.push(c); + } + socket.push("api.socket"); + let res = grant_api_under(jail, &socket); + if parents.len() == 3 { + prop_assert!(matches!(res, Ok(GrantOut::Pending)), "depth 3 must be Pending, got {res:?}"); + } else { + prop_assert!(matches!(res, Err(Error::SocketShape(_))), "got {res:?}"); + } + } +} From 85bc4a4455ccae5ca3aadca76a253fa8b126eb3b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 01:51:19 +0000 Subject: [PATCH 29/34] feat(fire_vmm): grant the API socket before probing so the controller can connect AwaitingApi now hands the jailed API socket to the node user (via the new chroot-jail grant-api op) before each readiness probe. firecracker creates the socket owned by the per-VM uid, so the controller gets EACCES until the helper chowns it; granting first lets the probe (and every later API call) connect. The grant runs once (tracked by State.api_granted); :socket_pending and transient grant errors keep the controller waiting until the existing boot deadline rather than crashing, via a shared deadline-aware keep_probing path. --- lib/hyper/node/fire_vmm/state.ex | 87 ++++++++++++++++++++-------- lib/hyper/suid_helper/chroot_jail.ex | 21 +++++++ 2 files changed, 84 insertions(+), 24 deletions(-) diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index 552da0f..0ca98e7 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -39,12 +39,13 @@ defmodule Hyper.Node.FireVMM.State do } @enforce_keys [:opts] - defstruct [:opts, :spec, :boot_deadline] + defstruct [:opts, :spec, :boot_deadline, api_granted: false] @type t :: %State{ opts: Opts.t(), spec: BootSpec.Cold.t() | nil, - boot_deadline: integer() | nil + boot_deadline: integer() | nil, + api_granted: boolean() } # How long to wait for the daemon's API to come up before failing the boot. @@ -110,7 +111,8 @@ defmodule Hyper.Node.FireVMM.State do @moduledoc "Poll the (already-launched) daemon's API socket, then advance to `:configuring`." alias Hyper.Firecracker.Api.{InstanceInfo, Operations} - alias Hyper.Node.FireVMM.{Client, Opts} + alias Hyper.Node.FireVMM.{Client, Jailer, Opts} + alias Hyper.SuidHelper.ChrootJail alias Unit.Time require Logger @@ -118,35 +120,72 @@ defmodule Hyper.Node.FireVMM.State do # How often to probe the daemon's API while waiting for it. @probe_interval Time.ms(50) - # Poll the daemon's API until it answers, then configure. Give up if the - # readiness deadline passes first. + # Hand the jailed API socket to the node user, then poll the daemon's API + # until it answers and advance to `:configuring`. Give up if the readiness + # deadline passes first. The grant must happen before the probe: firecracker + # creates the socket owned by the per-VM uid, so the unprivileged controller + # gets EACCES on connect until the helper chowns it to us. def handle(:state_timeout, :probe, %{opts: %Opts{vm_id: id}} = data) do - case Client.run(Client.via(id), &Operations.describe_instance/1) do - {:ok, %InstanceInfo{}} -> - {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} - - {:error, reason} -> - if System.monotonic_time(:millisecond) >= data.boot_deadline do - # firecracker's own log shows the API server is up, so a persistent - # probe failure points at the host->jail socket (path or, more often, - # permissions: the jailer drops firecracker to the per-VM uid and the - # unprivileged controller cannot reach the jailed socket). Surface the - # last error rather than swallowing it into a bare :daemon_unready. - Logger.warning( - "vm #{id}: firecracker API not reachable before deadline; " <> - "last probe error: #{inspect(reason)}" - ) - - {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data} - else - {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + case ensure_api_granted(id, data) do + {:cont, data} -> + case Client.run(Client.via(id), &Operations.describe_instance/1) do + {:ok, %InstanceInfo{}} -> + {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} + + {:error, reason} -> + keep_probing(id, data, reason) end + + {:wait, data, reason} -> + keep_probing(id, data, reason) end end def handle({:call, from}, :stop, data) do {:next_state, :stopping, data, [{:reply, from, :ok}]} end + + # Ensure the jailed API socket has been handed to the node user. Idempotent + # once granted (we record it in `data` so we ask the helper only once). + # `:socket_pending` means firecracker has not created the socket yet, so we + # keep waiting; a hard error is logged but also tolerated until the deadline + # (the probe that follows would fail with EACCES anyway and drive the stop). + @spec ensure_api_granted(Hyper.Vm.id(), State.t()) :: + {:cont, State.t()} | {:wait, State.t(), term()} + defp ensure_api_granted(_id, %{api_granted: true} = data), do: {:cont, data} + + defp ensure_api_granted(id, data) do + case ChrootJail.grant_api(Jailer.host_socket(id)) do + :ok -> + {:cont, %{data | api_granted: true}} + + {:error, :socket_pending} -> + {:wait, data, :socket_pending} + + {:error, reason} -> + Logger.warning("vm #{id}: grant-api failed: #{inspect(reason)}") + {:wait, data, {:grant_api, reason}} + end + end + + # Keep waiting for readiness, re-arming the probe timer, unless the deadline + # has lapsed - then fail the boot, surfacing `reason` rather than swallowing + # it into a bare `:daemon_unready`. A persistent failure here points at the + # host->jail socket (path or, more often, the grant/permission step above). + @spec keep_probing(Hyper.Vm.id(), State.t(), term()) :: + {:keep_state, State.t(), list()} | {:stop, term(), State.t()} + defp keep_probing(id, data, reason) do + if System.monotonic_time(:millisecond) >= data.boot_deadline do + Logger.warning( + "vm #{id}: firecracker API not reachable before deadline; " <> + "last probe error: #{inspect(reason)}" + ) + + {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data} + else + {:keep_state, data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + end + end end defmodule Configuring do diff --git a/lib/hyper/suid_helper/chroot_jail.ex b/lib/hyper/suid_helper/chroot_jail.ex index fdc0f1c..cf8370c 100644 --- a/lib/hyper/suid_helper/chroot_jail.ex +++ b/lib/hyper/suid_helper/chroot_jail.ex @@ -57,4 +57,25 @@ defmodule Hyper.SuidHelper.ChrootJail do {:error, _} = err -> err end end + + @doc """ + Hand the firecracker API `socket` to the node user so the unprivileged + controller can `connect()` to it. The jailer drops firecracker to a per-VM + uid/gid and chroots it, so the socket it creates is owned by that per-VM id and + the node (a different uid) gets `EACCES` on connect. The helper chowns just + that one socket to its caller (the node user) and chmods it `0660`, leaving the + rest of the per-VM isolation intact. + + Returns `{:error, :socket_pending}` while firecracker has not yet created the + socket, so the caller can keep waiting. + """ + @spec grant_api(Path.t()) :: :ok | {:error, :socket_pending} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.ChrootJail.grant_api", include: [:socket]) + def grant_api(socket) do + case SuidHelper.exec(["chroot-jail", "grant-api", "--socket", socket]) do + {:ok, %{"result" => "granted"}} -> :ok + {:ok, %{"result" => "pending"}} -> {:error, :socket_pending} + {:error, _} = err -> err + end + end end From 71d64403d43880f5889844bb2819bf24f7228905 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 02:19:17 +0000 Subject: [PATCH 30/34] fix(hyper): generate alphanumeric vm ids (firecracker rejects _) gen_vm_id used Base.url_encode64, which emits - and _. firecracker rejects _ in an instance id (InvalidInstanceId / "Invalid char (_)"), so any id containing one crash-looped the jailer at boot. Switch to lowercase base32 ([a-z2-7], alphanumeric only) - the intersection of the firecracker, dm/jailer, and registry-key constraints. Strengthen the property from "no leading -" to "strictly alphanumeric". --- lib/hyper.ex | 19 +++++++++++++++++-- test/hyper/node/fire_vmm/jailer_test.exs | 7 +++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/lib/hyper.ex b/lib/hyper.ex index c29d2f0..ce946a6 100644 --- a/lib/hyper.ex +++ b/lib/hyper.ex @@ -34,9 +34,24 @@ defmodule Hyper do end end - @doc "Generate a fresh VM id (url-safe base64, dm-name compatible)." + @doc """ + Generate a fresh VM id: a `v` prefix followed by lowercase base32 of 10 random + bytes, charset `[a-z2-7]`. + + Alphanumeric only - no `-`, `_`, or other punctuation. That is the intersection + of three independent constraints the id must satisfy at once: + + * firecracker rejects `_` in an instance id (`InvalidInstanceId`); + * dm/jailer names must not start with `-`; + * registry keys and chroot path components stay trivially safe. + + The previous base64url encoding emitted `-` and `_`, so it could produce ids + firecracker refused at boot (`Invalid char (_)`). + """ @spec gen_vm_id() :: Hyper.Vm.id() - def gen_vm_id, do: "v" <> Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false) + def gen_vm_id do + "v" <> Base.encode32(:crypto.strong_rand_bytes(10), padding: false, case: :lower) + end @spec resolve_arch(Hyper.Vm.Instance.arch() | nil) :: {:ok, Hyper.Vm.Instance.arch()} | {:error, term()} diff --git a/test/hyper/node/fire_vmm/jailer_test.exs b/test/hyper/node/fire_vmm/jailer_test.exs index 9ac1753..1ca862f 100644 --- a/test/hyper/node/fire_vmm/jailer_test.exs +++ b/test/hyper/node/fire_vmm/jailer_test.exs @@ -80,9 +80,12 @@ defmodule Hyper.Node.FireVMM.JailerTest do assert Enum.any?(args, &String.starts_with?(&1, "memory.max=")) end - property "gen_vm_id/0 never produces an id starting with -" do + # firecracker rejects an instance id containing `_` (and dm/jailer names must + # not lead with `-`), so the id must be strictly alphanumeric. + property "gen_vm_id/0 produces only alphanumeric ids" do check all(_ <- StreamData.constant(nil)) do - refute String.starts_with?(Hyper.gen_vm_id(), "-") + id = Hyper.gen_vm_id() + assert id =~ ~r/\A[A-Za-z0-9]+\z/ end end end From b3c2315bf94462af071fc1203a9bf1cfe5d7d599 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 02:19:26 +0000 Subject: [PATCH 31/34] fix(fire_vmm): self-register per-VM names in init, not via a start name Starting a process with a {:via, Horde.Registry, _} name makes OTP run gen:get_proc_name right after start: it calls whereis_name immediately after the synchronous register. Horde materialises the name into local ETS only asynchronously (DeltaCRDT diff loop), so under registry churn the read loses the race and startup aborts with {:process_not_registered_via, Horde.Registry}. The crash-storm from the bad vm_id flooded the CRDT and tipped this over, killing FireVMM.State. Add Routing.register_self/1 and have the supervisor, client, and state machine register from their own init (started unnamed). Names stay cluster-resolvable via via/1 once the diff propagates - callers already tolerate that lag. Core starts unnamed entirely (resolved by nobody). --- lib/hyper/cluster/routing.ex | 20 ++++++++++++++++++++ lib/hyper/node/fire_vmm.ex | 30 +++++++++++++++++++----------- lib/hyper/node/fire_vmm/client.ex | 31 +++++++++++++++++++++---------- lib/hyper/node/fire_vmm/core.ex | 5 ++++- lib/hyper/node/fire_vmm/state.ex | 28 ++++++++++++++++++++-------- 5 files changed, 84 insertions(+), 30 deletions(-) diff --git a/lib/hyper/cluster/routing.ex b/lib/hyper/cluster/routing.ex index c780259..7b47836 100644 --- a/lib/hyper/cluster/routing.ex +++ b/lib/hyper/cluster/routing.ex @@ -28,6 +28,26 @@ defmodule Hyper.Cluster.Routing do @spec via(term()) :: {:via, module(), {atom(), term()}} def via(key), do: {:via, Horde.Registry, {@name, key}} + @doc """ + Register the calling process under `key` from inside its own `init`. + + Prefer this over starting a process with a `{:via, Horde.Registry, _}` name. + OTP's post-start name check (`gen:get_proc_name`) calls `whereis_name` + immediately after the synchronous `register`, but Horde materialises the name + into its local ETS only asynchronously, via the DeltaCRDT diff loop. Under + registry churn that read loses the race and OTP aborts startup with + `{:process_not_registered_via, Horde.Registry}`. Registering from within + `init` carries no such self-check, while leaving the name cluster-resolvable + through `via/1` once the diff propagates (callers already tolerate that lag). + """ + @spec register_self(term()) :: :ok | {:error, {:already_registered, pid()}} + def register_self(key) do + case Horde.Registry.register(@name, key, nil) do + {:ok, _pid} -> :ok + {:error, {:already_registered, _pid}} = err -> err + end + end + @doc "Which node currently runs `vm_id`? `nil` if unknown." @spec whereis(Hyper.Vm.id()) :: node() | nil @decorate with_span("Hyper.Cluster.Routing.whereis", include: [:vm_id]) diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex index 5938862..f9e45fb 100644 --- a/lib/hyper/node/fire_vmm.ex +++ b/lib/hyper/node/fire_vmm.ex @@ -47,7 +47,7 @@ defmodule Hyper.Node.FireVMM do @spec start_link(Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id)) + Supervisor.start_link(__MODULE__, opts) end @spec child_spec(Opts.t()) :: Supervisor.child_spec() @@ -64,18 +64,26 @@ defmodule Hyper.Node.FireVMM do @impl true def init(opts) do - children = [ - # Client must be registered before Core: Core starts the State machine, - # which calls Client.run while waiting for the daemon's API. Client depends - # only on vm_id (an independent peer), so it has no reverse dependency. - {Client, %Client.Opts{vm_id: opts.vm_id}}, - {Core, opts} - ] + # Self-register the cluster routing entry here rather than via a start name; + # see `Hyper.Cluster.Routing.register_self/1`. A fresh random vm_id never + # collides, so `:already_registered` only happens against a stale dead + # incarnation - decline the start and let the supervisor retry clean. + case Hyper.Cluster.Routing.register_self({opts.vm_id, :supervisor}) do + :ok -> + children = [ + # Client must be registered before Core: Core starts the State machine, + # which calls Client.run while waiting for the daemon's API. Client + # depends only on vm_id (an independent peer), so no reverse dependency. + {Client, %Client.Opts{vm_id: opts.vm_id}}, + {Core, opts} + ] - Supervisor.init(children, strategy: :one_for_one) - end + Supervisor.init(children, strategy: :one_for_one) - defp via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :supervisor}) + {:error, _} -> + :ignore + end + end @doc "Test whether the system can run firecracker VMMs." @spec test_system() :: :ok | {:error, term()} diff --git a/lib/hyper/node/fire_vmm/client.ex b/lib/hyper/node/fire_vmm/client.ex index a336b09..2ed991c 100644 --- a/lib/hyper/node/fire_vmm/client.ex +++ b/lib/hyper/node/fire_vmm/client.ex @@ -57,15 +57,12 @@ defmodule Hyper.Node.FireVMM.Client do @type t :: %__MODULE__{socket_path: Path.t()} end + # Prod path (vm_id, no explicit name) starts unnamed and self-registers in + # `init` - see `Hyper.Cluster.Routing.register_self/1`. A `:name` override + # (test stand-ins) is honoured as a plain local name and skips registration. @spec start_link(Opts.t()) :: GenServer.on_start() def start_link(%Opts{} = opts) do - name = - case opts.name do - nil when not is_nil(opts.vm_id) -> via(opts.vm_id) - other -> other - end - - GenServer.start_link(__MODULE__, opts, gen_opts(name)) + GenServer.start_link(__MODULE__, opts, gen_opts(opts.name)) end @spec via(Hyper.Vm.id()) :: GenServer.name() @@ -78,12 +75,26 @@ defmodule Hyper.Node.FireVMM.Client do end @impl true - @spec init(Opts.t()) :: {:ok, State.t()} + @spec init(Opts.t()) :: {:ok, State.t()} | {:stop, {:already_registered, pid()}} def init(%Opts{} = opts) do - socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) - {:ok, %State{socket_path: socket_path}} + with :ok <- register(opts) do + socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) + {:ok, %State{socket_path: socket_path}} + end + end + + # Register cluster-wide under {vm_id, :client} on the prod path. With an + # explicit name (test stand-in), the name is the local registration, so skip. + @spec register(Opts.t()) :: :ok | {:stop, {:already_registered, pid()}} + defp register(%Opts{name: nil, vm_id: vm_id}) when not is_nil(vm_id) do + case Hyper.Cluster.Routing.register_self({vm_id, :client}) do + :ok -> :ok + {:error, reason} -> {:stop, reason} + end end + defp register(%Opts{}), do: :ok + @impl true def handle_call({:run, op_fun}, _from, %State{socket_path: socket_path} = state) do {:reply, op_fun.(socket_path: socket_path), state} diff --git a/lib/hyper/node/fire_vmm/core.ex b/lib/hyper/node/fire_vmm/core.ex index 8ccf618..18dfeb5 100644 --- a/lib/hyper/node/fire_vmm/core.ex +++ b/lib/hyper/node/fire_vmm/core.ex @@ -28,9 +28,12 @@ defmodule Hyper.Node.FireVMM.Core do alias Hyper.Node.FireVMM.Daemon alias Hyper.Node.FireVMM.State + # Started unnamed: nothing resolves the core by name (it is addressed as a + # child of `Hyper.Node.FireVMM`), so it needs no registry entry - and avoids a + # needless racy Horde registration at startup. @spec start_link(FireVMM.Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: Hyper.Cluster.Routing.via({opts.vm_id, :core})) + Supervisor.start_link(__MODULE__, opts) end @impl true diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index 0ca98e7..7a1749d 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -55,8 +55,11 @@ defmodule Hyper.Node.FireVMM.State do %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}} end - def start_link(%Opts{vm_id: id} = opts) do - :gen_statem.start_link(via(id), __MODULE__, opts, []) + # Started unnamed; the controller self-registers under `{id, :state}` from + # `init` (see `Hyper.Cluster.Routing.register_self/1`). `stop/1` still resolves + # it cluster-wide through `via/1`. + def start_link(%Opts{} = opts) do + :gen_statem.start_link(__MODULE__, opts, []) end @spec stop(Hyper.Vm.id()) :: :ok @@ -73,12 +76,21 @@ defmodule Hyper.Node.FireVMM.State do # The daemon is already (being) started by `Core` as our sibling. Read the root # device off the per-VM mutable layer, resolve the boot spec, set the readiness # deadline, and start probing the API. - def init(%Opts{mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = opts) do - spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) - deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) - data = %State{opts: opts, spec: spec, boot_deadline: deadline} - - {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + def init( + %Opts{vm_id: id, mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = + opts + ) do + case Hyper.Cluster.Routing.register_self({id, :state}) do + :ok -> + spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) + deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) + data = %State{opts: opts, spec: spec, boot_deadline: deadline} + + {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + + {:error, reason} -> + {:stop, reason} + end end # Assemble the `Hyper.Vm.source()` BootSpec expects from the resolved kernel + From 41961533b98b65020457acaf4e61ca222bf8d2d1 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 02:44:27 +0000 Subject: [PATCH 32/34] fix(suidhelper): grant-api opens the jail root dir for node traversal grant-api chowned the API socket to the caller, but the jailer leaves /root as 0700 owned by the per-VM uid. connect() needs search (+x) on every ancestor, so the unprivileged node still got EACCES traversing into root - the socket's owner was irrelevant. The op now also opens that one directory to the caller's group: owner stays the per-VM uid (firecracker needs it), chgrp to the caller's gid, chmod 0710 (owner rwx, group --x traverse-not-list, other none). Unrelated users stay locked out; only the socket and its parent's group/mode move. Add SafeDir::chmod_self/chgrp_self (fchmod/fchown on the pinned root fd, not by name - TOCTOU-safe). Extend the grant test to assert root is chgrp'd to the caller and chmod'd 0710. --- .../src/tools/chroot_jail/grant_api.rs | 40 +++++++++++++++---- native/suidhelper/src/util/safe_dir.rs | 24 ++++++++++- .../tests/tools/chroot_jail_grant_api.rs | 19 ++++++++- 3 files changed, 73 insertions(+), 10 deletions(-) diff --git a/native/suidhelper/src/tools/chroot_jail/grant_api.rs b/native/suidhelper/src/tools/chroot_jail/grant_api.rs index a74b548..4ec94a2 100644 --- a/native/suidhelper/src/tools/chroot_jail/grant_api.rs +++ b/native/suidhelper/src/tools/chroot_jail/grant_api.rs @@ -4,13 +4,22 @@ //! //! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker //! then creates its API socket at `/root/api.socket` owned by that per-VM -//! id, mode `0755`. Connecting a unix socket needs *write* permission on the -//! node, so the node user (a different uid) gets `EACCES`. This op chowns just -//! that one socket to the helper's CALLER — `getuid()`/`getgid()`, which inside -//! the privileged scope are the real (caller) ids while euid is 0 — and chmods -//! it `0660`. The node thus connects as owner, and humans added to the node's -//! group connect via the group bit. Per-VM isolation is otherwise untouched: -//! only this single socket moves, nothing else in the jail. +//! id. Connecting a unix socket needs *write* permission on the node, so the +//! node user (a different uid) gets `EACCES`. This op chowns just that one +//! socket to the helper's CALLER — `getuid()`/`getgid()`, which inside the +//! privileged scope are the real (caller) ids while euid is 0 — and chmods it +//! `0660`. The node thus connects as owner, and humans added to the node's +//! group connect via the group bit. +//! +//! That alone is not enough: the jailer leaves `/root` as `0700` owned by +//! the per-VM uid, and connecting needs *search* (`+x`) on every ancestor, so +//! the node cannot even traverse into `root` to reach the (now its own) socket. +//! So this op also opens just that one directory to the caller's group: it keeps +//! the per-VM uid as owner (firecracker still needs it), chgrps `root` to the +//! caller's gid, and chmods it `0710` — owner `rwx`, group `--x` (traverse, not +//! list), other none. Per-VM isolation is otherwise untouched: only this socket +//! and its immediate parent's group/mode move, nothing else in the jail, and +//! unrelated users stay locked out. //! //! Security: the socket path is validated as a `SafePath` and reached by an //! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect @@ -45,6 +54,11 @@ const SOCKET_PARENT_DEPTH: usize = 3; /// Mode handed to the node: owner+group read/write, no world access. const SOCKET_MODE: u32 = 0o660; +/// Mode set on the jail `root` dir so the node's group can *traverse* it to +/// reach the socket: owner `rwx` (the per-VM uid, unchanged), group `--x` +/// (traverse, not list), other none. +const JAIL_ROOT_MODE: u32 = 0o710; + type LexicalPath = SafePath; #[derive(Debug, ThisError)] @@ -65,6 +79,10 @@ pub enum Error { Chown(#[source] safe_dir::Error), #[error("chmoding the socket: {0}")] Chmod(#[source] safe_dir::Error), + #[error("chgrp-ing the jail root dir to the caller: {0}")] + ChgrpRoot(#[source] safe_dir::Error), + #[error("chmoding the jail root dir for traversal: {0}")] + ChmodRoot(#[source] safe_dir::Error), } #[derive(Args)] @@ -139,6 +157,14 @@ pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result Result<(), Error> { + fchmod(self.0.as_raw_fd(), Mode::from_bits_truncate(mode)).map_err(|source| Error::Chmod { + name: PathBuf::from("."), + source, + }) + } + + /// `fchown` this directory's group through its own held fd, preserving its + /// owner (no uid passed). Same TOCTOU guarantee as [`chmod_self`](Self::chmod_self). + pub fn chgrp_self(&self, gid: u32) -> Result<(), Error> { + fchown(self.0.as_raw_fd(), None, Some(Gid::from_raw(gid))).map_err(|source| Error::Chown { + name: PathBuf::from("."), + source, + }) + } + /// Remove the non-directory entry `name` from this directory. pub fn unlink(&self, name: &Path) -> Result<(), Error> { unlinkat(Some(self.0.as_raw_fd()), name, UnlinkatFlags::NoRemoveDir).map_err(|source| { diff --git a/native/suidhelper/tests/tools/chroot_jail_grant_api.rs b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs index 3e6092f..e55a679 100644 --- a/native/suidhelper/tests/tools/chroot_jail_grant_api.rs +++ b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs @@ -13,7 +13,9 @@ //! can never escape the anchored jail tree (the core TOCTOU guarantee); //! * pending — a not-yet-created socket (or half-built jail) is `Pending`, not //! an error, so the controller keeps probing; -//! * grant — a real socket is chowned to the caller and left mode 0660. +//! * grant — a real socket is chowned to the caller and left mode 0660, and +//! its parent `root` dir is opened for the caller's group to traverse +//! (chgrp'd to the caller, chmod'd 0710) so the node can reach the socket. use hyper_suidhelper::tools::chroot_jail::grant_api::{grant_api_under, Error, GrantOut}; use hyper_suidhelper::util::safe_path::ValidationError; @@ -134,6 +136,21 @@ fn real_socket_is_granted_and_chmod_0660() { assert_eq!(meta.mode() & 0o777, 0o660, "socket must be chmod'd 0660"); assert_eq!(meta.uid(), nix::unistd::getuid().as_raw()); assert_eq!(meta.gid(), nix::unistd::getgid().as_raw()); + + // The parent `root` dir must also be opened for the caller's group to + // traverse, else the node could not reach the socket: chgrp'd to the caller, + // chmod'd 0710 (owner rwx, group --x, other none). + let root_meta = fs::symlink_metadata(&root).unwrap(); + assert_eq!( + root_meta.mode() & 0o777, + 0o710, + "jail root must be chmod'd 0710 for traversal", + ); + assert_eq!( + root_meta.gid(), + nix::unistd::getgid().as_raw(), + "jail root must be chgrp'd to the caller", + ); } // A regular file planted at api.socket is refused and left untouched (not chmod'd). From 528487855a9960e12750d5fa022b14fdd2d28da2 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 05:22:04 +0000 Subject: [PATCH 33/34] fix(suidhelper): resolve dm symlink to real node before rootfs open The rootfs device handed to chroot-jail staging is /dev/mapper/hyper-rw-, a symlink into the dm farm. SafeFile opens it O_NOFOLLOW (it must never follow an attacker-supplied symlink), so the dm symlink itself was rejected with "file is not of the required type" and the guest never booted. Resolve the device to its real /dev/dm-N node via canonicalize before the open. Safe because the BlockDev lexical guard already proved the name is a hyper-owned dm device and /dev/mapper is a root-owned 0755 dir an unprivileged caller cannot redirect; loop nodes canonicalize to themselves. The re-opened target is still verified IsBlockDevice. --- native/suidhelper/src/util/chroot_jail.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/native/suidhelper/src/util/chroot_jail.rs b/native/suidhelper/src/util/chroot_jail.rs index 70d4ce7..6a61818 100644 --- a/native/suidhelper/src/util/chroot_jail.rs +++ b/native/suidhelper/src/util/chroot_jail.rs @@ -43,6 +43,12 @@ pub enum Error { }, #[error("copy kernel: {0}")] Copy(#[source] io::Error), + #[error("resolving block device {path}: {source}")] + ResolveDev { + path: PathBuf, + #[source] + source: io::Error, + }, } /// An unfilled artifact slot. @@ -190,8 +196,18 @@ pub fn stage_kernel_under( /// `uid:gid`. The device is opened as a verified `SafeFile`, so the /// number comes from a real device node, never a caller-supplied value. fn make_rootfs(chroot: &SafeDir, device: &BlockDev, uid: u32, gid: u32) -> Result<(), Error> { - let dev_path: SafePath = - device.as_ref().to_path_buf().try_into()?; + // `device` is lexically validated as `/dev/loopN` or `/dev/mapper/hyper-*`. + // The dm form is a symlink under `/dev/mapper` — a root-owned `0755` dir an + // unprivileged caller cannot write — so resolving it to its real `/dev/dm-N` + // node cannot be redirected by the caller. We must resolve first: `SafeFile` + // opens `O_NOFOLLOW` (it must never follow an attacker-supplied symlink) and + // would otherwise reject the dm symlink itself. Loop nodes resolve to + // themselves. The re-opened target is still verified `IsBlockDevice`. + let real = std::fs::canonicalize(device.as_ref()).map_err(|source| Error::ResolveDev { + path: device.as_ref().to_path_buf(), + source, + })?; + let dev_path: SafePath = real.try_into()?; let dev = SafeFile::::open(&dev_path, OFlag::O_PATH)?; let rdev = dev.rdev()?; chroot.mknod_block(Path::new(ROOTFS_NAME), rdev, uid, gid)?; From f2dc209171a3e6af495cb34ca86abe0f201678f5 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 26 Jun 2026 05:22:08 +0000 Subject: [PATCH 34/34] feat(fire_vmm): log boot failures in the Configuring state Both staging and config-apply failures end the boot via a :one_for_all supervisor restart, so without an explicit log the reason vanished into the restart cycle and the VM just appeared to relaunch for no visible cause. Log the reason at each failure path before stopping. --- lib/hyper/node/fire_vmm/state.ex | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index 7a1749d..fa87d9b 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -208,8 +208,12 @@ defmodule Hyper.Node.FireVMM.State do alias Hyper.Firecracker.Api.{InstanceActionInfo, Operations} alias Hyper.Node.FireVMM.{BootSpec, ChrootJail, Client, Opts} + require Logger + # Stage boot artifacts into the chroot, then issue the pre-boot config and - # start the guest. + # start the guest. Both failure paths end the boot via a supervisor restart, + # so log the reason here - otherwise it vanishes into the `:one_for_all` + # cycle and the VM just appears to relaunch for no visible cause. def handle( :state_timeout, :configure, @@ -218,11 +222,17 @@ defmodule Hyper.Node.FireVMM.State do case ChrootJail.stage(id, uid, gid, spec) do {:ok, jailed_spec} -> case apply_spec(id, jailed_spec) do - :ok -> {:next_state, :running, data} - {:error, reason} -> {:stop, {:shutdown, {:boot_failed, reason}}, data} + :ok -> + Logger.info("vm #{id}: configured, guest starting") + {:next_state, :running, data} + + {:error, reason} -> + Logger.error("vm #{id}: boot failed applying config: #{inspect(reason)}") + {:stop, {:shutdown, {:boot_failed, reason}}, data} end {:error, reason} -> + Logger.error("vm #{id}: boot failed staging jail: #{inspect(reason)}") {:stop, {:shutdown, {:boot_failed, {:staging, reason}}}, data} end end