diff --git a/config/runtime.exs b/config/runtime.exs index a3b4fd0f..c9ca89fb 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -14,17 +14,27 @@ config :hyper, Hyper.Node.Config.Budget, # Where to send traces. Defaults to Honeycomb; override OTEL_EXPORTER_OTLP_* # to point at any OTLP/HTTP backend (Collector, Grafana, etc). if config_env() != :test do - endpoint = System.get_env("OTEL_EXPORTER_OTLP_ENDPOINT", "https://api.honeycomb.io") + custom_endpoint = System.get_env("OTEL_EXPORTER_OTLP_ENDPOINT") + api_key = System.get_env("HONEYCOMB_API_KEY") - headers = - case System.get_env("HONEYCOMB_API_KEY") do - nil -> [] - "" -> [] - key -> [{"x-honeycomb-team", key}] - end + cond do + api_key not in [nil, ""] -> + config :opentelemetry_exporter, + otlp_protocol: :http_protobuf, + otlp_endpoint: custom_endpoint || "https://api.honeycomb.io", + otlp_headers: [{"x-honeycomb-team", api_key}] - config :opentelemetry_exporter, - otlp_protocol: :http_protobuf, - otlp_endpoint: endpoint, - otlp_headers: headers + custom_endpoint not in [nil, ""] -> + # A custom OTLP backend (e.g. a local Collector) needs no Honeycomb key. + config :opentelemetry_exporter, + otlp_protocol: :http_protobuf, + otlp_endpoint: custom_endpoint, + otlp_headers: [] + + true -> + # No backend configured: exporting to the Honeycomb default with no key + # 401s on every batch. Stay silent instead (typical for local dev). Set + # HONEYCOMB_API_KEY or OTEL_EXPORTER_OTLP_ENDPOINT to enable tracing. + config :opentelemetry, traces_exporter: :none + end end diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index a0030d35..57aa4ff0 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -19,13 +19,187 @@ The absolute best way to get started with `Hyper` is to play with it. ### Requirements -Hyper requires the following software be installed on each node running it: +#### External services - - [`skopeo`](https://github.com/containers/skopeo) - - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) +Hyper needs a **PostgreSQL** server reachable from every node - it is the image +database and the only stateful external dependency. -Hyper has more runtime dependencies, but they are automatically redistributed -by Hyper. +For local development the quickest path is Docker. The connection details below +match the defaults in `config/config.exs` (`Hyper.Img.Db.Repo`): + +```sh +docker run -d --name hyper-pg \ + -e POSTGRES_USER=postgres \ + -e POSTGRES_PASSWORD=postgres \ + -e POSTGRES_DB=hyper_dev \ + -p 5432:5432 \ + postgres:16 +``` + +Once it is up, create and migrate the schema (the repo is not in `ecto_repos`, +so pass it with `-r`): + +```sh +mix ecto.create -r Hyper.Img.Db.Repo +mix ecto.migrate -r Hyper.Img.Db.Repo +``` + +The container is ephemeral; `docker start hyper-pg` brings it back after a +reboot. To point Hyper at an existing server instead, override the +`Hyper.Img.Db.Repo` block in your `config.exs`. + +#### System binaries + +These are used by the unprivileged node directly; each must be on the node's +`PATH` (the bracketed override is the `config :hyper` key you can set if the +binary lives elsewhere): + + - [`skopeo`](https://github.com/containers/skopeo) - pulls OCI images + (`skopeo_path`) + - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) - provides `mke2fs`, which + builds the ext4 rootfs (`mke2fs_path`) + - `du`, `getent` (from **coreutils** and **glibc**) - rootfs sizing and user + resolution. Present on essentially every distro. + +The privileged device binaries - `losetup`, `blockdev` (from **util-linux**) +and `dmsetup` (from **lvm2** / device-mapper) - are run only by the setuid +helper, never named by the unprivileged caller. Their paths therefore live in +the helper's own config, `/etc/hyper/config.toml`, and default to +`/usr/sbin/{losetup,blockdev,dmsetup}`. + +**The config file must exist** to set `firecracker` and `jailer` (no built-in +defaults for those). The device-tool paths (`dmsetup`, `losetup`, `blockdev`) +and `work_dir` do have built-in defaults, so if you only need those defaults +and are not running VMs you may omit the file entirely. When the file is +present it must be root-owned and not group/other-writable, or the helper +refuses to start (a present-but-untrusted file is treated as an attack signal, +unlike a missing one): + +```toml +# /etc/hyper/config.toml (root-owned, mode 0644) +work_dir = "/srv/hyper" + +# REQUIRED - no default. Each must be an absolute path to a root-owned, +# non-group/world-writable binary named exactly "firecracker" or "jailer" +# (the helper validates the basename). Run `mix firecracker.install` to +# download the pinned release and print these values. +firecracker = "/opt/firecracker/firecracker" +jailer = "/opt/firecracker/jailer" + +# Optional device-tool overrides; default to /usr/sbin/{dmsetup,losetup,blockdev}. +# Each must be root-owned and not group/world-writable. +dmsetup = "/usr/sbin/dmsetup" +losetup = "/usr/sbin/losetup" +blockdev = "/usr/sbin/blockdev" + +# Optional. Governs which uid/gid values the helper accepts when launching the +# jailer. Must satisfy min > 0 and min <= max. Defaults to {900000, 999999}. +# If you narrow this range, set the same bounds in `config :hyper, uid_gid_range:` +# so the node hands out only uids the helper will accept. +[uid_gid_range] +min = 900000 +max = 999999 +``` + +`dmsetup` (lvm2) is frequently *not* installed by default - check that one +first. + +#### Kernel features + +The host kernel must provide: + + - **KVM** - `/dev/kvm` must exist and be accessible to the per-VM users (see + the `uid_gid_range` configuration). + - **cgroup v2** - the unified hierarchy mounted at `/sys/fs/cgroup`. v1-only + hosts are not supported. + - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` - from the + `dm_snapshot` (provides `snapshot`) and `dm_thin_pool` (provides `thin` and + `thin-pool`) modules. Hyper refuses to start without all three; on boot it + fails with `{:missing_dm_targets, [...]}` listing whichever are absent. + - **loop devices** - the `loop` module, used to attach layer images as block + devices. + +Load the modules and confirm the targets are present: + +```sh +sudo modprobe dm_snapshot dm_thin_pool loop +sudo dmsetup targets # must list snapshot, thin, and thin-pool +``` + +If `modprobe` reports the module is missing, the running kernel lacks it - +minimal cloud images often strip device-mapper. On Debian/Ubuntu, install the +extra modules for the running kernel, then load them: + +```sh +sudo apt-get install -y linux-modules-extra-$(uname -r) +sudo modprobe dm_snapshot dm_thin_pool loop +``` + +Make the modules load on every boot: + +```sh +printf 'dm_snapshot\ndm_thin_pool\nloop\n' | sudo tee /etc/modules-load.d/hyper.conf +``` + +#### Privileged setup + + - The **setuid-root device helper** (`hyper-suidhelper`) must be installed. + Run `mix suidhelper.install`, which builds, stamps, and places it + setuid-root on `PATH`. Every privileged operation (losetup, dmsetup, mknod, + chroot jails) routes through it; the BEAM itself runs unprivileged. + + The final `sudo install` step runs without a controlling terminal (Mix + captures the nested `cargo` output), so on a typical `tty_tickets` sudo + setup it cannot prompt for a password. If it fails, the build has already + stamped the binary -- just run the copy yourself: + + ```sh + sudo install -o root -g root -m 4755 \ + native/suidhelper/target/release/hyper-suidhelper \ + /usr/local/bin/hyper-suidhelper + ``` + - A **parent cgroup** named by `cgroup_parent` (default `hyper`) must exist + under the cgroup-v2 hierarchy; Hyper creates each VM's cgroup beneath it and + fails to boot with `:missing_parent_cgroup` if it is absent. Create it and + delegate the `cpu` and `memory` controllers so the per-VM cgroups can set + `cpu.max` / `memory.max`: + + ```sh + sudo mkdir -p /sys/fs/cgroup/hyper + echo '+cpu +memory' | sudo tee /sys/fs/cgroup/hyper/cgroup.subtree_control + ``` + + If that last write errors, the root hierarchy is not delegating those + controllers down yet - enable them there first, then retry the line above: + + ```sh + echo '+cpu +memory' | sudo tee /sys/fs/cgroup/cgroup.subtree_control + ``` + + The cgroup hierarchy is memory-backed, so `/sys/fs/cgroup/hyper` does **not** + survive a reboot. Re-create it each boot, or persist it with + `systemd-tmpfiles`: + + ```sh + echo 'd /sys/fs/cgroup/hyper 0755 root root -' \ + | sudo tee /etc/tmpfiles.d/hyper-cgroup.conf + ``` + - The host UID/GID range must be free for Hyper to allocate per-VM users + from. The node's range is set by `uid_gid_range` in `config :hyper`; the + helper independently reads `[uid_gid_range]` from `/etc/hyper/config.toml` + (see below) and only accepts jailer `--uid`/`--gid` within that range. + Keep the two in sync. + +#### Auto-redistributed + +`umoci` and the guest `vmlinux` kernels are downloaded, checksum-verified, and +managed by Hyper itself; you do not install them. + +`firecracker` and `jailer` are not auto-downloaded. Install them with +`mix firecracker.install [--prefix ]` (default prefix `/opt/firecracker`), +which downloads the pinned v1.16.0 release, places the binaries at +`/firecracker` and `/jailer`, and prints the `/etc/hyper/config.toml` +snippet to paste in. ### Installation @@ -40,22 +214,22 @@ configuration. ```elixir config :hyper, - # TODO(markovejnovic): Remove this after it gets auto-downloaded. - jailer_bin: "/opt/firecracker/jailer-v1.16.0-x86_64", - # TODO(markovejnovic): Remove this after it gets auto-downloaded. - firecracker_bin: "/opt/firecracker/firecracker-v1.16.0-x86_64", # You must create a parent cgroup on your system. Continue reading for # further details. cgroup_parent: "hyper", - # TODO(markovejnovic): Merge these directories into one. jailer_chroot_base: "/srv/hyper/jails", socket_dir: "/srv/hyper/socks", scratch_dir: "/srv/hyper/scratch", - # Hyper requires that each VM you pass + # Must match the [uid_gid_range] table in /etc/hyper/config.toml so the node + # hands out only uids the helper will accept. uid_gid_range: {900_000, 999_999}, layer_dir: "/srv/hyper/layers" ``` +The `firecracker` and `jailer` binary paths are **not** set here — they are read +from `/etc/hyper/config.toml` (the single source of truth shared with the setuid +helper). See the `config.toml` example above. + ### Usage diff --git a/lib/hyper.ex b/lib/hyper.ex index fb0a626e..ce946a6a 100644 --- a/lib/hyper.ex +++ b/lib/hyper.ex @@ -34,9 +34,24 @@ defmodule Hyper do end end - @doc "Generate a fresh VM id (url-safe base64, dm-name compatible)." + @doc """ + Generate a fresh VM id: a `v` prefix followed by lowercase base32 of 10 random + bytes, charset `[a-z2-7]`. + + Alphanumeric only - no `-`, `_`, or other punctuation. That is the intersection + of three independent constraints the id must satisfy at once: + + * firecracker rejects `_` in an instance id (`InvalidInstanceId`); + * dm/jailer names must not start with `-`; + * registry keys and chroot path components stay trivially safe. + + The previous base64url encoding emitted `-` and `_`, so it could produce ids + firecracker refused at boot (`Invalid char (_)`). + """ @spec gen_vm_id() :: Hyper.Vm.id() - def gen_vm_id, do: Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false) + def gen_vm_id do + "v" <> Base.encode32(:crypto.strong_rand_bytes(10), padding: false, case: :lower) + end @spec resolve_arch(Hyper.Vm.Instance.arch() | nil) :: {:ok, Hyper.Vm.Instance.arch()} | {:error, term()} diff --git a/lib/hyper/cluster/routing.ex b/lib/hyper/cluster/routing.ex index c7802595..7b478363 100644 --- a/lib/hyper/cluster/routing.ex +++ b/lib/hyper/cluster/routing.ex @@ -28,6 +28,26 @@ defmodule Hyper.Cluster.Routing do @spec via(term()) :: {:via, module(), {atom(), term()}} def via(key), do: {:via, Horde.Registry, {@name, key}} + @doc """ + Register the calling process under `key` from inside its own `init`. + + Prefer this over starting a process with a `{:via, Horde.Registry, _}` name. + OTP's post-start name check (`gen:get_proc_name`) calls `whereis_name` + immediately after the synchronous `register`, but Horde materialises the name + into its local ETS only asynchronously, via the DeltaCRDT diff loop. Under + registry churn that read loses the race and OTP aborts startup with + `{:process_not_registered_via, Horde.Registry}`. Registering from within + `init` carries no such self-check, while leaving the name cluster-resolvable + through `via/1` once the diff propagates (callers already tolerate that lag). + """ + @spec register_self(term()) :: :ok | {:error, {:already_registered, pid()}} + def register_self(key) do + case Horde.Registry.register(@name, key, nil) do + {:ok, _pid} -> :ok + {:error, {:already_registered, _pid}} = err -> err + end + end + @doc "Which node currently runs `vm_id`? `nil` if unknown." @spec whereis(Hyper.Vm.id()) :: node() | nil @decorate with_span("Hyper.Cluster.Routing.whereis", include: [:vm_id]) diff --git a/lib/hyper/cluster/scheduler.ex b/lib/hyper/cluster/scheduler.ex index 1412a0b2..4877658c 100644 --- a/lib/hyper/cluster/scheduler.ex +++ b/lib/hyper/cluster/scheduler.ex @@ -16,6 +16,8 @@ defmodule Hyper.Cluster.Scheduler do alias Hyper.Vm.Instance.Spec alias Unit.Information + require Logger + use OpenTelemetryDecorator @type layer_sizes :: [{Hyper.Layer.id(), Unit.Information.t()}] @@ -45,8 +47,15 @@ defmodule Hyper.Cluster.Scheduler do |> candidates(layers) |> Enum.reduce_while({:error, :no_capacity}, fn node, acc -> case attempt.(node) do - {:ok, result} -> {:halt, {:ok, {node, result}}} - {:error, _reason} -> {:cont, acc} + {:ok, result} -> + {:halt, {:ok, {node, result}}} + + {:error, reason} -> + # The candidate fit the snapshot but refused at confirmation time. + # Log the real reason: otherwise an actual boot failure on the only + # candidate is indistinguishable from genuine `:no_capacity`. + Logger.warning("scheduler: #{inspect(node)} refused placement: #{inspect(reason)}") + {:cont, acc} end end) end diff --git a/lib/hyper/config.ex b/lib/hyper/config.ex index 1332d9ee..0704c647 100644 --- a/lib/hyper/config.ex +++ b/lib/hyper/config.ex @@ -2,10 +2,10 @@ defmodule Hyper.Config do @moduledoc """ Host configuration, read from `config :hyper, ...` (see `config/config.exs`). - `work_dir` is the one value shared with the setuid helper - (`native/suidhelper`); both sides read it from `/etc/hyper/config.toml` so the - data root has a single source of truth (see `work_dir/0`). Everything else is - compile-time. + Runtime values shared with the setuid helper (`native/suidhelper`) — `work_dir`, + `firecracker`, `jailer` — are read from `/etc/hyper/config.toml` (the single + source of truth for both sides) the first time they are needed, then cached in + `:persistent_term`. Everything else is compile-time. """ # The shared data-root config file, read by both this node and the setuid @@ -16,9 +16,6 @@ defmodule Hyper.Config do @parent_cgroup Application.compile_env(:hyper, :cgroup_parent, "hyper") @uid_gid_range Application.compile_env!(:hyper, :uid_gid_range) @layer_dir Application.compile_env!(:hyper, :layer_dir) - @losetup_path Application.compile_env(:hyper, :losetup_path, "losetup") - @dmsetup_path Application.compile_env(:hyper, :dmsetup_path, "dmsetup") - @blockdev_path Application.compile_env(:hyper, :blockdev_path, "blockdev") @skopeo_path Application.compile_env(:hyper, :skopeo_path, "skopeo") @umoci_path Application.compile_env(:hyper, :umoci_path, nil) @mke2fs_path Application.compile_env(:hyper, :mke2fs_path, "mke2fs") @@ -27,39 +24,86 @@ defmodule Hyper.Config do Root work directory for this node. All firecracker paths derive from it. Read from `#{@config_path}` (the single source of truth shared with the setuid - helper) the first time it is needed, then cached in `:persistent_term`. Falls - back to `#{@dev_work_dir}` when the file is absent (local dev / CI, where the - helper is not installed anyway). + helper) the first time it is needed, then cached via `config_toml/0`. Falls back + to `#{@dev_work_dir}` when the file is absent (local dev / CI, where the helper + is not installed anyway). """ @spec work_dir :: Path.t() - def work_dir do - case :persistent_term.get({__MODULE__, :work_dir}, nil) do + def work_dir, do: Map.get(config_toml(), "work_dir", @dev_work_dir) + + @doc "Directory holding redistributable binaries downloaded by the node." + @spec redist_dir :: Path.t() + def redist_dir, do: Path.join(work_dir(), "redist") + + @doc """ + Absolute path to the firecracker binary, as set in `#{@config_path}` under the + `firecracker` key. Raises if the key is absent — the operator must configure it; + there is no default. + + For the launch path only. Pre-launch checks should use `firecracker_bin_configured/0` + so a missing key returns a typed error rather than crashing. + """ + @spec firecracker_bin :: Path.t() + def firecracker_bin, do: fetch_bin!("firecracker") + + @doc """ + Non-raising form of `firecracker_bin/0`. Returns `{:ok, path}` when the + `firecracker` key is present in `#{@config_path}`, or `:error` when it is absent. + """ + @spec firecracker_bin_configured :: {:ok, Path.t()} | :error + def firecracker_bin_configured, do: Map.fetch(config_toml(), "firecracker") + + @doc """ + Absolute path to the jailer binary, as set in `#{@config_path}` under the + `jailer` key. Raises if the key is absent — the operator must configure it; + there is no default. + + For the launch path only. Pre-launch checks should use `jailer_bin_configured/0` + so a missing key returns a typed error rather than crashing. + """ + @spec jailer_bin :: Path.t() + def jailer_bin, do: fetch_bin!("jailer") + + @doc """ + Non-raising form of `jailer_bin/0`. Returns `{:ok, path}` when the `jailer` key + is present in `#{@config_path}`, or `:error` when it is absent. + """ + @spec jailer_bin_configured :: {:ok, Path.t()} | :error + def jailer_bin_configured, do: Map.fetch(config_toml(), "jailer") + + @spec fetch_bin!(String.t()) :: Path.t() + defp fetch_bin!(key) do + case Map.fetch(config_toml(), key) do + {:ok, path} -> + path + + :error -> + raise "#{@config_path}: key #{inspect(key)} is not set; " <> + "operator must configure it before starting the node" + end + end + + @spec config_toml :: map() + defp config_toml do + case :persistent_term.get({__MODULE__, :config_toml}, nil) do nil -> - work_dir = load_work_dir() - :persistent_term.put({__MODULE__, :work_dir}, work_dir) - work_dir + cfg = load_config_toml() + :persistent_term.put({__MODULE__, :config_toml}, cfg) + cfg - work_dir -> - work_dir + cfg -> + cfg end end - @spec load_work_dir :: Path.t() - defp load_work_dir do + @spec load_config_toml :: map() + defp load_config_toml do case File.read(@config_path) do - {:ok, body} -> body |> Toml.decode!() |> Map.fetch!("work_dir") - {:error, _} -> @dev_work_dir + {:ok, body} -> Toml.decode!(body) + {:error, _} -> %{} end end - @doc "Directory holding redistributable binaries downloaded by the node." - @spec redist_dir :: Path.t() - def redist_dir, do: Path.join(work_dir(), "redist") - - @doc "Directory where `Hyper.Node.FireVMM.Provider` installs the firecracker release." - @spec firecracker_install_dir :: Path.t() - def firecracker_install_dir, do: Path.join(redist_dir(), "firecracker") - @doc "Directory where `Hyper.Node.FireVMM.VmLinux.Provider` installs guest kernels." @spec vmlinux_install_dir :: Path.t() def vmlinux_install_dir, do: Path.join(redist_dir(), "vmlinux") @@ -111,15 +155,6 @@ defmodule Hyper.Config do @spec layer_dir :: Path.t() def layer_dir, do: @layer_dir - @doc "Path to the losetup binary." - def losetup_path, do: @losetup_path - - @doc "Path to the dmsetup binary." - def dmsetup_path, do: @dmsetup_path - - @doc "Path to the blockdev binary." - def blockdev_path, do: @blockdev_path - @doc "Path to the skopeo binary (used by `Hyper.Img.OciLoader` to pull OCI images)." def skopeo_path, do: @skopeo_path @@ -132,15 +167,20 @@ defmodule Hyper.Config do @doc "Path to the mke2fs binary (used by `Hyper.Img.OciLoader` to build the ext4 rootfs)." def mke2fs_path, do: @mke2fs_path + # Where `cargo xtask install` (via `mix suidhelper.install`) drops the helper. + @default_suid_helper "/usr/local/bin/hyper-suidhelper" + @doc """ - Path to the setuid-root device helper (`hyper-suidhelper`). Required: the node - runs unprivileged and routes every `losetup`/`dmsetup`/`blockdev` operation - through it. + Path to the setuid-root device helper (`hyper-suidhelper`). The node runs + unprivileged and routes every `losetup`/`dmsetup`/`blockdev` operation through + it. - Runtime config (host-specific), so it can be set per node without recompiling. + Defaults to `#{@default_suid_helper}`, the install path used by `mix + suidhelper.install`. Runtime config (host-specific), so an operator who + installs it elsewhere can override per node without recompiling. """ @spec suid_helper :: Path.t() - def suid_helper, do: Application.fetch_env!(:hyper, :suid_helper) + def suid_helper, do: Application.get_env(:hyper, :suid_helper, @default_suid_helper) @doc """ Directory for per-VM scratch (writable-layer COW) files. Must be node-local and diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index 6282eaa7..b4721167 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -40,8 +40,14 @@ defmodule Hyper.Node do def start_link(opts \\ []) do case test_system() do - :ok -> Supervisor.start_link(__MODULE__, opts, name: __MODULE__) - {:error, reason} -> {:error, reason} + :ok -> + # Clear any dm/loop devices a previous unclean shutdown left behind, + # before the device-owning children start and collide with them. + :ok = Hyper.Node.Reclaim.run() + Supervisor.start_link(__MODULE__, opts, name: __MODULE__) + + {:error, reason} -> + {:error, reason} end end @@ -49,9 +55,12 @@ defmodule Hyper.Node do def init(_opts) do children = [ Hyper.Node.Users, + # Layer owns Hyper.Node.Layer.Registry, which Budget.Advertiser queries + # (via Hyper.Node.Layer.active/0) as it advertises on init - so Layer must + # be up first. + Hyper.Node.Layer, Hyper.Node.Budget.Supervisor, {DynamicSupervisor, name: @vm_sup, strategy: :one_for_one}, - Hyper.Node.Layer, Hyper.Node.Img ] @@ -144,10 +153,11 @@ defmodule Hyper.Node do @spec test_system :: :ok | {:error, term()} def test_system do with {:ok, _} <- Hyper.Node.Config.Budget.load(), - :ok <- Hyper.Node.FireVMM.Provider.ensure_installed(), + :ok <- check_firecracker_bins(), :ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(), :ok <- Hyper.Node.Vmlinux.test_system(), :ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(), + :ok <- Hyper.Img.OciLoader.test_system(), :ok <- Hyper.Node.Users.test_system(), :ok <- Hyper.Node.Layer.Repo.test_system(), :ok <- Hyper.SuidHelper.test_system(), @@ -157,6 +167,24 @@ defmodule Hyper.Node do end end + @spec check_firecracker_bins :: + :ok + | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}} + | {:error, :firecracker_not_configured | :jailer_not_configured} + defp check_firecracker_bins do + with {:fc, {:ok, fc}} <- {:fc, Hyper.Config.firecracker_bin_configured()}, + {:jail, {:ok, jail}} <- {:jail, Hyper.Config.jailer_bin_configured()} do + cond do + not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}} + not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}} + true -> :ok + end + else + {:fc, :error} -> {:error, :firecracker_not_configured} + {:jail, :error} -> {:error, :jailer_not_configured} + end + end + @spec check_helper_base(Path.t()) :: :ok | {:error, {:suid_helper_base_mismatch, Path.t(), Path.t()}} defp check_helper_base(base) do diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex index 7e5603b2..f9e45fbc 100644 --- a/lib/hyper/node/fire_vmm.ex +++ b/lib/hyper/node/fire_vmm.ex @@ -47,14 +47,15 @@ defmodule Hyper.Node.FireVMM do @spec start_link(Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id)) + Supervisor.start_link(__MODULE__, opts) end + @spec child_spec(Opts.t()) :: Supervisor.child_spec() def child_spec(opts) do # Keyed by VM id and :transient so a cleanly-stopped VM is not rebooted by # the node-level DynamicSupervisor. %{ - vm_id: {__MODULE__, opts.vm_id}, + id: {__MODULE__, opts.vm_id}, start: {__MODULE__, :start_link, [opts]}, type: :supervisor, restart: :transient @@ -63,18 +64,26 @@ defmodule Hyper.Node.FireVMM do @impl true def init(opts) do - children = [ - # Client must be registered before Core: Core starts the State machine, - # which calls Client.run while waiting for the daemon's API. Client depends - # only on vm_id (an independent peer), so it has no reverse dependency. - {Client, %Client.Opts{vm_id: opts.vm_id}}, - {Core, opts} - ] + # Self-register the cluster routing entry here rather than via a start name; + # see `Hyper.Cluster.Routing.register_self/1`. A fresh random vm_id never + # collides, so `:already_registered` only happens against a stale dead + # incarnation - decline the start and let the supervisor retry clean. + case Hyper.Cluster.Routing.register_self({opts.vm_id, :supervisor}) do + :ok -> + children = [ + # Client must be registered before Core: Core starts the State machine, + # which calls Client.run while waiting for the daemon's API. Client + # depends only on vm_id (an independent peer), so no reverse dependency. + {Client, %Client.Opts{vm_id: opts.vm_id}}, + {Core, opts} + ] - Supervisor.init(children, strategy: :one_for_one) - end + Supervisor.init(children, strategy: :one_for_one) - defp via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :supervisor}) + {:error, _} -> + :ignore + end + end @doc "Test whether the system can run firecracker VMMs." @spec test_system() :: :ok | {:error, term()} diff --git a/lib/hyper/node/fire_vmm/client.ex b/lib/hyper/node/fire_vmm/client.ex index a336b095..2ed991ce 100644 --- a/lib/hyper/node/fire_vmm/client.ex +++ b/lib/hyper/node/fire_vmm/client.ex @@ -57,15 +57,12 @@ defmodule Hyper.Node.FireVMM.Client do @type t :: %__MODULE__{socket_path: Path.t()} end + # Prod path (vm_id, no explicit name) starts unnamed and self-registers in + # `init` - see `Hyper.Cluster.Routing.register_self/1`. A `:name` override + # (test stand-ins) is honoured as a plain local name and skips registration. @spec start_link(Opts.t()) :: GenServer.on_start() def start_link(%Opts{} = opts) do - name = - case opts.name do - nil when not is_nil(opts.vm_id) -> via(opts.vm_id) - other -> other - end - - GenServer.start_link(__MODULE__, opts, gen_opts(name)) + GenServer.start_link(__MODULE__, opts, gen_opts(opts.name)) end @spec via(Hyper.Vm.id()) :: GenServer.name() @@ -78,12 +75,26 @@ defmodule Hyper.Node.FireVMM.Client do end @impl true - @spec init(Opts.t()) :: {:ok, State.t()} + @spec init(Opts.t()) :: {:ok, State.t()} | {:stop, {:already_registered, pid()}} def init(%Opts{} = opts) do - socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) - {:ok, %State{socket_path: socket_path}} + with :ok <- register(opts) do + socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) + {:ok, %State{socket_path: socket_path}} + end + end + + # Register cluster-wide under {vm_id, :client} on the prod path. With an + # explicit name (test stand-in), the name is the local registration, so skip. + @spec register(Opts.t()) :: :ok | {:stop, {:already_registered, pid()}} + defp register(%Opts{name: nil, vm_id: vm_id}) when not is_nil(vm_id) do + case Hyper.Cluster.Routing.register_self({vm_id, :client}) do + :ok -> :ok + {:error, reason} -> {:stop, reason} + end end + defp register(%Opts{}), do: :ok + @impl true def handle_call({:run, op_fun}, _from, %State{socket_path: socket_path} = state) do {:reply, op_fun.(socket_path: socket_path), state} diff --git a/lib/hyper/node/fire_vmm/core.ex b/lib/hyper/node/fire_vmm/core.ex index 8ccf6180..18dfeb50 100644 --- a/lib/hyper/node/fire_vmm/core.ex +++ b/lib/hyper/node/fire_vmm/core.ex @@ -28,9 +28,12 @@ defmodule Hyper.Node.FireVMM.Core do alias Hyper.Node.FireVMM.Daemon alias Hyper.Node.FireVMM.State + # Started unnamed: nothing resolves the core by name (it is addressed as a + # child of `Hyper.Node.FireVMM`), so it needs no registry entry - and avoids a + # needless racy Horde registration at startup. @spec start_link(FireVMM.Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: Hyper.Cluster.Routing.via({opts.vm_id, :core})) + Supervisor.start_link(__MODULE__, opts) end @impl true diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex index 7cbc159c..b6257732 100644 --- a/lib/hyper/node/fire_vmm/daemon.ex +++ b/lib/hyper/node/fire_vmm/daemon.ex @@ -4,15 +4,17 @@ defmodule Hyper.Node.FireVMM.Daemon do `Hyper.Node.FireVMM.Core`. Lifecycle is supervisor-owned. On every (re)start it first resets any stale - jail left by a prior incarnation - the firecracker jailer refuses to reuse an - existing chroot - then builds the jailer command and runs it under + jail left by a prior incarnation — the firecracker jailer refuses to reuse an + existing chroot — then builds the jailer command and runs it under `MuonTrap.Daemon`, which kills the OS process when its port closes (controller crash, container teardown, or BEAM death). So no firecracker process outlives the supervisor, and `Core`'s `:one_for_all` restarting this child (e.g. after a firecracker crash) cleanly cold-boots against a fresh jail. - The supervised process *is* the `MuonTrap.Daemon` - `start_link/1` does the - reset, then delegates and returns that pid. + The supervised process is `hyper-suidhelper jailer ...`, which `execve`s into + the jailer (same pid) so MuonTrap owns the firecracker lifetime without needing + to know the jailer path. `start_link/1` does the reset, then delegates and + returns that pid. """ alias Hyper.Node.FireVMM.{Jailer, Opts} @@ -21,6 +23,8 @@ defmodule Hyper.Node.FireVMM.Daemon do use OpenTelemetryDecorator + require Logger + @shutdown_timeout Time.s(5) @spec child_spec(Opts.t()) :: Supervisor.child_spec() @@ -44,9 +48,26 @@ defmodule Hyper.Node.FireVMM.Daemon do with :ok <- SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) do cmd = Jailer.command(opts) - case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, []) do - {:ok, pid} -> {:ok, pid} - {:error, _} = err -> err + # Surface what the jailed process actually does: `log_output` routes the + # helper/jailer/firecracker stdout+stderr (guest serial console included) + # to the Logger, and `exit_status_to_reason` turns MuonTrap's opaque + # `:error_exit_status` into `{:firecracker_exited, status}` so a crash + # report names the real exit code instead of hiding it. + daemon_opts = [ + log_output: :info, + log_prefix: "vm #{id} firecracker: ", + stderr_to_stdout: true, + exit_status_to_reason: &{:firecracker_exited, &1} + ] + + case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, daemon_opts) do + {:ok, pid} -> + Logger.info("vm #{id}: jailer launched under MuonTrap (#{inspect(pid)})") + {:ok, pid} + + {:error, reason} = err -> + Logger.error("vm #{id}: jailer failed to launch: #{inspect(reason)}") + err end end end diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex index 6dd1b800..355ad0d7 100644 --- a/lib/hyper/node/fire_vmm/jailer.ex +++ b/lib/hyper/node/fire_vmm/jailer.ex @@ -1,27 +1,26 @@ defmodule Hyper.Node.FireVMM.Jailer do @moduledoc """ - Builds the firecracker - [jailer](https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md) - command for one VM. + Builds the `hyper-suidhelper jailer` command for one VM. - The jailer sets up the chroot, namespaces, cgroup (via `Hyper.Vm.Instance` - flags) and drops privileges, then exec's firecracker. We run the jailer (not - firecracker directly) under `MuonTrap.Daemon`; MuonTrap only supervises the OS - process, the jailer owns isolation. + The BEAM does not invoke the jailer directly. Instead it calls the setuid helper + with the `jailer` subcommand; the helper reads the firecracker binary path, chroot + base, parent cgroup, and cgroup version from its trusted `/etc/hyper/config.toml`, + re-acquires root, and `execve`s the jailer (same pid, so `MuonTrap.Daemon` keeps + supervising it). + + This means the BEAM passes only untrusted-origin values: `--id`, `--uid`, `--gid`, + repeated `--cgroup KEY=VALUE`, and `--api-sock`. The helper derives and validates + everything else; it also inserts the `--` separator between its own flags and + firecracker's flags. Because firecracker is chrooted to `///root`, the API - socket it opens at `/api.socket` lives at `host_socket` on the host - that's the + socket it opens at `/api.socket` lives at `host_socket` on the host — that's the path the controller connects to. - - Host config: paths are derived from `config :hyper, work_dir: ...`. The - firecracker + jailer binaries are installed under `/redist/firecracker` - by `Hyper.Node.FireVMM.Provider`; the chroot base is `/jails`. """ use OpenTelemetryDecorator alias Hyper.Node.FireVMM - alias Hyper.Node.FireVMM.Provider alias Hyper.Vm.Instance # firecracker's API socket path *inside* the chroot. @@ -94,26 +93,11 @@ defmodule Hyper.Node.FireVMM.Jailer do @spec command(FireVMM.Opts.t()) :: t() def command(opts) do args = - [ - "--id", - opts.vm_id, - "--exec-file", - Provider.firecracker_bin(), - "--uid", - to_string(opts.uid), - "--gid", - to_string(opts.gid), - "--chroot-base-dir", - Hyper.Config.chroot_base(), - "--cgroup-version", - "2", - "--parent-cgroup", - Hyper.Config.parent_cgroup() - ] ++ + ["jailer", "--id", opts.vm_id, "--uid", to_string(opts.uid), "--gid", to_string(opts.gid)] ++ cgroup_flags(opts.type) ++ - ["--", "--api-sock", "/" <> @jail_socket] + ["--api-sock", "/" <> @jail_socket] - %{binary: Provider.jailer_bin(), args: args, host_socket: host_socket(opts.vm_id)} + %{binary: Hyper.Config.suid_helper(), args: args, host_socket: host_socket(opts.vm_id)} end # Find the appropriate jailer cgroup flags for the given instance type. @@ -166,5 +150,5 @@ defmodule Hyper.Node.FireVMM.Jailer do ]) end - defp exec_name, do: Path.basename(Provider.firecracker_bin()) + defp exec_name, do: Path.basename(Hyper.Config.firecracker_bin()) end diff --git a/lib/hyper/node/fire_vmm/provider.ex b/lib/hyper/node/fire_vmm/provider.ex deleted file mode 100644 index e50aee16..00000000 --- a/lib/hyper/node/fire_vmm/provider.ex +++ /dev/null @@ -1,92 +0,0 @@ -defmodule Hyper.Node.FireVMM.Provider do - @moduledoc """ - Installs the firecracker release for the current architecture into - `Hyper.Config.firecracker_install_dir/0` (`/redist/firecracker`). - - `ensure_installed/0` is idempotent: if the binaries are already present and - executable it returns `:ok` without touching the network. Otherwise it fetches - the official firecracker release tarball for the detected architecture via - `Redist.Targz` (download, SHA-256 verify, extract). The archive is - extracted as-is - the binaries live under `release-v-/` exactly as - firecracker ships them, and `firecracker_bin/0` / `jailer_bin/0` resolve those - paths. - """ - - alias Redist.Targz - - @downloads %{ - x86_64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-x86_64.tgz", - sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", - firecracker: "release-v1.16.0-x86_64/firecracker-v1.16.0-x86_64", - jailer: "release-v1.16.0-x86_64/jailer-v1.16.0-x86_64" - }, - aarch64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-aarch64.tgz", - sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", - firecracker: "release-v1.16.0-aarch64/firecracker-v1.16.0-aarch64", - jailer: "release-v1.16.0-aarch64/jailer-v1.16.0-aarch64" - } - } - - @doc "Absolute path to the installed firecracker binary." - @spec firecracker_bin() :: Path.t() - def firecracker_bin, do: bin_path(:firecracker) - - @doc "Absolute path to the installed jailer binary." - @spec jailer_bin() :: Path.t() - def jailer_bin, do: bin_path(:jailer) - - @doc "Ensure the firecracker release is installed for this node." - @spec ensure_installed() :: :ok | {:error, term()} - def ensure_installed do - with {:ok, arch} <- Sys.Arch.current() do - dl = Map.fetch!(@downloads, arch) - - case check_install(dl) do - :ok -> :ok - {:error, :not_installed} -> install(dl) - {:error, :bad_install} -> reinstall(dl) - end - end - end - - # `:ok` if `dl`'s version-specific binaries are present and executable; - # `{:error, :not_installed}` if the install dir is empty/absent; otherwise - # `{:error, :bad_install}` - something is there but it's the wrong version, - # partial, or corrupt, which we cannot fix in place because `Targz` keeps - # existing files. The remedy is to wipe and reinstall. - @spec check_install(map()) :: :ok | {:error, :not_installed | :bad_install} - defp check_install(dl) do - fc = Path.join(install_dir(), dl.firecracker) - jail = Path.join(install_dir(), dl.jailer) - - cond do - Sys.Posix.executable?(fc) and Sys.Posix.executable?(jail) -> - :ok - - File.dir?(install_dir()) and File.ls!(install_dir()) != [] -> - {:error, :bad_install} - - true -> - {:error, :not_installed} - end - end - - defp install(dl), do: Targz.install(dl.url, dl.sha256, install_dir()) - - defp reinstall(dl) do - _ = File.rm_rf!(install_dir()) - install(dl) - end - - defp bin_path(key) do - {:ok, arch} = Sys.Arch.current() - dl = Map.fetch!(@downloads, arch) - Path.join(install_dir(), Map.fetch!(dl, key)) - end - - defp install_dir, do: Hyper.Config.firecracker_install_dir() -end diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index b390965c..fa87d9b4 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -39,12 +39,13 @@ defmodule Hyper.Node.FireVMM.State do } @enforce_keys [:opts] - defstruct [:opts, :spec, :boot_deadline] + defstruct [:opts, :spec, :boot_deadline, api_granted: false] @type t :: %State{ opts: Opts.t(), spec: BootSpec.Cold.t() | nil, - boot_deadline: integer() | nil + boot_deadline: integer() | nil, + api_granted: boolean() } # How long to wait for the daemon's API to come up before failing the boot. @@ -54,8 +55,11 @@ defmodule Hyper.Node.FireVMM.State do %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}} end - def start_link(%Opts{vm_id: id} = opts) do - :gen_statem.start_link(via(id), __MODULE__, opts, []) + # Started unnamed; the controller self-registers under `{id, :state}` from + # `init` (see `Hyper.Cluster.Routing.register_self/1`). `stop/1` still resolves + # it cluster-wide through `via/1`. + def start_link(%Opts{} = opts) do + :gen_statem.start_link(__MODULE__, opts, []) end @spec stop(Hyper.Vm.id()) :: :ok @@ -72,12 +76,21 @@ defmodule Hyper.Node.FireVMM.State do # The daemon is already (being) started by `Core` as our sibling. Read the root # device off the per-VM mutable layer, resolve the boot spec, set the readiness # deadline, and start probing the API. - def init(%Opts{mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = opts) do - spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) - deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) - data = %State{opts: opts, spec: spec, boot_deadline: deadline} - - {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + def init( + %Opts{vm_id: id, mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = + opts + ) do + case Hyper.Cluster.Routing.register_self({id, :state}) do + :ok -> + spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) + deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) + data = %State{opts: opts, spec: spec, boot_deadline: deadline} + + {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + + {:error, reason} -> + {:stop, reason} + end end # Assemble the `Hyper.Vm.source()` BootSpec expects from the resolved kernel + @@ -110,31 +123,81 @@ defmodule Hyper.Node.FireVMM.State do @moduledoc "Poll the (already-launched) daemon's API socket, then advance to `:configuring`." alias Hyper.Firecracker.Api.{InstanceInfo, Operations} - alias Hyper.Node.FireVMM.{Client, Opts} + alias Hyper.Node.FireVMM.{Client, Jailer, Opts} + alias Hyper.SuidHelper.ChrootJail alias Unit.Time + require Logger + # How often to probe the daemon's API while waiting for it. @probe_interval Time.ms(50) - # Poll the daemon's API until it answers, then configure. Give up if the - # readiness deadline passes first. + # Hand the jailed API socket to the node user, then poll the daemon's API + # until it answers and advance to `:configuring`. Give up if the readiness + # deadline passes first. The grant must happen before the probe: firecracker + # creates the socket owned by the per-VM uid, so the unprivileged controller + # gets EACCES on connect until the helper chowns it to us. def handle(:state_timeout, :probe, %{opts: %Opts{vm_id: id}} = data) do - case Client.run(Client.via(id), &Operations.describe_instance/1) do - {:ok, %InstanceInfo{}} -> - {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} - - {:error, _reason} -> - if System.monotonic_time(:millisecond) >= data.boot_deadline do - {:stop, {:shutdown, {:boot_failed, :daemon_unready}}, data} - else - {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + case ensure_api_granted(id, data) do + {:cont, data} -> + case Client.run(Client.via(id), &Operations.describe_instance/1) do + {:ok, %InstanceInfo{}} -> + {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} + + {:error, reason} -> + keep_probing(id, data, reason) end + + {:wait, data, reason} -> + keep_probing(id, data, reason) end end def handle({:call, from}, :stop, data) do {:next_state, :stopping, data, [{:reply, from, :ok}]} end + + # Ensure the jailed API socket has been handed to the node user. Idempotent + # once granted (we record it in `data` so we ask the helper only once). + # `:socket_pending` means firecracker has not created the socket yet, so we + # keep waiting; a hard error is logged but also tolerated until the deadline + # (the probe that follows would fail with EACCES anyway and drive the stop). + @spec ensure_api_granted(Hyper.Vm.id(), State.t()) :: + {:cont, State.t()} | {:wait, State.t(), term()} + defp ensure_api_granted(_id, %{api_granted: true} = data), do: {:cont, data} + + defp ensure_api_granted(id, data) do + case ChrootJail.grant_api(Jailer.host_socket(id)) do + :ok -> + {:cont, %{data | api_granted: true}} + + {:error, :socket_pending} -> + {:wait, data, :socket_pending} + + {:error, reason} -> + Logger.warning("vm #{id}: grant-api failed: #{inspect(reason)}") + {:wait, data, {:grant_api, reason}} + end + end + + # Keep waiting for readiness, re-arming the probe timer, unless the deadline + # has lapsed - then fail the boot, surfacing `reason` rather than swallowing + # it into a bare `:daemon_unready`. A persistent failure here points at the + # host->jail socket (path or, more often, the grant/permission step above). + @spec keep_probing(Hyper.Vm.id(), State.t(), term()) :: + {:keep_state, State.t(), list()} | {:stop, term(), State.t()} + defp keep_probing(id, data, reason) do + if System.monotonic_time(:millisecond) >= data.boot_deadline do + Logger.warning( + "vm #{id}: firecracker API not reachable before deadline; " <> + "last probe error: #{inspect(reason)}" + ) + + {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data} + else + {:keep_state, data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + end + end end defmodule Configuring do @@ -145,8 +208,12 @@ defmodule Hyper.Node.FireVMM.State do alias Hyper.Firecracker.Api.{InstanceActionInfo, Operations} alias Hyper.Node.FireVMM.{BootSpec, ChrootJail, Client, Opts} + require Logger + # Stage boot artifacts into the chroot, then issue the pre-boot config and - # start the guest. + # start the guest. Both failure paths end the boot via a supervisor restart, + # so log the reason here - otherwise it vanishes into the `:one_for_all` + # cycle and the VM just appears to relaunch for no visible cause. def handle( :state_timeout, :configure, @@ -155,11 +222,17 @@ defmodule Hyper.Node.FireVMM.State do case ChrootJail.stage(id, uid, gid, spec) do {:ok, jailed_spec} -> case apply_spec(id, jailed_spec) do - :ok -> {:next_state, :running, data} - {:error, reason} -> {:stop, {:shutdown, {:boot_failed, reason}}, data} + :ok -> + Logger.info("vm #{id}: configured, guest starting") + {:next_state, :running, data} + + {:error, reason} -> + Logger.error("vm #{id}: boot failed applying config: #{inspect(reason)}") + {:stop, {:shutdown, {:boot_failed, reason}}, data} end {:error, reason} -> + Logger.error("vm #{id}: boot failed staging jail: #{inspect(reason)}") {:stop, {:shutdown, {:boot_failed, {:staging, reason}}}, data} end end diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex index 386e36f0..4dbab644 100644 --- a/lib/hyper/node/img/mutable.ex +++ b/lib/hyper/node/img/mutable.ex @@ -121,6 +121,12 @@ defmodule Hyper.Node.Img.Mutable do @impl true def handle_info(:idle_timeout, state), do: {:noreply, state} + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, state) do # Destroy the thin volume, then release the image (its monitor on us also diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex index d51fe3ba..de6579f5 100644 --- a/lib/hyper/node/img/server.ex +++ b/lib/hyper/node/img/server.ex @@ -131,6 +131,12 @@ defmodule Hyper.Node.Img.Server do {:noreply, state} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, %State{dm_names: dm_names}) do # Remove top-down (a snapshot's origin is the device below it). Layers are diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex index f3ec3e19..9599da23 100644 --- a/lib/hyper/node/img/thin_pool.ex +++ b/lib/hyper/node/img/thin_pool.ex @@ -94,6 +94,13 @@ defmodule Hyper.Node.Img.ThinPool do {:reply, :ok, id_free(state, id)} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. An abnormal exit + # carries a non-`:normal` reason and falls through to the default handler. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, state) do _ = SuidHelper.Dmsetup.remove(@pool_name) diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex index 188e135b..f717e155 100644 --- a/lib/hyper/node/layer/server.ex +++ b/lib/hyper/node/layer/server.ex @@ -122,6 +122,12 @@ defmodule Hyper.Node.Layer.Server do {:noreply, state} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, %State{blk_path: blk_path}) do case SuidHelper.Losetup.detach(blk_path) do diff --git a/lib/hyper/node/reclaim.ex b/lib/hyper/node/reclaim.ex new file mode 100644 index 00000000..f9513f89 --- /dev/null +++ b/lib/hyper/node/reclaim.ex @@ -0,0 +1,99 @@ +defmodule Hyper.Node.Reclaim do + @moduledoc """ + Boot-time reclamation of device-mapper and loop devices orphaned by an unclean + shutdown (SIGKILL or `:erlang.halt`, where the owning GenServers' `terminate/2` + never ran to tear them down). + + Hyper names every dm device it creates with a `hyper-` prefix (`hyper-thinpool`, + `hyper-rw-`, `hyper-img--`), so this removes exactly those - never an + operator's unrelated dm devices. Removal is leaf-first: a device still open by + another (the pool under a thin volume, a snapshot under the next in its chain) + refuses until its dependents are gone, so leftovers are retried until a pass + removes nothing new. Loop devices backing files under Hyper's data dirs are then + detached (the dm devices that held them are gone by that point). + + Entirely best-effort: every failure is logged and boot continues. It runs once, + before any device-owning GenServer starts, so the freshly-booting node never + collides with its own previous instance's leftovers. + """ + + alias Hyper.SuidHelper.{Dmsetup, Losetup} + + require Logger + + @dm_prefix "hyper-" + + @spec run() :: :ok + def run do + reclaim_dm() + reclaim_loops() + :ok + end + + defp reclaim_dm do + case Dmsetup.list() do + {:ok, names} -> + case Enum.filter(names, &String.starts_with?(&1, @dm_prefix)) do + [] -> + :ok + + stale -> + Logger.warning( + "reclaim: removing #{length(stale)} stale dm device(s): #{inspect(stale)}" + ) + + remove_dm(stale) + end + + {:error, reason} -> + Logger.warning("reclaim: could not list dm devices: #{inspect(reason)}") + end + end + + @spec remove_dm([String.t()]) :: :ok + defp remove_dm([]), do: :ok + + defp remove_dm(names) do + {failed, removed_any?} = + Enum.reduce(names, {[], false}, fn name, {failed, any?} -> + case Dmsetup.remove(name) do + :ok -> {failed, true} + {:error, _} -> {[name | failed], any?} + end + end) + + cond do + failed == [] -> :ok + # A pass made progress: a retry may now clear the devices that were still + # held by the ones just removed. + removed_any? -> remove_dm(failed) + true -> Logger.error("reclaim: could not remove dm devices: #{inspect(failed)}") + end + end + + defp reclaim_loops do + case Losetup.list() do + {:ok, pairs} -> + for {dev, backing} <- pairs, under_data_dirs?(backing) do + case Losetup.detach(dev) do + :ok -> + :ok + + {:error, reason} -> + Logger.warning("reclaim: could not detach #{dev} (#{backing}): #{inspect(reason)}") + end + end + + :ok + + {:error, reason} -> + Logger.warning("reclaim: could not list loop devices: #{inspect(reason)}") + end + end + + @spec under_data_dirs?(Path.t()) :: boolean() + defp under_data_dirs?(path) do + String.starts_with?(path, Hyper.Config.scratch_dir() <> "/") or + String.starts_with?(path, Hyper.Config.layer_dir() <> "/") + end +end diff --git a/lib/hyper/suid_helper.ex b/lib/hyper/suid_helper.ex index 690f004a..6d4ff56d 100644 --- a/lib/hyper/suid_helper.ex +++ b/lib/hyper/suid_helper.ex @@ -16,7 +16,7 @@ defmodule Hyper.SuidHelper do self-test and reports the base path it was compiled against. """ - alias Hyper.SuidHelper.{Blockdev, Dmsetup, Expected, Losetup} + alias Hyper.SuidHelper.{Dmsetup, Expected} use OpenTelemetryDecorator @@ -51,18 +51,20 @@ defmodule Hyper.SuidHelper do end @doc """ - Check that the setuid helper and every tool it execs are usable on this - machine: the helper binary is present, is the build this release expects - (`verify_version/0`), then each tool submodule's own check. + Check that the setuid helper is usable on this machine: the helper binary is + present, is the build this release expects (`verify_version/0`), and the kernel + exposes the device-mapper targets we need (`Dmsetup.test_system/0`, which also + exercises the helper's configured `dmsetup` binary). + + The `losetup`/`blockdev` binaries are validated by the helper the first time + each is used; their paths live in the helper's own config, not here. """ @spec test_system() :: :ok | {:error, term()} @decorate with_span("Hyper.SuidHelper.test_system") def test_system do with :ok <- helper_present(), - :ok <- verify_version(), - :ok <- Losetup.test_system(), - :ok <- Dmsetup.test_system() do - Blockdev.test_system() + :ok <- verify_version() do + Dmsetup.test_system() end end diff --git a/lib/hyper/suid_helper/blockdev.ex b/lib/hyper/suid_helper/blockdev.ex index 9f675ea5..760fae84 100644 --- a/lib/hyper/suid_helper/blockdev.ex +++ b/lib/hyper/suid_helper/blockdev.ex @@ -11,18 +11,9 @@ defmodule Hyper.SuidHelper.Blockdev do @spec device_sectors(Path.t()) :: {:ok, pos_integer()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Blockdev.device_sectors", include: [:path]) def device_sectors(path) do - case SuidHelper.exec(["blockdev", "--bin", Hyper.Config.blockdev_path(), "--getsz", path]) do + case SuidHelper.exec(["blockdev", "--getsz", path]) do {:ok, %{"sectors" => n}} -> {:ok, n} {:error, _} = err -> err end end - - @doc "Check the blockdev binary is present." - @spec test_system() :: :ok | {:error, :blockdev_not_found} - @decorate with_span("Hyper.SuidHelper.Blockdev.test_system") - def test_system do - if System.find_executable(Hyper.Config.blockdev_path()), - do: :ok, - else: {:error, :blockdev_not_found} - end end diff --git a/lib/hyper/suid_helper/chroot_jail.ex b/lib/hyper/suid_helper/chroot_jail.ex index fdc0f1ce..cf8370c0 100644 --- a/lib/hyper/suid_helper/chroot_jail.ex +++ b/lib/hyper/suid_helper/chroot_jail.ex @@ -57,4 +57,25 @@ defmodule Hyper.SuidHelper.ChrootJail do {:error, _} = err -> err end end + + @doc """ + Hand the firecracker API `socket` to the node user so the unprivileged + controller can `connect()` to it. The jailer drops firecracker to a per-VM + uid/gid and chroots it, so the socket it creates is owned by that per-VM id and + the node (a different uid) gets `EACCES` on connect. The helper chowns just + that one socket to its caller (the node user) and chmods it `0660`, leaving the + rest of the per-VM isolation intact. + + Returns `{:error, :socket_pending}` while firecracker has not yet created the + socket, so the caller can keep waiting. + """ + @spec grant_api(Path.t()) :: :ok | {:error, :socket_pending} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.ChrootJail.grant_api", include: [:socket]) + def grant_api(socket) do + case SuidHelper.exec(["chroot-jail", "grant-api", "--socket", socket]) do + {:ok, %{"result" => "granted"}} -> :ok + {:ok, %{"result" => "pending"}} -> {:error, :socket_pending} + {:error, _} = err -> err + end + end end diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex index ef670634..3c57c6b5 100644 --- a/lib/hyper/suid_helper/dmsetup.ex +++ b/lib/hyper/suid_helper/dmsetup.ex @@ -59,14 +59,7 @@ defmodule Hyper.SuidHelper.Dmsetup do @spec remove(String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.remove", include: [:name]) def remove(name) do - case SuidHelper.exec([ - "dmsetup", - "--bin", - Hyper.Config.dmsetup_path(), - "remove", - "--retry", - name - ]) do + case SuidHelper.exec(["dmsetup", "remove", "--retry", name]) do {:ok, _} -> :ok {:error, _} = err -> err end @@ -76,39 +69,31 @@ defmodule Hyper.SuidHelper.Dmsetup do @spec message(String.t(), String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.message", include: [:name, :message]) def message(name, message) do - argv = - ["dmsetup", "--bin", Hyper.Config.dmsetup_path(), "message", name, "--message", message] - - case SuidHelper.exec(argv) do + case SuidHelper.exec(["dmsetup", "message", name, "--message", message]) do {:ok, _} -> :ok {:error, _} = err -> err end end @doc """ - Check the dmsetup binary is present and the kernel exposes the dm targets we - use (snapshot, thin, thin-pool). + Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool). + + Routes through the setuid helper: `dmsetup targets` opens `/dev/mapper/control`, + which needs root, and the BEAM runs unprivileged. The helper validates its + configured `dmsetup` binary before running it, so a missing or unsafe binary + surfaces here too. """ @spec test_system() :: :ok | {:error, term()} @decorate with_span("Hyper.SuidHelper.Dmsetup.test_system") def test_system do - if System.find_executable(Hyper.Config.dmsetup_path()), - do: test_targets(), - else: {:error, :dmsetup_not_found} - end - - @doc "Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool)." - @spec test_targets() :: :ok | {:error, term()} - @decorate with_span("Hyper.SuidHelper.Dmsetup.test_targets") - def test_targets do - case System.cmd(Hyper.Config.dmsetup_path(), ["targets"], stderr_to_stdout: true) do - {out, 0} -> + case SuidHelper.exec(["dmsetup", "targets"]) do + {:ok, %{"output" => out}} -> have = parse_targets(out) missing = Enum.reject(@required_targets, &MapSet.member?(have, &1)) if missing == [], do: :ok, else: {:error, {:missing_dm_targets, missing}} - {out, code} -> - {:error, {:dmsetup_targets_failed, code, String.trim(out)}} + {:error, {code, msg}} -> + {:error, {:dmsetup_targets_failed, code, msg}} end end @@ -122,6 +107,32 @@ defmodule Hyper.SuidHelper.Dmsetup do |> MapSet.new() end + @doc "Names of every device-mapper device currently present on this host." + @spec list() :: {:ok, [String.t()]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Dmsetup.list") + def list do + case SuidHelper.exec(["dmsetup", "ls"]) do + {:ok, %{"output" => out}} -> {:ok, parse_names(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_names(String.t()) :: [String.t()] + def parse_names(out) do + case String.trim(out) do + # `dmsetup ls` prints this sentinel (not a device row) when there are none. + "No devices found" -> + [] + + _ -> + out + |> String.split("\n", trim: true) + |> Enum.map(&(&1 |> String.split() |> List.first())) + |> Enum.reject(&is_nil/1) + end + end + @doc false @spec snapshot_table(Path.t(), Path.t(), pos_integer(), pos_integer()) :: String.t() def snapshot_table(origin_dev, cow_dev, sectors, chunk_sectors) do @@ -145,9 +156,7 @@ defmodule Hyper.SuidHelper.Dmsetup do # create flags (e.g. `--readonly`). Returns the `/dev/mapper/` path. @spec create(String.t(), String.t(), [String.t()]) :: {:ok, Path.t()} | {:error, err()} defp create(name, table, flags) do - argv = - ["dmsetup", "--bin", Hyper.Config.dmsetup_path(), "create", name] ++ - flags ++ ["--table", table] + argv = ["dmsetup", "create", name] ++ flags ++ ["--table", table] case SuidHelper.exec(argv) do {:ok, %{"device" => dev}} -> {:ok, dev} diff --git a/lib/hyper/suid_helper/losetup.ex b/lib/hyper/suid_helper/losetup.ex index d825b731..3abf4fba 100644 --- a/lib/hyper/suid_helper/losetup.ex +++ b/lib/hyper/suid_helper/losetup.ex @@ -11,7 +11,7 @@ defmodule Hyper.SuidHelper.Losetup do @spec attach_ro(Path.t()) :: {:ok, Path.t()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.attach_ro", include: [:path]) def attach_ro(path) do - case SuidHelper.exec(["losetup", "--bin", Hyper.Config.losetup_path(), "attach", path]) do + case SuidHelper.exec(["losetup", "attach", path]) do {:ok, %{"device" => dev}} -> {:ok, dev} {:error, _} = err -> err end @@ -21,14 +21,7 @@ defmodule Hyper.SuidHelper.Losetup do @spec attach_rw(Path.t()) :: {:ok, Path.t()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.attach_rw", include: [:path]) def attach_rw(path) do - case SuidHelper.exec([ - "losetup", - "--bin", - Hyper.Config.losetup_path(), - "attach", - "--rw", - path - ]) do + case SuidHelper.exec(["losetup", "attach", "--rw", path]) do {:ok, %{"device" => dev}} -> {:ok, dev} {:error, _} = err -> err end @@ -38,18 +31,34 @@ defmodule Hyper.SuidHelper.Losetup do @spec detach(Path.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.detach", include: [:dev]) def detach(dev) do - case SuidHelper.exec(["losetup", "--bin", Hyper.Config.losetup_path(), "detach", dev]) do + case SuidHelper.exec(["losetup", "detach", dev]) do {:ok, _} -> :ok {:error, _} = err -> err end end - @doc "Check the losetup binary is present." - @spec test_system() :: :ok | {:error, :losetup_not_found} - @decorate with_span("Hyper.SuidHelper.Losetup.test_system") - def test_system do - if System.find_executable(Hyper.Config.losetup_path()), - do: :ok, - else: {:error, :losetup_not_found} + @doc "Currently-attached loop devices as `{device, backing_file}` pairs." + @spec list() :: {:ok, [{Path.t(), Path.t()}]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Losetup.list") + def list do + case SuidHelper.exec(["losetup", "list"]) do + {:ok, %{"output" => out}} -> {:ok, parse_list(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_list(String.t()) :: [{Path.t(), Path.t()}] + def parse_list(out) do + out + |> String.split("\n", trim: true) + |> Enum.flat_map(fn line -> + # `NAME BACK-FILE` rows; a loop with no backing file has only one column + # (nothing for us to reclaim by file), so skip it. + case String.split(line, " ", parts: 2, trim: true) do + [dev, backing] -> [{dev, String.trim(backing)}] + _ -> [] + end + end) end end diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex new file mode 100644 index 00000000..03ada2be --- /dev/null +++ b/lib/mix/tasks/firecracker.install.ex @@ -0,0 +1,147 @@ +defmodule Mix.Tasks.Firecracker.Install do + @shortdoc "Download, verify, and install the pinned Firecracker release" + @moduledoc """ + Downloads, verifies, and installs the pinned Firecracker release (v1.16.0) + for the current CPU architecture. + + mix firecracker.install [--prefix DIR] + + Steps performed: + + 1. Detects the CPU architecture (`x86_64` or `aarch64`). + 2. Downloads the release tarball and verifies its SHA-256 checksum. + 3. Extracts the tarball, then copies the binaries to `/firecracker` + and `/jailer` using the **bare basenames** `firecracker` and + `jailer`. The setuid helper validates binaries via `SafeBin<"firecracker">` + and `SafeBin<"jailer">`, which match on basename only — version-stamped + names such as `firecracker-v1.16.0-x86_64` are rejected unconditionally. + 4. Marks both binaries executable (`0o755`). + 5. Prints the `/etc/hyper/config.toml` snippet the operator needs to paste. + + This task installs **unprivileged** binaries and prints configuration. + Privilege at runtime is handled by `hyper-suidhelper` (the setuid helper). + This task does **not** setuid `firecracker` or `jailer`. Install and setuid + the helper separately with `mix suidhelper.install`. + + ## Options + + * `--prefix DIR` — installation directory (default: `/opt/firecracker`). + + ## Security requirements + + After installing, ensure: + + * The binaries are root-owned and **not** group- or world-writable. + The suidhelper refuses binaries with loose permissions. + * `/etc/hyper/config.toml` is root-owned with mode `0644`. + """ + + use Mix.Task + + @version "1.16.0" + @default_prefix "/opt/firecracker" + + @impl Mix.Task + @spec run([String.t()]) :: :ok + def run(argv) do + {opts, _rest, _invalid} = OptionParser.parse(argv, strict: [prefix: :string]) + prefix = Keyword.get(opts, :prefix, @default_prefix) + + arch = detect_arch!() + + case Application.ensure_all_started(:req) do + {:ok, _} -> :ok + {:error, {reason, app}} -> Mix.raise("Cannot start HTTP client #{app}: #{inspect(reason)}") + end + + install!(release_for(arch), prefix) + print_config(prefix) + end + + defp detect_arch! do + case Sys.Arch.current() do + {:ok, arch} -> + arch + + {:error, {:unsupported_arch, raw}} -> + Mix.raise( + "Unsupported CPU architecture #{inspect(raw)}; " <> + "Firecracker supports x86_64 and aarch64." + ) + end + end + + defp release_for(:x86_64) do + %{ + url: + "https://github.com/firecracker-microvm/firecracker/releases/download/" <> + "v#{@version}/firecracker-v#{@version}-x86_64.tgz", + sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", + firecracker_path: "release-v#{@version}-x86_64/firecracker-v#{@version}-x86_64", + jailer_path: "release-v#{@version}-x86_64/jailer-v#{@version}-x86_64" + } + end + + defp release_for(:aarch64) do + %{ + url: + "https://github.com/firecracker-microvm/firecracker/releases/download/" <> + "v#{@version}/firecracker-v#{@version}-aarch64.tgz", + sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", + firecracker_path: "release-v#{@version}-aarch64/firecracker-v#{@version}-aarch64", + jailer_path: "release-v#{@version}-aarch64/jailer-v#{@version}-aarch64" + } + end + + defp install!( + %{url: url, sha256: sha256, firecracker_path: fc_rel, jailer_path: jailer_rel}, + prefix + ) do + extract_dir = Path.join(prefix, ".firecracker-extract") + + Mix.shell().info("Downloading Firecracker v#{@version} from #{url} ...") + + case Redist.Targz.install(url, sha256, extract_dir) do + :ok -> :ok + {:error, reason} -> Mix.raise("Download from #{url} failed: #{inspect(reason)}") + end + + dst_fc = Path.join(prefix, "firecracker") + dst_jailer = Path.join(prefix, "jailer") + + # The release ships version-stamped names; copy to bare basenames so SafeBin + # validation passes. The helper matches on basename, not full path. + File.cp!(Path.join(extract_dir, fc_rel), dst_fc) + File.cp!(Path.join(extract_dir, jailer_rel), dst_jailer) + File.chmod!(dst_fc, 0o755) + File.chmod!(dst_jailer, 0o755) + _ = File.rm_rf!(extract_dir) + + Mix.shell().info("Installed #{dst_fc}") + Mix.shell().info("Installed #{dst_jailer}") + end + + defp print_config(prefix) do + fc = Path.join(prefix, "firecracker") + jailer = Path.join(prefix, "jailer") + + # This task runs unprivileged, so the binaries land owned by the invoking + # user. The suidhelper's SafeBin refuses any binary not owned by root and not + # free of group/other write bits, so the operator MUST chown/chmod them or + # every jailer launch fails closed. Print the exact commands rather than a + # vague "ensure root-owned". + Mix.shell().info(""" + + Almost done. Run these as root so the setuid helper will accept the binaries + (it refuses any jailer/firecracker not owned by root): + + sudo chown root:root #{fc} #{jailer} + sudo chmod 0755 #{fc} #{jailer} + + Then add to /etc/hyper/config.toml (file: root-owned, mode 0644): + + firecracker = "#{fc}" + jailer = "#{jailer}" + """) + end +end diff --git a/lib/mix/tasks/suidhelper.install.ex b/lib/mix/tasks/suidhelper.install.ex new file mode 100644 index 00000000..2085de3c --- /dev/null +++ b/lib/mix/tasks/suidhelper.install.ex @@ -0,0 +1,93 @@ +defmodule Mix.Tasks.Suidhelper.Install do + @shortdoc "Build, stamp, and install the setuid helper" + @moduledoc """ + Builds, stamps, and installs the Rust setuid helper. + + mix suidhelper.install + + Two steps: + + 1. `cargo xtask stamp` in `native/suidhelper` builds the release binary and + writes its BLAKE3 self-checksum into `.note.sum` (the same step the + `:suidhelper_stamp` compiler runs). + 2. The stamped binary is copied setuid-root (mode `4755`) to + `/usr/local/bin/hyper-suidhelper`. + + The copy needs root, but Mix runs every subprocess in its own session with no + controlling terminal (`erl_child_setup` calls `setsid`), so a nested `sudo` + cannot open `/dev/tty` to prompt for a password. This task therefore only runs + `sudo` itself when it is already non-interactive (`sudo -n` succeeds, e.g. + `NOPASSWD` or a usable cached credential). Otherwise it prints the exact + privileged command for you to run in your own terminal. + + This is the privileged counterpart to `mix suidhelper.stamp`, which stamps + only. `cargo` and the helper's toolchain (see + `native/suidhelper/rust-toolchain.toml`) must be installed. + """ + + use Mix.Task + + @helper_dir "native/suidhelper" + @source Path.join(@helper_dir, "target/release/hyper-suidhelper") + # Must match `Hyper.Config`'s default `suid_helper` path and the xtask's + # `INSTALL_PATH`: a `PATH` location the unprivileged node can exec. + @install_path "/usr/local/bin/hyper-suidhelper" + + @impl Mix.Task + def run(argv) do + stamp!(argv) + install_privileged() + end + + defp stamp!(argv) do + case System.cmd("cargo", ["xtask", "stamp" | argv], + cd: @helper_dir, + into: IO.stream(:stdio, :line) + ) do + {_, 0} -> + :ok + + {_, _} -> + Mix.raise(""" + `cargo xtask stamp` failed building the suidhelper. + + Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml) + are installed. + """) + end + end + + defp install_privileged do + if passwordless_sudo?() do + Mix.shell().info("Installing #{@source} -> #{@install_path} (setuid root)") + + case System.cmd("sudo", install_argv(), into: IO.stream(:stdio, :line)) do + {_, 0} -> Mix.shell().info("installed #{@install_path} (setuid root)") + {_, _} -> Mix.raise(manual_instructions()) + end + else + Mix.shell().info(manual_instructions()) + end + end + + # `sudo -n true` exits 0 only when sudo can run without prompting. With no + # controlling terminal a cached `tty_tickets` credential is invisible, so this + # is true essentially only under `NOPASSWD` -- exactly the case where the + # nested `sudo install` below can succeed. + defp passwordless_sudo? do + match?({_, 0}, System.cmd("sudo", ["-n", "true"], stderr_to_stdout: true)) + end + + defp install_argv, + do: ["install", "-o", "root", "-g", "root", "-m", "4755", @source, @install_path] + + defp manual_instructions do + """ + + The binary is built and stamped, but installing it setuid-root needs a + password and `sudo` has no terminal to prompt on here. Run the copy yourself: + + sudo #{Enum.join(install_argv(), " ")} + """ + end +end diff --git a/mix.exs b/mix.exs index 7906c6a1..5c620b81 100644 --- a/mix.exs +++ b/mix.exs @@ -24,6 +24,10 @@ defmodule Hyper.MixProject do # Cache the PLTs in a stable, gitignored dir so CI can cache them. plt_local_path: "priv/plts", plt_core_path: "priv/plts", + # `:mix` is needed so the Mix tasks under `lib/mix/tasks` (which call + # `Mix.raise/1`, `Mix.shell/0`, and implement the `Mix.Task` behaviour) + # resolve instead of tripping `unknown_function`. + plt_add_apps: [:mix], # Verify @specs against actual returns, and flag ignored return values. flags: [:unmatched_returns, :extra_return, :missing_return] ] diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index f5b4b1b7..be6a449c 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -48,6 +48,10 @@ path = "tests/util/safe_bin.rs" name = "tools_dmsetup_parsers" path = "tests/tools/dmsetup_parsers.rs" +[[test]] +name = "tools_jailer" +path = "tests/tools/jailer.rs" + [[test]] name = "util_confinement" path = "tests/util/confinement.rs" @@ -56,6 +60,10 @@ path = "tests/util/confinement.rs" name = "tools_chroot_jail_remove" path = "tests/tools/chroot_jail_remove.rs" +[[test]] +name = "tools_chroot_jail_grant_api" +path = "tests/tools/chroot_jail_grant_api.rs" + [[test]] name = "util_chroot_jail" path = "tests/util/chroot_jail.rs" @@ -68,14 +76,22 @@ path = "tests/e2e/config.rs" name = "e2e_argv" path = "tests/e2e/argv.rs" +[[test]] +name = "e2e_jailer" +path = "tests/e2e/jailer.rs" + [[test]] name = "e2e_chroot_jail" path = "tests/e2e/chroot_jail.rs" +[[test]] +name = "config_uid_gid_range" +path = "tests/config_uid_gid_range.rs" + [dependencies] clap = { version = "4", features = ["derive"] } hyper-suidhelper-meta = { path = "meta" } -nix = { version = "0.29", features = ["user", "fs", "dir"] } +nix = { version = "0.29", features = ["user", "fs", "dir", "process"] } serde = { version = "1", features = ["derive"] } serde_json = "1" thiserror = "1" diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs index 425ba386..76a7f221 100644 --- a/native/suidhelper/src/config.rs +++ b/native/suidhelper/src/config.rs @@ -1,6 +1,14 @@ // SPDX-License-Identifier: AGPL-3.0-only //! Runtime host configuration, read from a single root-owned TOML file. +//! +//! ## UID/GID range divergence +//! +//! Elixir keeps `compile_env` default `{900_000, 999_999}` that governs which +//! UIDs the node hands *out*; this helper reads `[uid_gid_range]` from +//! config.toml to decide which UIDs it *accepts* (default `{900_000, 999_999}` +//! when the key is absent). Operators narrowing the range must set **both**. +use crate::util::safe_bin::{self, SafeBin}; use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile}; use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents}; use nix::fcntl::OFlag; @@ -21,11 +29,48 @@ pub enum LoadingError { Malformed(PathBuf), #[error("work_dir in {0:?} must be an absolute path")] Relative(PathBuf), + #[error("uid_gid_range.min must be >= 1 and <= max (got min={min}, max={max})")] + BadUidGidRange { min: u32, max: u32 }, +} + +/// Error returned by config accessors for tool binaries derived from config. +#[derive(Debug, Error)] +pub enum BinError { + #[error("required binary `{0}` is not configured in /etc/hyper/config.toml")] + Unconfigured(&'static str), + #[error(transparent)] + Bin(#[from] safe_bin::Error), } const CONFIG_PATHSTR: &str = "/etc/hyper/config.toml"; const INSECURE_CONFIG_PATH_ENV: &str = "HYPER_SETUIDHELPER_CONFIG_PATH"; +/// UID/GID allocation band, read from `[uid_gid_range]` in config.toml. +/// Controls which UIDs the helper *accepts* from the BEAM — see module docs. +#[derive(Debug, Clone, Copy, Deserialize)] +pub struct UidGidRange { + pub min: u32, + pub max: u32, +} + +// Band defaults match Elixir's `compile_env` allocation defaults so that an +// unconfigured helper and an unconfigured node agree out of the box. +const DEFAULT_UID_GID: (u32, u32) = (900_000, 999_999); + +/// Validate a uid_gid_range value. A present range where min==0 or min>max is +/// treated as a config trust violation — fatal at load, consistent with the +/// "present but untrusted" model. Exposed so tests can verify the contract +/// without touching the file system. +pub fn validate_uid_gid_range(r: &UidGidRange) -> Result<(), LoadingError> { + if r.min == 0 || r.min > r.max { + return Err(LoadingError::BadUidGidRange { + min: r.min, + max: r.max, + }); + } + Ok(()) +} + /// The config file path. In production this is the fixed `/etc/hyper/config.toml`. /// Only in INSECURE TEST MODE (both gates open) may an env var redirect it — the /// secure arm is always the hardcoded path, so a release build cannot be steered. @@ -44,13 +89,66 @@ fn config_path() -> PathBuf { #[derive(Debug, Clone, Deserialize)] pub struct Config { work_dir: PathBuf, + #[serde(default = "default_dmsetup")] + dmsetup: PathBuf, + #[serde(default = "default_losetup")] + losetup: PathBuf, + #[serde(default = "default_blockdev")] + blockdev: PathBuf, + #[serde(default)] + firecracker: Option, + #[serde(default)] + jailer: Option, + #[serde(default = "default_parent_cgroup")] + parent_cgroup: String, + #[serde(default)] + uid_gid_range: Option, +} + +// The default data root. Must match the Elixir node's `@dev_work_dir`, which it +// uses when the same config file is absent, so both sides agree (see +// `Hyper.Node.check_helper_base`). +fn default_work_dir() -> PathBuf { + PathBuf::from("/srv/hyper") +} + +fn default_dmsetup() -> PathBuf { + PathBuf::from("/usr/sbin/dmsetup") +} + +fn default_losetup() -> PathBuf { + PathBuf::from("/usr/sbin/losetup") +} + +fn default_blockdev() -> PathBuf { + PathBuf::from("/usr/sbin/blockdev") +} + +fn default_parent_cgroup() -> String { + // Must match Elixir node's `@parent_cgroup`; operators need to keep them in sync. + "hyper".into() +} + +impl Default for Config { + fn default() -> Self { + Self { + work_dir: default_work_dir(), + dmsetup: default_dmsetup(), + losetup: default_losetup(), + blockdev: default_blockdev(), + firecracker: None, + jailer: None, + parent_cgroup: default_parent_cgroup(), + uid_gid_range: None, + } + } } impl Config { /// The process-wide config, loaded once (and forced unprivileged by - /// [`Config::init`]). A load failure is fatal: the helper cannot safely - /// operate without a trusted data root, so it prints the error and exits - /// rather than guessing a default. + /// [`Config::init`]). An absent file yields the built-in defaults; a + /// *present but untrusted* file (wrong owner/mode, malformed) is fatal — + /// the helper prints the error and exits rather than trusting it. pub fn get() -> &'static Config { LazyLock::force(&CONFIG) } @@ -58,7 +156,7 @@ impl Config { /// Force the config to load now. Call this once at the very start of `main`, /// after privileges have already been dropped (the `.preinit_array` entry in /// `setuid_privileged` runs before `main`), so the file is never first read - /// lazily from inside a `Privileged` scope - i.e. it is guaranteed to be read + /// lazily from inside a `Privileged` scope — i.e. it is guaranteed to be read /// as the real uid, not as root. pub fn init() { let _ = Self::get(); @@ -74,6 +172,57 @@ impl Config { self.work_dir.join("jails") } + /// The validated `dmsetup` binary the helper will run. + pub fn dmsetup(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.dmsetup) + } + + /// The validated `losetup` binary the helper will run. + pub fn losetup(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.losetup) + } + + /// The validated `blockdev` binary the helper will run. + pub fn blockdev(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.blockdev) + } + + /// The Firecracker VMM binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set this key before any VM can be launched. + pub fn firecracker(&self) -> Result, BinError> { + self.firecracker + .as_deref() + .ok_or(BinError::Unconfigured("firecracker")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The Firecracker jailer binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set this key before any VM can be launched. + pub fn jailer(&self) -> Result, BinError> { + self.jailer + .as_deref() + .ok_or(BinError::Unconfigured("jailer")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The jailer `--parent-cgroup` value. Defaults to `"hyper"`, matching the + /// Elixir node's `@parent_cgroup`. + pub fn parent_cgroup(&self) -> &str { + &self.parent_cgroup + } + + /// The UID/GID band the helper accepts from the BEAM. Defaults to + /// `(900_000, 999_999)` when the key is absent (matching Elixir's defaults). + /// A present range with min==0 or min>max is rejected at load time by + /// [`Config::safe_load`], so this accessor is always total. + pub fn uid_gid_range(&self) -> (u32, u32) { + self.uid_gid_range + .map(|r| (r.min, r.max)) + .unwrap_or(DEFAULT_UID_GID) + } + /// Read, ownership-check, parse, and validate the config file. See the module /// docs for the trust model. pub fn safe_load() -> Result { @@ -82,7 +231,18 @@ impl Config { let safe_path: SafePath = path.clone().try_into()?; let file: SafeFile = - SafeFile::open(&safe_path, OFlag::O_RDONLY)?; + match SafeFile::open(&safe_path, OFlag::O_RDONLY) { + Ok(file) => file, + // A genuinely-absent file means "use the built-in defaults": those + // are compiled into this root-owned binary, so they are trusted. Any + // OTHER failure — a present but wrong-owner/mode file, an I/O error — + // stays fatal, because it is a signal (someone put an untrusted file + // there), not an absence. + Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => { + return Ok(Self::default()) + } + Err(e) => return Err(e.into()), + }; let body = std::io::read_to_string(std::fs::File::from(file.into_owned_fd())) .map_err(|_| LoadingError::Unreadable(path.clone()))?; @@ -92,6 +252,11 @@ impl Config { if !config.work_dir.is_absolute() { return Err(LoadingError::Relative(path)); } + + if let Some(r) = &config.uid_gid_range { + validate_uid_gid_range(r)?; + } + Ok(config) } } diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs index 617fd6d5..11b08319 100644 --- a/native/suidhelper/src/main.rs +++ b/native/suidhelper/src/main.rs @@ -15,6 +15,7 @@ use clap::{Parser, Subcommand}; use hyper_suidhelper::config; +use hyper_suidhelper::tools::jailer::{self, JailerArgs}; use hyper_suidhelper::tools::Tool; use hyper_suidhelper::util::setuid_privileged::{self, Privileged}; use serde::Serialize; @@ -37,6 +38,9 @@ enum Command { Tool(Tool), /// Check the helper is correctly installed (can promote to root). SysTest, + /// Validate the caller's args, become root, and `execve` the firecracker + /// jailer in our place. Prints nothing on success (the image is replaced). + Jailer(JailerArgs), /// Print the build version and BLAKE3 checksum of this binary. Version, } @@ -101,12 +105,21 @@ fn main() { config::Config::init(); // Each command yields a serializable value (errors stringified to unify); we - // render the final JSON line here. + // render the final JSON line here. The jailer is the exception: it `execve`s + // in place, so on success it never returns and never emits JSON, and on + // failure it reports to stderr and exits before reaching the JSON pipeline. let output = match command { Command::Tool(tool) => tool.run().map(Output::Tool).map_err(|e| e.to_string()), Command::SysTest => SysTest::perform() .map(Output::SysTest) .map_err(|e| e.to_string()), + Command::Jailer(args) => { + let e = jailer::run(args).expect_err("jailer::run only returns on error"); + eprintln!("{e}"); + // _exit bypasses atexit handlers; safe because we are permanently root + // at this point and must not run any cleanup registered before escalation. + unsafe { nix::libc::_exit(2) } + } Command::Version => unreachable!("handled above"), }; diff --git a/native/suidhelper/src/tools/chroot_jail/grant_api.rs b/native/suidhelper/src/tools/chroot_jail/grant_api.rs new file mode 100644 index 00000000..4ec94a21 --- /dev/null +++ b/native/suidhelper/src/tools/chroot_jail/grant_api.rs @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! `chroot-jail grant-api`: hand the firecracker API socket to the node user so +//! the unprivileged controller can `connect()` to it. +//! +//! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker +//! then creates its API socket at `/root/api.socket` owned by that per-VM +//! id. Connecting a unix socket needs *write* permission on the node, so the +//! node user (a different uid) gets `EACCES`. This op chowns just that one +//! socket to the helper's CALLER — `getuid()`/`getgid()`, which inside the +//! privileged scope are the real (caller) ids while euid is 0 — and chmods it +//! `0660`. The node thus connects as owner, and humans added to the node's +//! group connect via the group bit. +//! +//! That alone is not enough: the jailer leaves `/root` as `0700` owned by +//! the per-VM uid, and connecting needs *search* (`+x`) on every ancestor, so +//! the node cannot even traverse into `root` to reach the (now its own) socket. +//! So this op also opens just that one directory to the caller's group: it keeps +//! the per-VM uid as owner (firecracker still needs it), chgrps `root` to the +//! caller's gid, and chmods it `0710` — owner `rwx`, group `--x` (traverse, not +//! list), other none. Per-VM isolation is otherwise untouched: only this socket +//! and its immediate parent's group/mode move, nothing else in the jail, and +//! unrelated users stay locked out. +//! +//! Security: the socket path is validated as a `SafePath` and reached by an +//! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect +//! the chown outside the jail, and every op is fd-relative on the pinned `root` +//! dir fd, never by re-resolved name. The leaf must be exactly `api.socket` +//! `//root` below the base, and `fstatat(AT_SYMLINK_NOFOLLOW)` must +//! report a *socket* — a regular file or symlink planted at that name is an +//! attack and is refused, never chmod'd. A missing socket (`ENOENT`, anywhere on +//! the path) is `Pending`, not an error: firecracker has not created it yet, so +//! the controller keeps probing. + +use crate::config::Config; +use crate::tools::IsTool; +use crate::util::safe_dir::{self, SafeDir}; +use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents}; +use clap::Args; +use nix::errno::Errno; +use nix::sys::stat::SFlag; +use nix::unistd::{getgid, getuid}; +use serde::Serialize; +use std::path::{Path, PathBuf}; +use thiserror::Error as ThisError; + +/// The fixed in-jail socket name firecracker opens (mirrors the Elixir +/// `Hyper.Node.FireVMM.Jailer` `@jail_socket`). +const SOCKET_NAME: &str = "api.socket"; + +/// The socket sits at `///root/api.socket`: three parent +/// components (``, ``, `root`) before the leaf. +const SOCKET_PARENT_DEPTH: usize = 3; + +/// Mode handed to the node: owner+group read/write, no world access. +const SOCKET_MODE: u32 = 0o660; + +/// Mode set on the jail `root` dir so the node's group can *traverse* it to +/// reach the socket: owner `rwx` (the per-VM uid, unchanged), group `--x` +/// (traverse, not list), other none. +const JAIL_ROOT_MODE: u32 = 0o710; + +type LexicalPath = SafePath; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("--socket path: {0}")] + SocketPath(#[source] safe_path::ValidationError), + #[error("--socket must be exactly //root/api.socket below JAIL_BASE: {0:?}")] + SocketShape(PathBuf), + #[error("--socket leaf must be {SOCKET_NAME:?}: {0:?}")] + SocketName(PathBuf), + #[error("walking to the jail root: {0}")] + Walk(#[source] safe_dir::Error), + #[error("api.socket is not a socket (or is a symlink); refusing to touch it")] + NotASocket, + #[error("statting the socket: {0}")] + Stat(#[source] safe_dir::Error), + #[error("chowning the socket to the caller: {0}")] + Chown(#[source] safe_dir::Error), + #[error("chmoding the socket: {0}")] + Chmod(#[source] safe_dir::Error), + #[error("chgrp-ing the jail root dir to the caller: {0}")] + ChgrpRoot(#[source] safe_dir::Error), + #[error("chmoding the jail root dir for traversal: {0}")] + ChmodRoot(#[source] safe_dir::Error), +} + +#[derive(Args)] +pub struct GrantApiArgs { + /// Host path of the firecracker API socket, shape + /// ///root/api.socket. + #[arg(long)] + socket: PathBuf, +} + +#[derive(Debug, Serialize)] +#[serde(tag = "result", rename_all = "snake_case")] +pub enum GrantOut { + /// The socket was handed to the caller (chowned + chmoded). + Granted, + /// The socket does not exist yet; the caller should keep waiting. + Pending, +} + +/// Run the `grant-api` op in its own privileged scope (returns its serialized `Value`). +pub fn run(args: GrantApiArgs) -> Result { + GrantApi { args }.run() +} + +struct GrantApi { + args: GrantApiArgs, +} + +impl IsTool for GrantApi { + type Args = GrantApiArgs; + type Output = GrantOut; + type RunT = Result; + + fn run_privileged(&self) -> Self::RunT { + grant_api_under(&Config::get().jail_base(), &self.args.socket) + } + + fn parse(&self, res: Self::RunT) -> Result> { + Ok(res?) + } +} + +/// Hand `socket` (`///root/api.socket`) to the helper's +/// caller, fd-relative after an `O_NOFOLLOW` walk from `jail_base`. Returns +/// `Pending` if any path component or the socket itself is not yet present. +pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result { + let path: LexicalPath = socket.to_path_buf().try_into().map_err(Error::SocketPath)?; + let (parents, leaf) = path.relative_to(jail_base).map_err(Error::SocketPath)?; + if parents.len() != SOCKET_PARENT_DEPTH { + return Err(Error::SocketShape(socket.to_path_buf())); + } + if leaf != Path::new(SOCKET_NAME) { + return Err(Error::SocketName(socket.to_path_buf())); + } + + let Some(root) = walk(jail_base.to_path_buf(), &parents)? else { + return Ok(GrantOut::Pending); // jail not fully created yet + }; + + let leaf = Path::new(SOCKET_NAME); + let stat = match root.stat(leaf) { + Ok(stat) => stat, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(GrantOut::Pending), + Err(e) => return Err(Error::Stat(e)), + }; + // `stat` used AT_SYMLINK_NOFOLLOW, so a symlink reports as S_IFLNK and fails + // this check too: only a real socket is accepted, anything else is refused. + if stat.st_mode & SFlag::S_IFMT.bits() != SFlag::S_IFSOCK.bits() { + return Err(Error::NotASocket); + } + + root.chown(leaf, getuid().as_raw(), getgid().as_raw()) + .map_err(Error::Chown)?; + root.chmod(leaf, SOCKET_MODE).map_err(Error::Chmod)?; + + // Open `root` itself to the caller's group so the node can traverse into it + // to reach the socket (the jailer leaves it 0700 / per-VM uid). Owner stays + // the per-VM uid; only the group and mode move. Operate on the pinned `root` + // fd (already opened `O_NOFOLLOW`), never by name - TOCTOU-safe. + root.chgrp_self(getgid().as_raw()) + .map_err(Error::ChgrpRoot)?; + root.chmod_self(JAIL_ROOT_MODE).map_err(Error::ChmodRoot)?; + Ok(GrantOut::Granted) +} + +/// Open `base` and walk `parents` from it (`O_NOFOLLOW` each step). Returns +/// `Ok(None)` if `base` or any parent is not yet present (`ENOENT`), so the +/// caller can treat a half-built jail as `Pending` rather than an error. +fn walk(base: PathBuf, parents: &[PathBuf]) -> Result, Error> { + let base_path: LexicalPath = base.try_into().map_err(Error::SocketPath)?; + let anchor = match SafeDir::open(&base_path) { + Ok(dir) => dir, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(None), + Err(e) => return Err(Error::Walk(e)), + }; + match anchor.descend(parents) { + Ok(dir) => Ok(Some(dir)), + Err(e) if e.errno() == Some(Errno::ENOENT) => Ok(None), + Err(e) => Err(Error::Walk(e)), + } +} diff --git a/native/suidhelper/src/tools/chroot_jail/mod.rs b/native/suidhelper/src/tools/chroot_jail/mod.rs index d65bac7b..3a06c35a 100644 --- a/native/suidhelper/src/tools/chroot_jail/mod.rs +++ b/native/suidhelper/src/tools/chroot_jail/mod.rs @@ -1,9 +1,11 @@ // SPDX-License-Identifier: AGPL-3.0-only //! `chroot-jail`: per-VM chroot/jail lifecycle. +pub mod grant_api; mod prepare; pub mod remove; +pub use grant_api::GrantApiArgs; pub use prepare::PrepareArgs; pub use remove::RemoveArgs; @@ -15,6 +17,8 @@ pub enum ChrootJailOp { Prepare(PrepareArgs), /// Remove a VM's stale chroot and cgroup leaf before relaunching the jailer. Remove(RemoveArgs), + /// Hand the firecracker API socket to the node user (chown to caller, 0660). + GrantApi(GrantApiArgs), } impl ChrootJailOp { @@ -25,6 +29,7 @@ impl ChrootJailOp { match self { ChrootJailOp::Prepare(args) => prepare::run(args), ChrootJailOp::Remove(args) => remove::run(args), + ChrootJailOp::GrantApi(args) => grant_api::run(args), } } } diff --git a/native/suidhelper/src/tools/dmsetup/mod.rs b/native/suidhelper/src/tools/dmsetup/mod.rs index eef016ad..d77674df 100644 --- a/native/suidhelper/src/tools/dmsetup/mod.rs +++ b/native/suidhelper/src/tools/dmsetup/mod.rs @@ -52,6 +52,11 @@ enum DmOp { #[arg(long)] message: ThinMessage, }, + /// List the target types the kernel device-mapper exposes. Read-only, but + /// still needs root: it opens `/dev/mapper/control`. + Targets, + /// List the names of existing dm devices (for stale-device reclaim). + Ls, } #[derive(Serialize)] @@ -60,6 +65,8 @@ pub enum DmsetupOut { Created { device: PathBuf }, Removed, Messaged, + Targets { output: String }, + Listed { output: String }, } pub struct Dmsetup { @@ -105,6 +112,12 @@ impl IsTool for Dmsetup { .arg("0") .arg(message.to_string()); } + DmOp::Targets => { + cmd.arg("targets"); + } + DmOp::Ls => { + cmd.arg("ls"); + } } cmd.env_clear().output() @@ -124,6 +137,12 @@ impl IsTool for Dmsetup { }, DmOp::Remove { .. } => DmsetupOut::Removed, DmOp::Message { .. } => DmsetupOut::Messaged, + DmOp::Targets => DmsetupOut::Targets { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, + DmOp::Ls => DmsetupOut::Listed { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/src/tools/jailer.rs b/native/suidhelper/src/tools/jailer.rs new file mode 100644 index 00000000..ec29ac8a --- /dev/null +++ b/native/suidhelper/src/tools/jailer.rs @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! The `jailer` subcommand: validate the BEAM's arguments, re-acquire root +//! permanently, and `execve` the firecracker jailer in our place. +//! +//! Unlike the device tools this is **not** an [`crate::tools::IsTool`]: it does +//! not run a child and parse JSON, it *becomes* the jailer via `execve`, so the +//! unprivileged BEAM's MuonTrap port keeps supervising the resulting process +//! across the image replacement. There is no output and no return on success. +//! +//! Threat model: the BEAM is untrusted. It supplies only `--id`, `--uid`, +//! `--gid`, repeated `--cgroup KEY=VALUE`, and `--api-sock`. Every privileged +//! path (the jailer binary, the firecracker `--exec-file`, the chroot base, the +//! parent cgroup) comes from the root-owned config, never the caller. The +//! refusal contracts on the newtypes below are the security core: a compromised +//! BEAM must not be able to name a privileged path, request uid 0, traverse out +//! of the chroot base, inject a flag, or smuggle an environment/fd into root. +//! +//! ## Validator laws (property-tested in `tests/tools/jailer.rs`) +//! - [`validate_id_number`] accepts iff `n != 0 && lo <= n <= hi`; 0 is rejected +//! for *every* range (uid 0 makes the jailer skip its privilege drop). +//! - [`VmId`] round-trips exactly the allowed charset/length and rejects any +//! separator, dot, NUL, whitespace, leading dash, empty, or over-long input. +//! - [`CgroupSetting`] re-renders a valid pair to its canonical `key=value` and +//! rejects unknown keys and values outside the per-key grammar. +//! - [`JailSock`] accepts exactly `/` + one filename and rejects multi-component, +//! relative, `..`, and NUL/whitespace inputs. + +use crate::config::{BinError, Config}; +use crate::util::setuid_privileged; +use clap::Args; +use nix::errno::Errno; +use std::ffi::CString; +use std::fmt; +use std::os::unix::ffi::OsStrExt; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use thiserror::Error as ThisError; + +/// The jailer protects at most a handful of controllers; an unbounded `--cgroup` +/// list is a caller trying something. Cap it well above any legitimate need. +const MAX_CGROUPS: usize = 16; + +/// `sun_path` in `sockaddr_un` is 108 bytes on Linux; a socket path longer than +/// that can never be bound, so reject it up front. +const MAX_SOCK_LEN: usize = 108; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("invalid --id {0:?}: must be 1..=64 chars of [A-Za-z0-9_-], not starting with '-'")] + VmId(String), + #[error("invalid --cgroup {0:?}: unknown key or value outside its grammar")] + Cgroup(String), + #[error("invalid --api-sock {0:?}: must be / with name in [A-Za-z0-9_.-]")] + Sock(String), + #[error("--uid/--gid {value} is zero or outside the configured range [{lo}, {hi}]")] + IdNumber { value: u32, lo: u32, hi: u32 }, + #[error("too many --cgroup settings: {0} (max {MAX_CGROUPS})")] + TooManyCgroups(usize), + #[error(transparent)] + Bin(#[from] BinError), + #[error(transparent)] + Privilege(#[from] setuid_privileged::Error), + #[error("argument contains an interior NUL byte")] + NulArgument, + #[error("execve {path:?} failed: {errno}")] + Exec { path: PathBuf, errno: Errno }, +} + +/// `n != 0 && lo <= n <= hi`. uid/gid 0 is rejected unconditionally: a jailer run +/// with uid 0 skips its privilege drop and leaves firecracker running as root. +pub fn validate_id_number(n: u32, range: (u32, u32)) -> Result { + let (lo, hi) = range; + if n != 0 && lo <= n && n <= hi { + Ok(n) + } else { + Err(Error::IdNumber { value: n, lo, hi }) + } +} + +/// A VM id used as a chroot subdirectory name: `[A-Za-z0-9_-]`, length `1..=64`, +/// first character not `-` (so it can never be read as a flag). No `/`, `.`, NUL, +/// or whitespace can appear, so it can never traverse out of the chroot base. +#[derive(Debug, Clone)] +pub struct VmId(String); + +impl FromStr for VmId { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::VmId(s.to_string()); + let bytes = s.as_bytes(); + if !(1..=64).contains(&bytes.len()) { + return Err(reject()); + } + if bytes[0] == b'-' { + return Err(reject()); + } + if !bytes + .iter() + .all(|&b| b.is_ascii_alphanumeric() || b == b'_' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for VmId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +/// `1..=20` ASCII digits — bounds a numeric cgroup limit without pulling in a +/// regex engine. The upper bound comfortably exceeds `u64::MAX`'s 20 digits. +fn is_digits_1_20(s: &str) -> bool { + !s.is_empty() && s.len() <= 20 && s.bytes().all(|b| b.is_ascii_digit()) +} + +/// A single `KEY=VALUE` cgroup setting from an allowlist. The helper re-emits +/// `key=value` itself from the canonical key, so the jailer never sees the +/// caller's raw bytes. Per-key value grammar: +/// - `memory.max` : `[0-9]{1,20}` or the literal `max` +/// - `cpu.max` : `[0-9]{1,20} [0-9]{1,20}` or `max [0-9]{1,20}` +#[derive(Debug, Clone)] +pub struct CgroupSetting { + key: &'static str, + value: String, +} + +impl FromStr for CgroupSetting { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Cgroup(s.to_string()); + // Split on the FIRST `=`. None of the value grammars contains a `=`, so a + // second `=` lands in `value` and is rejected by the grammar check below. + let (raw_key, value) = s.split_once('=').ok_or_else(reject)?; + + let key: &'static str = match raw_key { + "memory.max" => "memory.max", + "cpu.max" => "cpu.max", + _ => return Err(reject()), + }; + + let valid = match key { + "memory.max" => value == "max" || is_digits_1_20(value), + "cpu.max" => match value.split_once(' ') { + Some((quota, period)) => { + (quota == "max" || is_digits_1_20(quota)) && is_digits_1_20(period) + } + None => false, + }, + _ => false, + }; + + if valid { + Ok(Self { + key, + value: value.to_string(), + }) + } else { + Err(reject()) + } + } +} + +impl fmt::Display for CgroupSetting { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}={}", self.key, self.value) + } +} + +/// The firecracker API socket path: an absolute path that is exactly `/` plus one +/// filename in `[A-Za-z0-9_.-]`. The charset excludes `/`, so the value is always +/// a direct child of `/` with no extra components and no traversal; `.`/`..` as +/// the whole filename are rejected explicitly. +#[derive(Debug, Clone)] +pub struct JailSock(String); + +impl FromStr for JailSock { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Sock(s.to_string()); + if s.len() > MAX_SOCK_LEN { + return Err(reject()); + } + let name = s.strip_prefix('/').ok_or_else(reject)?; + if name.is_empty() || name == "." || name == ".." { + return Err(reject()); + } + if !name + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for JailSock { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +#[derive(Args)] +pub struct JailerArgs { + /// Microvm id; becomes the chroot subdirectory name. + #[arg(long)] + id: VmId, + /// Unprivileged uid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + uid: u32, + /// Unprivileged gid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + gid: u32, + /// Repeatable `KEY=VALUE` cgroup setting from the allowlist. + #[arg(long = "cgroup")] + cgroup: Vec, + /// Absolute firecracker API socket path (single filename under `/`). + #[arg(long = "api-sock")] + api_sock: JailSock, +} + +fn cstr_bytes(bytes: &[u8]) -> Result { + CString::new(bytes).map_err(|_| Error::NulArgument) +} + +fn cstr_str(s: &str) -> Result { + cstr_bytes(s.as_bytes()) +} + +fn cstr_path(p: &Path) -> Result { + cstr_bytes(p.as_os_str().as_bytes()) +} + +/// Build the exact argv handed to the jailer. argv[0] is the jailer path. The +/// caller never names the jailer, the `--exec-file`, the chroot base, the cgroup +/// version, or the parent cgroup — those are derived from trusted config here. +#[allow(clippy::too_many_arguments)] +fn build_argv( + jailer: &Path, + id: &VmId, + firecracker: &Path, + uid: u32, + gid: u32, + jail_base: &Path, + parent_cgroup: &str, + cgroups: &[CgroupSetting], + api_sock: &JailSock, +) -> Result, Error> { + let mut argv = vec![ + cstr_path(jailer)?, + cstr_str("--id")?, + cstr_str(&id.to_string())?, + cstr_str("--exec-file")?, + cstr_path(firecracker)?, + cstr_str("--uid")?, + cstr_str(&uid.to_string())?, + cstr_str("--gid")?, + cstr_str(&gid.to_string())?, + cstr_str("--chroot-base-dir")?, + cstr_path(jail_base)?, + cstr_str("--cgroup-version")?, + cstr_str("2")?, + cstr_str("--parent-cgroup")?, + cstr_str(parent_cgroup)?, + ]; + + for cg in cgroups { + argv.push(cstr_str("--cgroup")?); + argv.push(cstr_str(&cg.to_string())?); + } + + argv.push(cstr_str("--")?); + argv.push(cstr_str("--api-sock")?); + argv.push(cstr_str(&api_sock.to_string())?); + + Ok(argv) +} + +/// Close every inherited fd above stdio so a compromised BEAM cannot smuggle an +/// open fd into the root jailer. Keep 0/1/2: MuonTrap supervises the jailer +/// through stdio, and stderr carries our exec-failure message. `close_range(2)` +/// needs Linux 5.9+; on any failure (ENOSYS or otherwise) we fall back to +/// closing each fd individually — fail closed before handing root to the jailer. +fn close_inherited_fds() { + const FIRST: u32 = 3; + // SAFETY: raw syscall with no memory operands; closing fds has no UB. + let rc = unsafe { nix::libc::close_range(FIRST, u32::MAX, 0) }; + if rc == 0 { + return; + } + + // SAFETY: sysconf is a pure query of a system limit. + let max = unsafe { nix::libc::sysconf(nix::libc::_SC_OPEN_MAX) }; + let max = if max < 0 { 4096 } else { max as i32 }; + for fd in (FIRST as i32)..max { + // SAFETY: closing an arbitrary fd is safe; EBADF for unused fds is ignored. + unsafe { + nix::libc::close(fd); + } + } +} + +/// Validate the caller's args, then permanently become root and `execve` the +/// jailer. On success this never returns (the process image is replaced); the +/// `Ok` arm is therefore [`std::convert::Infallible`]. Every failure is returned +/// as [`Error`] for the caller to print and exit non-zero. +pub fn run(args: JailerArgs) -> Result { + let config = Config::get(); + + // Resolve everything that can fail as the REAL uid, before any privilege is + // raised: config accessors, binary validation, range, and arg validation. + let jailer: PathBuf = config.jailer()?.into(); + let firecracker: PathBuf = config.firecracker()?.into(); + let jail_base = config.jail_base(); + let parent_cgroup = config.parent_cgroup(); + let range = config.uid_gid_range(); + + let uid = validate_id_number(args.uid, range)?; + let gid = validate_id_number(args.gid, range)?; + + if args.cgroup.len() > MAX_CGROUPS { + return Err(Error::TooManyCgroups(args.cgroup.len())); + } + + let argv = build_argv( + &jailer, + &args.id, + &firecracker, + uid, + gid, + &jail_base, + parent_cgroup, + &args.cgroup, + &args.api_sock, + )?; + let jailer_cstr = cstr_path(&jailer)?; + + // Point of no return: from here we are permanently root and must execve. + setuid_privileged::become_root_permanently()?; + close_inherited_fds(); + + // Empty envp: once ruid==0 the kernel clears AT_SECURE, so a smuggled + // LD_PRELOAD/LD_LIBRARY_PATH would be honored by the dynamic loader and + // hijack the root jailer. We pass nothing and let the jailer build its own. + let empty_env: [CString; 0] = []; + let errno = nix::unistd::execve(&jailer_cstr, &argv, &empty_env) + .expect_err("execve only returns on failure"); + Err(Error::Exec { + path: jailer, + errno, + }) +} diff --git a/native/suidhelper/src/tools/losetup.rs b/native/suidhelper/src/tools/losetup.rs index fa8a49d9..aadb5b67 100644 --- a/native/suidhelper/src/tools/losetup.rs +++ b/native/suidhelper/src/tools/losetup.rs @@ -66,6 +66,8 @@ enum LosetupOp { Attach(AttachArgs), /// Detach a loop device. Detach { dev: LoopDev }, + /// List loop devices as `NAME BACK-FILE` rows (for stale-device reclaim). + List, } #[derive(Serialize)] @@ -73,6 +75,7 @@ enum LosetupOp { pub enum LosetupOut { Attached { device: PathBuf }, Detached, + Listed { output: String }, } pub struct Losetup { @@ -101,6 +104,15 @@ impl IsTool for Losetup { let dev: &Path = dev.as_ref(); cmd.arg("-d").arg(dev); } + LosetupOp::List => { + cmd.args([ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE", + ]); + } } cmd.env_clear().output() @@ -119,6 +131,9 @@ impl IsTool for Losetup { device: PathBuf::from(String::from_utf8_lossy(&out.stdout).trim()), }, LosetupOp::Detach { .. } => LosetupOut::Detached, + LosetupOp::List => LosetupOut::Listed { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/src/tools/mod.rs b/native/suidhelper/src/tools/mod.rs index 10c53c47..9f9fedae 100644 --- a/native/suidhelper/src/tools/mod.rs +++ b/native/suidhelper/src/tools/mod.rs @@ -1,11 +1,13 @@ //! Per-tool CLI fragments and their `IsTool` implementations. Each tool lives in -//! its own submodule and owns its own error type, operand validation, and `--bin` -//! parser; this module owns the shared trait, the `Tool` subcommand tree, and the -//! privilege boundary. +//! its own submodule and owns its own error type and operand validation; this +//! module owns the shared trait, the `Tool` subcommand tree, and the privilege +//! boundary. The binary each tool runs is resolved from the trusted config here, +//! never passed by the caller. mod blockdev; pub mod chroot_jail; mod dmsetup; +pub mod jailer; mod losetup; pub use blockdev::{Blockdev, BlockdevArgs}; @@ -13,15 +15,15 @@ pub use chroot_jail::ChrootJailOp; pub use dmsetup::{DmTable, Dmsetup, DmsetupArgs, ThinMessage}; pub use losetup::{Losetup, LosetupArgs}; -use crate::util::safe_bin::SafeBin; +use crate::config::Config; use crate::util::setuid_privileged::{self, Privileged}; use clap::Subcommand; use serde::Serialize; use thiserror::Error as ThisError; -/// Errors of the dispatch layer: whatever the privilege guard or the chosen tool -/// raises on the way out. (`--bin` and operand validation are handled by clap at -/// parse time, so they never reach here.) +/// Errors of the dispatch layer: an invalid configured binary (`SafeBin`), the +/// privilege guard, or the chosen tool's own failure on the way out. (Operand +/// validation is handled by clap at parse time, so it never reaches here.) #[derive(Debug, ThisError)] pub enum Error { #[error(transparent)] @@ -65,28 +67,23 @@ pub trait IsTool { } } -/// The subcommand tree: one subcommand per tool, each taking its own `--bin` -/// with the tool-specific args flattened in from the submodule. +/// The subcommand tree: one subcommand per tool, with the tool-specific args +/// flattened in from the submodule. The binary each tool runs is not a caller +/// argument - it comes from the root-owned config (see [`Config`]). #[derive(Subcommand)] pub enum Tool { /// Attach/detach loop devices. Losetup { - #[arg(long)] - bin: SafeBin<"losetup">, #[command(flatten)] args: LosetupArgs, }, /// Create/remove device-mapper snapshot devices. Dmsetup { - #[arg(long)] - bin: SafeBin<"dmsetup">, #[command(flatten)] args: DmsetupArgs, }, /// Query a block device's size. Blockdev { - #[arg(long)] - bin: SafeBin<"blockdev">, #[command(flatten)] args: BlockdevArgs, }, @@ -99,13 +96,24 @@ pub enum Tool { impl Tool { /// Dispatch to the selected tool's `run` (or, for `chroot-jail`, its nested - /// op), returning its already-serialized `Value`. The `--bin` is already - /// validated (it is a `SafeBin`, constructed only by its value parser). + /// op), returning its already-serialized `Value`. The binary path is taken + /// from the trusted config and validated (`SafeBin`) here, as the real uid, + /// before any privilege is acquired. pub fn run(self) -> Result { + let config = Config::get(); match self { - Tool::Losetup { bin, args } => Losetup::new(bin.into(), args).run(), - Tool::Dmsetup { bin, args } => Dmsetup::new(bin.into(), args).run(), - Tool::Blockdev { bin, args } => Blockdev::new(bin.into(), args).run(), + Tool::Losetup { args } => { + let bin = config.losetup().map_err(|e| Error::Tool(Box::new(e)))?; + Losetup::new(bin.into(), args).run() + } + Tool::Dmsetup { args } => { + let bin = config.dmsetup().map_err(|e| Error::Tool(Box::new(e)))?; + Dmsetup::new(bin.into(), args).run() + } + Tool::Blockdev { args } => { + let bin = config.blockdev().map_err(|e| Error::Tool(Box::new(e)))?; + Blockdev::new(bin.into(), args).run() + } Tool::ChrootJail { op } => op.run(), } } diff --git a/native/suidhelper/src/util/chroot_jail.rs b/native/suidhelper/src/util/chroot_jail.rs index 70d4ce78..6a618187 100644 --- a/native/suidhelper/src/util/chroot_jail.rs +++ b/native/suidhelper/src/util/chroot_jail.rs @@ -43,6 +43,12 @@ pub enum Error { }, #[error("copy kernel: {0}")] Copy(#[source] io::Error), + #[error("resolving block device {path}: {source}")] + ResolveDev { + path: PathBuf, + #[source] + source: io::Error, + }, } /// An unfilled artifact slot. @@ -190,8 +196,18 @@ pub fn stage_kernel_under( /// `uid:gid`. The device is opened as a verified `SafeFile`, so the /// number comes from a real device node, never a caller-supplied value. fn make_rootfs(chroot: &SafeDir, device: &BlockDev, uid: u32, gid: u32) -> Result<(), Error> { - let dev_path: SafePath = - device.as_ref().to_path_buf().try_into()?; + // `device` is lexically validated as `/dev/loopN` or `/dev/mapper/hyper-*`. + // The dm form is a symlink under `/dev/mapper` — a root-owned `0755` dir an + // unprivileged caller cannot write — so resolving it to its real `/dev/dm-N` + // node cannot be redirected by the caller. We must resolve first: `SafeFile` + // opens `O_NOFOLLOW` (it must never follow an attacker-supplied symlink) and + // would otherwise reject the dm symlink itself. Loop nodes resolve to + // themselves. The re-opened target is still verified `IsBlockDevice`. + let real = std::fs::canonicalize(device.as_ref()).map_err(|source| Error::ResolveDev { + path: device.as_ref().to_path_buf(), + source, + })?; + let dev_path: SafePath = real.try_into()?; let dev = SafeFile::::open(&dev_path, OFlag::O_PATH)?; let rdev = dev.rdev()?; chroot.mknod_block(Path::new(ROOTFS_NAME), rdev, uid, gid)?; diff --git a/native/suidhelper/src/util/safe_bin.rs b/native/suidhelper/src/util/safe_bin.rs index 8f323aa7..91084407 100644 --- a/native/suidhelper/src/util/safe_bin.rs +++ b/native/suidhelper/src/util/safe_bin.rs @@ -1,17 +1,16 @@ -//! A validated `--bin` path. +//! A validated tool-binary path. //! -//! The caller names the binary to run, but it must be the expected tool and a -//! binary only root could have produced. [`SafeBin`] is a newtype whose only -//! constructor runs those checks, so holding one is proof the path was -//! validated. The const string parameter `NAME` is the basename it was validated -//! against - a `SafeBin<"losetup">` can never be passed where a -//! `SafeBin<"dmsetup">` is wanted. Combined with the [`FromStr`] impl (see -//! `tools`), clap validates the path at argument-parse time with no per-tool -//! boilerplate. +//! The path of each device tool (`losetup`, `dmsetup`, `blockdev`) comes from +//! the root-owned config file, never from the unprivileged caller. [`SafeBin`] +//! is a newtype whose only constructor runs the safety checks, so holding one is +//! proof the path was validated. The const string parameter `NAME` is the +//! basename it was validated against - a `SafeBin<"losetup">` can never be +//! passed where a `SafeBin<"dmsetup">` is wanted. //! -//! These checks are what keep this from being arbitrary-root-execution: an -//! unprivileged caller cannot point us at a binary it controls (must be -//! root-owned, not group/other-writable, not a symlink, exact basename). +//! These checks are what keep this from being arbitrary-root-execution: even a +//! mistaken config entry cannot point us at a binary a non-root user controls +//! (must be an absolute path, the exact basename, root-owned, not a symlink, not +//! group/other-writable). use std::ffi::OsStr; use std::fs; @@ -23,14 +22,14 @@ use thiserror::Error as ThisError; #[derive(Debug, ThisError)] pub enum Error { - #[error("--bin must be an absolute path: {0}")] + #[error("binary path must be absolute: {0}")] NotAbsolute(PathBuf), - #[error("--bin basename must be `{expected}`: {got}")] + #[error("binary basename must be `{expected}`: {got}")] Name { expected: &'static str, got: PathBuf, }, - #[error("--bin {path}: {source}")] + #[error("binary {path}: {source}")] Stat { path: PathBuf, #[source] @@ -44,22 +43,17 @@ pub enum Error { Writable(PathBuf), } -/// A `--bin` path validated to have basename `NAME`. The wrapped path is private -/// and the only constructor is the [`FromStr`] impl, so a `SafeBin` value cannot -/// exist without having been checked. +/// A tool-binary path validated to have basename `NAME`. The wrapped path is +/// private and the only constructor is [`SafeBin::from_path`], so a `SafeBin` +/// value cannot exist without having been checked. #[derive(Debug, Clone)] pub struct SafeBin(PathBuf); -// Lets clap validate `--bin` at parse time straight into a `SafeBin`, with -// no per-tool value parser: the const basename is the whole spec. Validates that -// `s` is an absolute path with basename `NAME`, a regular root-owned file no -// non-root user could have written. -impl FromStr for SafeBin { - type Err = Error; - - fn from_str(s: &str) -> Result { - let bin = Path::new(s); - +impl SafeBin { + /// Validate `bin` as the `NAME` tool binary: an absolute path with basename + /// `NAME`, a real (non-symlink) regular file owned by root that no non-root + /// user could have written. These checks are the whole point of the type. + pub fn from_path(bin: &Path) -> Result { if !bin.is_absolute() { return Err(Error::NotAbsolute(bin.to_path_buf())); } @@ -93,6 +87,16 @@ impl FromStr for SafeBin { } } +// Lets a string parse straight into a validated `SafeBin` (used by the +// test suite); delegates to the single `from_path` constructor. +impl FromStr for SafeBin { + type Err = Error; + + fn from_str(s: &str) -> Result { + Self::from_path(Path::new(s)) + } +} + // Read the validated path back out; the "validated" guarantee stays attached to // the `SafeBin` type until this conversion. impl From> for PathBuf { diff --git a/native/suidhelper/src/util/safe_dir.rs b/native/suidhelper/src/util/safe_dir.rs index 388ae838..0e4c6870 100644 --- a/native/suidhelper/src/util/safe_dir.rs +++ b/native/suidhelper/src/util/safe_dir.rs @@ -17,8 +17,8 @@ use super::safe_path::SafePath; use nix::dir::{Dir, Type}; use nix::fcntl::{openat, AtFlags, OFlag}; use nix::libc::dev_t; -use nix::sys::stat::{mknodat, Mode, SFlag}; -use nix::unistd::{dup, fchownat, linkat, unlinkat, Gid, Uid, UnlinkatFlags}; +use nix::sys::stat::{fchmod, fchmodat, fstatat, mknodat, FchmodatFlags, FileStat, Mode, SFlag}; +use nix::unistd::{dup, fchown, fchownat, linkat, unlinkat, Gid, Uid, UnlinkatFlags}; use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd}; @@ -37,6 +37,10 @@ pub enum Error { Mknod { name: PathBuf, source: nix::Error }, #[error("fchownat {name:?}: {source}")] Chown { name: PathBuf, source: nix::Error }, + #[error("fchmodat {name:?}: {source}")] + Chmod { name: PathBuf, source: nix::Error }, + #[error("fstatat {name:?}: {source}")] + Stat { name: PathBuf, source: nix::Error }, #[error("linkat -> {name:?}: {source}")] Link { name: PathBuf, source: nix::Error }, #[error("dup: {0}")] @@ -52,6 +56,8 @@ impl Error { | Error::Unlink { source, .. } | Error::Mknod { source, .. } | Error::Chown { source, .. } + | Error::Chmod { source, .. } + | Error::Stat { source, .. } | Error::Link { source, .. } => Some(*source), Error::ReadDir(source) | Error::Dup(source) => Some(*source), } @@ -180,6 +186,55 @@ impl SafeDir { }) } + /// `fstat` entry `name` relative to this dir's fd without following a final + /// symlink (`AT_SYMLINK_NOFOLLOW`). A symlink stats as itself (`S_IFLNK`), + /// never its target, so a caller inspecting the file type can reject one. + pub fn stat(&self, name: &Path) -> Result { + fstatat(Some(self.0.as_raw_fd()), name, AtFlags::AT_SYMLINK_NOFOLLOW).map_err(|source| { + Error::Stat { + name: name.to_path_buf(), + source, + } + }) + } + + /// `chmod` entry `name` to `mode`. Linux's `fchmodat` has no working + /// no-follow mode (it returns `ENOTSUP`), so this follows a final symlink; + /// call it only after [`stat`](Self::stat) has proven `name` is not a + /// symlink, so the follow is a no-op on a real (non-link) entry. + pub fn chmod(&self, name: &Path, mode: u32) -> Result<(), Error> { + fchmodat( + Some(self.0.as_raw_fd()), + name, + Mode::from_bits_truncate(mode), + FchmodatFlags::FollowSymlink, + ) + .map_err(|source| Error::Chmod { + name: name.to_path_buf(), + source, + }) + } + + /// `fchmod` this directory through its own held fd. Unlike [`chmod`](Self::chmod), + /// which re-resolves a *name*, this targets the fd we already opened + /// `O_NOFOLLOW`, so there is no path component to swap - TOCTOU-safe on the + /// directory itself. + pub fn chmod_self(&self, mode: u32) -> Result<(), Error> { + fchmod(self.0.as_raw_fd(), Mode::from_bits_truncate(mode)).map_err(|source| Error::Chmod { + name: PathBuf::from("."), + source, + }) + } + + /// `fchown` this directory's group through its own held fd, preserving its + /// owner (no uid passed). Same TOCTOU guarantee as [`chmod_self`](Self::chmod_self). + pub fn chgrp_self(&self, gid: u32) -> Result<(), Error> { + fchown(self.0.as_raw_fd(), None, Some(Gid::from_raw(gid))).map_err(|source| Error::Chown { + name: PathBuf::from("."), + source, + }) + } + /// Remove the non-directory entry `name` from this directory. pub fn unlink(&self, name: &Path) -> Result<(), Error> { unlinkat(Some(self.0.as_raw_fd()), name, UnlinkatFlags::NoRemoveDir).map_err(|source| { diff --git a/native/suidhelper/src/util/setuid_privileged.rs b/native/suidhelper/src/util/setuid_privileged.rs index 3c58bd39..e334b115 100644 --- a/native/suidhelper/src/util/setuid_privileged.rs +++ b/native/suidhelper/src/util/setuid_privileged.rs @@ -12,17 +12,20 @@ //! constructor nor a Drop impl can report an error, and silently staying root //! would defeat the point. -use nix::unistd::{getuid, seteuid, Uid}; +use nix::unistd::{getuid, seteuid, setgroups, setresgid, setresuid, Gid, Uid}; use thiserror::Error as ThisError; -/// Failures of the privilege transitions. Both are fatal: if we can't raise we -/// aren't installed setuid root, and if we can't lower we refuse to keep running. +/// Failures of the privilege transitions. All fatal: if we can't raise we aren't +/// installed setuid root, if we can't lower we refuse to keep running, and if we +/// can't seal permanent root we refuse to hand off to the execve target. #[derive(Debug, ThisError)] pub enum Error { #[error("not installed setuid root")] NotSetuidRoot, #[error("failed to drop privileges")] DropPrivileges, + #[error("failed to acquire permanent root for execve handoff")] + PermanentRoot, } /// `.preinit_array` runs before `.init_array` and before any shared-library @@ -76,6 +79,36 @@ impl Privileged { } } +/// Re-acquire full root **permanently** for an `execve` handoff and return — +/// there is deliberately no [`Privileged`] Drop guard, because `execve` replaces +/// the entire process image: nothing of this process survives to run a destructor, +/// and the new image (the firecracker jailer) is responsible for dropping its own +/// privileges. We must hand it a *genuine* root process (all of real, effective +/// and saved uid == 0) so the jailer's own privilege-drop is the real thing. +/// +/// Order matters and each step needs the privilege the previous one preserves: +/// 1. `seteuid(0)` — regain effective root; without it the rest are EPERM. +/// 2. `setresgid(0,0,0)` — set every gid to root *before* touching uids, while +/// we still hold euid 0 (gid changes require privilege). +/// 3. `setgroups([0])` — `drop_to_real` only lowered euid; it never touched the +/// caller's supplementary groups, so the jailer would otherwise inherit them. +/// Reset to just {0} now, while still privileged. +/// 4. `setresuid(0,0,0)` LAST — this seals real+effective+saved uid to root. It +/// goes last because once the saved uid is 0 there is no escape hatch left +/// (which is the point: permanent), and because it must follow the gid/group +/// changes that needed our euid-0 to be permitted. +pub fn become_root_permanently() -> Result<(), Error> { + let root_uid = Uid::from_raw(0); + let root_gid = Gid::from_raw(0); + + seteuid(root_uid).map_err(|_| Error::NotSetuidRoot)?; + setresgid(root_gid, root_gid, root_gid).map_err(|_| Error::PermanentRoot)?; + setgroups(&[root_gid]).map_err(|_| Error::PermanentRoot)?; + setresuid(root_uid, root_uid, root_uid).map_err(|_| Error::PermanentRoot)?; + + Ok(()) +} + impl Drop for Privileged { fn drop(&mut self) { if lower().is_err() { diff --git a/native/suidhelper/tests/config_uid_gid_range.rs b/native/suidhelper/tests/config_uid_gid_range.rs new file mode 100644 index 00000000..1f176e9f --- /dev/null +++ b/native/suidhelper/tests/config_uid_gid_range.rs @@ -0,0 +1,59 @@ +//! Properties of the `uid_gid_range` configuration field. +//! +//! Contracts under test: +//! - A valid range (min >= 1, min <= max) is always accepted. +//! - Absent range yields the built-in default (900_000, 999_999). +//! - A valid range round-trips through TOML deserialization + uid_gid_range(). +//! - min == 0 is always rejected (uid 0 means root; the jailer must never +//! receive it — it skips its privilege drop when uid == 0). +//! - min > max is always rejected (incoherent range; likely a config typo). + +use hyper_suidhelper::config::{validate_uid_gid_range, Config, LoadingError, UidGidRange}; +use proptest::prelude::*; + +#[test] +fn absent_range_yields_default() { + assert_eq!(Config::default().uid_gid_range(), (900_000, 999_999)); +} + +proptest! { + #[test] + fn valid_range_accepted(min in 1u32.., delta in 0u32..) { + // max = min + delta, saturating so it never wraps past u32::MAX. + let max = min.saturating_add(delta); + let r = UidGidRange { min, max }; + prop_assert!(validate_uid_gid_range(&r).is_ok()); + } + + #[test] + fn valid_range_round_trips_via_toml(min in 1u32.., delta in 0u32..) { + let max = min.saturating_add(delta); + let body = format!( + "work_dir = \"/srv/hyper\"\n[uid_gid_range]\nmin = {min}\nmax = {max}\n" + ); + let config: Config = toml::from_str(&body).expect("valid TOML"); + prop_assert_eq!(config.uid_gid_range(), (min, max)); + } + + #[test] + fn zero_min_always_rejected(max in 0u32..) { + let r = UidGidRange { min: 0, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { min: 0, .. }) + ); + prop_assert!(rejected); + } + + #[test] + fn min_exceeds_max_always_rejected(max in 0u32..u32::MAX) { + // min = max + 1 is always strictly greater than max and always >= 1. + let min = max + 1; + let r = UidGidRange { min, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { .. }) + ); + prop_assert!(rejected); + } +} diff --git a/native/suidhelper/tests/e2e/argv.rs b/native/suidhelper/tests/e2e/argv.rs index 6ebf997a..dcda8db9 100644 --- a/native/suidhelper/tests/e2e/argv.rs +++ b/native/suidhelper/tests/e2e/argv.rs @@ -1,7 +1,7 @@ //! L4: prove the exact argv (and empty env) the helper hands to the child tool — -//! the one thing the design deliberately hides from the caller. We point `--bin` -//! at a root-owned fake that writes its argv+env to a file as JSON, then assert -//! on the reconstructed command line. +//! the one thing the design deliberately hides from the caller. We point the +//! tool's config path at a root-owned fake that writes its argv+env to a file as +//! JSON, then assert on the reconstructed command line. #![cfg(feature = "insecure_test_seams")] use std::fs; @@ -31,9 +31,16 @@ fn install_fake(dir: &Path, basename: &str, record: &Path, stdout_line: &str) -> path // root-owned because this test runs as root } -fn write_root_config(dir: &Path) -> PathBuf { +/// Write a root-owned config that points the named tools at the given (fake) +/// binaries, so the helper resolves each tool's path from config rather than a +/// caller argument. +fn write_root_config(dir: &Path, bins: &[(&str, &Path)]) -> PathBuf { let p = dir.join("config.toml"); - fs::write(&p, "work_dir = \"/srv/hyper\"\n").unwrap(); + let mut body = String::from("work_dir = \"/srv/hyper\"\n"); + for (key, path) in bins { + body.push_str(&format!("{key} = \"{}\"\n", path.display())); + } + fs::write(&p, body).unwrap(); fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); p } @@ -60,17 +67,15 @@ fn dmsetup_create_snapshot_reconstructs_canonical_table_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); // Deliberately weird inner spacing; the helper must re-render canonically. let out = run( &cfg, &[ "dmsetup", - "--bin", - bin.to_str().unwrap(), "create", "hyper-vm1", "--readonly", @@ -106,21 +111,11 @@ fn dmsetup_remove_retry_toggle_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); - let out = run( - &cfg, - &[ - "dmsetup", - "--bin", - bin.to_str().unwrap(), - "remove", - "--retry", - "hyper-vm1", - ], - ); + let out = run(&cfg, &["dmsetup", "remove", "--retry", "hyper-vm1"]); assert_eq!(out.status.code(), Some(0)); assert_eq!(recorded_argv(&rec), vec!["remove", "--retry", "hyper-vm1"]); } @@ -132,16 +127,14 @@ fn dmsetup_message_create_thin_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); let out = run( &cfg, &[ "dmsetup", - "--bin", - bin.to_str().unwrap(), "message", "hyper-pool", "--message", @@ -156,6 +149,116 @@ fn dmsetup_message_create_thin_as_root() { ); } +#[test] +fn dmsetup_targets_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_targets: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + // Fake prints one `dmsetup targets` row; the helper returns it verbatim. + let bin = install_fake(tmp.path(), "dmsetup", &rec, "snapshot v1.16.0"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "targets"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["targets"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "targets"); + assert_eq!(json["output"], "snapshot v1.16.0\n"); +} + +#[test] +fn dmsetup_ls_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_ls: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake(tmp.path(), "dmsetup", &rec, "hyper-thinpool\\nhyper-rw-abc"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "ls"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["ls"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!(json["output"], "hyper-thinpool\nhyper-rw-abc\n"); +} + +#[test] +fn losetup_list_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP losetup_list: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake( + tmp.path(), + "losetup", + &rec, + "/dev/loop0 /srv/hyper/scratch/thinpool.meta", + ); + let cfg = write_root_config(tmp.path(), &[("losetup", &bin)]); + + let out = run(&cfg, &["losetup", "list"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!( + recorded_argv(&rec), + vec![ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE" + ] + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!( + json["output"], + "/dev/loop0 /srv/hyper/scratch/thinpool.meta\n" + ); +} + +#[test] +fn dmsetup_rejects_configured_bin_with_wrong_basename_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_rejects_bin: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // A real, root-owned system file, but the wrong basename for `dmsetup`. + let cfg = write_root_config(tmp.path(), &[("dmsetup", Path::new("/usr/bin/env"))]); + + let out = run(&cfg, &["dmsetup", "targets"]); + assert_ne!( + out.status.code(), + Some(0), + "a configured binary with the wrong basename must be refused" + ); + let err = String::from_utf8_lossy(&out.stderr); + assert!(err.contains("basename must be"), "stderr: {err}"); +} + #[test] fn blockdev_getsz_argv_and_parse_as_root() { if !is_root() { @@ -163,21 +266,12 @@ fn blockdev_getsz_argv_and_parse_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); // Fake prints "2048" as the sector count for the helper to parse. let bin = install_fake(tmp.path(), "blockdev", &rec, "2048"); + let cfg = write_root_config(tmp.path(), &[("blockdev", &bin)]); - let out = run( - &cfg, - &[ - "blockdev", - "--bin", - bin.to_str().unwrap(), - "--getsz", - "/dev/loop0", - ], - ); + let out = run(&cfg, &["blockdev", "--getsz", "/dev/loop0"]); assert_eq!( out.status.code(), Some(0), diff --git a/native/suidhelper/tests/e2e/config.rs b/native/suidhelper/tests/e2e/config.rs index cd3ba333..ef57c97b 100644 --- a/native/suidhelper/tests/e2e/config.rs +++ b/native/suidhelper/tests/e2e/config.rs @@ -3,6 +3,8 @@ //! tempfile instead of /etc/hyper, so tests never touch the real host. #![cfg(feature = "insecure_test_seams")] +use hyper_suidhelper::config::{BinError, Config}; +use hyper_suidhelper::util::safe_bin; use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; @@ -34,17 +36,28 @@ fn write_root_config(dir: &Path, body: &str) -> std::path::PathBuf { p } -/// A missing config file is refused at SafeFile::open with exit code 2. -/// The error is LoadingError::File(ValidationError::Open(ENOENT)), -/// which displays as "open failed: ". +/// A genuinely-absent config file is NOT an error: the helper falls back to the +/// built-in defaults (compiled into this root-owned binary, hence trusted). The +/// default `work_dir` is `/srv/hyper`. Needs root because `sys-test` then +/// acquires privileges to prove it can promote. #[test] -fn missing_config_exits_2() { +fn missing_config_falls_back_to_defaults_as_root() { + if !is_root() { + eprintln!("SKIP missing_config defaults: sys-test needs root"); + return; + } let tmp = tempfile::tempdir().unwrap(); let missing = tmp.path().join("nope.toml"); let out = run_with_config(&missing, &["sys-test"]); - assert_eq!(out.status.code(), Some(2), "missing config must exit 2"); - let err = String::from_utf8_lossy(&out.stderr); - assert!(err.contains("open failed"), "stderr: {err}"); + assert_eq!( + out.status.code(), + Some(0), + "absent config should use defaults; stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).expect("stdout is JSON"); + assert_eq!(json["sys_test"], "ok"); + assert_eq!(json["hyper_base"], "/srv/hyper"); } #[test] @@ -115,3 +128,95 @@ fn valid_config_and_setuid_yields_sys_test_ok_as_root() { assert_eq!(json["sys_test"], "ok"); assert_eq!(json["hyper_base"], "/srv/hyper"); } + +#[test] +fn firecracker_unconfigured_when_absent() { + // Config::default() has firecracker == None; the accessor must signal this + // distinctly so callers can report a missing-configuration error rather than + // a safe_bin validation error. + let err = Config::default() + .firecracker() + .expect_err("absent firecracker must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("firecracker")), + "expected Unconfigured(\"firecracker\"), got {err:?}", + ); +} + +#[test] +fn jailer_unconfigured_when_absent() { + let err = Config::default() + .jailer() + .expect_err("absent jailer must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("jailer")), + "expected Unconfigured(\"jailer\"), got {err:?}", + ); +} + +#[test] +fn jailer_basename_mismatch_rejected() { + // The basename check in SafeBin::from_path precedes the stat, so we do not + // need a real file — any absolute path with the wrong leaf name is enough. + let body = "work_dir = \"/srv/hyper\"\njailer = \"/usr/local/bin/not-jailer\"\n"; + let config: Config = toml::from_str(body).unwrap(); + let err = config + .jailer() + .expect_err("wrong-basename jailer path must be rejected"); + assert!( + matches!(err, BinError::Bin(safe_bin::Error::Name { .. })), + "expected a Name error, got {err:?}", + ); +} + +#[test] +fn firecracker_and_jailer_return_ok_when_root_owned_as_root() { + if !is_root() { + eprintln!("SKIP firecracker_jailer_configured: needs root to create root-owned binaries"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let fc = tmp.path().join("firecracker"); + let jr = tmp.path().join("jailer"); + // 0o755: root-owned, not group/other-writable — satisfies SafeBin's checks. + for p in [&fc, &jr] { + fs::write(p, b"#!/bin/sh\n").unwrap(); + fs::set_permissions(p, fs::Permissions::from_mode(0o755)).unwrap(); + } + let body = format!( + "work_dir = \"/srv/hyper\"\nfirecracker = \"{}\"\njailer = \"{}\"\n", + fc.display(), + jr.display(), + ); + let config: Config = toml::from_str(&body).unwrap(); + assert!( + config.firecracker().is_ok(), + "root-owned firecracker with correct basename must be accepted" + ); + assert!( + config.jailer().is_ok(), + "root-owned jailer with correct basename must be accepted" + ); +} + +#[test] +fn bad_uid_gid_range_exits_2_as_root() { + if !is_root() { + eprintln!("SKIP bad_uid_gid_range: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // min = 0 is the clearest violation: uid 0 is root, which the jailer must + // never receive because it skips its privilege drop when uid == 0. + let p = write_root_config( + tmp.path(), + "work_dir = \"/srv/hyper\"\n[uid_gid_range]\nmin = 0\nmax = 100\n", + ); + let out = run_with_config(&p, &["sys-test"]); + assert_eq!(out.status.code(), Some(2)); + let err = String::from_utf8_lossy(&out.stderr); + assert!( + err.contains("uid_gid_range"), + "expected a uid_gid_range error in stderr, got: {err}", + ); +} diff --git a/native/suidhelper/tests/e2e/jailer.rs b/native/suidhelper/tests/e2e/jailer.rs new file mode 100644 index 00000000..87084aef --- /dev/null +++ b/native/suidhelper/tests/e2e/jailer.rs @@ -0,0 +1,193 @@ +//! L4 for the jailer handoff: prove the exact argv (and an EMPTY environment) the +//! helper hands to the jailer via `execve`, and that the jailer's exit status +//! propagates. We point the config's `jailer` at a root-owned recorder that +//! writes its argv and its `/proc/self/environ` to files, then exits with a known +//! code. Root-only: `become_root_permanently` requires we are already root. +#![cfg(feature = "insecure_test_seams")] + +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::process::Command; + +const HELPER: &str = env!("CARGO_BIN_EXE_hyper-suidhelper"); +const RECORDER_EXIT: i32 = 7; + +fn is_root() -> bool { + nix::unistd::geteuid().is_root() +} + +fn cat_bin() -> &'static str { + ["/bin/cat", "/usr/bin/cat"] + .into_iter() + .find(|p| Path::new(p).exists()) + .expect("a `cat` binary for the recorder") +} + +/// Install a root-owned recorder named `jailer` that writes its argv (minus +/// argv[0]) as a JSON array to `argv_rec` and copies its `/proc/self/environ` to +/// `env_rec`, then exits `RECORDER_EXIT`. Paths are baked into the script text +/// because the recorder runs with an empty environment and absolute `cat` so it +/// needs no `PATH`. Returns the recorder's absolute path. +fn install_recorder(dir: &Path, argv_rec: &Path, env_rec: &Path) -> PathBuf { + let path = dir.join("jailer"); + let script = format!( + "#!/bin/sh\n\ + (\n printf '['\n sep=''\n for a in \"$@\"; do printf '%s\"%s\"' \"$sep\" \"$a\"; sep=','; done\n printf ']'\n) > '{argv}'\n\ + {cat} /proc/self/environ > '{env}'\n\ + exit {code}\n", + argv = argv_rec.display(), + cat = cat_bin(), + env = env_rec.display(), + code = RECORDER_EXIT, + ); + fs::write(&path, script).unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o755)).unwrap(); + path +} + +/// A root-owned plain file with basename `firecracker` — the `--exec-file`. It is +/// never executed by us (the jailer would), only validated as a `SafeBin`. +fn install_firecracker(dir: &Path) -> PathBuf { + let path = dir.join("firecracker"); + fs::write(&path, b"#!/bin/true\n").unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap(); + path +} + +fn write_root_config(dir: &Path, jailer: &Path, firecracker: &Path) -> PathBuf { + let p = dir.join("config.toml"); + let body = format!( + "work_dir = \"/srv/hyper\"\njailer = \"{}\"\nfirecracker = \"{}\"\n", + jailer.display(), + firecracker.display(), + ); + fs::write(&p, body).unwrap(); + fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); + p +} + +fn run(config: &Path, args: &[&str]) -> std::process::Output { + Command::new(HELPER) + .args(args) + .env_clear() + .env("HYPER_SETUIDHELPER_IS_INSECURE_MODE", "1") + .env("HYPER_SETUIDHELPER_CONFIG_PATH", config) + .output() + .expect("spawn helper") +} + +#[test] +fn execs_jailer_with_canonical_argv_and_empty_env_as_root() { + if !is_root() { + eprintln!("SKIP jailer exec: needs root to become_root_permanently + own the fakes"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "900001", + "--gid", + "900002", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--api-sock", + "/api.sock", + ], + ); + + // The jailer's own exit status must propagate through the execve handoff. + assert_eq!( + out.status.code(), + Some(RECORDER_EXIT), + "exit status did not propagate; stderr: {}", + String::from_utf8_lossy(&out.stderr), + ); + + let argv: Vec = + serde_json::from_str(&fs::read_to_string(&argv_rec).expect("recorded argv")).unwrap(); + assert_eq!( + argv, + vec![ + "--id", + "vm1", + "--exec-file", + &firecracker.to_string_lossy(), + "--uid", + "900001", + "--gid", + "900002", + "--chroot-base-dir", + "/srv/hyper/jails", + "--cgroup-version", + "2", + "--parent-cgroup", + "hyper", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--", + "--api-sock", + "/api.sock", + ], + "helper handed the jailer a non-canonical argv", + ); + + // The execve envp must be empty: once ruid==0 a smuggled LD_PRELOAD would be + // honored, so nothing of the caller's environment may reach the root jailer. + let environ = fs::read(&env_rec).expect("recorded environ"); + assert!( + environ.is_empty(), + "jailer inherited a non-empty environment: {environ:?}", + ); +} + +#[test] +fn refuses_uid_zero_without_exec_as_root() { + if !is_root() { + eprintln!("SKIP jailer uid 0: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "0", + "--gid", + "900002", + "--api-sock", + "/api.sock", + ], + ); + + assert_ne!(out.status.code(), Some(0), "uid 0 must be refused"); + assert_eq!(out.status.code(), Some(2), "validation failure exits 2"); + assert!( + !argv_rec.exists(), + "the jailer must never have been exec'd for uid 0", + ); +} diff --git a/native/suidhelper/tests/tools/chroot_jail_grant_api.rs b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs new file mode 100644 index 00000000..e55a679b --- /dev/null +++ b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs @@ -0,0 +1,241 @@ +//! Contracts of the `chroot-jail grant-api` op, driven through the base-injected +//! `grant_api_under` seam so they run unprivileged in a tempdir. The promises +//! under test (refusal contracts first — they are the security boundary): +//! * shape — the socket is accepted iff it is exactly +//! `//root/api.socket` below the jail base; any other depth or a +//! leaf that is not `api.socket` is refused before any chown; +//! * lexical — a `.`/`..`/empty component or a relative path is always rejected +//! before any filesystem access; +//! * type — a regular file or a symlink planted at `api.socket` is refused +//! (`NotASocket`) and left untouched, never chmod'd; only a real socket is +//! granted; +//! * confinement — a symlinked path component is never followed, so the chown +//! can never escape the anchored jail tree (the core TOCTOU guarantee); +//! * pending — a not-yet-created socket (or half-built jail) is `Pending`, not +//! an error, so the controller keeps probing; +//! * grant — a real socket is chowned to the caller and left mode 0660, and +//! its parent `root` dir is opened for the caller's group to traverse +//! (chgrp'd to the caller, chmod'd 0710) so the node can reach the socket. + +use hyper_suidhelper::tools::chroot_jail::grant_api::{grant_api_under, Error, GrantOut}; +use hyper_suidhelper::util::safe_path::ValidationError; +use proptest::prelude::*; +use std::os::unix::fs::{symlink, PermissionsExt}; +use std::os::unix::net::UnixListener; +use std::path::{Path, PathBuf}; +use std::{fs, os::unix::fs::MetadataExt}; + +/// Build the canonical `/exec/id/root` parent dirs and return that dir. +fn make_root(jail: &Path) -> PathBuf { + let root = jail.join("exec").join("id").join("root"); + fs::create_dir_all(&root).unwrap(); + root +} + +#[test] +fn socket_outside_jail_base_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path().join("jail"); + fs::create_dir(&jail).unwrap(); + let outside = tmp.path().join("elsewhere/exec/id/root/api.socket"); + let err = grant_api_under(&jail, &outside).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::NotUnderBase)), + "got {err:?}", + ); +} + +#[test] +fn wrong_leaf_basename_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail.join("exec").join("id").join("root").join("evil.sock"); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketName(_)), "got {err:?}"); +} + +#[test] +fn too_shallow_is_shape_error() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail.join("exec").join("id").join("api.socket"); // missing root/ + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketShape(_)), "got {err:?}"); +} + +#[test] +fn too_deep_is_shape_error() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail + .join("exec") + .join("id") + .join("root") + .join("extra") + .join("api.socket"); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketShape(_)), "got {err:?}"); +} + +#[test] +fn dotdot_traversal_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = PathBuf::from(format!("{}/exec/../id/root/api.socket", jail.display())); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::LooseComponents)), + "got {err:?}", + ); +} + +#[test] +fn relative_socket_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let err = grant_api_under(tmp.path(), Path::new("exec/id/root/api.socket")).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::NotAbsolute)), + "got {err:?}", + ); +} + +#[test] +fn missing_socket_is_pending() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let socket = root.join("api.socket"); // never created + let out = grant_api_under(jail, &socket).expect("missing socket must be Ok(Pending)"); + assert!(matches!(out, GrantOut::Pending), "got {out:?}"); +} + +#[test] +fn missing_jail_tree_is_pending() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let socket = jail.join("exec").join("id").join("root").join("api.socket"); // nothing created + let out = grant_api_under(jail, &socket).expect("half-built jail must be Ok(Pending)"); + assert!(matches!(out, GrantOut::Pending), "got {out:?}"); +} + +// A real socket is granted: chowned to the caller (our own uid/gid, which a +// non-root process is allowed to set on a file it owns) and chmod'd 0660. +#[test] +fn real_socket_is_granted_and_chmod_0660() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let socket = root.join("api.socket"); + let _listener = UnixListener::bind(&socket).unwrap(); + fs::set_permissions(&socket, fs::Permissions::from_mode(0o755)).unwrap(); + + let out = grant_api_under(jail, &socket).expect("real socket must grant"); + assert!(matches!(out, GrantOut::Granted), "got {out:?}"); + + let meta = fs::symlink_metadata(&socket).unwrap(); + assert_eq!(meta.mode() & 0o777, 0o660, "socket must be chmod'd 0660"); + assert_eq!(meta.uid(), nix::unistd::getuid().as_raw()); + assert_eq!(meta.gid(), nix::unistd::getgid().as_raw()); + + // The parent `root` dir must also be opened for the caller's group to + // traverse, else the node could not reach the socket: chgrp'd to the caller, + // chmod'd 0710 (owner rwx, group --x, other none). + let root_meta = fs::symlink_metadata(&root).unwrap(); + assert_eq!( + root_meta.mode() & 0o777, + 0o710, + "jail root must be chmod'd 0710 for traversal", + ); + assert_eq!( + root_meta.gid(), + nix::unistd::getgid().as_raw(), + "jail root must be chgrp'd to the caller", + ); +} + +// A regular file planted at api.socket is refused and left untouched (not chmod'd). +#[test] +fn regular_file_at_leaf_is_refused_and_untouched() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let imposter = root.join("api.socket"); + fs::write(&imposter, b"not a socket").unwrap(); + fs::set_permissions(&imposter, fs::Permissions::from_mode(0o600)).unwrap(); + + let err = grant_api_under(jail, &imposter).unwrap_err(); + assert!(matches!(err, Error::NotASocket), "got {err:?}"); + assert_eq!( + fs::symlink_metadata(&imposter).unwrap().mode() & 0o777, + 0o600, + "imposter file must not be chmod'd", + ); +} + +// A symlink planted at api.socket is refused: fstatat(AT_SYMLINK_NOFOLLOW) stats +// the link itself (S_IFLNK), so it is never seen as a socket and never followed. +#[test] +fn symlink_at_leaf_is_refused() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let target = tmp.path().join("real-target"); + fs::write(&target, b"secret").unwrap(); + let link = root.join("api.socket"); + symlink(&target, &link).unwrap(); + + let err = grant_api_under(jail, &link).unwrap_err(); + assert!(matches!(err, Error::NotASocket), "got {err:?}"); +} + +// A symlinked path component must NOT be followed: the walk fails rather than +// reaching through it, so nothing outside the jail is touched. +#[test] +fn symlinked_component_does_not_escape() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path().join("jail"); + fs::create_dir(&jail).unwrap(); + + let sentinel = tmp.path().join("sentinel"); + fs::create_dir_all(sentinel.join("id").join("root")).unwrap(); + let outside_socket = sentinel.join("id").join("root").join("api.socket"); + let _listener = UnixListener::bind(&outside_socket).unwrap(); + fs::set_permissions(&outside_socket, fs::Permissions::from_mode(0o700)).unwrap(); + + // `/exec` is a symlink to the external sentinel dir. + symlink(&sentinel, jail.join("exec")).unwrap(); + + let socket = jail.join("exec").join("id").join("root").join("api.socket"); + let _ = grant_api_under(&jail, &socket); // O_NOFOLLOW makes the walk refuse + + assert_eq!( + fs::symlink_metadata(&outside_socket).unwrap().mode() & 0o777, + 0o700, + "grant escaped through a symlinked component", + ); +} + +proptest! { + // For a socket `depth` components below the jail base with leaf `api.socket` + // (target never created), grant_api_under returns Ok(Pending) iff depth == 4 + // (i.e. 3 parents), else SocketShape. The generator emits only plain names so + // the lexical gate never fires and the leaf is always `api.socket`. + #[test] + fn shape_classification( + parents in prop::collection::vec("[a-z][a-z0-9]{0,5}", 1..6) + ) { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let mut socket = jail.to_path_buf(); + for c in &parents { + socket.push(c); + } + socket.push("api.socket"); + let res = grant_api_under(jail, &socket); + if parents.len() == 3 { + prop_assert!(matches!(res, Ok(GrantOut::Pending)), "depth 3 must be Pending, got {res:?}"); + } else { + prop_assert!(matches!(res, Err(Error::SocketShape(_))), "got {res:?}"); + } + } +} diff --git a/native/suidhelper/tests/tools/jailer.rs b/native/suidhelper/tests/tools/jailer.rs new file mode 100644 index 00000000..9ab75447 --- /dev/null +++ b/native/suidhelper/tests/tools/jailer.rs @@ -0,0 +1,162 @@ +//! Refusal contracts for the jailer's pure validators — the security core. A +//! valid input must round-trip to its canonical form; an invalid one must +//! *always* be rejected, never silently accepted. These properties are what stop +//! a compromised BEAM from naming uid 0, a privileged path, a traversal, or a +//! flag through the jailer subcommand. + +use hyper_suidhelper::tools::jailer::{validate_id_number, CgroupSetting, JailSock, VmId}; +use proptest::prelude::*; +use std::str::FromStr; + +proptest! { + /// uid/gid 0 is rejected for EVERY range — a jailer run with uid 0 skips its + /// privilege drop and leaves firecracker running as root. + #[test] + fn id_zero_always_rejected(lo in any::(), span in any::()) { + let hi = lo.saturating_add(span); + prop_assert!(validate_id_number(0, (lo, hi)).is_err()); + } + + /// Any nonzero value inside the (nonempty) range is accepted unchanged. + #[test] + fn id_in_range_nonzero_accepted( + lo in 1u32..=1_000_000, + span in 0u32..=1_000_000, + off in 0u32..=1_000_000, + ) { + let hi = lo + span; + let n = lo + (off % (span + 1)); // off bounded into [0, span] => n in [lo, hi] + prop_assert_eq!(validate_id_number(n, (lo, hi)).ok(), Some(n)); + } + + /// Values just below `lo` (still nonzero) and just above `hi` are rejected. + #[test] + fn id_out_of_range_rejected(lo in 2u32..=1_000_000, span in 0u32..=1_000_000) { + let hi = lo + span; + prop_assert!(validate_id_number(lo - 1, (lo, hi)).is_err()); + if hi < u32::MAX { + prop_assert!(validate_id_number(hi + 1, (lo, hi)).is_err()); + } + } + + /// Every string over the allowed charset/length with a non-dash leader parses + /// and renders back to itself. + #[test] + fn vmid_valid_round_trips(s in "[A-Za-z0-9_][A-Za-z0-9_-]{0,63}") { + prop_assert_eq!(VmId::from_str(&s).unwrap().to_string(), s); + } + + /// A leading dash is always rejected (no flag injection via `--id`). + #[test] + fn vmid_leading_dash_rejected(s in "-[A-Za-z0-9_-]{0,63}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Any embedded path separator is rejected (no chroot traversal via `--id`). + #[test] + fn vmid_slash_rejected(s in "[A-Za-z0-9_]{0,10}/[A-Za-z0-9_]{0,10}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Over-length ids (>64) are rejected. + #[test] + fn vmid_too_long_rejected(s in "[A-Za-z][A-Za-z0-9_-]{64,90}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// A valid `memory.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_memory_round_trips(s in "memory[.]max=([0-9]{1,20}|max)") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A valid `cpu.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_cpu_round_trips(s in "cpu[.]max=([0-9]{1,20} [0-9]{1,20}|max [0-9]{1,20})") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A single-filename absolute socket path round-trips; `.`/`..` filenames are + /// rejected even though they are within the charset. + #[test] + fn jailsock_single_filename(name in "[A-Za-z0-9_.-]{1,40}") { + let s = format!("/{name}"); + let res = JailSock::from_str(&s); + if name == "." || name == ".." { + prop_assert!(res.is_err()); + } else { + prop_assert_eq!(res.unwrap().to_string(), s); + } + } + + /// A second path component is always rejected (the socket must be a direct + /// child of `/`). + #[test] + fn jailsock_multi_component_rejected( + a in "[A-Za-z0-9_]{1,10}", + b in "[A-Za-z0-9_]{1,10}", + ) { + let s = format!("/{a}/{b}"); + prop_assert!(JailSock::from_str(&s).is_err()); + } +} + +#[test] +fn vmid_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "-leading", // leading dash + "a/b", // separator + "a.b", // dot + "a b", // whitespace + "a\tb", // tab + "a\0b", // NUL + "naïve", // non-ascii + &"x".repeat(65), // too long + ] { + assert!(VmId::from_str(bad).is_err(), "VmId accepted {bad:?}"); + } +} + +#[test] +fn cgroup_rejects_known_bad_shapes() { + for bad in [ + "linear=10", // unknown key + "memory.high=10", // unknown key + "memory.max=", // empty value + "memory.max=12x", // non-digit + "memory.max=1=2", // second '=' + "memory.max", // no '=' + "cpu.max=100000", // missing period field + "cpu.max=100000 100000 5", // extra field + "cpu.max=x 100000", // bad quota + "cpu.max=max max", // bad period + "cpu.max=max", // missing period + ] { + assert!( + CgroupSetting::from_str(bad).is_err(), + "CgroupSetting accepted {bad:?}" + ); + } +} + +#[test] +fn jailsock_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "relative", // not absolute + "/", // no filename + "/a/b", // multi-component + "/../etc", // traversal + "/..", // traversal as whole filename + "/.", // current dir + "/a b", // whitespace + "/a\0b", // NUL + "//x", // empty leading component + ] { + assert!( + JailSock::from_str(bad).is_err(), + "JailSock accepted {bad:?}" + ); + } +} diff --git a/native/suidhelper/tests/util/safe_bin.rs b/native/suidhelper/tests/util/safe_bin.rs index 950e06b7..e7522439 100644 --- a/native/suidhelper/tests/util/safe_bin.rs +++ b/native/suidhelper/tests/util/safe_bin.rs @@ -1,5 +1,5 @@ -//! `SafeBin` is what stops `--bin` from pointing the helper at an -//! attacker-controlled binary it would then run as root. The constructor demands +//! `SafeBin` is what stops a configured path from pointing the helper at +//! an attacker-controlled binary it would then run as root. The constructor demands //! an absolute path, exact basename, a real (non-symlink) regular file owned by //! root and not group/other-writable. These assert the refusal axes; the symlink //! axis is root-independent, the owner axis is asserted both ways. diff --git a/test/hyper/node/fire_vmm/jailer_test.exs b/test/hyper/node/fire_vmm/jailer_test.exs new file mode 100644 index 00000000..1ca862fe --- /dev/null +++ b/test/hyper/node/fire_vmm/jailer_test.exs @@ -0,0 +1,91 @@ +defmodule Hyper.Node.FireVMM.JailerTest do + @moduledoc """ + Properties and examples for `Hyper.Node.FireVMM.Jailer.command/1`. + + Load-bearing invariant: the BEAM must never place a privileged binary path + (firecracker, jailer) or lifecycle flags owned by the suidhelper (`--exec-file`, + `--chroot-base-dir`, `--cgroup-version`, `--parent-cgroup`, `--`) in the args + it hands to the helper. The helper derives those from its trusted config. + """ + + use ExUnit.Case, async: false + use ExUnitProperties + + alias Hyper.Node.FireVMM + alias Hyper.Node.FireVMM.Jailer + + @vm_id "vmtest01" + + # Stub config_toml persistent_term so firecracker_bin/jailer_bin resolve + # to dummy paths without requiring /etc/hyper/config.toml on the test host. + # async: false because persistent_term is global state. + setup do + :persistent_term.put({Hyper.Config, :config_toml}, %{ + "firecracker" => "/usr/local/bin/firecracker", + "jailer" => "/usr/local/bin/jailer" + }) + + on_exit(fn -> :persistent_term.erase({Hyper.Config, :config_toml}) end) + end + + defp micro_opts do + %FireVMM.Opts{ + vm_id: @vm_id, + uid: 900_001, + gid: 900_001, + type: :micro, + arch: :x86_64, + mutable: nil, + kernel: "/srv/hyper/redist/vmlinux/vmlinux-x86_64-6.1", + boot_args: nil + } + end + + test "binary is the suid helper" do + assert Jailer.command(micro_opts()).binary == Hyper.Config.suid_helper() + end + + test "args start with the jailer subcommand" do + %{args: [first | _]} = Jailer.command(micro_opts()) + assert first == "jailer" + end + + test "args contain --id, --uid, --gid with the opts values" do + %{args: args} = Jailer.command(micro_opts()) + assert "--id" in args + assert @vm_id in args + assert "--uid" in args + assert "--gid" in args + assert "900001" in args + end + + test "args end with --api-sock /api.socket" do + %{args: args} = Jailer.command(micro_opts()) + assert Enum.take(args, -2) == ["--api-sock", "/api.socket"] + end + + test "args do not contain privileged flags owned by the suidhelper" do + %{args: args} = Jailer.command(micro_opts()) + refute "--exec-file" in args + refute "--chroot-base-dir" in args + refute "--cgroup-version" in args + refute "--parent-cgroup" in args + refute "--" in args + end + + test "args include --cgroup cpu.max and memory.max for :micro type" do + %{args: args} = Jailer.command(micro_opts()) + assert "--cgroup" in args + assert Enum.any?(args, &String.starts_with?(&1, "cpu.max=")) + assert Enum.any?(args, &String.starts_with?(&1, "memory.max=")) + end + + # firecracker rejects an instance id containing `_` (and dm/jailer names must + # not lead with `-`), so the id must be strictly alphanumeric. + property "gen_vm_id/0 produces only alphanumeric ids" do + check all(_ <- StreamData.constant(nil)) do + id = Hyper.gen_vm_id() + assert id =~ ~r/\A[A-Za-z0-9]+\z/ + end + end +end diff --git a/test/hyper/suid_helper/dmsetup_properties_test.exs b/test/hyper/suid_helper/dmsetup_properties_test.exs index 73858fc6..a7e49d81 100644 --- a/test/hyper/suid_helper/dmsetup_properties_test.exs +++ b/test/hyper/suid_helper/dmsetup_properties_test.exs @@ -73,4 +73,17 @@ defmodule Hyper.SuidHelper.DmsetupPropertiesTest do assert Dmsetup.parse_targets(out) == MapSet.new(targets) end end + + property "parse_names recovers the device name from each `dmsetup ls` row" do + check all(names <- uniq_list_of(dev(), min_length: 1, max_length: 6)) do + # `dmsetup ls` rows are "\t(major:minor)"; order is preserved. + out = Enum.map_join(names, "\n", fn n -> "#{n}\t(254:0)" end) + assert Dmsetup.parse_names(out) == names + end + end + + test "parse_names reads the empty-table sentinel as no devices" do + assert Dmsetup.parse_names("No devices found\n") == [] + assert Dmsetup.parse_names("") == [] + end end diff --git a/test/hyper/suid_helper/losetup_test.exs b/test/hyper/suid_helper/losetup_test.exs new file mode 100644 index 00000000..4cdc6f0f --- /dev/null +++ b/test/hyper/suid_helper/losetup_test.exs @@ -0,0 +1,37 @@ +defmodule Hyper.SuidHelper.LosetupTest do + @moduledoc """ + `parse_list/1` turns `losetup --list --output NAME,BACK-FILE` rows into + `{device, backing_file}` pairs. The reclaim pass relies on it to recognise loop + devices backing Hyper's files, so the edges that matter are: a loop with no + backing file (skipped, nothing to reclaim by file) and a `(deleted)` backing + suffix (kept, so the data-dir prefix still matches). + """ + use ExUnit.Case, async: true + + alias Hyper.SuidHelper.Losetup + + test "pairs device with backing file and skips rows that have no backing file" do + out = """ + /dev/loop0 /srv/hyper/scratch/thinpool.meta + /dev/loop1 /srv/hyper/layers/blob + /dev/loop2 + """ + + assert Losetup.parse_list(out) == [ + {"/dev/loop0", "/srv/hyper/scratch/thinpool.meta"}, + {"/dev/loop1", "/srv/hyper/layers/blob"} + ] + end + + test "keeps a `(deleted)` backing suffix so data-dir prefix matching still works" do + out = "/dev/loop0 /srv/hyper/scratch/thinpool.data (deleted)\n" + + assert Losetup.parse_list(out) == [ + {"/dev/loop0", "/srv/hyper/scratch/thinpool.data (deleted)"} + ] + end + + test "an empty listing yields no pairs" do + assert Losetup.parse_list("") == [] + end +end