diff --git a/config/runtime.exs b/config/runtime.exs
index a3b4fd0f..c9ca89fb 100644
--- a/config/runtime.exs
+++ b/config/runtime.exs
@@ -14,17 +14,27 @@ config :hyper, Hyper.Node.Config.Budget,
# Where to send traces. Defaults to Honeycomb; override OTEL_EXPORTER_OTLP_*
# to point at any OTLP/HTTP backend (Collector, Grafana, etc).
if config_env() != :test do
- endpoint = System.get_env("OTEL_EXPORTER_OTLP_ENDPOINT", "https://api.honeycomb.io")
+ custom_endpoint = System.get_env("OTEL_EXPORTER_OTLP_ENDPOINT")
+ api_key = System.get_env("HONEYCOMB_API_KEY")
- headers =
- case System.get_env("HONEYCOMB_API_KEY") do
- nil -> []
- "" -> []
- key -> [{"x-honeycomb-team", key}]
- end
+ cond do
+ api_key not in [nil, ""] ->
+ config :opentelemetry_exporter,
+ otlp_protocol: :http_protobuf,
+ otlp_endpoint: custom_endpoint || "https://api.honeycomb.io",
+ otlp_headers: [{"x-honeycomb-team", api_key}]
- config :opentelemetry_exporter,
- otlp_protocol: :http_protobuf,
- otlp_endpoint: endpoint,
- otlp_headers: headers
+ custom_endpoint not in [nil, ""] ->
+ # A custom OTLP backend (e.g. a local Collector) needs no Honeycomb key.
+ config :opentelemetry_exporter,
+ otlp_protocol: :http_protobuf,
+ otlp_endpoint: custom_endpoint,
+ otlp_headers: []
+
+ true ->
+ # No backend configured: exporting to the Honeycomb default with no key
+ # 401s on every batch. Stay silent instead (typical for local dev). Set
+ # HONEYCOMB_API_KEY or OTEL_EXPORTER_OTLP_ENDPOINT to enable tracing.
+ config :opentelemetry, traces_exporter: :none
+ end
end
diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md
index a0030d35..57aa4ff0 100644
--- a/docs/cookbook/intro.md
+++ b/docs/cookbook/intro.md
@@ -19,13 +19,187 @@ The absolute best way to get started with `Hyper` is to play with it.
### Requirements
-Hyper requires the following software be installed on each node running it:
+#### External services
- - [`skopeo`](https://github.com/containers/skopeo)
- - [`e2fsprogs`](https://github.com/tytso/e2fsprogs)
+Hyper needs a **PostgreSQL** server reachable from every node - it is the image
+database and the only stateful external dependency.
-Hyper has more runtime dependencies, but they are automatically redistributed
-by Hyper.
+For local development the quickest path is Docker. The connection details below
+match the defaults in `config/config.exs` (`Hyper.Img.Db.Repo`):
+
+```sh
+docker run -d --name hyper-pg \
+ -e POSTGRES_USER=postgres \
+ -e POSTGRES_PASSWORD=postgres \
+ -e POSTGRES_DB=hyper_dev \
+ -p 5432:5432 \
+ postgres:16
+```
+
+Once it is up, create and migrate the schema (the repo is not in `ecto_repos`,
+so pass it with `-r`):
+
+```sh
+mix ecto.create -r Hyper.Img.Db.Repo
+mix ecto.migrate -r Hyper.Img.Db.Repo
+```
+
+The container is ephemeral; `docker start hyper-pg` brings it back after a
+reboot. To point Hyper at an existing server instead, override the
+`Hyper.Img.Db.Repo` block in your `config.exs`.
+
+#### System binaries
+
+These are used by the unprivileged node directly; each must be on the node's
+`PATH` (the bracketed override is the `config :hyper` key you can set if the
+binary lives elsewhere):
+
+ - [`skopeo`](https://github.com/containers/skopeo) - pulls OCI images
+ (`skopeo_path`)
+ - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) - provides `mke2fs`, which
+ builds the ext4 rootfs (`mke2fs_path`)
+ - `du`, `getent` (from **coreutils** and **glibc**) - rootfs sizing and user
+ resolution. Present on essentially every distro.
+
+The privileged device binaries - `losetup`, `blockdev` (from **util-linux**)
+and `dmsetup` (from **lvm2** / device-mapper) - are run only by the setuid
+helper, never named by the unprivileged caller. Their paths therefore live in
+the helper's own config, `/etc/hyper/config.toml`, and default to
+`/usr/sbin/{losetup,blockdev,dmsetup}`.
+
+**The config file must exist** to set `firecracker` and `jailer` (no built-in
+defaults for those). The device-tool paths (`dmsetup`, `losetup`, `blockdev`)
+and `work_dir` do have built-in defaults, so if you only need those defaults
+and are not running VMs you may omit the file entirely. When the file is
+present it must be root-owned and not group/other-writable, or the helper
+refuses to start (a present-but-untrusted file is treated as an attack signal,
+unlike a missing one):
+
+```toml
+# /etc/hyper/config.toml (root-owned, mode 0644)
+work_dir = "/srv/hyper"
+
+# REQUIRED - no default. Each must be an absolute path to a root-owned,
+# non-group/world-writable binary named exactly "firecracker" or "jailer"
+# (the helper validates the basename). Run `mix firecracker.install` to
+# download the pinned release and print these values.
+firecracker = "/opt/firecracker/firecracker"
+jailer = "/opt/firecracker/jailer"
+
+# Optional device-tool overrides; default to /usr/sbin/{dmsetup,losetup,blockdev}.
+# Each must be root-owned and not group/world-writable.
+dmsetup = "/usr/sbin/dmsetup"
+losetup = "/usr/sbin/losetup"
+blockdev = "/usr/sbin/blockdev"
+
+# Optional. Governs which uid/gid values the helper accepts when launching the
+# jailer. Must satisfy min > 0 and min <= max. Defaults to {900000, 999999}.
+# If you narrow this range, set the same bounds in `config :hyper, uid_gid_range:`
+# so the node hands out only uids the helper will accept.
+[uid_gid_range]
+min = 900000
+max = 999999
+```
+
+`dmsetup` (lvm2) is frequently *not* installed by default - check that one
+first.
+
+#### Kernel features
+
+The host kernel must provide:
+
+ - **KVM** - `/dev/kvm` must exist and be accessible to the per-VM users (see
+ the `uid_gid_range` configuration).
+ - **cgroup v2** - the unified hierarchy mounted at `/sys/fs/cgroup`. v1-only
+ hosts are not supported.
+ - **device-mapper targets** `snapshot`, `thin`, and `thin-pool` - from the
+ `dm_snapshot` (provides `snapshot`) and `dm_thin_pool` (provides `thin` and
+ `thin-pool`) modules. Hyper refuses to start without all three; on boot it
+ fails with `{:missing_dm_targets, [...]}` listing whichever are absent.
+ - **loop devices** - the `loop` module, used to attach layer images as block
+ devices.
+
+Load the modules and confirm the targets are present:
+
+```sh
+sudo modprobe dm_snapshot dm_thin_pool loop
+sudo dmsetup targets # must list snapshot, thin, and thin-pool
+```
+
+If `modprobe` reports the module is missing, the running kernel lacks it -
+minimal cloud images often strip device-mapper. On Debian/Ubuntu, install the
+extra modules for the running kernel, then load them:
+
+```sh
+sudo apt-get install -y linux-modules-extra-$(uname -r)
+sudo modprobe dm_snapshot dm_thin_pool loop
+```
+
+Make the modules load on every boot:
+
+```sh
+printf 'dm_snapshot\ndm_thin_pool\nloop\n' | sudo tee /etc/modules-load.d/hyper.conf
+```
+
+#### Privileged setup
+
+ - The **setuid-root device helper** (`hyper-suidhelper`) must be installed.
+ Run `mix suidhelper.install`, which builds, stamps, and places it
+ setuid-root on `PATH`. Every privileged operation (losetup, dmsetup, mknod,
+ chroot jails) routes through it; the BEAM itself runs unprivileged.
+
+ The final `sudo install` step runs without a controlling terminal (Mix
+ captures the nested `cargo` output), so on a typical `tty_tickets` sudo
+ setup it cannot prompt for a password. If it fails, the build has already
+ stamped the binary -- just run the copy yourself:
+
+ ```sh
+ sudo install -o root -g root -m 4755 \
+ native/suidhelper/target/release/hyper-suidhelper \
+ /usr/local/bin/hyper-suidhelper
+ ```
+ - A **parent cgroup** named by `cgroup_parent` (default `hyper`) must exist
+ under the cgroup-v2 hierarchy; Hyper creates each VM's cgroup beneath it and
+ fails to boot with `:missing_parent_cgroup` if it is absent. Create it and
+ delegate the `cpu` and `memory` controllers so the per-VM cgroups can set
+ `cpu.max` / `memory.max`:
+
+ ```sh
+ sudo mkdir -p /sys/fs/cgroup/hyper
+ echo '+cpu +memory' | sudo tee /sys/fs/cgroup/hyper/cgroup.subtree_control
+ ```
+
+ If that last write errors, the root hierarchy is not delegating those
+ controllers down yet - enable them there first, then retry the line above:
+
+ ```sh
+ echo '+cpu +memory' | sudo tee /sys/fs/cgroup/cgroup.subtree_control
+ ```
+
+ The cgroup hierarchy is memory-backed, so `/sys/fs/cgroup/hyper` does **not**
+ survive a reboot. Re-create it each boot, or persist it with
+ `systemd-tmpfiles`:
+
+ ```sh
+ echo 'd /sys/fs/cgroup/hyper 0755 root root -' \
+ | sudo tee /etc/tmpfiles.d/hyper-cgroup.conf
+ ```
+ - The host UID/GID range must be free for Hyper to allocate per-VM users
+ from. The node's range is set by `uid_gid_range` in `config :hyper`; the
+ helper independently reads `[uid_gid_range]` from `/etc/hyper/config.toml`
+ (see below) and only accepts jailer `--uid`/`--gid` within that range.
+ Keep the two in sync.
+
+#### Auto-redistributed
+
+`umoci` and the guest `vmlinux` kernels are downloaded, checksum-verified, and
+managed by Hyper itself; you do not install them.
+
+`firecracker` and `jailer` are not auto-downloaded. Install them with
+`mix firecracker.install [--prefix
]` (default prefix `/opt/firecracker`),
+which downloads the pinned v1.16.0 release, places the binaries at
+`/firecracker` and `/jailer`, and prints the `/etc/hyper/config.toml`
+snippet to paste in.
### Installation
@@ -40,22 +214,22 @@ configuration.
```elixir
config :hyper,
- # TODO(markovejnovic): Remove this after it gets auto-downloaded.
- jailer_bin: "/opt/firecracker/jailer-v1.16.0-x86_64",
- # TODO(markovejnovic): Remove this after it gets auto-downloaded.
- firecracker_bin: "/opt/firecracker/firecracker-v1.16.0-x86_64",
# You must create a parent cgroup on your system. Continue reading for
# further details.
cgroup_parent: "hyper",
- # TODO(markovejnovic): Merge these directories into one.
jailer_chroot_base: "/srv/hyper/jails",
socket_dir: "/srv/hyper/socks",
scratch_dir: "/srv/hyper/scratch",
- # Hyper requires that each VM you pass
+ # Must match the [uid_gid_range] table in /etc/hyper/config.toml so the node
+ # hands out only uids the helper will accept.
uid_gid_range: {900_000, 999_999},
layer_dir: "/srv/hyper/layers"
```
+The `firecracker` and `jailer` binary paths are **not** set here — they are read
+from `/etc/hyper/config.toml` (the single source of truth shared with the setuid
+helper). See the `config.toml` example above.
+
### Usage
diff --git a/lib/hyper.ex b/lib/hyper.ex
index fb0a626e..ce946a6a 100644
--- a/lib/hyper.ex
+++ b/lib/hyper.ex
@@ -34,9 +34,24 @@ defmodule Hyper do
end
end
- @doc "Generate a fresh VM id (url-safe base64, dm-name compatible)."
+ @doc """
+ Generate a fresh VM id: a `v` prefix followed by lowercase base32 of 10 random
+ bytes, charset `[a-z2-7]`.
+
+ Alphanumeric only - no `-`, `_`, or other punctuation. That is the intersection
+ of three independent constraints the id must satisfy at once:
+
+ * firecracker rejects `_` in an instance id (`InvalidInstanceId`);
+ * dm/jailer names must not start with `-`;
+ * registry keys and chroot path components stay trivially safe.
+
+ The previous base64url encoding emitted `-` and `_`, so it could produce ids
+ firecracker refused at boot (`Invalid char (_)`).
+ """
@spec gen_vm_id() :: Hyper.Vm.id()
- def gen_vm_id, do: Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false)
+ def gen_vm_id do
+ "v" <> Base.encode32(:crypto.strong_rand_bytes(10), padding: false, case: :lower)
+ end
@spec resolve_arch(Hyper.Vm.Instance.arch() | nil) ::
{:ok, Hyper.Vm.Instance.arch()} | {:error, term()}
diff --git a/lib/hyper/cluster/routing.ex b/lib/hyper/cluster/routing.ex
index c7802595..7b478363 100644
--- a/lib/hyper/cluster/routing.ex
+++ b/lib/hyper/cluster/routing.ex
@@ -28,6 +28,26 @@ defmodule Hyper.Cluster.Routing do
@spec via(term()) :: {:via, module(), {atom(), term()}}
def via(key), do: {:via, Horde.Registry, {@name, key}}
+ @doc """
+ Register the calling process under `key` from inside its own `init`.
+
+ Prefer this over starting a process with a `{:via, Horde.Registry, _}` name.
+ OTP's post-start name check (`gen:get_proc_name`) calls `whereis_name`
+ immediately after the synchronous `register`, but Horde materialises the name
+ into its local ETS only asynchronously, via the DeltaCRDT diff loop. Under
+ registry churn that read loses the race and OTP aborts startup with
+ `{:process_not_registered_via, Horde.Registry}`. Registering from within
+ `init` carries no such self-check, while leaving the name cluster-resolvable
+ through `via/1` once the diff propagates (callers already tolerate that lag).
+ """
+ @spec register_self(term()) :: :ok | {:error, {:already_registered, pid()}}
+ def register_self(key) do
+ case Horde.Registry.register(@name, key, nil) do
+ {:ok, _pid} -> :ok
+ {:error, {:already_registered, _pid}} = err -> err
+ end
+ end
+
@doc "Which node currently runs `vm_id`? `nil` if unknown."
@spec whereis(Hyper.Vm.id()) :: node() | nil
@decorate with_span("Hyper.Cluster.Routing.whereis", include: [:vm_id])
diff --git a/lib/hyper/cluster/scheduler.ex b/lib/hyper/cluster/scheduler.ex
index 1412a0b2..4877658c 100644
--- a/lib/hyper/cluster/scheduler.ex
+++ b/lib/hyper/cluster/scheduler.ex
@@ -16,6 +16,8 @@ defmodule Hyper.Cluster.Scheduler do
alias Hyper.Vm.Instance.Spec
alias Unit.Information
+ require Logger
+
use OpenTelemetryDecorator
@type layer_sizes :: [{Hyper.Layer.id(), Unit.Information.t()}]
@@ -45,8 +47,15 @@ defmodule Hyper.Cluster.Scheduler do
|> candidates(layers)
|> Enum.reduce_while({:error, :no_capacity}, fn node, acc ->
case attempt.(node) do
- {:ok, result} -> {:halt, {:ok, {node, result}}}
- {:error, _reason} -> {:cont, acc}
+ {:ok, result} ->
+ {:halt, {:ok, {node, result}}}
+
+ {:error, reason} ->
+ # The candidate fit the snapshot but refused at confirmation time.
+ # Log the real reason: otherwise an actual boot failure on the only
+ # candidate is indistinguishable from genuine `:no_capacity`.
+ Logger.warning("scheduler: #{inspect(node)} refused placement: #{inspect(reason)}")
+ {:cont, acc}
end
end)
end
diff --git a/lib/hyper/config.ex b/lib/hyper/config.ex
index 1332d9ee..0704c647 100644
--- a/lib/hyper/config.ex
+++ b/lib/hyper/config.ex
@@ -2,10 +2,10 @@ defmodule Hyper.Config do
@moduledoc """
Host configuration, read from `config :hyper, ...` (see `config/config.exs`).
- `work_dir` is the one value shared with the setuid helper
- (`native/suidhelper`); both sides read it from `/etc/hyper/config.toml` so the
- data root has a single source of truth (see `work_dir/0`). Everything else is
- compile-time.
+ Runtime values shared with the setuid helper (`native/suidhelper`) — `work_dir`,
+ `firecracker`, `jailer` — are read from `/etc/hyper/config.toml` (the single
+ source of truth for both sides) the first time they are needed, then cached in
+ `:persistent_term`. Everything else is compile-time.
"""
# The shared data-root config file, read by both this node and the setuid
@@ -16,9 +16,6 @@ defmodule Hyper.Config do
@parent_cgroup Application.compile_env(:hyper, :cgroup_parent, "hyper")
@uid_gid_range Application.compile_env!(:hyper, :uid_gid_range)
@layer_dir Application.compile_env!(:hyper, :layer_dir)
- @losetup_path Application.compile_env(:hyper, :losetup_path, "losetup")
- @dmsetup_path Application.compile_env(:hyper, :dmsetup_path, "dmsetup")
- @blockdev_path Application.compile_env(:hyper, :blockdev_path, "blockdev")
@skopeo_path Application.compile_env(:hyper, :skopeo_path, "skopeo")
@umoci_path Application.compile_env(:hyper, :umoci_path, nil)
@mke2fs_path Application.compile_env(:hyper, :mke2fs_path, "mke2fs")
@@ -27,39 +24,86 @@ defmodule Hyper.Config do
Root work directory for this node. All firecracker paths derive from it.
Read from `#{@config_path}` (the single source of truth shared with the setuid
- helper) the first time it is needed, then cached in `:persistent_term`. Falls
- back to `#{@dev_work_dir}` when the file is absent (local dev / CI, where the
- helper is not installed anyway).
+ helper) the first time it is needed, then cached via `config_toml/0`. Falls back
+ to `#{@dev_work_dir}` when the file is absent (local dev / CI, where the helper
+ is not installed anyway).
"""
@spec work_dir :: Path.t()
- def work_dir do
- case :persistent_term.get({__MODULE__, :work_dir}, nil) do
+ def work_dir, do: Map.get(config_toml(), "work_dir", @dev_work_dir)
+
+ @doc "Directory holding redistributable binaries downloaded by the node."
+ @spec redist_dir :: Path.t()
+ def redist_dir, do: Path.join(work_dir(), "redist")
+
+ @doc """
+ Absolute path to the firecracker binary, as set in `#{@config_path}` under the
+ `firecracker` key. Raises if the key is absent — the operator must configure it;
+ there is no default.
+
+ For the launch path only. Pre-launch checks should use `firecracker_bin_configured/0`
+ so a missing key returns a typed error rather than crashing.
+ """
+ @spec firecracker_bin :: Path.t()
+ def firecracker_bin, do: fetch_bin!("firecracker")
+
+ @doc """
+ Non-raising form of `firecracker_bin/0`. Returns `{:ok, path}` when the
+ `firecracker` key is present in `#{@config_path}`, or `:error` when it is absent.
+ """
+ @spec firecracker_bin_configured :: {:ok, Path.t()} | :error
+ def firecracker_bin_configured, do: Map.fetch(config_toml(), "firecracker")
+
+ @doc """
+ Absolute path to the jailer binary, as set in `#{@config_path}` under the
+ `jailer` key. Raises if the key is absent — the operator must configure it;
+ there is no default.
+
+ For the launch path only. Pre-launch checks should use `jailer_bin_configured/0`
+ so a missing key returns a typed error rather than crashing.
+ """
+ @spec jailer_bin :: Path.t()
+ def jailer_bin, do: fetch_bin!("jailer")
+
+ @doc """
+ Non-raising form of `jailer_bin/0`. Returns `{:ok, path}` when the `jailer` key
+ is present in `#{@config_path}`, or `:error` when it is absent.
+ """
+ @spec jailer_bin_configured :: {:ok, Path.t()} | :error
+ def jailer_bin_configured, do: Map.fetch(config_toml(), "jailer")
+
+ @spec fetch_bin!(String.t()) :: Path.t()
+ defp fetch_bin!(key) do
+ case Map.fetch(config_toml(), key) do
+ {:ok, path} ->
+ path
+
+ :error ->
+ raise "#{@config_path}: key #{inspect(key)} is not set; " <>
+ "operator must configure it before starting the node"
+ end
+ end
+
+ @spec config_toml :: map()
+ defp config_toml do
+ case :persistent_term.get({__MODULE__, :config_toml}, nil) do
nil ->
- work_dir = load_work_dir()
- :persistent_term.put({__MODULE__, :work_dir}, work_dir)
- work_dir
+ cfg = load_config_toml()
+ :persistent_term.put({__MODULE__, :config_toml}, cfg)
+ cfg
- work_dir ->
- work_dir
+ cfg ->
+ cfg
end
end
- @spec load_work_dir :: Path.t()
- defp load_work_dir do
+ @spec load_config_toml :: map()
+ defp load_config_toml do
case File.read(@config_path) do
- {:ok, body} -> body |> Toml.decode!() |> Map.fetch!("work_dir")
- {:error, _} -> @dev_work_dir
+ {:ok, body} -> Toml.decode!(body)
+ {:error, _} -> %{}
end
end
- @doc "Directory holding redistributable binaries downloaded by the node."
- @spec redist_dir :: Path.t()
- def redist_dir, do: Path.join(work_dir(), "redist")
-
- @doc "Directory where `Hyper.Node.FireVMM.Provider` installs the firecracker release."
- @spec firecracker_install_dir :: Path.t()
- def firecracker_install_dir, do: Path.join(redist_dir(), "firecracker")
-
@doc "Directory where `Hyper.Node.FireVMM.VmLinux.Provider` installs guest kernels."
@spec vmlinux_install_dir :: Path.t()
def vmlinux_install_dir, do: Path.join(redist_dir(), "vmlinux")
@@ -111,15 +155,6 @@ defmodule Hyper.Config do
@spec layer_dir :: Path.t()
def layer_dir, do: @layer_dir
- @doc "Path to the losetup binary."
- def losetup_path, do: @losetup_path
-
- @doc "Path to the dmsetup binary."
- def dmsetup_path, do: @dmsetup_path
-
- @doc "Path to the blockdev binary."
- def blockdev_path, do: @blockdev_path
-
@doc "Path to the skopeo binary (used by `Hyper.Img.OciLoader` to pull OCI images)."
def skopeo_path, do: @skopeo_path
@@ -132,15 +167,20 @@ defmodule Hyper.Config do
@doc "Path to the mke2fs binary (used by `Hyper.Img.OciLoader` to build the ext4 rootfs)."
def mke2fs_path, do: @mke2fs_path
+ # Where `cargo xtask install` (via `mix suidhelper.install`) drops the helper.
+ @default_suid_helper "/usr/local/bin/hyper-suidhelper"
+
@doc """
- Path to the setuid-root device helper (`hyper-suidhelper`). Required: the node
- runs unprivileged and routes every `losetup`/`dmsetup`/`blockdev` operation
- through it.
+ Path to the setuid-root device helper (`hyper-suidhelper`). The node runs
+ unprivileged and routes every `losetup`/`dmsetup`/`blockdev` operation through
+ it.
- Runtime config (host-specific), so it can be set per node without recompiling.
+ Defaults to `#{@default_suid_helper}`, the install path used by `mix
+ suidhelper.install`. Runtime config (host-specific), so an operator who
+ installs it elsewhere can override per node without recompiling.
"""
@spec suid_helper :: Path.t()
- def suid_helper, do: Application.fetch_env!(:hyper, :suid_helper)
+ def suid_helper, do: Application.get_env(:hyper, :suid_helper, @default_suid_helper)
@doc """
Directory for per-VM scratch (writable-layer COW) files. Must be node-local and
diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex
index 6282eaa7..b4721167 100644
--- a/lib/hyper/node.ex
+++ b/lib/hyper/node.ex
@@ -40,8 +40,14 @@ defmodule Hyper.Node do
def start_link(opts \\ []) do
case test_system() do
- :ok -> Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
- {:error, reason} -> {:error, reason}
+ :ok ->
+ # Clear any dm/loop devices a previous unclean shutdown left behind,
+ # before the device-owning children start and collide with them.
+ :ok = Hyper.Node.Reclaim.run()
+ Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
+
+ {:error, reason} ->
+ {:error, reason}
end
end
@@ -49,9 +55,12 @@ defmodule Hyper.Node do
def init(_opts) do
children = [
Hyper.Node.Users,
+ # Layer owns Hyper.Node.Layer.Registry, which Budget.Advertiser queries
+ # (via Hyper.Node.Layer.active/0) as it advertises on init - so Layer must
+ # be up first.
+ Hyper.Node.Layer,
Hyper.Node.Budget.Supervisor,
{DynamicSupervisor, name: @vm_sup, strategy: :one_for_one},
- Hyper.Node.Layer,
Hyper.Node.Img
]
@@ -144,10 +153,11 @@ defmodule Hyper.Node do
@spec test_system :: :ok | {:error, term()}
def test_system do
with {:ok, _} <- Hyper.Node.Config.Budget.load(),
- :ok <- Hyper.Node.FireVMM.Provider.ensure_installed(),
+ :ok <- check_firecracker_bins(),
:ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(),
:ok <- Hyper.Node.Vmlinux.test_system(),
:ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(),
+ :ok <- Hyper.Img.OciLoader.test_system(),
:ok <- Hyper.Node.Users.test_system(),
:ok <- Hyper.Node.Layer.Repo.test_system(),
:ok <- Hyper.SuidHelper.test_system(),
@@ -157,6 +167,24 @@ defmodule Hyper.Node do
end
end
+ @spec check_firecracker_bins ::
+ :ok
+ | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}}
+ | {:error, :firecracker_not_configured | :jailer_not_configured}
+ defp check_firecracker_bins do
+ with {:fc, {:ok, fc}} <- {:fc, Hyper.Config.firecracker_bin_configured()},
+ {:jail, {:ok, jail}} <- {:jail, Hyper.Config.jailer_bin_configured()} do
+ cond do
+ not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}}
+ not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}}
+ true -> :ok
+ end
+ else
+ {:fc, :error} -> {:error, :firecracker_not_configured}
+ {:jail, :error} -> {:error, :jailer_not_configured}
+ end
+ end
+
@spec check_helper_base(Path.t()) ::
:ok | {:error, {:suid_helper_base_mismatch, Path.t(), Path.t()}}
defp check_helper_base(base) do
diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex
index 7e5603b2..f9e45fbc 100644
--- a/lib/hyper/node/fire_vmm.ex
+++ b/lib/hyper/node/fire_vmm.ex
@@ -47,14 +47,15 @@ defmodule Hyper.Node.FireVMM do
@spec start_link(Opts.t()) :: Supervisor.on_start()
def start_link(opts) do
- Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id))
+ Supervisor.start_link(__MODULE__, opts)
end
+ @spec child_spec(Opts.t()) :: Supervisor.child_spec()
def child_spec(opts) do
# Keyed by VM id and :transient so a cleanly-stopped VM is not rebooted by
# the node-level DynamicSupervisor.
%{
- vm_id: {__MODULE__, opts.vm_id},
+ id: {__MODULE__, opts.vm_id},
start: {__MODULE__, :start_link, [opts]},
type: :supervisor,
restart: :transient
@@ -63,18 +64,26 @@ defmodule Hyper.Node.FireVMM do
@impl true
def init(opts) do
- children = [
- # Client must be registered before Core: Core starts the State machine,
- # which calls Client.run while waiting for the daemon's API. Client depends
- # only on vm_id (an independent peer), so it has no reverse dependency.
- {Client, %Client.Opts{vm_id: opts.vm_id}},
- {Core, opts}
- ]
+ # Self-register the cluster routing entry here rather than via a start name;
+ # see `Hyper.Cluster.Routing.register_self/1`. A fresh random vm_id never
+ # collides, so `:already_registered` only happens against a stale dead
+ # incarnation - decline the start and let the supervisor retry clean.
+ case Hyper.Cluster.Routing.register_self({opts.vm_id, :supervisor}) do
+ :ok ->
+ children = [
+ # Client must be registered before Core: Core starts the State machine,
+ # which calls Client.run while waiting for the daemon's API. Client
+ # depends only on vm_id (an independent peer), so no reverse dependency.
+ {Client, %Client.Opts{vm_id: opts.vm_id}},
+ {Core, opts}
+ ]
- Supervisor.init(children, strategy: :one_for_one)
- end
+ Supervisor.init(children, strategy: :one_for_one)
- defp via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :supervisor})
+ {:error, _} ->
+ :ignore
+ end
+ end
@doc "Test whether the system can run firecracker VMMs."
@spec test_system() :: :ok | {:error, term()}
diff --git a/lib/hyper/node/fire_vmm/client.ex b/lib/hyper/node/fire_vmm/client.ex
index a336b095..2ed991ce 100644
--- a/lib/hyper/node/fire_vmm/client.ex
+++ b/lib/hyper/node/fire_vmm/client.ex
@@ -57,15 +57,12 @@ defmodule Hyper.Node.FireVMM.Client do
@type t :: %__MODULE__{socket_path: Path.t()}
end
+ # Prod path (vm_id, no explicit name) starts unnamed and self-registers in
+ # `init` - see `Hyper.Cluster.Routing.register_self/1`. A `:name` override
+ # (test stand-ins) is honoured as a plain local name and skips registration.
@spec start_link(Opts.t()) :: GenServer.on_start()
def start_link(%Opts{} = opts) do
- name =
- case opts.name do
- nil when not is_nil(opts.vm_id) -> via(opts.vm_id)
- other -> other
- end
-
- GenServer.start_link(__MODULE__, opts, gen_opts(name))
+ GenServer.start_link(__MODULE__, opts, gen_opts(opts.name))
end
@spec via(Hyper.Vm.id()) :: GenServer.name()
@@ -78,12 +75,26 @@ defmodule Hyper.Node.FireVMM.Client do
end
@impl true
- @spec init(Opts.t()) :: {:ok, State.t()}
+ @spec init(Opts.t()) :: {:ok, State.t()} | {:stop, {:already_registered, pid()}}
def init(%Opts{} = opts) do
- socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id)
- {:ok, %State{socket_path: socket_path}}
+ with :ok <- register(opts) do
+ socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id)
+ {:ok, %State{socket_path: socket_path}}
+ end
+ end
+
+ # Register cluster-wide under {vm_id, :client} on the prod path. With an
+ # explicit name (test stand-in), the name is the local registration, so skip.
+ @spec register(Opts.t()) :: :ok | {:stop, {:already_registered, pid()}}
+ defp register(%Opts{name: nil, vm_id: vm_id}) when not is_nil(vm_id) do
+ case Hyper.Cluster.Routing.register_self({vm_id, :client}) do
+ :ok -> :ok
+ {:error, reason} -> {:stop, reason}
+ end
end
+ defp register(%Opts{}), do: :ok
+
@impl true
def handle_call({:run, op_fun}, _from, %State{socket_path: socket_path} = state) do
{:reply, op_fun.(socket_path: socket_path), state}
diff --git a/lib/hyper/node/fire_vmm/core.ex b/lib/hyper/node/fire_vmm/core.ex
index 8ccf6180..18dfeb50 100644
--- a/lib/hyper/node/fire_vmm/core.ex
+++ b/lib/hyper/node/fire_vmm/core.ex
@@ -28,9 +28,12 @@ defmodule Hyper.Node.FireVMM.Core do
alias Hyper.Node.FireVMM.Daemon
alias Hyper.Node.FireVMM.State
+ # Started unnamed: nothing resolves the core by name (it is addressed as a
+ # child of `Hyper.Node.FireVMM`), so it needs no registry entry - and avoids a
+ # needless racy Horde registration at startup.
@spec start_link(FireVMM.Opts.t()) :: Supervisor.on_start()
def start_link(opts) do
- Supervisor.start_link(__MODULE__, opts, name: Hyper.Cluster.Routing.via({opts.vm_id, :core}))
+ Supervisor.start_link(__MODULE__, opts)
end
@impl true
diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex
index 7cbc159c..b6257732 100644
--- a/lib/hyper/node/fire_vmm/daemon.ex
+++ b/lib/hyper/node/fire_vmm/daemon.ex
@@ -4,15 +4,17 @@ defmodule Hyper.Node.FireVMM.Daemon do
`Hyper.Node.FireVMM.Core`.
Lifecycle is supervisor-owned. On every (re)start it first resets any stale
- jail left by a prior incarnation - the firecracker jailer refuses to reuse an
- existing chroot - then builds the jailer command and runs it under
+ jail left by a prior incarnation — the firecracker jailer refuses to reuse an
+ existing chroot — then builds the jailer command and runs it under
`MuonTrap.Daemon`, which kills the OS process when its port closes (controller
crash, container teardown, or BEAM death). So no firecracker process outlives
the supervisor, and `Core`'s `:one_for_all` restarting this child (e.g. after a
firecracker crash) cleanly cold-boots against a fresh jail.
- The supervised process *is* the `MuonTrap.Daemon` - `start_link/1` does the
- reset, then delegates and returns that pid.
+ The supervised process is `hyper-suidhelper jailer ...`, which `execve`s into
+ the jailer (same pid) so MuonTrap owns the firecracker lifetime without needing
+ to know the jailer path. `start_link/1` does the reset, then delegates and
+ returns that pid.
"""
alias Hyper.Node.FireVMM.{Jailer, Opts}
@@ -21,6 +23,8 @@ defmodule Hyper.Node.FireVMM.Daemon do
use OpenTelemetryDecorator
+ require Logger
+
@shutdown_timeout Time.s(5)
@spec child_spec(Opts.t()) :: Supervisor.child_spec()
@@ -44,9 +48,26 @@ defmodule Hyper.Node.FireVMM.Daemon do
with :ok <- SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) do
cmd = Jailer.command(opts)
- case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, []) do
- {:ok, pid} -> {:ok, pid}
- {:error, _} = err -> err
+ # Surface what the jailed process actually does: `log_output` routes the
+ # helper/jailer/firecracker stdout+stderr (guest serial console included)
+ # to the Logger, and `exit_status_to_reason` turns MuonTrap's opaque
+ # `:error_exit_status` into `{:firecracker_exited, status}` so a crash
+ # report names the real exit code instead of hiding it.
+ daemon_opts = [
+ log_output: :info,
+ log_prefix: "vm #{id} firecracker: ",
+ stderr_to_stdout: true,
+ exit_status_to_reason: &{:firecracker_exited, &1}
+ ]
+
+ case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, daemon_opts) do
+ {:ok, pid} ->
+ Logger.info("vm #{id}: jailer launched under MuonTrap (#{inspect(pid)})")
+ {:ok, pid}
+
+ {:error, reason} = err ->
+ Logger.error("vm #{id}: jailer failed to launch: #{inspect(reason)}")
+ err
end
end
end
diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex
index 6dd1b800..355ad0d7 100644
--- a/lib/hyper/node/fire_vmm/jailer.ex
+++ b/lib/hyper/node/fire_vmm/jailer.ex
@@ -1,27 +1,26 @@
defmodule Hyper.Node.FireVMM.Jailer do
@moduledoc """
- Builds the firecracker
- [jailer](https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md)
- command for one VM.
+ Builds the `hyper-suidhelper jailer` command for one VM.
- The jailer sets up the chroot, namespaces, cgroup (via `Hyper.Vm.Instance`
- flags) and drops privileges, then exec's firecracker. We run the jailer (not
- firecracker directly) under `MuonTrap.Daemon`; MuonTrap only supervises the OS
- process, the jailer owns isolation.
+ The BEAM does not invoke the jailer directly. Instead it calls the setuid helper
+ with the `jailer` subcommand; the helper reads the firecracker binary path, chroot
+ base, parent cgroup, and cgroup version from its trusted `/etc/hyper/config.toml`,
+ re-acquires root, and `execve`s the jailer (same pid, so `MuonTrap.Daemon` keeps
+ supervising it).
+
+ This means the BEAM passes only untrusted-origin values: `--id`, `--uid`, `--gid`,
+ repeated `--cgroup KEY=VALUE`, and `--api-sock`. The helper derives and validates
+ everything else; it also inserts the `--` separator between its own flags and
+ firecracker's flags.
Because firecracker is chrooted to `///root`, the API
- socket it opens at `/api.socket` lives at `host_socket` on the host - that's the
+ socket it opens at `/api.socket` lives at `host_socket` on the host — that's the
path the controller connects to.
-
- Host config: paths are derived from `config :hyper, work_dir: ...`. The
- firecracker + jailer binaries are installed under `/redist/firecracker`
- by `Hyper.Node.FireVMM.Provider`; the chroot base is `/jails`.
"""
use OpenTelemetryDecorator
alias Hyper.Node.FireVMM
- alias Hyper.Node.FireVMM.Provider
alias Hyper.Vm.Instance
# firecracker's API socket path *inside* the chroot.
@@ -94,26 +93,11 @@ defmodule Hyper.Node.FireVMM.Jailer do
@spec command(FireVMM.Opts.t()) :: t()
def command(opts) do
args =
- [
- "--id",
- opts.vm_id,
- "--exec-file",
- Provider.firecracker_bin(),
- "--uid",
- to_string(opts.uid),
- "--gid",
- to_string(opts.gid),
- "--chroot-base-dir",
- Hyper.Config.chroot_base(),
- "--cgroup-version",
- "2",
- "--parent-cgroup",
- Hyper.Config.parent_cgroup()
- ] ++
+ ["jailer", "--id", opts.vm_id, "--uid", to_string(opts.uid), "--gid", to_string(opts.gid)] ++
cgroup_flags(opts.type) ++
- ["--", "--api-sock", "/" <> @jail_socket]
+ ["--api-sock", "/" <> @jail_socket]
- %{binary: Provider.jailer_bin(), args: args, host_socket: host_socket(opts.vm_id)}
+ %{binary: Hyper.Config.suid_helper(), args: args, host_socket: host_socket(opts.vm_id)}
end
# Find the appropriate jailer cgroup flags for the given instance type.
@@ -166,5 +150,5 @@ defmodule Hyper.Node.FireVMM.Jailer do
])
end
- defp exec_name, do: Path.basename(Provider.firecracker_bin())
+ defp exec_name, do: Path.basename(Hyper.Config.firecracker_bin())
end
diff --git a/lib/hyper/node/fire_vmm/provider.ex b/lib/hyper/node/fire_vmm/provider.ex
deleted file mode 100644
index e50aee16..00000000
--- a/lib/hyper/node/fire_vmm/provider.ex
+++ /dev/null
@@ -1,92 +0,0 @@
-defmodule Hyper.Node.FireVMM.Provider do
- @moduledoc """
- Installs the firecracker release for the current architecture into
- `Hyper.Config.firecracker_install_dir/0` (`/redist/firecracker`).
-
- `ensure_installed/0` is idempotent: if the binaries are already present and
- executable it returns `:ok` without touching the network. Otherwise it fetches
- the official firecracker release tarball for the detected architecture via
- `Redist.Targz` (download, SHA-256 verify, extract). The archive is
- extracted as-is - the binaries live under `release-v-/` exactly as
- firecracker ships them, and `firecracker_bin/0` / `jailer_bin/0` resolve those
- paths.
- """
-
- alias Redist.Targz
-
- @downloads %{
- x86_64: %{
- url:
- "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-x86_64.tgz",
- sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6",
- firecracker: "release-v1.16.0-x86_64/firecracker-v1.16.0-x86_64",
- jailer: "release-v1.16.0-x86_64/jailer-v1.16.0-x86_64"
- },
- aarch64: %{
- url:
- "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-aarch64.tgz",
- sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7",
- firecracker: "release-v1.16.0-aarch64/firecracker-v1.16.0-aarch64",
- jailer: "release-v1.16.0-aarch64/jailer-v1.16.0-aarch64"
- }
- }
-
- @doc "Absolute path to the installed firecracker binary."
- @spec firecracker_bin() :: Path.t()
- def firecracker_bin, do: bin_path(:firecracker)
-
- @doc "Absolute path to the installed jailer binary."
- @spec jailer_bin() :: Path.t()
- def jailer_bin, do: bin_path(:jailer)
-
- @doc "Ensure the firecracker release is installed for this node."
- @spec ensure_installed() :: :ok | {:error, term()}
- def ensure_installed do
- with {:ok, arch} <- Sys.Arch.current() do
- dl = Map.fetch!(@downloads, arch)
-
- case check_install(dl) do
- :ok -> :ok
- {:error, :not_installed} -> install(dl)
- {:error, :bad_install} -> reinstall(dl)
- end
- end
- end
-
- # `:ok` if `dl`'s version-specific binaries are present and executable;
- # `{:error, :not_installed}` if the install dir is empty/absent; otherwise
- # `{:error, :bad_install}` - something is there but it's the wrong version,
- # partial, or corrupt, which we cannot fix in place because `Targz` keeps
- # existing files. The remedy is to wipe and reinstall.
- @spec check_install(map()) :: :ok | {:error, :not_installed | :bad_install}
- defp check_install(dl) do
- fc = Path.join(install_dir(), dl.firecracker)
- jail = Path.join(install_dir(), dl.jailer)
-
- cond do
- Sys.Posix.executable?(fc) and Sys.Posix.executable?(jail) ->
- :ok
-
- File.dir?(install_dir()) and File.ls!(install_dir()) != [] ->
- {:error, :bad_install}
-
- true ->
- {:error, :not_installed}
- end
- end
-
- defp install(dl), do: Targz.install(dl.url, dl.sha256, install_dir())
-
- defp reinstall(dl) do
- _ = File.rm_rf!(install_dir())
- install(dl)
- end
-
- defp bin_path(key) do
- {:ok, arch} = Sys.Arch.current()
- dl = Map.fetch!(@downloads, arch)
- Path.join(install_dir(), Map.fetch!(dl, key))
- end
-
- defp install_dir, do: Hyper.Config.firecracker_install_dir()
-end
diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex
index b390965c..fa87d9b4 100644
--- a/lib/hyper/node/fire_vmm/state.ex
+++ b/lib/hyper/node/fire_vmm/state.ex
@@ -39,12 +39,13 @@ defmodule Hyper.Node.FireVMM.State do
}
@enforce_keys [:opts]
- defstruct [:opts, :spec, :boot_deadline]
+ defstruct [:opts, :spec, :boot_deadline, api_granted: false]
@type t :: %State{
opts: Opts.t(),
spec: BootSpec.Cold.t() | nil,
- boot_deadline: integer() | nil
+ boot_deadline: integer() | nil,
+ api_granted: boolean()
}
# How long to wait for the daemon's API to come up before failing the boot.
@@ -54,8 +55,11 @@ defmodule Hyper.Node.FireVMM.State do
%{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}}
end
- def start_link(%Opts{vm_id: id} = opts) do
- :gen_statem.start_link(via(id), __MODULE__, opts, [])
+ # Started unnamed; the controller self-registers under `{id, :state}` from
+ # `init` (see `Hyper.Cluster.Routing.register_self/1`). `stop/1` still resolves
+ # it cluster-wide through `via/1`.
+ def start_link(%Opts{} = opts) do
+ :gen_statem.start_link(__MODULE__, opts, [])
end
@spec stop(Hyper.Vm.id()) :: :ok
@@ -72,12 +76,21 @@ defmodule Hyper.Node.FireVMM.State do
# The daemon is already (being) started by `Core` as our sibling. Read the root
# device off the per-VM mutable layer, resolve the boot spec, set the readiness
# deadline, and start probing the API.
- def init(%Opts{mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = opts) do
- spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type)
- deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout)
- data = %State{opts: opts, spec: spec, boot_deadline: deadline}
-
- {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]}
+ def init(
+ %Opts{vm_id: id, mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} =
+ opts
+ ) do
+ case Hyper.Cluster.Routing.register_self({id, :state}) do
+ :ok ->
+ spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type)
+ deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout)
+ data = %State{opts: opts, spec: spec, boot_deadline: deadline}
+
+ {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]}
+
+ {:error, reason} ->
+ {:stop, reason}
+ end
end
# Assemble the `Hyper.Vm.source()` BootSpec expects from the resolved kernel +
@@ -110,31 +123,81 @@ defmodule Hyper.Node.FireVMM.State do
@moduledoc "Poll the (already-launched) daemon's API socket, then advance to `:configuring`."
alias Hyper.Firecracker.Api.{InstanceInfo, Operations}
- alias Hyper.Node.FireVMM.{Client, Opts}
+ alias Hyper.Node.FireVMM.{Client, Jailer, Opts}
+ alias Hyper.SuidHelper.ChrootJail
alias Unit.Time
+ require Logger
+
# How often to probe the daemon's API while waiting for it.
@probe_interval Time.ms(50)
- # Poll the daemon's API until it answers, then configure. Give up if the
- # readiness deadline passes first.
+ # Hand the jailed API socket to the node user, then poll the daemon's API
+ # until it answers and advance to `:configuring`. Give up if the readiness
+ # deadline passes first. The grant must happen before the probe: firecracker
+ # creates the socket owned by the per-VM uid, so the unprivileged controller
+ # gets EACCES on connect until the helper chowns it to us.
def handle(:state_timeout, :probe, %{opts: %Opts{vm_id: id}} = data) do
- case Client.run(Client.via(id), &Operations.describe_instance/1) do
- {:ok, %InstanceInfo{}} ->
- {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]}
-
- {:error, _reason} ->
- if System.monotonic_time(:millisecond) >= data.boot_deadline do
- {:stop, {:shutdown, {:boot_failed, :daemon_unready}}, data}
- else
- {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]}
+ case ensure_api_granted(id, data) do
+ {:cont, data} ->
+ case Client.run(Client.via(id), &Operations.describe_instance/1) do
+ {:ok, %InstanceInfo{}} ->
+ {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]}
+
+ {:error, reason} ->
+ keep_probing(id, data, reason)
end
+
+ {:wait, data, reason} ->
+ keep_probing(id, data, reason)
end
end
def handle({:call, from}, :stop, data) do
{:next_state, :stopping, data, [{:reply, from, :ok}]}
end
+
+ # Ensure the jailed API socket has been handed to the node user. Idempotent
+ # once granted (we record it in `data` so we ask the helper only once).
+ # `:socket_pending` means firecracker has not created the socket yet, so we
+ # keep waiting; a hard error is logged but also tolerated until the deadline
+ # (the probe that follows would fail with EACCES anyway and drive the stop).
+ @spec ensure_api_granted(Hyper.Vm.id(), State.t()) ::
+ {:cont, State.t()} | {:wait, State.t(), term()}
+ defp ensure_api_granted(_id, %{api_granted: true} = data), do: {:cont, data}
+
+ defp ensure_api_granted(id, data) do
+ case ChrootJail.grant_api(Jailer.host_socket(id)) do
+ :ok ->
+ {:cont, %{data | api_granted: true}}
+
+ {:error, :socket_pending} ->
+ {:wait, data, :socket_pending}
+
+ {:error, reason} ->
+ Logger.warning("vm #{id}: grant-api failed: #{inspect(reason)}")
+ {:wait, data, {:grant_api, reason}}
+ end
+ end
+
+ # Keep waiting for readiness, re-arming the probe timer, unless the deadline
+ # has lapsed - then fail the boot, surfacing `reason` rather than swallowing
+ # it into a bare `:daemon_unready`. A persistent failure here points at the
+ # host->jail socket (path or, more often, the grant/permission step above).
+ @spec keep_probing(Hyper.Vm.id(), State.t(), term()) ::
+ {:keep_state, State.t(), list()} | {:stop, term(), State.t()}
+ defp keep_probing(id, data, reason) do
+ if System.monotonic_time(:millisecond) >= data.boot_deadline do
+ Logger.warning(
+ "vm #{id}: firecracker API not reachable before deadline; " <>
+ "last probe error: #{inspect(reason)}"
+ )
+
+ {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data}
+ else
+ {:keep_state, data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]}
+ end
+ end
end
defmodule Configuring do
@@ -145,8 +208,12 @@ defmodule Hyper.Node.FireVMM.State do
alias Hyper.Firecracker.Api.{InstanceActionInfo, Operations}
alias Hyper.Node.FireVMM.{BootSpec, ChrootJail, Client, Opts}
+ require Logger
+
# Stage boot artifacts into the chroot, then issue the pre-boot config and
- # start the guest.
+ # start the guest. Both failure paths end the boot via a supervisor restart,
+ # so log the reason here - otherwise it vanishes into the `:one_for_all`
+ # cycle and the VM just appears to relaunch for no visible cause.
def handle(
:state_timeout,
:configure,
@@ -155,11 +222,17 @@ defmodule Hyper.Node.FireVMM.State do
case ChrootJail.stage(id, uid, gid, spec) do
{:ok, jailed_spec} ->
case apply_spec(id, jailed_spec) do
- :ok -> {:next_state, :running, data}
- {:error, reason} -> {:stop, {:shutdown, {:boot_failed, reason}}, data}
+ :ok ->
+ Logger.info("vm #{id}: configured, guest starting")
+ {:next_state, :running, data}
+
+ {:error, reason} ->
+ Logger.error("vm #{id}: boot failed applying config: #{inspect(reason)}")
+ {:stop, {:shutdown, {:boot_failed, reason}}, data}
end
{:error, reason} ->
+ Logger.error("vm #{id}: boot failed staging jail: #{inspect(reason)}")
{:stop, {:shutdown, {:boot_failed, {:staging, reason}}}, data}
end
end
diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex
index 386e36f0..4dbab644 100644
--- a/lib/hyper/node/img/mutable.ex
+++ b/lib/hyper/node/img/mutable.ex
@@ -121,6 +121,12 @@ defmodule Hyper.Node.Img.Mutable do
@impl true
def handle_info(:idle_timeout, state), do: {:noreply, state}
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, state) do
# Destroy the thin volume, then release the image (its monitor on us also
diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex
index d51fe3ba..de6579f5 100644
--- a/lib/hyper/node/img/server.ex
+++ b/lib/hyper/node/img/server.ex
@@ -131,6 +131,12 @@ defmodule Hyper.Node.Img.Server do
{:noreply, state}
end
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, %State{dm_names: dm_names}) do
# Remove top-down (a snapshot's origin is the device below it). Layers are
diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex
index f3ec3e19..9599da23 100644
--- a/lib/hyper/node/img/thin_pool.ex
+++ b/lib/hyper/node/img/thin_pool.ex
@@ -94,6 +94,13 @@ defmodule Hyper.Node.Img.ThinPool do
{:reply, :ok, id_free(state, id)}
end
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it. An abnormal exit
+ # carries a non-`:normal` reason and falls through to the default handler.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, state) do
_ = SuidHelper.Dmsetup.remove(@pool_name)
diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex
index 188e135b..f717e155 100644
--- a/lib/hyper/node/layer/server.ex
+++ b/lib/hyper/node/layer/server.ex
@@ -122,6 +122,12 @@ defmodule Hyper.Node.Layer.Server do
{:noreply, state}
end
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, %State{blk_path: blk_path}) do
case SuidHelper.Losetup.detach(blk_path) do
diff --git a/lib/hyper/node/reclaim.ex b/lib/hyper/node/reclaim.ex
new file mode 100644
index 00000000..f9513f89
--- /dev/null
+++ b/lib/hyper/node/reclaim.ex
@@ -0,0 +1,99 @@
+defmodule Hyper.Node.Reclaim do
+ @moduledoc """
+ Boot-time reclamation of device-mapper and loop devices orphaned by an unclean
+ shutdown (SIGKILL or `:erlang.halt`, where the owning GenServers' `terminate/2`
+ never ran to tear them down).
+
+ Hyper names every dm device it creates with a `hyper-` prefix (`hyper-thinpool`,
+ `hyper-rw-`, `hyper-img--`), so this removes exactly those - never an
+ operator's unrelated dm devices. Removal is leaf-first: a device still open by
+ another (the pool under a thin volume, a snapshot under the next in its chain)
+ refuses until its dependents are gone, so leftovers are retried until a pass
+ removes nothing new. Loop devices backing files under Hyper's data dirs are then
+ detached (the dm devices that held them are gone by that point).
+
+ Entirely best-effort: every failure is logged and boot continues. It runs once,
+ before any device-owning GenServer starts, so the freshly-booting node never
+ collides with its own previous instance's leftovers.
+ """
+
+ alias Hyper.SuidHelper.{Dmsetup, Losetup}
+
+ require Logger
+
+ @dm_prefix "hyper-"
+
+ @spec run() :: :ok
+ def run do
+ reclaim_dm()
+ reclaim_loops()
+ :ok
+ end
+
+ defp reclaim_dm do
+ case Dmsetup.list() do
+ {:ok, names} ->
+ case Enum.filter(names, &String.starts_with?(&1, @dm_prefix)) do
+ [] ->
+ :ok
+
+ stale ->
+ Logger.warning(
+ "reclaim: removing #{length(stale)} stale dm device(s): #{inspect(stale)}"
+ )
+
+ remove_dm(stale)
+ end
+
+ {:error, reason} ->
+ Logger.warning("reclaim: could not list dm devices: #{inspect(reason)}")
+ end
+ end
+
+ @spec remove_dm([String.t()]) :: :ok
+ defp remove_dm([]), do: :ok
+
+ defp remove_dm(names) do
+ {failed, removed_any?} =
+ Enum.reduce(names, {[], false}, fn name, {failed, any?} ->
+ case Dmsetup.remove(name) do
+ :ok -> {failed, true}
+ {:error, _} -> {[name | failed], any?}
+ end
+ end)
+
+ cond do
+ failed == [] -> :ok
+ # A pass made progress: a retry may now clear the devices that were still
+ # held by the ones just removed.
+ removed_any? -> remove_dm(failed)
+ true -> Logger.error("reclaim: could not remove dm devices: #{inspect(failed)}")
+ end
+ end
+
+ defp reclaim_loops do
+ case Losetup.list() do
+ {:ok, pairs} ->
+ for {dev, backing} <- pairs, under_data_dirs?(backing) do
+ case Losetup.detach(dev) do
+ :ok ->
+ :ok
+
+ {:error, reason} ->
+ Logger.warning("reclaim: could not detach #{dev} (#{backing}): #{inspect(reason)}")
+ end
+ end
+
+ :ok
+
+ {:error, reason} ->
+ Logger.warning("reclaim: could not list loop devices: #{inspect(reason)}")
+ end
+ end
+
+ @spec under_data_dirs?(Path.t()) :: boolean()
+ defp under_data_dirs?(path) do
+ String.starts_with?(path, Hyper.Config.scratch_dir() <> "/") or
+ String.starts_with?(path, Hyper.Config.layer_dir() <> "/")
+ end
+end
diff --git a/lib/hyper/suid_helper.ex b/lib/hyper/suid_helper.ex
index 690f004a..6d4ff56d 100644
--- a/lib/hyper/suid_helper.ex
+++ b/lib/hyper/suid_helper.ex
@@ -16,7 +16,7 @@ defmodule Hyper.SuidHelper do
self-test and reports the base path it was compiled against.
"""
- alias Hyper.SuidHelper.{Blockdev, Dmsetup, Expected, Losetup}
+ alias Hyper.SuidHelper.{Dmsetup, Expected}
use OpenTelemetryDecorator
@@ -51,18 +51,20 @@ defmodule Hyper.SuidHelper do
end
@doc """
- Check that the setuid helper and every tool it execs are usable on this
- machine: the helper binary is present, is the build this release expects
- (`verify_version/0`), then each tool submodule's own check.
+ Check that the setuid helper is usable on this machine: the helper binary is
+ present, is the build this release expects (`verify_version/0`), and the kernel
+ exposes the device-mapper targets we need (`Dmsetup.test_system/0`, which also
+ exercises the helper's configured `dmsetup` binary).
+
+ The `losetup`/`blockdev` binaries are validated by the helper the first time
+ each is used; their paths live in the helper's own config, not here.
"""
@spec test_system() :: :ok | {:error, term()}
@decorate with_span("Hyper.SuidHelper.test_system")
def test_system do
with :ok <- helper_present(),
- :ok <- verify_version(),
- :ok <- Losetup.test_system(),
- :ok <- Dmsetup.test_system() do
- Blockdev.test_system()
+ :ok <- verify_version() do
+ Dmsetup.test_system()
end
end
diff --git a/lib/hyper/suid_helper/blockdev.ex b/lib/hyper/suid_helper/blockdev.ex
index 9f675ea5..760fae84 100644
--- a/lib/hyper/suid_helper/blockdev.ex
+++ b/lib/hyper/suid_helper/blockdev.ex
@@ -11,18 +11,9 @@ defmodule Hyper.SuidHelper.Blockdev do
@spec device_sectors(Path.t()) :: {:ok, pos_integer()} | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Blockdev.device_sectors", include: [:path])
def device_sectors(path) do
- case SuidHelper.exec(["blockdev", "--bin", Hyper.Config.blockdev_path(), "--getsz", path]) do
+ case SuidHelper.exec(["blockdev", "--getsz", path]) do
{:ok, %{"sectors" => n}} -> {:ok, n}
{:error, _} = err -> err
end
end
-
- @doc "Check the blockdev binary is present."
- @spec test_system() :: :ok | {:error, :blockdev_not_found}
- @decorate with_span("Hyper.SuidHelper.Blockdev.test_system")
- def test_system do
- if System.find_executable(Hyper.Config.blockdev_path()),
- do: :ok,
- else: {:error, :blockdev_not_found}
- end
end
diff --git a/lib/hyper/suid_helper/chroot_jail.ex b/lib/hyper/suid_helper/chroot_jail.ex
index fdc0f1ce..cf8370c0 100644
--- a/lib/hyper/suid_helper/chroot_jail.ex
+++ b/lib/hyper/suid_helper/chroot_jail.ex
@@ -57,4 +57,25 @@ defmodule Hyper.SuidHelper.ChrootJail do
{:error, _} = err -> err
end
end
+
+ @doc """
+ Hand the firecracker API `socket` to the node user so the unprivileged
+ controller can `connect()` to it. The jailer drops firecracker to a per-VM
+ uid/gid and chroots it, so the socket it creates is owned by that per-VM id and
+ the node (a different uid) gets `EACCES` on connect. The helper chowns just
+ that one socket to its caller (the node user) and chmods it `0660`, leaving the
+ rest of the per-VM isolation intact.
+
+ Returns `{:error, :socket_pending}` while firecracker has not yet created the
+ socket, so the caller can keep waiting.
+ """
+ @spec grant_api(Path.t()) :: :ok | {:error, :socket_pending} | {:error, err()}
+ @decorate with_span("Hyper.SuidHelper.ChrootJail.grant_api", include: [:socket])
+ def grant_api(socket) do
+ case SuidHelper.exec(["chroot-jail", "grant-api", "--socket", socket]) do
+ {:ok, %{"result" => "granted"}} -> :ok
+ {:ok, %{"result" => "pending"}} -> {:error, :socket_pending}
+ {:error, _} = err -> err
+ end
+ end
end
diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex
index ef670634..3c57c6b5 100644
--- a/lib/hyper/suid_helper/dmsetup.ex
+++ b/lib/hyper/suid_helper/dmsetup.ex
@@ -59,14 +59,7 @@ defmodule Hyper.SuidHelper.Dmsetup do
@spec remove(String.t()) :: :ok | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Dmsetup.remove", include: [:name])
def remove(name) do
- case SuidHelper.exec([
- "dmsetup",
- "--bin",
- Hyper.Config.dmsetup_path(),
- "remove",
- "--retry",
- name
- ]) do
+ case SuidHelper.exec(["dmsetup", "remove", "--retry", name]) do
{:ok, _} -> :ok
{:error, _} = err -> err
end
@@ -76,39 +69,31 @@ defmodule Hyper.SuidHelper.Dmsetup do
@spec message(String.t(), String.t()) :: :ok | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Dmsetup.message", include: [:name, :message])
def message(name, message) do
- argv =
- ["dmsetup", "--bin", Hyper.Config.dmsetup_path(), "message", name, "--message", message]
-
- case SuidHelper.exec(argv) do
+ case SuidHelper.exec(["dmsetup", "message", name, "--message", message]) do
{:ok, _} -> :ok
{:error, _} = err -> err
end
end
@doc """
- Check the dmsetup binary is present and the kernel exposes the dm targets we
- use (snapshot, thin, thin-pool).
+ Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool).
+
+ Routes through the setuid helper: `dmsetup targets` opens `/dev/mapper/control`,
+ which needs root, and the BEAM runs unprivileged. The helper validates its
+ configured `dmsetup` binary before running it, so a missing or unsafe binary
+ surfaces here too.
"""
@spec test_system() :: :ok | {:error, term()}
@decorate with_span("Hyper.SuidHelper.Dmsetup.test_system")
def test_system do
- if System.find_executable(Hyper.Config.dmsetup_path()),
- do: test_targets(),
- else: {:error, :dmsetup_not_found}
- end
-
- @doc "Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool)."
- @spec test_targets() :: :ok | {:error, term()}
- @decorate with_span("Hyper.SuidHelper.Dmsetup.test_targets")
- def test_targets do
- case System.cmd(Hyper.Config.dmsetup_path(), ["targets"], stderr_to_stdout: true) do
- {out, 0} ->
+ case SuidHelper.exec(["dmsetup", "targets"]) do
+ {:ok, %{"output" => out}} ->
have = parse_targets(out)
missing = Enum.reject(@required_targets, &MapSet.member?(have, &1))
if missing == [], do: :ok, else: {:error, {:missing_dm_targets, missing}}
- {out, code} ->
- {:error, {:dmsetup_targets_failed, code, String.trim(out)}}
+ {:error, {code, msg}} ->
+ {:error, {:dmsetup_targets_failed, code, msg}}
end
end
@@ -122,6 +107,32 @@ defmodule Hyper.SuidHelper.Dmsetup do
|> MapSet.new()
end
+ @doc "Names of every device-mapper device currently present on this host."
+ @spec list() :: {:ok, [String.t()]} | {:error, err()}
+ @decorate with_span("Hyper.SuidHelper.Dmsetup.list")
+ def list do
+ case SuidHelper.exec(["dmsetup", "ls"]) do
+ {:ok, %{"output" => out}} -> {:ok, parse_names(out)}
+ {:error, _} = err -> err
+ end
+ end
+
+ @doc false
+ @spec parse_names(String.t()) :: [String.t()]
+ def parse_names(out) do
+ case String.trim(out) do
+ # `dmsetup ls` prints this sentinel (not a device row) when there are none.
+ "No devices found" ->
+ []
+
+ _ ->
+ out
+ |> String.split("\n", trim: true)
+ |> Enum.map(&(&1 |> String.split() |> List.first()))
+ |> Enum.reject(&is_nil/1)
+ end
+ end
+
@doc false
@spec snapshot_table(Path.t(), Path.t(), pos_integer(), pos_integer()) :: String.t()
def snapshot_table(origin_dev, cow_dev, sectors, chunk_sectors) do
@@ -145,9 +156,7 @@ defmodule Hyper.SuidHelper.Dmsetup do
# create flags (e.g. `--readonly`). Returns the `/dev/mapper/` path.
@spec create(String.t(), String.t(), [String.t()]) :: {:ok, Path.t()} | {:error, err()}
defp create(name, table, flags) do
- argv =
- ["dmsetup", "--bin", Hyper.Config.dmsetup_path(), "create", name] ++
- flags ++ ["--table", table]
+ argv = ["dmsetup", "create", name] ++ flags ++ ["--table", table]
case SuidHelper.exec(argv) do
{:ok, %{"device" => dev}} -> {:ok, dev}
diff --git a/lib/hyper/suid_helper/losetup.ex b/lib/hyper/suid_helper/losetup.ex
index d825b731..3abf4fba 100644
--- a/lib/hyper/suid_helper/losetup.ex
+++ b/lib/hyper/suid_helper/losetup.ex
@@ -11,7 +11,7 @@ defmodule Hyper.SuidHelper.Losetup do
@spec attach_ro(Path.t()) :: {:ok, Path.t()} | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Losetup.attach_ro", include: [:path])
def attach_ro(path) do
- case SuidHelper.exec(["losetup", "--bin", Hyper.Config.losetup_path(), "attach", path]) do
+ case SuidHelper.exec(["losetup", "attach", path]) do
{:ok, %{"device" => dev}} -> {:ok, dev}
{:error, _} = err -> err
end
@@ -21,14 +21,7 @@ defmodule Hyper.SuidHelper.Losetup do
@spec attach_rw(Path.t()) :: {:ok, Path.t()} | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Losetup.attach_rw", include: [:path])
def attach_rw(path) do
- case SuidHelper.exec([
- "losetup",
- "--bin",
- Hyper.Config.losetup_path(),
- "attach",
- "--rw",
- path
- ]) do
+ case SuidHelper.exec(["losetup", "attach", "--rw", path]) do
{:ok, %{"device" => dev}} -> {:ok, dev}
{:error, _} = err -> err
end
@@ -38,18 +31,34 @@ defmodule Hyper.SuidHelper.Losetup do
@spec detach(Path.t()) :: :ok | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Losetup.detach", include: [:dev])
def detach(dev) do
- case SuidHelper.exec(["losetup", "--bin", Hyper.Config.losetup_path(), "detach", dev]) do
+ case SuidHelper.exec(["losetup", "detach", dev]) do
{:ok, _} -> :ok
{:error, _} = err -> err
end
end
- @doc "Check the losetup binary is present."
- @spec test_system() :: :ok | {:error, :losetup_not_found}
- @decorate with_span("Hyper.SuidHelper.Losetup.test_system")
- def test_system do
- if System.find_executable(Hyper.Config.losetup_path()),
- do: :ok,
- else: {:error, :losetup_not_found}
+ @doc "Currently-attached loop devices as `{device, backing_file}` pairs."
+ @spec list() :: {:ok, [{Path.t(), Path.t()}]} | {:error, err()}
+ @decorate with_span("Hyper.SuidHelper.Losetup.list")
+ def list do
+ case SuidHelper.exec(["losetup", "list"]) do
+ {:ok, %{"output" => out}} -> {:ok, parse_list(out)}
+ {:error, _} = err -> err
+ end
+ end
+
+ @doc false
+ @spec parse_list(String.t()) :: [{Path.t(), Path.t()}]
+ def parse_list(out) do
+ out
+ |> String.split("\n", trim: true)
+ |> Enum.flat_map(fn line ->
+ # `NAME BACK-FILE` rows; a loop with no backing file has only one column
+ # (nothing for us to reclaim by file), so skip it.
+ case String.split(line, " ", parts: 2, trim: true) do
+ [dev, backing] -> [{dev, String.trim(backing)}]
+ _ -> []
+ end
+ end)
end
end
diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex
new file mode 100644
index 00000000..03ada2be
--- /dev/null
+++ b/lib/mix/tasks/firecracker.install.ex
@@ -0,0 +1,147 @@
+defmodule Mix.Tasks.Firecracker.Install do
+ @shortdoc "Download, verify, and install the pinned Firecracker release"
+ @moduledoc """
+ Downloads, verifies, and installs the pinned Firecracker release (v1.16.0)
+ for the current CPU architecture.
+
+ mix firecracker.install [--prefix DIR]
+
+ Steps performed:
+
+ 1. Detects the CPU architecture (`x86_64` or `aarch64`).
+ 2. Downloads the release tarball and verifies its SHA-256 checksum.
+ 3. Extracts the tarball, then copies the binaries to `/firecracker`
+ and `/jailer` using the **bare basenames** `firecracker` and
+ `jailer`. The setuid helper validates binaries via `SafeBin<"firecracker">`
+ and `SafeBin<"jailer">`, which match on basename only — version-stamped
+ names such as `firecracker-v1.16.0-x86_64` are rejected unconditionally.
+ 4. Marks both binaries executable (`0o755`).
+ 5. Prints the `/etc/hyper/config.toml` snippet the operator needs to paste.
+
+ This task installs **unprivileged** binaries and prints configuration.
+ Privilege at runtime is handled by `hyper-suidhelper` (the setuid helper).
+ This task does **not** setuid `firecracker` or `jailer`. Install and setuid
+ the helper separately with `mix suidhelper.install`.
+
+ ## Options
+
+ * `--prefix DIR` — installation directory (default: `/opt/firecracker`).
+
+ ## Security requirements
+
+ After installing, ensure:
+
+ * The binaries are root-owned and **not** group- or world-writable.
+ The suidhelper refuses binaries with loose permissions.
+ * `/etc/hyper/config.toml` is root-owned with mode `0644`.
+ """
+
+ use Mix.Task
+
+ @version "1.16.0"
+ @default_prefix "/opt/firecracker"
+
+ @impl Mix.Task
+ @spec run([String.t()]) :: :ok
+ def run(argv) do
+ {opts, _rest, _invalid} = OptionParser.parse(argv, strict: [prefix: :string])
+ prefix = Keyword.get(opts, :prefix, @default_prefix)
+
+ arch = detect_arch!()
+
+ case Application.ensure_all_started(:req) do
+ {:ok, _} -> :ok
+ {:error, {reason, app}} -> Mix.raise("Cannot start HTTP client #{app}: #{inspect(reason)}")
+ end
+
+ install!(release_for(arch), prefix)
+ print_config(prefix)
+ end
+
+ defp detect_arch! do
+ case Sys.Arch.current() do
+ {:ok, arch} ->
+ arch
+
+ {:error, {:unsupported_arch, raw}} ->
+ Mix.raise(
+ "Unsupported CPU architecture #{inspect(raw)}; " <>
+ "Firecracker supports x86_64 and aarch64."
+ )
+ end
+ end
+
+ defp release_for(:x86_64) do
+ %{
+ url:
+ "https://github.com/firecracker-microvm/firecracker/releases/download/" <>
+ "v#{@version}/firecracker-v#{@version}-x86_64.tgz",
+ sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6",
+ firecracker_path: "release-v#{@version}-x86_64/firecracker-v#{@version}-x86_64",
+ jailer_path: "release-v#{@version}-x86_64/jailer-v#{@version}-x86_64"
+ }
+ end
+
+ defp release_for(:aarch64) do
+ %{
+ url:
+ "https://github.com/firecracker-microvm/firecracker/releases/download/" <>
+ "v#{@version}/firecracker-v#{@version}-aarch64.tgz",
+ sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7",
+ firecracker_path: "release-v#{@version}-aarch64/firecracker-v#{@version}-aarch64",
+ jailer_path: "release-v#{@version}-aarch64/jailer-v#{@version}-aarch64"
+ }
+ end
+
+ defp install!(
+ %{url: url, sha256: sha256, firecracker_path: fc_rel, jailer_path: jailer_rel},
+ prefix
+ ) do
+ extract_dir = Path.join(prefix, ".firecracker-extract")
+
+ Mix.shell().info("Downloading Firecracker v#{@version} from #{url} ...")
+
+ case Redist.Targz.install(url, sha256, extract_dir) do
+ :ok -> :ok
+ {:error, reason} -> Mix.raise("Download from #{url} failed: #{inspect(reason)}")
+ end
+
+ dst_fc = Path.join(prefix, "firecracker")
+ dst_jailer = Path.join(prefix, "jailer")
+
+ # The release ships version-stamped names; copy to bare basenames so SafeBin
+ # validation passes. The helper matches on basename, not full path.
+ File.cp!(Path.join(extract_dir, fc_rel), dst_fc)
+ File.cp!(Path.join(extract_dir, jailer_rel), dst_jailer)
+ File.chmod!(dst_fc, 0o755)
+ File.chmod!(dst_jailer, 0o755)
+ _ = File.rm_rf!(extract_dir)
+
+ Mix.shell().info("Installed #{dst_fc}")
+ Mix.shell().info("Installed #{dst_jailer}")
+ end
+
+ defp print_config(prefix) do
+ fc = Path.join(prefix, "firecracker")
+ jailer = Path.join(prefix, "jailer")
+
+ # This task runs unprivileged, so the binaries land owned by the invoking
+ # user. The suidhelper's SafeBin refuses any binary not owned by root and not
+ # free of group/other write bits, so the operator MUST chown/chmod them or
+ # every jailer launch fails closed. Print the exact commands rather than a
+ # vague "ensure root-owned".
+ Mix.shell().info("""
+
+ Almost done. Run these as root so the setuid helper will accept the binaries
+ (it refuses any jailer/firecracker not owned by root):
+
+ sudo chown root:root #{fc} #{jailer}
+ sudo chmod 0755 #{fc} #{jailer}
+
+ Then add to /etc/hyper/config.toml (file: root-owned, mode 0644):
+
+ firecracker = "#{fc}"
+ jailer = "#{jailer}"
+ """)
+ end
+end
diff --git a/lib/mix/tasks/suidhelper.install.ex b/lib/mix/tasks/suidhelper.install.ex
new file mode 100644
index 00000000..2085de3c
--- /dev/null
+++ b/lib/mix/tasks/suidhelper.install.ex
@@ -0,0 +1,93 @@
+defmodule Mix.Tasks.Suidhelper.Install do
+ @shortdoc "Build, stamp, and install the setuid helper"
+ @moduledoc """
+ Builds, stamps, and installs the Rust setuid helper.
+
+ mix suidhelper.install
+
+ Two steps:
+
+ 1. `cargo xtask stamp` in `native/suidhelper` builds the release binary and
+ writes its BLAKE3 self-checksum into `.note.sum` (the same step the
+ `:suidhelper_stamp` compiler runs).
+ 2. The stamped binary is copied setuid-root (mode `4755`) to
+ `/usr/local/bin/hyper-suidhelper`.
+
+ The copy needs root, but Mix runs every subprocess in its own session with no
+ controlling terminal (`erl_child_setup` calls `setsid`), so a nested `sudo`
+ cannot open `/dev/tty` to prompt for a password. This task therefore only runs
+ `sudo` itself when it is already non-interactive (`sudo -n` succeeds, e.g.
+ `NOPASSWD` or a usable cached credential). Otherwise it prints the exact
+ privileged command for you to run in your own terminal.
+
+ This is the privileged counterpart to `mix suidhelper.stamp`, which stamps
+ only. `cargo` and the helper's toolchain (see
+ `native/suidhelper/rust-toolchain.toml`) must be installed.
+ """
+
+ use Mix.Task
+
+ @helper_dir "native/suidhelper"
+ @source Path.join(@helper_dir, "target/release/hyper-suidhelper")
+ # Must match `Hyper.Config`'s default `suid_helper` path and the xtask's
+ # `INSTALL_PATH`: a `PATH` location the unprivileged node can exec.
+ @install_path "/usr/local/bin/hyper-suidhelper"
+
+ @impl Mix.Task
+ def run(argv) do
+ stamp!(argv)
+ install_privileged()
+ end
+
+ defp stamp!(argv) do
+ case System.cmd("cargo", ["xtask", "stamp" | argv],
+ cd: @helper_dir,
+ into: IO.stream(:stdio, :line)
+ ) do
+ {_, 0} ->
+ :ok
+
+ {_, _} ->
+ Mix.raise("""
+ `cargo xtask stamp` failed building the suidhelper.
+
+ Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml)
+ are installed.
+ """)
+ end
+ end
+
+ defp install_privileged do
+ if passwordless_sudo?() do
+ Mix.shell().info("Installing #{@source} -> #{@install_path} (setuid root)")
+
+ case System.cmd("sudo", install_argv(), into: IO.stream(:stdio, :line)) do
+ {_, 0} -> Mix.shell().info("installed #{@install_path} (setuid root)")
+ {_, _} -> Mix.raise(manual_instructions())
+ end
+ else
+ Mix.shell().info(manual_instructions())
+ end
+ end
+
+ # `sudo -n true` exits 0 only when sudo can run without prompting. With no
+ # controlling terminal a cached `tty_tickets` credential is invisible, so this
+ # is true essentially only under `NOPASSWD` -- exactly the case where the
+ # nested `sudo install` below can succeed.
+ defp passwordless_sudo? do
+ match?({_, 0}, System.cmd("sudo", ["-n", "true"], stderr_to_stdout: true))
+ end
+
+ defp install_argv,
+ do: ["install", "-o", "root", "-g", "root", "-m", "4755", @source, @install_path]
+
+ defp manual_instructions do
+ """
+
+ The binary is built and stamped, but installing it setuid-root needs a
+ password and `sudo` has no terminal to prompt on here. Run the copy yourself:
+
+ sudo #{Enum.join(install_argv(), " ")}
+ """
+ end
+end
diff --git a/mix.exs b/mix.exs
index 7906c6a1..5c620b81 100644
--- a/mix.exs
+++ b/mix.exs
@@ -24,6 +24,10 @@ defmodule Hyper.MixProject do
# Cache the PLTs in a stable, gitignored dir so CI can cache them.
plt_local_path: "priv/plts",
plt_core_path: "priv/plts",
+ # `:mix` is needed so the Mix tasks under `lib/mix/tasks` (which call
+ # `Mix.raise/1`, `Mix.shell/0`, and implement the `Mix.Task` behaviour)
+ # resolve instead of tripping `unknown_function`.
+ plt_add_apps: [:mix],
# Verify @specs against actual returns, and flag ignored return values.
flags: [:unmatched_returns, :extra_return, :missing_return]
]
diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml
index f5b4b1b7..be6a449c 100644
--- a/native/suidhelper/Cargo.toml
+++ b/native/suidhelper/Cargo.toml
@@ -48,6 +48,10 @@ path = "tests/util/safe_bin.rs"
name = "tools_dmsetup_parsers"
path = "tests/tools/dmsetup_parsers.rs"
+[[test]]
+name = "tools_jailer"
+path = "tests/tools/jailer.rs"
+
[[test]]
name = "util_confinement"
path = "tests/util/confinement.rs"
@@ -56,6 +60,10 @@ path = "tests/util/confinement.rs"
name = "tools_chroot_jail_remove"
path = "tests/tools/chroot_jail_remove.rs"
+[[test]]
+name = "tools_chroot_jail_grant_api"
+path = "tests/tools/chroot_jail_grant_api.rs"
+
[[test]]
name = "util_chroot_jail"
path = "tests/util/chroot_jail.rs"
@@ -68,14 +76,22 @@ path = "tests/e2e/config.rs"
name = "e2e_argv"
path = "tests/e2e/argv.rs"
+[[test]]
+name = "e2e_jailer"
+path = "tests/e2e/jailer.rs"
+
[[test]]
name = "e2e_chroot_jail"
path = "tests/e2e/chroot_jail.rs"
+[[test]]
+name = "config_uid_gid_range"
+path = "tests/config_uid_gid_range.rs"
+
[dependencies]
clap = { version = "4", features = ["derive"] }
hyper-suidhelper-meta = { path = "meta" }
-nix = { version = "0.29", features = ["user", "fs", "dir"] }
+nix = { version = "0.29", features = ["user", "fs", "dir", "process"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs
index 425ba386..76a7f221 100644
--- a/native/suidhelper/src/config.rs
+++ b/native/suidhelper/src/config.rs
@@ -1,6 +1,14 @@
// SPDX-License-Identifier: AGPL-3.0-only
//! Runtime host configuration, read from a single root-owned TOML file.
+//!
+//! ## UID/GID range divergence
+//!
+//! Elixir keeps `compile_env` default `{900_000, 999_999}` that governs which
+//! UIDs the node hands *out*; this helper reads `[uid_gid_range]` from
+//! config.toml to decide which UIDs it *accepts* (default `{900_000, 999_999}`
+//! when the key is absent). Operators narrowing the range must set **both**.
+use crate::util::safe_bin::{self, SafeBin};
use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile};
use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents};
use nix::fcntl::OFlag;
@@ -21,11 +29,48 @@ pub enum LoadingError {
Malformed(PathBuf),
#[error("work_dir in {0:?} must be an absolute path")]
Relative(PathBuf),
+ #[error("uid_gid_range.min must be >= 1 and <= max (got min={min}, max={max})")]
+ BadUidGidRange { min: u32, max: u32 },
+}
+
+/// Error returned by config accessors for tool binaries derived from config.
+#[derive(Debug, Error)]
+pub enum BinError {
+ #[error("required binary `{0}` is not configured in /etc/hyper/config.toml")]
+ Unconfigured(&'static str),
+ #[error(transparent)]
+ Bin(#[from] safe_bin::Error),
}
const CONFIG_PATHSTR: &str = "/etc/hyper/config.toml";
const INSECURE_CONFIG_PATH_ENV: &str = "HYPER_SETUIDHELPER_CONFIG_PATH";
+/// UID/GID allocation band, read from `[uid_gid_range]` in config.toml.
+/// Controls which UIDs the helper *accepts* from the BEAM — see module docs.
+#[derive(Debug, Clone, Copy, Deserialize)]
+pub struct UidGidRange {
+ pub min: u32,
+ pub max: u32,
+}
+
+// Band defaults match Elixir's `compile_env` allocation defaults so that an
+// unconfigured helper and an unconfigured node agree out of the box.
+const DEFAULT_UID_GID: (u32, u32) = (900_000, 999_999);
+
+/// Validate a uid_gid_range value. A present range where min==0 or min>max is
+/// treated as a config trust violation — fatal at load, consistent with the
+/// "present but untrusted" model. Exposed so tests can verify the contract
+/// without touching the file system.
+pub fn validate_uid_gid_range(r: &UidGidRange) -> Result<(), LoadingError> {
+ if r.min == 0 || r.min > r.max {
+ return Err(LoadingError::BadUidGidRange {
+ min: r.min,
+ max: r.max,
+ });
+ }
+ Ok(())
+}
+
/// The config file path. In production this is the fixed `/etc/hyper/config.toml`.
/// Only in INSECURE TEST MODE (both gates open) may an env var redirect it — the
/// secure arm is always the hardcoded path, so a release build cannot be steered.
@@ -44,13 +89,66 @@ fn config_path() -> PathBuf {
#[derive(Debug, Clone, Deserialize)]
pub struct Config {
work_dir: PathBuf,
+ #[serde(default = "default_dmsetup")]
+ dmsetup: PathBuf,
+ #[serde(default = "default_losetup")]
+ losetup: PathBuf,
+ #[serde(default = "default_blockdev")]
+ blockdev: PathBuf,
+ #[serde(default)]
+ firecracker: Option,
+ #[serde(default)]
+ jailer: Option,
+ #[serde(default = "default_parent_cgroup")]
+ parent_cgroup: String,
+ #[serde(default)]
+ uid_gid_range: Option,
+}
+
+// The default data root. Must match the Elixir node's `@dev_work_dir`, which it
+// uses when the same config file is absent, so both sides agree (see
+// `Hyper.Node.check_helper_base`).
+fn default_work_dir() -> PathBuf {
+ PathBuf::from("/srv/hyper")
+}
+
+fn default_dmsetup() -> PathBuf {
+ PathBuf::from("/usr/sbin/dmsetup")
+}
+
+fn default_losetup() -> PathBuf {
+ PathBuf::from("/usr/sbin/losetup")
+}
+
+fn default_blockdev() -> PathBuf {
+ PathBuf::from("/usr/sbin/blockdev")
+}
+
+fn default_parent_cgroup() -> String {
+ // Must match Elixir node's `@parent_cgroup`; operators need to keep them in sync.
+ "hyper".into()
+}
+
+impl Default for Config {
+ fn default() -> Self {
+ Self {
+ work_dir: default_work_dir(),
+ dmsetup: default_dmsetup(),
+ losetup: default_losetup(),
+ blockdev: default_blockdev(),
+ firecracker: None,
+ jailer: None,
+ parent_cgroup: default_parent_cgroup(),
+ uid_gid_range: None,
+ }
+ }
}
impl Config {
/// The process-wide config, loaded once (and forced unprivileged by
- /// [`Config::init`]). A load failure is fatal: the helper cannot safely
- /// operate without a trusted data root, so it prints the error and exits
- /// rather than guessing a default.
+ /// [`Config::init`]). An absent file yields the built-in defaults; a
+ /// *present but untrusted* file (wrong owner/mode, malformed) is fatal —
+ /// the helper prints the error and exits rather than trusting it.
pub fn get() -> &'static Config {
LazyLock::force(&CONFIG)
}
@@ -58,7 +156,7 @@ impl Config {
/// Force the config to load now. Call this once at the very start of `main`,
/// after privileges have already been dropped (the `.preinit_array` entry in
/// `setuid_privileged` runs before `main`), so the file is never first read
- /// lazily from inside a `Privileged` scope - i.e. it is guaranteed to be read
+ /// lazily from inside a `Privileged` scope — i.e. it is guaranteed to be read
/// as the real uid, not as root.
pub fn init() {
let _ = Self::get();
@@ -74,6 +172,57 @@ impl Config {
self.work_dir.join("jails")
}
+ /// The validated `dmsetup` binary the helper will run.
+ pub fn dmsetup(&self) -> Result, safe_bin::Error> {
+ SafeBin::from_path(&self.dmsetup)
+ }
+
+ /// The validated `losetup` binary the helper will run.
+ pub fn losetup(&self) -> Result, safe_bin::Error> {
+ SafeBin::from_path(&self.losetup)
+ }
+
+ /// The validated `blockdev` binary the helper will run.
+ pub fn blockdev(&self) -> Result, safe_bin::Error> {
+ SafeBin::from_path(&self.blockdev)
+ }
+
+ /// The Firecracker VMM binary, validated as root-owned and correctly named.
+ /// Errors [`BinError::Unconfigured`] when absent from config — an operator
+ /// must set this key before any VM can be launched.
+ pub fn firecracker(&self) -> Result, BinError> {
+ self.firecracker
+ .as_deref()
+ .ok_or(BinError::Unconfigured("firecracker"))
+ .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin))
+ }
+
+ /// The Firecracker jailer binary, validated as root-owned and correctly named.
+ /// Errors [`BinError::Unconfigured`] when absent from config — an operator
+ /// must set this key before any VM can be launched.
+ pub fn jailer(&self) -> Result, BinError> {
+ self.jailer
+ .as_deref()
+ .ok_or(BinError::Unconfigured("jailer"))
+ .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin))
+ }
+
+ /// The jailer `--parent-cgroup` value. Defaults to `"hyper"`, matching the
+ /// Elixir node's `@parent_cgroup`.
+ pub fn parent_cgroup(&self) -> &str {
+ &self.parent_cgroup
+ }
+
+ /// The UID/GID band the helper accepts from the BEAM. Defaults to
+ /// `(900_000, 999_999)` when the key is absent (matching Elixir's defaults).
+ /// A present range with min==0 or min>max is rejected at load time by
+ /// [`Config::safe_load`], so this accessor is always total.
+ pub fn uid_gid_range(&self) -> (u32, u32) {
+ self.uid_gid_range
+ .map(|r| (r.min, r.max))
+ .unwrap_or(DEFAULT_UID_GID)
+ }
+
/// Read, ownership-check, parse, and validate the config file. See the module
/// docs for the trust model.
pub fn safe_load() -> Result {
@@ -82,7 +231,18 @@ impl Config {
let safe_path: SafePath = path.clone().try_into()?;
let file: SafeFile =
- SafeFile::open(&safe_path, OFlag::O_RDONLY)?;
+ match SafeFile::open(&safe_path, OFlag::O_RDONLY) {
+ Ok(file) => file,
+ // A genuinely-absent file means "use the built-in defaults": those
+ // are compiled into this root-owned binary, so they are trusted. Any
+ // OTHER failure — a present but wrong-owner/mode file, an I/O error —
+ // stays fatal, because it is a signal (someone put an untrusted file
+ // there), not an absence.
+ Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => {
+ return Ok(Self::default())
+ }
+ Err(e) => return Err(e.into()),
+ };
let body = std::io::read_to_string(std::fs::File::from(file.into_owned_fd()))
.map_err(|_| LoadingError::Unreadable(path.clone()))?;
@@ -92,6 +252,11 @@ impl Config {
if !config.work_dir.is_absolute() {
return Err(LoadingError::Relative(path));
}
+
+ if let Some(r) = &config.uid_gid_range {
+ validate_uid_gid_range(r)?;
+ }
+
Ok(config)
}
}
diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs
index 617fd6d5..11b08319 100644
--- a/native/suidhelper/src/main.rs
+++ b/native/suidhelper/src/main.rs
@@ -15,6 +15,7 @@
use clap::{Parser, Subcommand};
use hyper_suidhelper::config;
+use hyper_suidhelper::tools::jailer::{self, JailerArgs};
use hyper_suidhelper::tools::Tool;
use hyper_suidhelper::util::setuid_privileged::{self, Privileged};
use serde::Serialize;
@@ -37,6 +38,9 @@ enum Command {
Tool(Tool),
/// Check the helper is correctly installed (can promote to root).
SysTest,
+ /// Validate the caller's args, become root, and `execve` the firecracker
+ /// jailer in our place. Prints nothing on success (the image is replaced).
+ Jailer(JailerArgs),
/// Print the build version and BLAKE3 checksum of this binary.
Version,
}
@@ -101,12 +105,21 @@ fn main() {
config::Config::init();
// Each command yields a serializable value (errors stringified to unify); we
- // render the final JSON line here.
+ // render the final JSON line here. The jailer is the exception: it `execve`s
+ // in place, so on success it never returns and never emits JSON, and on
+ // failure it reports to stderr and exits before reaching the JSON pipeline.
let output = match command {
Command::Tool(tool) => tool.run().map(Output::Tool).map_err(|e| e.to_string()),
Command::SysTest => SysTest::perform()
.map(Output::SysTest)
.map_err(|e| e.to_string()),
+ Command::Jailer(args) => {
+ let e = jailer::run(args).expect_err("jailer::run only returns on error");
+ eprintln!("{e}");
+ // _exit bypasses atexit handlers; safe because we are permanently root
+ // at this point and must not run any cleanup registered before escalation.
+ unsafe { nix::libc::_exit(2) }
+ }
Command::Version => unreachable!("handled above"),
};
diff --git a/native/suidhelper/src/tools/chroot_jail/grant_api.rs b/native/suidhelper/src/tools/chroot_jail/grant_api.rs
new file mode 100644
index 00000000..4ec94a21
--- /dev/null
+++ b/native/suidhelper/src/tools/chroot_jail/grant_api.rs
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+//! `chroot-jail grant-api`: hand the firecracker API socket to the node user so
+//! the unprivileged controller can `connect()` to it.
+//!
+//! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker
+//! then creates its API socket at `/root/api.socket` owned by that per-VM
+//! id. Connecting a unix socket needs *write* permission on the node, so the
+//! node user (a different uid) gets `EACCES`. This op chowns just that one
+//! socket to the helper's CALLER — `getuid()`/`getgid()`, which inside the
+//! privileged scope are the real (caller) ids while euid is 0 — and chmods it
+//! `0660`. The node thus connects as owner, and humans added to the node's
+//! group connect via the group bit.
+//!
+//! That alone is not enough: the jailer leaves `/root` as `0700` owned by
+//! the per-VM uid, and connecting needs *search* (`+x`) on every ancestor, so
+//! the node cannot even traverse into `root` to reach the (now its own) socket.
+//! So this op also opens just that one directory to the caller's group: it keeps
+//! the per-VM uid as owner (firecracker still needs it), chgrps `root` to the
+//! caller's gid, and chmods it `0710` — owner `rwx`, group `--x` (traverse, not
+//! list), other none. Per-VM isolation is otherwise untouched: only this socket
+//! and its immediate parent's group/mode move, nothing else in the jail, and
+//! unrelated users stay locked out.
+//!
+//! Security: the socket path is validated as a `SafePath` and reached by an
+//! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect
+//! the chown outside the jail, and every op is fd-relative on the pinned `root`
+//! dir fd, never by re-resolved name. The leaf must be exactly `api.socket`
+//! `//root` below the base, and `fstatat(AT_SYMLINK_NOFOLLOW)` must
+//! report a *socket* — a regular file or symlink planted at that name is an
+//! attack and is refused, never chmod'd. A missing socket (`ENOENT`, anywhere on
+//! the path) is `Pending`, not an error: firecracker has not created it yet, so
+//! the controller keeps probing.
+
+use crate::config::Config;
+use crate::tools::IsTool;
+use crate::util::safe_dir::{self, SafeDir};
+use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents};
+use clap::Args;
+use nix::errno::Errno;
+use nix::sys::stat::SFlag;
+use nix::unistd::{getgid, getuid};
+use serde::Serialize;
+use std::path::{Path, PathBuf};
+use thiserror::Error as ThisError;
+
+/// The fixed in-jail socket name firecracker opens (mirrors the Elixir
+/// `Hyper.Node.FireVMM.Jailer` `@jail_socket`).
+const SOCKET_NAME: &str = "api.socket";
+
+/// The socket sits at `///root/api.socket`: three parent
+/// components (``, ``, `root`) before the leaf.
+const SOCKET_PARENT_DEPTH: usize = 3;
+
+/// Mode handed to the node: owner+group read/write, no world access.
+const SOCKET_MODE: u32 = 0o660;
+
+/// Mode set on the jail `root` dir so the node's group can *traverse* it to
+/// reach the socket: owner `rwx` (the per-VM uid, unchanged), group `--x`
+/// (traverse, not list), other none.
+const JAIL_ROOT_MODE: u32 = 0o710;
+
+type LexicalPath = SafePath;
+
+#[derive(Debug, ThisError)]
+pub enum Error {
+ #[error("--socket path: {0}")]
+ SocketPath(#[source] safe_path::ValidationError),
+ #[error("--socket must be exactly //root/api.socket below JAIL_BASE: {0:?}")]
+ SocketShape(PathBuf),
+ #[error("--socket leaf must be {SOCKET_NAME:?}: {0:?}")]
+ SocketName(PathBuf),
+ #[error("walking to the jail root: {0}")]
+ Walk(#[source] safe_dir::Error),
+ #[error("api.socket is not a socket (or is a symlink); refusing to touch it")]
+ NotASocket,
+ #[error("statting the socket: {0}")]
+ Stat(#[source] safe_dir::Error),
+ #[error("chowning the socket to the caller: {0}")]
+ Chown(#[source] safe_dir::Error),
+ #[error("chmoding the socket: {0}")]
+ Chmod(#[source] safe_dir::Error),
+ #[error("chgrp-ing the jail root dir to the caller: {0}")]
+ ChgrpRoot(#[source] safe_dir::Error),
+ #[error("chmoding the jail root dir for traversal: {0}")]
+ ChmodRoot(#[source] safe_dir::Error),
+}
+
+#[derive(Args)]
+pub struct GrantApiArgs {
+ /// Host path of the firecracker API socket, shape
+ /// ///root/api.socket.
+ #[arg(long)]
+ socket: PathBuf,
+}
+
+#[derive(Debug, Serialize)]
+#[serde(tag = "result", rename_all = "snake_case")]
+pub enum GrantOut {
+ /// The socket was handed to the caller (chowned + chmoded).
+ Granted,
+ /// The socket does not exist yet; the caller should keep waiting.
+ Pending,
+}
+
+/// Run the `grant-api` op in its own privileged scope (returns its serialized `Value`).
+pub fn run(args: GrantApiArgs) -> Result {
+ GrantApi { args }.run()
+}
+
+struct GrantApi {
+ args: GrantApiArgs,
+}
+
+impl IsTool for GrantApi {
+ type Args = GrantApiArgs;
+ type Output = GrantOut;
+ type RunT = Result;
+
+ fn run_privileged(&self) -> Self::RunT {
+ grant_api_under(&Config::get().jail_base(), &self.args.socket)
+ }
+
+ fn parse(&self, res: Self::RunT) -> Result> {
+ Ok(res?)
+ }
+}
+
+/// Hand `socket` (`///root/api.socket`) to the helper's
+/// caller, fd-relative after an `O_NOFOLLOW` walk from `jail_base`. Returns
+/// `Pending` if any path component or the socket itself is not yet present.
+pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result {
+ let path: LexicalPath = socket.to_path_buf().try_into().map_err(Error::SocketPath)?;
+ let (parents, leaf) = path.relative_to(jail_base).map_err(Error::SocketPath)?;
+ if parents.len() != SOCKET_PARENT_DEPTH {
+ return Err(Error::SocketShape(socket.to_path_buf()));
+ }
+ if leaf != Path::new(SOCKET_NAME) {
+ return Err(Error::SocketName(socket.to_path_buf()));
+ }
+
+ let Some(root) = walk(jail_base.to_path_buf(), &parents)? else {
+ return Ok(GrantOut::Pending); // jail not fully created yet
+ };
+
+ let leaf = Path::new(SOCKET_NAME);
+ let stat = match root.stat(leaf) {
+ Ok(stat) => stat,
+ Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(GrantOut::Pending),
+ Err(e) => return Err(Error::Stat(e)),
+ };
+ // `stat` used AT_SYMLINK_NOFOLLOW, so a symlink reports as S_IFLNK and fails
+ // this check too: only a real socket is accepted, anything else is refused.
+ if stat.st_mode & SFlag::S_IFMT.bits() != SFlag::S_IFSOCK.bits() {
+ return Err(Error::NotASocket);
+ }
+
+ root.chown(leaf, getuid().as_raw(), getgid().as_raw())
+ .map_err(Error::Chown)?;
+ root.chmod(leaf, SOCKET_MODE).map_err(Error::Chmod)?;
+
+ // Open `root` itself to the caller's group so the node can traverse into it
+ // to reach the socket (the jailer leaves it 0700 / per-VM uid). Owner stays
+ // the per-VM uid; only the group and mode move. Operate on the pinned `root`
+ // fd (already opened `O_NOFOLLOW`), never by name - TOCTOU-safe.
+ root.chgrp_self(getgid().as_raw())
+ .map_err(Error::ChgrpRoot)?;
+ root.chmod_self(JAIL_ROOT_MODE).map_err(Error::ChmodRoot)?;
+ Ok(GrantOut::Granted)
+}
+
+/// Open `base` and walk `parents` from it (`O_NOFOLLOW` each step). Returns
+/// `Ok(None)` if `base` or any parent is not yet present (`ENOENT`), so the
+/// caller can treat a half-built jail as `Pending` rather than an error.
+fn walk(base: PathBuf, parents: &[PathBuf]) -> Result