From 349073d8c35351220301f9c54b7ecb02b2a7bcff Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 16 Jun 2026 16:32:48 +0200 Subject: [PATCH 01/10] fix(docker): honor configured supervisor image Signed-off-by: Evan Lezar --- crates/openshell-driver-docker/README.md | 9 ++- crates/openshell-driver-docker/src/lib.rs | 28 ++++--- docs/reference/gateway-config.mdx | 1 + e2e/with-docker-gateway.sh | 91 +++++------------------ 4 files changed, 41 insertions(+), 88 deletions(-) diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 71159fe66..754802cbe 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -79,10 +79,11 @@ The Docker driver bind-mounts a host-side Linux `openshell-sandbox` binary into each sandbox container. Resolution order is: 1. `supervisor_bin` in `[openshell.drivers.docker]`. -2. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary. -3. A local Linux cargo target build for the Docker daemon architecture. -4. `supervisor_image` in `[openshell.drivers.docker]`, or the - release-matched default supervisor image, extracting `/openshell-sandbox`. +2. `supervisor_image` in `[openshell.drivers.docker]`, extracting + `/openshell-sandbox` from that image. +3. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary. +4. A local Linux cargo target build for the Docker daemon architecture. +5. The release-matched default supervisor image, extracting `/openshell-sandbox`. Release and Docker-image gateway builds bake the matching supervisor image tag into the binary at compile time. The default Docker supervisor image is not diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 963e7a0f7..800f0b68c 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -79,7 +79,8 @@ const DOCKER_NETWORK_DRIVER: &str = "bridge"; /// Default image holding the Linux `openshell-sandbox` binary. The gateway /// pulls this image and extracts the binary to a host-side cache when no -/// explicit `supervisor_bin` override or local build is available. +/// explicit `supervisor_bin`, configured `supervisor_image`, sibling binary, +/// or local build is available. const DEFAULT_DOCKER_SUPERVISOR_IMAGE_REPO: &str = "ghcr.io/nvidia/openshell/supervisor"; /// Return the default `ghcr.io/nvidia/openshell/supervisor:` reference @@ -2960,7 +2961,14 @@ pub(crate) async fn resolve_supervisor_bin( return Ok(path); } - // Tier 2: sibling `openshell-sandbox` next to the running gateway + // Tier 2: explicit supervisor_image in [openshell.drivers.docker]. + // A configured image should be the source of truth even when a local + // developer build is present under target/. + if let Some(image) = docker_config.supervisor_image.clone() { + return extract_supervisor_bin_from_image(docker, &image).await; + } + + // Tier 3: sibling `openshell-sandbox` next to the running gateway // (release artifact layout). Linux-only because the sibling must be a // Linux ELF to bind-mount into a Linux container. if cfg!(target_os = "linux") { @@ -2977,9 +2985,9 @@ pub(crate) async fn resolve_supervisor_bin( } } - // Tier 3: local cargo target build (developer workflow). Preferred - // over a registry pull when available because it matches whatever the - // developer just built. + // Tier 4: local cargo target build (developer workflow). Preferred + // over the default registry image when available because it matches + // whatever the developer just built. let target_candidates = linux_supervisor_candidates(daemon_arch); for candidate in &target_candidates { if candidate.is_file() { @@ -2990,13 +2998,9 @@ pub(crate) async fn resolve_supervisor_bin( } } - // Tier 4: pull the supervisor image from a registry and extract the - // binary to a host-side cache keyed by image content digest. This is - // the default path for released gateway binaries. - let image = docker_config - .supervisor_image - .clone() - .unwrap_or_else(default_docker_supervisor_image); + // Tier 5: pull the release-matched default supervisor image and extract + // the binary to a host-side cache keyed by image content digest. + let image = default_docker_supervisor_image(); extract_supervisor_bin_from_image(docker, &image).await } diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index ff4542136..d820a131d 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -218,6 +218,7 @@ sandbox_namespace = "docker-dev" grpc_endpoint = "https://host.openshell.internal:17670" # Skip the image-pull-and-extract step by pointing at a locally built binary. supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" +# When supervisor_bin is omitted, Docker extracts /openshell-sandbox from this image. supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" guest_tls_ca = "/etc/openshell/certs/ca.pem" guest_tls_cert = "/etc/openshell/certs/client.pem" diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index f8e17661d..d2d809c18 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -81,7 +81,6 @@ DOCKER_NETWORK_NAME="" DOCKER_NETWORK_CONNECTED_CONTAINER="" DOCKER_NETWORK_MANAGED=0 GPU_MODE="${OPENSHELL_E2E_DOCKER_GPU:-0}" -DOCKER_SUPERVISOR_ARGS=() # Isolate CLI/SDK gateway metadata from the developer's real config. export XDG_CONFIG_HOME="${WORKDIR}/config" @@ -263,25 +262,6 @@ if [ "${GPU_MODE}" = "1" ]; then fi fi -normalize_arch() { - case "$1" in - x86_64|amd64) echo "amd64" ;; - aarch64|arm64) echo "arm64" ;; - *) echo "$1" ;; - esac -} - -linux_target_triple() { - case "$1" in - amd64) echo "x86_64-unknown-linux-gnu" ;; - arm64) echo "aarch64-unknown-linux-gnu" ;; - *) - echo "ERROR: unsupported Docker daemon architecture '$1'" >&2 - exit 2 - ;; - esac -} - resolve_docker_supervisor_image() { if [ -n "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE:-}" ]; then printf '%s\n' "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE}" @@ -304,7 +284,7 @@ resolve_docker_supervisor_image() { return 0 fi - printf '%s\n' "" + printf '%s\n' "openshell/supervisor:dev" } docker_pull_with_retry() { @@ -336,6 +316,21 @@ docker_pull_with_retry() { ensure_docker_supervisor_image() { local image=$1 + if [ "${image}" = "openshell/supervisor:dev" ] \ + && [ -z "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE:-}" ] \ + && [ -z "${OPENSHELL_SUPERVISOR_IMAGE:-}" ] \ + && [ -z "${CI:-}" ]; then + echo "Building local Docker supervisor image ${image}..." + CONTAINER_ENGINE=docker IMAGE_TAG=dev \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor + if docker image inspect "${image}" >/dev/null 2>&1; then + return 0 + fi + + echo "ERROR: expected supervisor image '${image}' after local build." >&2 + exit 2 + fi + if docker image inspect "${image}" >/dev/null 2>&1; then return 0 fi @@ -385,47 +380,11 @@ ensure_sandbox_image_available() { docker_pull_with_retry "${image}" } -DAEMON_ARCH="$(normalize_arch "$(docker info --format '{{.Architecture}}' 2>/dev/null || true)")" -SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" -HOST_OS="$(uname -s)" -HOST_ARCH="$(normalize_arch "$(uname -m)")" -SUPERVISOR_OUT_DIR="${WORKDIR}/supervisor/${DAEMON_ARCH}" -SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" - -CARGO_BUILD_JOBS_ARG=() -if [ -n "${CARGO_BUILD_JOBS:-}" ]; then - CARGO_BUILD_JOBS_ARG=(-j "${CARGO_BUILD_JOBS}") -fi - e2e_build_gateway_binaries "${ROOT}" TARGET_DIR GATEWAY_BIN CLI_BIN SUPERVISOR_IMAGE="$(resolve_docker_supervisor_image)" -if [ -n "${SUPERVISOR_IMAGE}" ]; then - ensure_docker_supervisor_image "${SUPERVISOR_IMAGE}" - echo "Using Docker supervisor image: ${SUPERVISOR_IMAGE}" - DOCKER_SUPERVISOR_ARGS=(--docker-supervisor-image "${SUPERVISOR_IMAGE}") -else - echo "Building openshell-sandbox for ${SUPERVISOR_TARGET}..." - mkdir -p "${SUPERVISOR_OUT_DIR}" - if [ "${HOST_OS}" = "Linux" ] && [ "${HOST_ARCH}" = "${DAEMON_ARCH}" ]; then - rustup target add "${SUPERVISOR_TARGET}" >/dev/null 2>&1 || true - cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ - --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" - cp "${TARGET_DIR}/${SUPERVISOR_TARGET}/release/openshell-sandbox" "${SUPERVISOR_BIN}" - else - CONTAINER_ENGINE=docker \ - DOCKER_PLATFORM="linux/${DAEMON_ARCH}" \ - DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_OUT_DIR}" \ - bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor-output - fi - - if [ ! -f "${SUPERVISOR_BIN}" ]; then - echo "ERROR: expected supervisor binary at ${SUPERVISOR_BIN}" >&2 - exit 1 - fi - chmod +x "${SUPERVISOR_BIN}" - DOCKER_SUPERVISOR_ARGS=(--docker-supervisor-bin "${SUPERVISOR_BIN}") -fi +ensure_docker_supervisor_image "${SUPERVISOR_IMAGE}" +echo "Using Docker supervisor image: ${SUPERVISOR_IMAGE}" DEFAULT_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-${OPENSHELL_SANDBOX_IMAGE:-${DEFAULT_SANDBOX_IMAGE}}}" @@ -493,19 +452,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" printf 'guest_tls_cert = %s\n' "$(toml_string "${PKI_DIR}/client/tls.crt")" printf 'guest_tls_key = %s\n' "$(toml_string "${PKI_DIR}/client/tls.key")" printf 'enable_bind_mounts = true\n' - # DOCKER_SUPERVISOR_ARGS holds either ("--docker-supervisor-bin" "") - # or ("--docker-supervisor-image" ""); both map to TOML keys on - # the docker driver config. - for ((i=0; i<${#DOCKER_SUPERVISOR_ARGS[@]}; i+=2)); do - case "${DOCKER_SUPERVISOR_ARGS[$i]}" in - --docker-supervisor-bin) - printf 'supervisor_bin = %s\n' "$(toml_string "${DOCKER_SUPERVISOR_ARGS[$((i+1))]}")" - ;; - --docker-supervisor-image) - printf 'supervisor_image = %s\n' "$(toml_string "${DOCKER_SUPERVISOR_ARGS[$((i+1))]}")" - ;; - esac - done + printf 'supervisor_image = %s\n' "$(toml_string "${SUPERVISOR_IMAGE}")" if [ -n "${GATEWAY_HOST_ALIAS_IP}" ]; then printf 'host_gateway_ip = %s\n' "$(toml_string "${GATEWAY_HOST_ALIAS_IP}")" fi From 4f444d87fdb1e42746bd40307696d38e2f2e78fa Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 16 Jun 2026 20:22:05 +0200 Subject: [PATCH 02/10] fix(cli): isolate ssh from host linker environment Signed-off-by: Evan Lezar --- crates/openshell-cli/src/ssh.rs | 123 ++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs index f5986a1d8..ab9bca2b1 100644 --- a/crates/openshell-cli/src/ssh.rs +++ b/crates/openshell-cli/src/ssh.rs @@ -29,6 +29,16 @@ use tokio::process::Command as TokioCommand; use tokio_stream::wrappers::ReceiverStream; const FOREGROUND_FORWARD_STARTUP_GRACE_PERIOD: Duration = Duration::from_secs(2); +const HOST_TOOL_LINKER_ENV: &[&str] = &[ + "DYLD_FALLBACK_LIBRARY_PATH", + "DYLD_INSERT_LIBRARIES", + "DYLD_LIBRARY_PATH", + "LD_AUDIT", + "LD_LIBRARY_PATH", + "LD_PRELOAD", + "LIBRARY_PATH", + "NIX_LD_LIBRARY_PATH", +]; #[derive(Clone, Copy, Debug)] pub enum Editor { @@ -121,6 +131,7 @@ async fn ssh_session_config( &session.token, gateway_name, ); + let proxy_command = proxy_command_with_preserved_environment(proxy_command); Ok(SshSessionConfig { proxy_command, @@ -137,6 +148,7 @@ fn ssh_base_command(proxy_command: &str) -> Command { std::env::var("OPENSHELL_SSH_LOG_LEVEL").unwrap_or_else(|_| "ERROR".to_string()); let mut command = Command::new("ssh"); + sanitize_host_tool_environment(&mut command); command .arg("-o") .arg(format!("ProxyCommand={proxy_command}")) @@ -159,6 +171,30 @@ fn ssh_base_command(proxy_command: &str) -> Command { command } +fn sanitize_host_tool_environment(command: &mut Command) { + for key in HOST_TOOL_LINKER_ENV { + command.env_remove(key); + } +} + +fn proxy_command_with_preserved_environment(proxy_command: String) -> String { + let assignments = HOST_TOOL_LINKER_ENV + .iter() + .filter_map(|key| { + std::env::var_os(key).map(|value| { + let value = value.to_string_lossy(); + format!("{key}={}", shell_escape(&value)) + }) + }) + .collect::>(); + + if assignments.is_empty() { + proxy_command + } else { + format!("env {} {proxy_command}", assignments.join(" ")) + } +} + #[cfg(unix)] const TRANSIENT_TTY_SIGNALS: &[Signal] = &[Signal::SIGINT, Signal::SIGQUIT, Signal::SIGTERM]; @@ -1508,6 +1544,93 @@ mod tests { use super::*; use crate::TEST_ENV_LOCK; + #[test] + fn ssh_base_command_removes_host_linker_environment() { + let command = ssh_base_command("openshell ssh-proxy"); + let removed_keys = command + .get_envs() + .filter(|(_, value)| value.is_none()) + .map(|(key, _)| key.to_string_lossy().into_owned()) + .collect::>(); + + for key in HOST_TOOL_LINKER_ENV { + assert!( + removed_keys.iter().any(|removed| removed == key), + "expected ssh command to remove {key}" + ); + } + } + + #[test] + #[allow(unsafe_code)] // Test-only: env vars require unsafe in Rust 2024. + fn proxy_command_preserves_linker_environment_for_proxy_child() { + let _guard = TEST_ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let old_env = HOST_TOOL_LINKER_ENV + .iter() + .map(|key| (*key, std::env::var_os(key))) + .collect::>(); + + unsafe { + for key in HOST_TOOL_LINKER_ENV { + std::env::remove_var(key); + } + std::env::set_var("LD_LIBRARY_PATH", "/nix/store/z3 lib:/opt/lib"); + } + + let proxy_command = + proxy_command_with_preserved_environment("openshell ssh-proxy".to_string()); + let has_assignment = proxy_command.contains("LD_LIBRARY_PATH='/nix/store/z3 lib:/opt/lib'"); + let has_env_prefix = proxy_command.starts_with("env "); + let has_command = proxy_command.ends_with(" openshell ssh-proxy"); + + unsafe { + for (key, value) in old_env { + match value { + Some(value) => std::env::set_var(key, value), + None => std::env::remove_var(key), + } + } + } + + assert!(has_assignment, "unexpected proxy command: {proxy_command}"); + assert!(has_env_prefix, "unexpected proxy command: {proxy_command}"); + assert!(has_command, "unexpected proxy command: {proxy_command}"); + } + + #[test] + #[allow(unsafe_code)] // Test-only: env vars require unsafe in Rust 2024. + fn proxy_command_is_unchanged_without_linker_environment() { + let _guard = TEST_ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let old_env = HOST_TOOL_LINKER_ENV + .iter() + .map(|key| (*key, std::env::var_os(key))) + .collect::>(); + + unsafe { + for key in HOST_TOOL_LINKER_ENV { + std::env::remove_var(key); + } + } + + let proxy_command = + proxy_command_with_preserved_environment("openshell ssh-proxy".to_string()); + + unsafe { + for (key, value) in old_env { + match value { + Some(value) => std::env::set_var(key, value), + None => std::env::remove_var(key), + } + } + } + + assert_eq!(proxy_command, "openshell ssh-proxy"); + } + #[test] fn upsert_host_block_appends_when_missing() { let input = "Host existing\n HostName example.com\n"; From cd183e5279309f718e8f643bea9972b5b9e4820a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Jun 2026 15:12:37 +0200 Subject: [PATCH 03/10] test(e2e): add workload manifest build flow Signed-off-by: Evan Lezar --- e2e/gpu/README.md | 67 +++++++++++++++++++++------ tasks/scripts/e2e-gpu-build-images.sh | 1 - 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md index 8c796b444..0b671c136 100644 --- a/e2e/gpu/README.md +++ b/e2e/gpu/README.md @@ -3,7 +3,8 @@ # GPU workload images -This directory defines workload test images for OpenShell GPU validation. +This directory defines workload test images currently used by the OpenShell GPU +e2e suite. ## Contract @@ -25,8 +26,7 @@ OpenShell sandbox creation replaces the image entrypoint with the supervisor and does not run the OCI image `CMD`. When these images are used through OpenShell, the workload command from each manifest entry must be passed explicitly. -The image build task writes a local workload manifest. Each workload entry -carries: +The test harness is manifest-driven. Each workload entry carries: - `name` - `image` @@ -71,8 +71,7 @@ The task writes the latest build refs to: e2e/gpu/images/.build/latest.env ``` -The task also writes a local workload manifest for downstream tooling and -future workload-runner integration: +The task also writes the local workload manifest used by the Rust e2e runner: ```text e2e/gpu/images/.build/workloads.yaml @@ -90,8 +89,7 @@ source e2e/gpu/images/.build/latest.env ``` That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local -manifest. The current checked-in Rust GPU e2e target does not consume this -manifest yet. The per-image refs remain available as a convenience for direct +manifest. The per-image refs remain available as a convenience for direct container-engine validation. ## Direct Validation @@ -124,14 +122,57 @@ where Podman CDI is configured. Direct container-engine validation catches image, CDI, CUDA, and host GPU setup issues before OpenShell sandbox behavior is involved. -## OpenShell GPU E2E +## Manifest-Driven Validation -The current Rust GPU validation target is: +The Rust GPU validation target is: ```shell -mise run e2e:gpu +cargo test --manifest-path e2e/rust/Cargo.toml --features e2e-docker-gpu --test gpu -- --nocapture ``` -That target runs `gpu_device_selection`. It validates GPU request and device -selection behavior against a Docker-backed gateway. It does not run the -workload manifest generated by `mise run e2e:workloads:build`. +The workload validation path reads: + +```text +OPENSHELL_E2E_WORKLOAD_MANIFEST +``` + +When that variable is unset, the runner uses the default local manifest path: + +```text +e2e/gpu/images/.build/workloads.yaml +``` + +If neither path exists, the workload validation test prints a clear skip +message telling you to run: + +```shell +mise run e2e:workloads:build +``` + +or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest. + +Each manifest entry supplies the sandbox image and command. OpenShell runs that +command through `openshell sandbox create --gpu --from -- `. +The test runner iterates all GPU-tagged workload entries and enforces each +entry's declared expectation: + +- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS` +- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE` + +The current local manifest includes three workloads: + +- `smoke-pass` expected to pass +- `smoke-fail` expected to fail +- `cuda-basic` expected to pass + +## External Manifests + +External workload catalogs can use the same schema. Point the runner at one +with: + +```shell +export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml +``` + +That lets alternate workload manifests use the same test runner without +introducing per-workload env vars. diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh index 2a6a13b51..71626c30f 100644 --- a/tasks/scripts/e2e-gpu-build-images.sh +++ b/tasks/scripts/e2e-gpu-build-images.sh @@ -38,7 +38,6 @@ yaml_quote() { value=${value//$'\t'/\\t} printf '"%s"' "${value}" } - available_image_dirs() { local preferred From ddaecfdf28bfda6157a20d270eea5ac441f74494 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 4 Jun 2026 14:35:35 +0200 Subject: [PATCH 04/10] test(e2e): fix gpu workload README lint --- e2e/gpu/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md index 0b671c136..e3168d1dd 100644 --- a/e2e/gpu/README.md +++ b/e2e/gpu/README.md @@ -23,8 +23,8 @@ Each workload image must: command explicitly. OpenShell sandbox creation replaces the image entrypoint with the supervisor and -does not run the OCI image `CMD`. When these images are used through OpenShell, -the workload command from each manifest entry must be passed explicitly. +does not run the OCI image `CMD`. E2e tests that use these images through +OpenShell run the command from each manifest entry explicitly. The test harness is manifest-driven. Each workload entry carries: From c98de465d41a4639298e26fce59040b8c5ccec2e Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Jun 2026 13:48:33 +0200 Subject: [PATCH 05/10] test(e2e): add gpu workload validation tests --- e2e/rust/Cargo.toml | 4 +- e2e/rust/e2e-docker.sh | 4 ++ e2e/rust/tests/gpu.rs | 9 +++++ .../device_selection.rs} | 2 - e2e/rust/tests/gpu/workloads.rs | 39 +++++++++++++++++++ tasks/test.toml | 4 +- 6 files changed, 56 insertions(+), 6 deletions(-) create mode 100644 e2e/rust/tests/gpu.rs rename e2e/rust/tests/{gpu_device_selection.rs => gpu/device_selection.rs} (99%) create mode 100644 e2e/rust/tests/gpu/workloads.rs diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 083c622df..141c7c69a 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -98,8 +98,8 @@ path = "tests/forward_proxy_graphql_l7.rs" required-features = ["e2e-host-gateway"] [[test]] -name = "gpu_device_selection" -path = "tests/gpu_device_selection.rs" +name = "gpu" +path = "tests/gpu.rs" required-features = ["e2e-gpu"] [dependencies] diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index a020f87c8..c38a265e4 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -14,6 +14,10 @@ E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}" cargo build -p openshell-cli --features openshell-core/dev-settings +if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE:-}" ]; then + echo "note: running GPU e2e without OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE; CUDA workload validation will log an explicit skip" +fi + exec "${ROOT}/e2e/with-docker-gateway.sh" \ cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ --features "${E2E_FEATURES}" \ diff --git a/e2e/rust/tests/gpu.rs b/e2e/rust/tests/gpu.rs new file mode 100644 index 000000000..62930bb7d --- /dev/null +++ b/e2e/rust/tests/gpu.rs @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-gpu")] + +#[path = "gpu/device_selection.rs"] +mod device_selection; +#[path = "gpu/workloads.rs"] +mod workloads; diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu/device_selection.rs similarity index 99% rename from e2e/rust/tests/gpu_device_selection.rs rename to e2e/rust/tests/gpu/device_selection.rs index 4fb0fa76a..6170a9181 100644 --- a/e2e/rust/tests/gpu_device_selection.rs +++ b/e2e/rust/tests/gpu/device_selection.rs @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -#![cfg(feature = "e2e-gpu")] - //! GPU device selection e2e tests. //! //! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`. diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs new file mode 100644 index 000000000..6aa7a9b99 --- /dev/null +++ b/e2e/rust/tests/gpu/workloads.rs @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! GPU workload validation e2e tests. + +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; + +const CUDA_WORKLOAD_IMAGE_ENV: &str = "OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE"; +const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; + +fn cuda_workload_image() -> Option { + std::env::var(CUDA_WORKLOAD_IMAGE_ENV) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +#[tokio::test] +async fn cuda_gpu_workload_validation_runs_with_default_image_command() { + let Some(image) = cuda_workload_image() else { + eprintln!("skipping CUDA GPU workload validation: {CUDA_WORKLOAD_IMAGE_ENV} is not set"); + return; + }; + + let mut guard = SandboxGuard::create(&["--gpu", "--from", image.as_str()]) + .await + .unwrap_or_else(|err| { + panic!("CUDA GPU workload sandbox create failed for image {image}:\n{err}") + }); + + let clean_output = strip_ansi(&guard.create_output); + assert!( + clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER), + "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for image {image} in sandbox output:\n{clean_output}" + ); + + guard.cleanup().await; +} diff --git a/tasks/test.toml b/tasks/test.toml index cf031bd6f..c9135bb31 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -93,7 +93,7 @@ run = "e2e/rust/e2e-podman-rootless.sh" ["e2e:podman:gpu"] description = "Run GPU e2e against a standalone gateway with the Podman compute driver" -env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } run = "e2e/rust/e2e-podman.sh" ["e2e:kubernetes"] @@ -122,7 +122,7 @@ run = [ ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } run = "e2e/rust/e2e-docker.sh" ["e2e:openshift"] From effe113abe6f8f4d53051b1393da60194e380020 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Jun 2026 14:20:39 +0200 Subject: [PATCH 06/10] test(e2e): invoke gpu workload binary explicitly --- e2e/rust/tests/gpu/workloads.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs index 6aa7a9b99..4c766913a 100644 --- a/e2e/rust/tests/gpu/workloads.rs +++ b/e2e/rust/tests/gpu/workloads.rs @@ -8,6 +8,7 @@ use openshell_e2e::harness::sandbox::SandboxGuard; const CUDA_WORKLOAD_IMAGE_ENV: &str = "OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE"; const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; +const GPU_WORKLOAD_BINARY: &str = "/usr/local/bin/openshell-gpu-workload"; fn cuda_workload_image() -> Option { std::env::var(CUDA_WORKLOAD_IMAGE_ENV) @@ -17,17 +18,25 @@ fn cuda_workload_image() -> Option { } #[tokio::test] -async fn cuda_gpu_workload_validation_runs_with_default_image_command() { +async fn cuda_gpu_workload_validation_runs_explicit_workload_binary() { let Some(image) = cuda_workload_image() else { eprintln!("skipping CUDA GPU workload validation: {CUDA_WORKLOAD_IMAGE_ENV} is not set"); return; }; - let mut guard = SandboxGuard::create(&["--gpu", "--from", image.as_str()]) - .await - .unwrap_or_else(|err| { - panic!("CUDA GPU workload sandbox create failed for image {image}:\n{err}") - }); + let mut guard = SandboxGuard::create(&[ + "--gpu", + "--from", + image.as_str(), + "--", + GPU_WORKLOAD_BINARY, + ]) + .await + .unwrap_or_else(|err| { + panic!( + "CUDA GPU workload sandbox create failed for image {image} with binary {GPU_WORKLOAD_BINARY}:\n{err}" + ) + }); let clean_output = strip_ansi(&guard.create_output); assert!( From 20707d1bb795bfc4a59318df4f51807dd09c667a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Jun 2026 15:14:22 +0200 Subject: [PATCH 07/10] test(e2e): run gpu workloads from manifest Signed-off-by: Evan Lezar --- e2e/rust/Cargo.lock | 21 ++++ e2e/rust/Cargo.toml | 2 + e2e/rust/e2e-docker.sh | 5 +- e2e/rust/tests/gpu/workloads.rs | 173 ++++++++++++++++++++++++++++---- 4 files changed, 179 insertions(+), 22 deletions(-) diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock index 953449c57..ff5c3e389 100644 --- a/e2e/rust/Cargo.lock +++ b/e2e/rust/Cargo.lock @@ -614,7 +614,9 @@ dependencies = [ "hyper-util", "prost", "rand", + "serde", "serde_json", + "serde_yaml", "sha1", "sha2", "tempfile", @@ -872,6 +874,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha1" version = "0.10.6" @@ -1087,6 +1102,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "url" version = "2.5.8" diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 141c7c69a..d950d68b1 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -117,7 +117,9 @@ sha1 = "0.10" sha2 = "0.10" hex = "0.4" rand = "0.9" +serde = { version = "1", features = ["derive"] } serde_json = "1" +serde_yaml = "0.9" [lints.rust] unsafe_code = "warn" diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index c38a265e4..7b21939fe 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -11,11 +11,12 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}" E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}" +DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml" cargo build -p openshell-cli --features openshell-core/dev-settings -if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE:-}" ]; then - echo "note: running GPU e2e without OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE; CUDA workload validation will log an explicit skip" +if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then + echo "note: running GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST." fi exec "${ROOT}/e2e/with-docker-gateway.sh" \ diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs index 4c766913a..b8ccb0b2e 100644 --- a/e2e/rust/tests/gpu/workloads.rs +++ b/e2e/rust/tests/gpu/workloads.rs @@ -3,46 +3,179 @@ //! GPU workload validation e2e tests. +use std::fs; +use std::path::{Path, PathBuf}; + use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; +use serde::Deserialize; -const CUDA_WORKLOAD_IMAGE_ENV: &str = "OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE"; +const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST"; const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; -const GPU_WORKLOAD_BINARY: &str = "/usr/local/bin/openshell-gpu-workload"; +const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE"; + +#[derive(Debug, Deserialize)] +struct WorkloadManifest { + workloads: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct WorkloadDefinition { + name: String, + image: String, + command: Vec, + expect: WorkloadExpectation, + #[serde(default)] + requirements: WorkloadRequirements, +} -fn cuda_workload_image() -> Option { - std::env::var(CUDA_WORKLOAD_IMAGE_ENV) +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "lowercase")] +enum WorkloadExpectation { + Pass, + Fail, +} + +#[derive(Clone, Debug, Default, Deserialize)] +struct WorkloadRequirements { + #[serde(default)] + gpu: bool, +} + +fn default_workload_manifest_path() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml") +} + +fn workload_manifest_path() -> PathBuf { + std::env::var(WORKLOAD_MANIFEST_ENV) .ok() .map(|value| value.trim().to_string()) .filter(|value| !value.is_empty()) + .map(PathBuf::from) + .unwrap_or_else(default_workload_manifest_path) } -#[tokio::test] -async fn cuda_gpu_workload_validation_runs_explicit_workload_binary() { - let Some(image) = cuda_workload_image() else { - eprintln!("skipping CUDA GPU workload validation: {CUDA_WORKLOAD_IMAGE_ENV} is not set"); - return; +fn load_workload_manifest() -> Option { + let path = workload_manifest_path(); + let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .map(|value| !value.trim().is_empty()) + .unwrap_or(false); + + let contents = match fs::read_to_string(&path) { + Ok(contents) => contents, + Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => { + eprintln!( + "skipping GPU workload validation: no workload manifest at {}. \ + Run `mise run e2e:workloads:build` to create the local manifest \ + or set {WORKLOAD_MANIFEST_ENV} to an external manifest.", + path.display() + ); + return None; + } + Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()), }; - let mut guard = SandboxGuard::create(&[ - "--gpu", - "--from", - image.as_str(), - "--", - GPU_WORKLOAD_BINARY, - ]) - .await - .unwrap_or_else(|err| { + let manifest: WorkloadManifest = serde_yaml::from_str(&contents).unwrap_or_else(|err| { panic!( - "CUDA GPU workload sandbox create failed for image {image} with binary {GPU_WORKLOAD_BINARY}:\n{err}" + "failed to parse workload manifest {}: {err}", + path.display() + ) + }); + assert!( + !manifest.workloads.is_empty(), + "workload manifest {} contains no workloads", + path.display() + ); + Some(manifest) +} + +async fn assert_expected_pass(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + let mut guard = SandboxGuard::create(&arg_refs).await.unwrap_or_else(|err| { + panic!( + "GPU workload '{}' expected success but sandbox create failed:\n{err}", + workload.name ) }); let clean_output = strip_ansi(&guard.create_output); assert!( clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER), - "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for image {image} in sandbox output:\n{clean_output}" + "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}", + workload.name, + workload.image, ); guard.cleanup().await; } + +async fn assert_expected_fail(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + match SandboxGuard::create(&arg_refs).await { + Ok(mut guard) => { + let clean_output = strip_ansi(&guard.create_output); + guard.cleanup().await; + panic!( + "GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}", + workload.name + ); + } + Err(err) => { + let clean_output = strip_ansi(&err); + assert!( + clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER), + "expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}", + workload.name, + workload.image, + ); + } + } +} + +#[tokio::test] +async fn gpu_workload_manifest_runs_expected_workloads() { + let Some(manifest) = load_workload_manifest() else { + return; + }; + + let gpu_workloads = manifest + .workloads + .into_iter() + .filter(|workload| workload.requirements.gpu) + .collect::>(); + + assert!( + !gpu_workloads.is_empty(), + "workload manifest contains no GPU-tagged workloads" + ); + + for workload in gpu_workloads { + assert!( + !workload.command.is_empty(), + "workload '{}' must declare a non-empty command", + workload.name + ); + + match workload.expect { + WorkloadExpectation::Pass => assert_expected_pass(&workload).await, + WorkloadExpectation::Fail => assert_expected_fail(&workload).await, + } + } +} From 3c036eb32f7cbdd1795fddbee5218320b9a63bb4 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 16 Jun 2026 13:54:17 +0200 Subject: [PATCH 08/10] test(e2e): serialize gpu workload tests Signed-off-by: Evan Lezar --- e2e/rust/Cargo.lock | 37 ++++++++++++++++++++++++++ e2e/rust/Cargo.toml | 3 +++ e2e/rust/tests/gpu.rs | 3 +++ e2e/rust/tests/gpu/device_selection.rs | 11 +++++--- e2e/rust/tests/gpu/workloads.rs | 8 +++--- 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock index ff5c3e389..e61c9a8c1 100644 --- a/e2e/rust/Cargo.lock +++ b/e2e/rust/Cargo.lock @@ -188,6 +188,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-macro" version = "0.3.32" @@ -617,6 +628,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", + "serial_test", "sha1", "sha2", "tempfile", @@ -887,6 +899,31 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "serial_test" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d" +dependencies = [ + "futures-executor", + "futures-util", + "log", + "once_cell", + "parking_lot", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha1" version = "0.10.6" diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index d950d68b1..eae80734d 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -121,6 +121,9 @@ serde = { version = "1", features = ["derive"] } serde_json = "1" serde_yaml = "0.9" +[dev-dependencies] +serial_test = "3" + [lints.rust] unsafe_code = "warn" rust_2018_idioms = { level = "warn", priority = -1 } diff --git a/e2e/rust/tests/gpu.rs b/e2e/rust/tests/gpu.rs index 62930bb7d..4a3f951f5 100644 --- a/e2e/rust/tests/gpu.rs +++ b/e2e/rust/tests/gpu.rs @@ -3,6 +3,9 @@ #![cfg(feature = "e2e-gpu")] +// GPU-consuming e2e tests use #[serial(gpu)] because common single-GPU hosts +// cannot reliably provision multiple GPU sandboxes at the same time. + #[path = "gpu/device_selection.rs"] mod device_selection; #[path = "gpu/workloads.rs"] diff --git a/e2e/rust/tests/gpu/device_selection.rs b/e2e/rust/tests/gpu/device_selection.rs index 6170a9181..98a2194f9 100644 --- a/e2e/rust/tests/gpu/device_selection.rs +++ b/e2e/rust/tests/gpu/device_selection.rs @@ -13,6 +13,7 @@ use openshell_e2e::harness::container::{ContainerEngine, e2e_driver}; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; use serde_json::{Map, Value}; +use serial_test::serial; use tokio::time::timeout; const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600); @@ -214,8 +215,7 @@ fn default_cdi_gpu_device_id(device_ids: &[String], allow_all_devices: bool) -> let mut named = device_ids .iter() .filter(|device_id| { - device_id.starts_with(CDI_GPU_DEVICE_PREFIX) - && device_id.as_str() != CDI_GPU_DEVICE_ALL + device_id.starts_with(CDI_GPU_DEVICE_PREFIX) && device_id.as_str() != CDI_GPU_DEVICE_ALL }) .cloned() .collect::>(); @@ -335,9 +335,11 @@ async fn sandbox_create_output(args: &[&str]) -> String { } #[tokio::test] +#[serial(gpu)] async fn gpu_request_without_device_matches_plain_default_gpu_container() { let device_ids = discovered_cdi_gpu_device_ids(); - let Some(default_gpu_device) = default_cdi_gpu_device_id(&device_ids, all_gpu_default_allowed()) + let Some(default_gpu_device) = + default_cdi_gpu_device_id(&device_ids, all_gpu_default_allowed()) else { eprintln!("skipping default GPU request test because no selectable GPU ID was discovered"); return; @@ -353,6 +355,7 @@ async fn gpu_request_without_device_matches_plain_default_gpu_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_request_for_each_discovered_device_matches_plain_container() { let device_ids: Vec<_> = discovered_cdi_gpu_device_ids() .into_iter() @@ -377,6 +380,7 @@ async fn gpu_request_for_each_discovered_device_matches_plain_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_all_device_request_matches_plain_all_gpu_container() { if !has_cdi_gpu_device(CDI_GPU_DEVICE_ALL) { eprintln!( @@ -395,6 +399,7 @@ async fn gpu_all_device_request_matches_plain_all_gpu_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_invalid_device_request_fails() { let driver_config_json = cdi_devices_driver_config_json(&["nvidia.com/gpu=invalid"]); let args = vec![ diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs index b8ccb0b2e..d0d192650 100644 --- a/e2e/rust/tests/gpu/workloads.rs +++ b/e2e/rust/tests/gpu/workloads.rs @@ -9,6 +9,7 @@ use std::path::{Path, PathBuf}; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; use serde::Deserialize; +use serial_test::serial; const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST"; const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; @@ -51,16 +52,14 @@ fn workload_manifest_path() -> PathBuf { .ok() .map(|value| value.trim().to_string()) .filter(|value| !value.is_empty()) - .map(PathBuf::from) - .unwrap_or_else(default_workload_manifest_path) + .map_or_else(default_workload_manifest_path, PathBuf::from) } fn load_workload_manifest() -> Option { let path = workload_manifest_path(); let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV) .ok() - .map(|value| !value.trim().is_empty()) - .unwrap_or(false); + .is_some_and(|value| !value.trim().is_empty()); let contents = match fs::read_to_string(&path) { Ok(contents) => contents, @@ -150,6 +149,7 @@ async fn assert_expected_fail(workload: &WorkloadDefinition) { } #[tokio::test] +#[serial(gpu)] async fn gpu_workload_manifest_runs_expected_workloads() { let Some(manifest) = load_workload_manifest() else { return; From 386d638d42fe4909131983bc1808400c5539f4c6 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Jun 2026 09:26:56 +0200 Subject: [PATCH 09/10] ci(e2e): build gpu workloads before gpu e2e Signed-off-by: Evan Lezar --- .github/workflows/e2e-gpu-test.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index a97e32896..8c8ef0919 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -65,5 +65,8 @@ jobs: docker info --format '{{json .CDISpecDirs}}' docker run --rm --device nvidia.com/gpu=all "${OPENSHELL_E2E_GPU_PROBE_IMAGE}" nvidia-smi -L + - name: Build GPU workload images + run: mise run --no-deps --skip-deps e2e:workloads:build + - name: Run tests run: mise run --no-deps --skip-deps e2e:docker:gpu From dbdc8134896c94f5f9babeae6fd8b5fb649494ab Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Jun 2026 15:50:51 +0200 Subject: [PATCH 10/10] test(e2e): fingerprint workload image tags without git Signed-off-by: Evan Lezar --- e2e/gpu/README.md | 6 ++--- tasks/scripts/e2e-gpu-build-images.sh | 37 +++++++++++++-------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md index e3168d1dd..10520a6bb 100644 --- a/e2e/gpu/README.md +++ b/e2e/gpu/README.md @@ -61,9 +61,9 @@ The build task uses `tasks/scripts/container-engine.sh`. Set `CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine explicitly. When unset, the helper uses its existing auto-detection behavior. -Local tags use the current commit short SHA plus a short fingerprint of the -external build inputs. Dirty local trees append `-dirty`. Set -`OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to override the tag. +Local tags use a short SHA-256 fingerprint of the selected workload contexts +and external build inputs. Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to +override the tag. The task writes the latest build refs to: diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh index 71626c30f..dc4f84b98 100644 --- a/tasks/scripts/e2e-gpu-build-images.sh +++ b/tasks/scripts/e2e-gpu-build-images.sh @@ -83,15 +83,29 @@ image_expectation() { workload_input_fingerprint() { local -a names=("$@") + local digest + local file + local name + local rel { + printf 'schema=openshell-gpu-workload-input-v1\n' printf 'OPENSHELL_SANDBOX_BASE_IMAGE=%s\n' "${BASE_IMAGE}" if contains_image cuda-basic "${names[@]}"; then printf 'CUDA_BUILD_IMAGE=%s\n' "${CUDA_BUILD_IMAGE}" printf 'CUDA_SAMPLES_REPO=%s\n' "${CUDA_SAMPLES_REPO}" printf 'CUDA_SAMPLES_REF=%s\n' "${CUDA_SAMPLES_REF}" fi - } | git -C "${ROOT}" hash-object --stdin | cut -c1-8 + for name in "${names[@]}"; do + printf 'WORKLOAD=%s\n' "${name}" + while IFS= read -r -d '' file; do + rel="${file#"${ROOT}/"}" + digest="$(sha256sum "${file}" | cut -d ' ' -f 1)" + printf 'FILE=%s\n' "${rel}" + printf 'SHA256=%s\n' "${digest}" + done < <(find "${IMAGES_ROOT}/${name}" -type f -print0 | sort -z) + done + } | sha256sum | cut -c1-12 } mapfile -t available < <(available_image_dirs) @@ -122,28 +136,18 @@ if [[ ${#selected[@]} -eq 0 ]]; then exit 1 fi -source_sha="$(git -C "${ROOT}" rev-parse HEAD)" -source_short_sha="$(git -C "${ROOT}" rev-parse --short HEAD)" -source_dirty=false -if [[ -n "$(git -C "${ROOT}" status --short)" ]]; then - source_dirty=true -fi +input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG:-}" ]]; then image_tag="${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG}" else - input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" - image_tag="${source_short_sha}-${input_fingerprint}" - if [[ "${source_dirty}" == "true" ]]; then - image_tag="${image_tag}-dirty" - fi + image_tag="${input_fingerprint}" fi -input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" declare -A image_refs=() echo "Building GPU workload images with ${CONTAINER_ENGINE}" -echo "Source: ${source_short_sha} (dirty: ${source_dirty})" +echo "Fingerprint: ${input_fingerprint}" echo "Tag: ${image_tag}" for name in "${selected[@]}"; do @@ -158,7 +162,6 @@ for name in "${selected[@]}"; do --label "com.nvidia.openshell.gpu-workload.source=${name}" --label "com.nvidia.openshell.gpu-workload.base-image=${BASE_IMAGE}" --label "com.nvidia.openshell.gpu-workload.input-fingerprint=${input_fingerprint}" - --label "org.opencontainers.image.revision=${source_sha}" ) if [[ "${name}" == "cuda-basic" ]]; then build_args+=( @@ -194,8 +197,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml" echo "# Source this file to use the most recently built GPU workload images." write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_TAG "${image_tag}" write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_PATH "${IMAGES_ROOT}" - write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_SHA "${source_sha}" - write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_DIRTY "${source_dirty}" write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_INPUT_FINGERPRINT "${input_fingerprint}" write_env_var OPENSHELL_SANDBOX_BASE_IMAGE "${BASE_IMAGE}" write_env_var CUDA_BUILD_IMAGE "${CUDA_BUILD_IMAGE}" @@ -213,8 +214,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml" echo "generated_by: $(yaml_quote "mise run e2e:workloads:build")" echo "source:" echo " path: $(yaml_quote "${IMAGES_ROOT}")" - echo " revision: $(yaml_quote "${source_sha}")" - echo " dirty: ${source_dirty}" echo " input_fingerprint: $(yaml_quote "${input_fingerprint}")" echo " container_engine: $(yaml_quote "${CONTAINER_ENGINE}")" echo " inputs:"