From 6b2dc07efb0f885c0dc9b8d5293fe7c686793aba Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Jun 2026 15:12:37 +0200 Subject: [PATCH 1/3] test(e2e): add workload manifest build flow Signed-off-by: Evan Lezar --- e2e/gpu/README.md | 77 ++++++++++++++++++++------- tasks/scripts/e2e-gpu-build-images.sh | 46 +++++++++------- 2 files changed, 86 insertions(+), 37 deletions(-) diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md index 8c796b444..10520a6bb 100644 --- a/e2e/gpu/README.md +++ b/e2e/gpu/README.md @@ -3,7 +3,8 @@ # GPU workload images -This directory defines workload test images for OpenShell GPU validation. +This directory defines workload test images currently used by the OpenShell GPU +e2e suite. ## Contract @@ -22,11 +23,10 @@ Each workload image must: command explicitly. OpenShell sandbox creation replaces the image entrypoint with the supervisor and -does not run the OCI image `CMD`. When these images are used through OpenShell, -the workload command from each manifest entry must be passed explicitly. +does not run the OCI image `CMD`. E2e tests that use these images through +OpenShell run the command from each manifest entry explicitly. -The image build task writes a local workload manifest. Each workload entry -carries: +The test harness is manifest-driven. Each workload entry carries: - `name` - `image` @@ -61,9 +61,9 @@ The build task uses `tasks/scripts/container-engine.sh`. Set `CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine explicitly. When unset, the helper uses its existing auto-detection behavior. -Local tags use the current commit short SHA plus a short fingerprint of the -external build inputs. Dirty local trees append `-dirty`. Set -`OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to override the tag. +Local tags use a short SHA-256 fingerprint of the selected workload contexts +and external build inputs. Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to +override the tag. The task writes the latest build refs to: @@ -71,8 +71,7 @@ The task writes the latest build refs to: e2e/gpu/images/.build/latest.env ``` -The task also writes a local workload manifest for downstream tooling and -future workload-runner integration: +The task also writes the local workload manifest used by the Rust e2e runner: ```text e2e/gpu/images/.build/workloads.yaml @@ -90,8 +89,7 @@ source e2e/gpu/images/.build/latest.env ``` That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local -manifest. The current checked-in Rust GPU e2e target does not consume this -manifest yet. The per-image refs remain available as a convenience for direct +manifest. The per-image refs remain available as a convenience for direct container-engine validation. ## Direct Validation @@ -124,14 +122,57 @@ where Podman CDI is configured. Direct container-engine validation catches image, CDI, CUDA, and host GPU setup issues before OpenShell sandbox behavior is involved. -## OpenShell GPU E2E +## Manifest-Driven Validation -The current Rust GPU validation target is: +The Rust GPU validation target is: ```shell -mise run e2e:gpu +cargo test --manifest-path e2e/rust/Cargo.toml --features e2e-docker-gpu --test gpu -- --nocapture ``` -That target runs `gpu_device_selection`. It validates GPU request and device -selection behavior against a Docker-backed gateway. It does not run the -workload manifest generated by `mise run e2e:workloads:build`. +The workload validation path reads: + +```text +OPENSHELL_E2E_WORKLOAD_MANIFEST +``` + +When that variable is unset, the runner uses the default local manifest path: + +```text +e2e/gpu/images/.build/workloads.yaml +``` + +If neither path exists, the workload validation test prints a clear skip +message telling you to run: + +```shell +mise run e2e:workloads:build +``` + +or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest. + +Each manifest entry supplies the sandbox image and command. OpenShell runs that +command through `openshell sandbox create --gpu --from -- `. +The test runner iterates all GPU-tagged workload entries and enforces each +entry's declared expectation: + +- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS` +- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE` + +The current local manifest includes three workloads: + +- `smoke-pass` expected to pass +- `smoke-fail` expected to fail +- `cuda-basic` expected to pass + +## External Manifests + +External workload catalogs can use the same schema. Point the runner at one +with: + +```shell +export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml +``` + +That lets alternate workload manifests use the same test runner without +introducing per-workload env vars. diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh index 2a6a13b51..3a800c74f 100644 --- a/tasks/scripts/e2e-gpu-build-images.sh +++ b/tasks/scripts/e2e-gpu-build-images.sh @@ -39,6 +39,15 @@ yaml_quote() { printf '"%s"' "${value}" } +if command -v sha256sum >/dev/null 2>&1; then + SHA256_CMD=(sha256sum) +elif command -v shasum >/dev/null 2>&1; then + SHA256_CMD=(shasum -a 256) +else + echo "neither sha256sum nor shasum is available for hashing" >&2 + exit 1 +fi + available_image_dirs() { local preferred @@ -84,15 +93,29 @@ image_expectation() { workload_input_fingerprint() { local -a names=("$@") + local digest + local file + local name + local rel { + printf 'schema=openshell-gpu-workload-input-v1\n' printf 'OPENSHELL_SANDBOX_BASE_IMAGE=%s\n' "${BASE_IMAGE}" if contains_image cuda-basic "${names[@]}"; then printf 'CUDA_BUILD_IMAGE=%s\n' "${CUDA_BUILD_IMAGE}" printf 'CUDA_SAMPLES_REPO=%s\n' "${CUDA_SAMPLES_REPO}" printf 'CUDA_SAMPLES_REF=%s\n' "${CUDA_SAMPLES_REF}" fi - } | git -C "${ROOT}" hash-object --stdin | cut -c1-8 + for name in "${names[@]}"; do + printf 'WORKLOAD=%s\n' "${name}" + while IFS= read -r -d '' file; do + rel="${file#"${ROOT}/"}" + digest="$("${SHA256_CMD[@]}" "${file}" | awk '{print $1}')" + printf 'FILE=%s\n' "${rel}" + printf 'SHA256=%s\n' "${digest}" + done < <(find "${IMAGES_ROOT}/${name}" -type f -print0 | sort -z) + done + } | "${SHA256_CMD[@]}" | cut -c1-12 } mapfile -t available < <(available_image_dirs) @@ -123,28 +146,18 @@ if [[ ${#selected[@]} -eq 0 ]]; then exit 1 fi -source_sha="$(git -C "${ROOT}" rev-parse HEAD)" -source_short_sha="$(git -C "${ROOT}" rev-parse --short HEAD)" -source_dirty=false -if [[ -n "$(git -C "${ROOT}" status --short)" ]]; then - source_dirty=true -fi +input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG:-}" ]]; then image_tag="${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG}" else - input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" - image_tag="${source_short_sha}-${input_fingerprint}" - if [[ "${source_dirty}" == "true" ]]; then - image_tag="${image_tag}-dirty" - fi + image_tag="${input_fingerprint}" fi -input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" declare -A image_refs=() echo "Building GPU workload images with ${CONTAINER_ENGINE}" -echo "Source: ${source_short_sha} (dirty: ${source_dirty})" +echo "Fingerprint: ${input_fingerprint}" echo "Tag: ${image_tag}" for name in "${selected[@]}"; do @@ -159,7 +172,6 @@ for name in "${selected[@]}"; do --label "com.nvidia.openshell.gpu-workload.source=${name}" --label "com.nvidia.openshell.gpu-workload.base-image=${BASE_IMAGE}" --label "com.nvidia.openshell.gpu-workload.input-fingerprint=${input_fingerprint}" - --label "org.opencontainers.image.revision=${source_sha}" ) if [[ "${name}" == "cuda-basic" ]]; then build_args+=( @@ -195,8 +207,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml" echo "# Source this file to use the most recently built GPU workload images." write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_TAG "${image_tag}" write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_PATH "${IMAGES_ROOT}" - write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_SHA "${source_sha}" - write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_DIRTY "${source_dirty}" write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_INPUT_FINGERPRINT "${input_fingerprint}" write_env_var OPENSHELL_SANDBOX_BASE_IMAGE "${BASE_IMAGE}" write_env_var CUDA_BUILD_IMAGE "${CUDA_BUILD_IMAGE}" @@ -214,8 +224,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml" echo "generated_by: $(yaml_quote "mise run e2e:workloads:build")" echo "source:" echo " path: $(yaml_quote "${IMAGES_ROOT}")" - echo " revision: $(yaml_quote "${source_sha}")" - echo " dirty: ${source_dirty}" echo " input_fingerprint: $(yaml_quote "${input_fingerprint}")" echo " container_engine: $(yaml_quote "${CONTAINER_ENGINE}")" echo " inputs:" From bb2375e566a85e46dd77c7c899dde9554f64cb66 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Jun 2026 13:48:33 +0200 Subject: [PATCH 2/3] test(e2e): add gpu workload validation tests Signed-off-by: Evan Lezar --- .github/workflows/e2e-gpu-test.yaml | 2 +- e2e/rust/Cargo.lock | 58 ++++++ e2e/rust/Cargo.toml | 9 +- e2e/rust/e2e-docker.sh | 5 + e2e/rust/tests/gpu.rs | 12 ++ .../device_selection.rs} | 7 +- e2e/rust/tests/gpu/workloads.rs | 181 ++++++++++++++++++ tasks/test.toml | 4 +- 8 files changed, 271 insertions(+), 7 deletions(-) create mode 100644 e2e/rust/tests/gpu.rs rename e2e/rust/tests/{gpu_device_selection.rs => gpu/device_selection.rs} (99%) create mode 100644 e2e/rust/tests/gpu/workloads.rs diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index d16277786..9e758994d 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -52,7 +52,7 @@ jobs: # NVIDIA-managed Ubuntu base used as the GPU probe target: it has the # filesystem layout CDI injection expects (ldconfig, populated /usr/bin) # which the distroless gateway runtime lacks. Consumed by the prereq - # probe below and by the e2e tests in e2e/rust/tests/gpu_device_selection.rs. + # probe below and by the e2e tests in e2e/rust/tests/gpu/device_selection.rs. OPENSHELL_E2E_GPU_PROBE_IMAGE: "nvcr.io/nvidia/base/ubuntu:noble-20251013" steps: - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock index 953449c57..e61c9a8c1 100644 --- a/e2e/rust/Cargo.lock +++ b/e2e/rust/Cargo.lock @@ -188,6 +188,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-macro" version = "0.3.32" @@ -614,7 +625,10 @@ dependencies = [ "hyper-util", "prost", "rand", + "serde", "serde_json", + "serde_yaml", + "serial_test", "sha1", "sha2", "tempfile", @@ -872,6 +886,44 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "serial_test" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d" +dependencies = [ + "futures-executor", + "futures-util", + "log", + "once_cell", + "parking_lot", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha1" version = "0.10.6" @@ -1087,6 +1139,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "url" version = "2.5.8" diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 083c622df..eae80734d 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -98,8 +98,8 @@ path = "tests/forward_proxy_graphql_l7.rs" required-features = ["e2e-host-gateway"] [[test]] -name = "gpu_device_selection" -path = "tests/gpu_device_selection.rs" +name = "gpu" +path = "tests/gpu.rs" required-features = ["e2e-gpu"] [dependencies] @@ -117,7 +117,12 @@ sha1 = "0.10" sha2 = "0.10" hex = "0.4" rand = "0.9" +serde = { version = "1", features = ["derive"] } serde_json = "1" +serde_yaml = "0.9" + +[dev-dependencies] +serial_test = "3" [lints.rust] unsafe_code = "warn" diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index 70e9835bd..99cd6daf7 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -11,9 +11,14 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}" E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}" +DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml" cargo build -p openshell-cli +if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then + echo "note: running GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST." +fi + exec "${ROOT}/e2e/with-docker-gateway.sh" \ cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ --features "${E2E_FEATURES}" \ diff --git a/e2e/rust/tests/gpu.rs b/e2e/rust/tests/gpu.rs new file mode 100644 index 000000000..4a3f951f5 --- /dev/null +++ b/e2e/rust/tests/gpu.rs @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-gpu")] + +// GPU-consuming e2e tests use #[serial(gpu)] because common single-GPU hosts +// cannot reliably provision multiple GPU sandboxes at the same time. + +#[path = "gpu/device_selection.rs"] +mod device_selection; +#[path = "gpu/workloads.rs"] +mod workloads; diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu/device_selection.rs similarity index 99% rename from e2e/rust/tests/gpu_device_selection.rs rename to e2e/rust/tests/gpu/device_selection.rs index 08e77ce2b..56f0019d4 100644 --- a/e2e/rust/tests/gpu_device_selection.rs +++ b/e2e/rust/tests/gpu/device_selection.rs @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -#![cfg(feature = "e2e-gpu")] - //! GPU device selection e2e tests. //! //! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`. @@ -15,6 +13,7 @@ use openshell_e2e::harness::container::{ContainerEngine, e2e_driver}; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; use serde_json::{Map, Value}; +use serial_test::serial; use tokio::time::timeout; const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600); @@ -340,6 +339,7 @@ async fn sandbox_create_output(args: &[&str]) -> String { } #[tokio::test] +#[serial(gpu)] async fn gpu_request_without_device_matches_plain_default_gpu_container() { let device_ids = discovered_cdi_gpu_device_ids(); let Some(default_gpu_device) = @@ -359,6 +359,7 @@ async fn gpu_request_without_device_matches_plain_default_gpu_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_request_for_each_discovered_device_matches_plain_container() { let device_ids: Vec<_> = discovered_cdi_gpu_device_ids() .into_iter() @@ -383,6 +384,7 @@ async fn gpu_request_for_each_discovered_device_matches_plain_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_all_device_request_matches_plain_all_gpu_container() { if !has_cdi_gpu_device(CDI_GPU_DEVICE_ALL) { eprintln!( @@ -401,6 +403,7 @@ async fn gpu_all_device_request_matches_plain_all_gpu_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_invalid_device_request_fails() { let driver_config_json = cdi_devices_driver_config_json(&["nvidia.com/gpu=invalid"]); let args = vec![ diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs new file mode 100644 index 000000000..d0d192650 --- /dev/null +++ b/e2e/rust/tests/gpu/workloads.rs @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! GPU workload validation e2e tests. + +use std::fs; +use std::path::{Path, PathBuf}; + +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; +use serde::Deserialize; +use serial_test::serial; + +const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST"; +const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; +const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE"; + +#[derive(Debug, Deserialize)] +struct WorkloadManifest { + workloads: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct WorkloadDefinition { + name: String, + image: String, + command: Vec, + expect: WorkloadExpectation, + #[serde(default)] + requirements: WorkloadRequirements, +} + +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "lowercase")] +enum WorkloadExpectation { + Pass, + Fail, +} + +#[derive(Clone, Debug, Default, Deserialize)] +struct WorkloadRequirements { + #[serde(default)] + gpu: bool, +} + +fn default_workload_manifest_path() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml") +} + +fn workload_manifest_path() -> PathBuf { + std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .map_or_else(default_workload_manifest_path, PathBuf::from) +} + +fn load_workload_manifest() -> Option { + let path = workload_manifest_path(); + let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .is_some_and(|value| !value.trim().is_empty()); + + let contents = match fs::read_to_string(&path) { + Ok(contents) => contents, + Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => { + eprintln!( + "skipping GPU workload validation: no workload manifest at {}. \ + Run `mise run e2e:workloads:build` to create the local manifest \ + or set {WORKLOAD_MANIFEST_ENV} to an external manifest.", + path.display() + ); + return None; + } + Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()), + }; + + let manifest: WorkloadManifest = serde_yaml::from_str(&contents).unwrap_or_else(|err| { + panic!( + "failed to parse workload manifest {}: {err}", + path.display() + ) + }); + assert!( + !manifest.workloads.is_empty(), + "workload manifest {} contains no workloads", + path.display() + ); + Some(manifest) +} + +async fn assert_expected_pass(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + let mut guard = SandboxGuard::create(&arg_refs).await.unwrap_or_else(|err| { + panic!( + "GPU workload '{}' expected success but sandbox create failed:\n{err}", + workload.name + ) + }); + + let clean_output = strip_ansi(&guard.create_output); + assert!( + clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER), + "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}", + workload.name, + workload.image, + ); + + guard.cleanup().await; +} + +async fn assert_expected_fail(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + match SandboxGuard::create(&arg_refs).await { + Ok(mut guard) => { + let clean_output = strip_ansi(&guard.create_output); + guard.cleanup().await; + panic!( + "GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}", + workload.name + ); + } + Err(err) => { + let clean_output = strip_ansi(&err); + assert!( + clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER), + "expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}", + workload.name, + workload.image, + ); + } + } +} + +#[tokio::test] +#[serial(gpu)] +async fn gpu_workload_manifest_runs_expected_workloads() { + let Some(manifest) = load_workload_manifest() else { + return; + }; + + let gpu_workloads = manifest + .workloads + .into_iter() + .filter(|workload| workload.requirements.gpu) + .collect::>(); + + assert!( + !gpu_workloads.is_empty(), + "workload manifest contains no GPU-tagged workloads" + ); + + for workload in gpu_workloads { + assert!( + !workload.command.is_empty(), + "workload '{}' must declare a non-empty command", + workload.name + ); + + match workload.expect { + WorkloadExpectation::Pass => assert_expected_pass(&workload).await, + WorkloadExpectation::Fail => assert_expected_fail(&workload).await, + } + } +} diff --git a/tasks/test.toml b/tasks/test.toml index 444ea15e1..95036e041 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -87,7 +87,7 @@ run = "e2e/rust/e2e-podman-rootless.sh" ["e2e:podman:gpu"] description = "Run GPU e2e against a standalone gateway with the Podman compute driver" -env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } run = "e2e/rust/e2e-podman.sh" ["e2e:kubernetes"] @@ -116,7 +116,7 @@ run = [ ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } run = "e2e/rust/e2e-docker.sh" ["e2e:openshift"] From 87ea21e479ae5c52ba1c4278677f549ea548407a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Jun 2026 09:26:56 +0200 Subject: [PATCH 3/3] ci(e2e): build gpu workloads before gpu e2e Signed-off-by: Evan Lezar --- .github/workflows/e2e-gpu-test.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index 9e758994d..bcbd96bb4 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -48,6 +48,7 @@ jobs: OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + CONTAINER_ENGINE: docker OPENSHELL_E2E_DOCKER_GPU: "1" # NVIDIA-managed Ubuntu base used as the GPU probe target: it has the # filesystem layout CDI injection expects (ldconfig, populated /usr/bin) @@ -65,5 +66,8 @@ jobs: docker info --format '{{json .CDISpecDirs}}' docker run --rm --device nvidia.com/gpu=all "${OPENSHELL_E2E_GPU_PROBE_IMAGE}" nvidia-smi -L + - name: Build GPU workload images + run: mise run --no-deps --skip-deps e2e:workloads:build + - name: Run tests run: mise run --no-deps --skip-deps e2e:docker:gpu