From 6b2dc07efb0f885c0dc9b8d5293fe7c686793aba Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 3 Jun 2026 15:12:37 +0200
Subject: [PATCH 1/3] test(e2e): add workload manifest build flow

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 e2e/gpu/README.md                     | 77 ++++++++++++++++++++-------
 tasks/scripts/e2e-gpu-build-images.sh | 46 +++++++++-------
 2 files changed, 86 insertions(+), 37 deletions(-)
diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md
index 8c796b444..10520a6bb 100644
--- a/e2e/gpu/README.md
+++ b/e2e/gpu/README.md
@@ -3,7 +3,8 @@
 
 # GPU workload images
 
-This directory defines workload test images for OpenShell GPU validation.
+This directory defines workload test images currently used by the OpenShell GPU
+e2e suite.
 
 ## Contract
 
@@ -22,11 +23,10 @@ Each workload image must:
   command explicitly.
 
 OpenShell sandbox creation replaces the image entrypoint with the supervisor and
-does not run the OCI image `CMD`. When these images are used through OpenShell,
-the workload command from each manifest entry must be passed explicitly.
+does not run the OCI image `CMD`. E2e tests that use these images through
+OpenShell run the command from each manifest entry explicitly.
 
-The image build task writes a local workload manifest. Each workload entry
-carries:
+The test harness is manifest-driven. Each workload entry carries:
 
 - `name`
 - `image`
@@ -61,9 +61,9 @@ The build task uses `tasks/scripts/container-engine.sh`. Set
 `CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine
 explicitly. When unset, the helper uses its existing auto-detection behavior.
 
-Local tags use the current commit short SHA plus a short fingerprint of the
-external build inputs. Dirty local trees append `-dirty`. Set
-`OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=<tag>` to override the tag.
+Local tags use a short SHA-256 fingerprint of the selected workload contexts
+and external build inputs. Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=<tag>` to
+override the tag.
 
 The task writes the latest build refs to:
 
@@ -71,8 +71,7 @@ The task writes the latest build refs to:
 e2e/gpu/images/.build/latest.env
 ```
 
-The task also writes a local workload manifest for downstream tooling and
-future workload-runner integration:
+The task also writes the local workload manifest used by the Rust e2e runner:
 
 ```text
 e2e/gpu/images/.build/workloads.yaml
@@ -90,8 +89,7 @@ source e2e/gpu/images/.build/latest.env
 ```
 
 That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local
-manifest. The current checked-in Rust GPU e2e target does not consume this
-manifest yet. The per-image refs remain available as a convenience for direct
+manifest. The per-image refs remain available as a convenience for direct
 container-engine validation.
 
 ## Direct Validation
@@ -124,14 +122,57 @@ where Podman CDI is configured.
 Direct container-engine validation catches image, CDI, CUDA, and host GPU setup
 issues before OpenShell sandbox behavior is involved.
 
-## OpenShell GPU E2E
+## Manifest-Driven Validation
 
-The current Rust GPU validation target is:
+The Rust GPU validation target is:
 
 ```shell
-mise run e2e:gpu
+cargo test --manifest-path e2e/rust/Cargo.toml --features e2e-docker-gpu --test gpu -- --nocapture
 ```
 
-That target runs `gpu_device_selection`. It validates GPU request and device
-selection behavior against a Docker-backed gateway. It does not run the
-workload manifest generated by `mise run e2e:workloads:build`.
+The workload validation path reads:
+
+```text
+OPENSHELL_E2E_WORKLOAD_MANIFEST
+```
+
+When that variable is unset, the runner uses the default local manifest path:
+
+```text
+e2e/gpu/images/.build/workloads.yaml
+```
+
+If neither path exists, the workload validation test prints a clear skip
+message telling you to run:
+
+```shell
+mise run e2e:workloads:build
+```
+
+or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest.
+
+Each manifest entry supplies the sandbox image and command. OpenShell runs that
+command through `openshell sandbox create --gpu --from <image> -- <command>`.
+The test runner iterates all GPU-tagged workload entries and enforces each
+entry's declared expectation:
+
+- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS`
+- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE`
+
+The current local manifest includes three workloads:
+
+- `smoke-pass` expected to pass
+- `smoke-fail` expected to fail
+- `cuda-basic` expected to pass
+
+## External Manifests
+
+External workload catalogs can use the same schema. Point the runner at one
+with:
+
+```shell
+export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml
+```
+
+That lets alternate workload manifests use the same test runner without
+introducing per-workload env vars.
diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh
index 2a6a13b51..3a800c74f 100644
--- a/tasks/scripts/e2e-gpu-build-images.sh
+++ b/tasks/scripts/e2e-gpu-build-images.sh
@@ -39,6 +39,15 @@ yaml_quote() {
   printf '"%s"' "${value}"
 }
 
+if command -v sha256sum >/dev/null 2>&1; then
+  SHA256_CMD=(sha256sum)
+elif command -v shasum >/dev/null 2>&1; then
+  SHA256_CMD=(shasum -a 256)
+else
+  echo "neither sha256sum nor shasum is available for hashing" >&2
+  exit 1
+fi
+
 available_image_dirs() {
   local preferred
 
@@ -84,15 +93,29 @@ image_expectation() {
 
 workload_input_fingerprint() {
   local -a names=("$@")
+  local digest
+  local file
+  local name
+  local rel
 
   {
+    printf 'schema=openshell-gpu-workload-input-v1\n'
     printf 'OPENSHELL_SANDBOX_BASE_IMAGE=%s\n' "${BASE_IMAGE}"
     if contains_image cuda-basic "${names[@]}"; then
       printf 'CUDA_BUILD_IMAGE=%s\n' "${CUDA_BUILD_IMAGE}"
       printf 'CUDA_SAMPLES_REPO=%s\n' "${CUDA_SAMPLES_REPO}"
       printf 'CUDA_SAMPLES_REF=%s\n' "${CUDA_SAMPLES_REF}"
     fi
-  } | git -C "${ROOT}" hash-object --stdin | cut -c1-8
+    for name in "${names[@]}"; do
+      printf 'WORKLOAD=%s\n' "${name}"
+      while IFS= read -r -d '' file; do
+        rel="${file#"${ROOT}/"}"
+        digest="$("${SHA256_CMD[@]}" "${file}" | awk '{print $1}')"
+        printf 'FILE=%s\n' "${rel}"
+        printf 'SHA256=%s\n' "${digest}"
+      done < <(find "${IMAGES_ROOT}/${name}" -type f -print0 | sort -z)
+    done
+  } | "${SHA256_CMD[@]}" | cut -c1-12
 }
 
 mapfile -t available < <(available_image_dirs)
@@ -123,28 +146,18 @@ if [[ ${#selected[@]} -eq 0 ]]; then
   exit 1
 fi
 
-source_sha="$(git -C "${ROOT}" rev-parse HEAD)"
-source_short_sha="$(git -C "${ROOT}" rev-parse --short HEAD)"
-source_dirty=false
-if [[ -n "$(git -C "${ROOT}" status --short)" ]]; then
-  source_dirty=true
-fi
+input_fingerprint="$(workload_input_fingerprint "${selected[@]}")"
 
 if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG:-}" ]]; then
   image_tag="${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG}"
 else
-  input_fingerprint="$(workload_input_fingerprint "${selected[@]}")"
-  image_tag="${source_short_sha}-${input_fingerprint}"
-  if [[ "${source_dirty}" == "true" ]]; then
-    image_tag="${image_tag}-dirty"
-  fi
+  image_tag="${input_fingerprint}"
 fi
-input_fingerprint="$(workload_input_fingerprint "${selected[@]}")"
 
 declare -A image_refs=()
 
 echo "Building GPU workload images with ${CONTAINER_ENGINE}"
-echo "Source: ${source_short_sha} (dirty: ${source_dirty})"
+echo "Fingerprint: ${input_fingerprint}"
 echo "Tag: ${image_tag}"
 
 for name in "${selected[@]}"; do
@@ -159,7 +172,6 @@ for name in "${selected[@]}"; do
     --label "com.nvidia.openshell.gpu-workload.source=${name}"
     --label "com.nvidia.openshell.gpu-workload.base-image=${BASE_IMAGE}"
     --label "com.nvidia.openshell.gpu-workload.input-fingerprint=${input_fingerprint}"
-    --label "org.opencontainers.image.revision=${source_sha}"
   )
   if [[ "${name}" == "cuda-basic" ]]; then
     build_args+=(
@@ -195,8 +207,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml"
   echo "# Source this file to use the most recently built GPU workload images."
   write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_TAG "${image_tag}"
   write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_PATH "${IMAGES_ROOT}"
-  write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_SHA "${source_sha}"
-  write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_DIRTY "${source_dirty}"
   write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_INPUT_FINGERPRINT "${input_fingerprint}"
   write_env_var OPENSHELL_SANDBOX_BASE_IMAGE "${BASE_IMAGE}"
   write_env_var CUDA_BUILD_IMAGE "${CUDA_BUILD_IMAGE}"
@@ -214,8 +224,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml"
   echo "generated_by: $(yaml_quote "mise run e2e:workloads:build")"
   echo "source:"
   echo "  path: $(yaml_quote "${IMAGES_ROOT}")"
-  echo "  revision: $(yaml_quote "${source_sha}")"
-  echo "  dirty: ${source_dirty}"
   echo "  input_fingerprint: $(yaml_quote "${input_fingerprint}")"
   echo "  container_engine: $(yaml_quote "${CONTAINER_ENGINE}")"
   echo "  inputs:"

From bb2375e566a85e46dd77c7c899dde9554f64cb66 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 3 Jun 2026 13:48:33 +0200
Subject: [PATCH 2/3] test(e2e): add gpu workload validation tests

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .github/workflows/e2e-gpu-test.yaml           |   2 +-
 e2e/rust/Cargo.lock                           |  58 ++++++
 e2e/rust/Cargo.toml                           |   9 +-
 e2e/rust/e2e-docker.sh                        |   5 +
 e2e/rust/tests/gpu.rs                         |  12 ++
 .../device_selection.rs}                      |   7 +-
 e2e/rust/tests/gpu/workloads.rs               | 181 ++++++++++++++++++
 tasks/test.toml                               |   4 +-
 8 files changed, 271 insertions(+), 7 deletions(-)
 create mode 100644 e2e/rust/tests/gpu.rs
 rename e2e/rust/tests/{gpu_device_selection.rs => gpu/device_selection.rs} (99%)
 create mode 100644 e2e/rust/tests/gpu/workloads.rs

diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml
index d16277786..9e758994d 100644
--- a/.github/workflows/e2e-gpu-test.yaml
+++ b/.github/workflows/e2e-gpu-test.yaml
@@ -52,7 +52,7 @@ jobs:
       # NVIDIA-managed Ubuntu base used as the GPU probe target: it has the
       # filesystem layout CDI injection expects (ldconfig, populated /usr/bin)
       # which the distroless gateway runtime lacks. Consumed by the prereq
-      # probe below and by the e2e tests in e2e/rust/tests/gpu_device_selection.rs.
+      # probe below and by the e2e tests in e2e/rust/tests/gpu/device_selection.rs.
       OPENSHELL_E2E_GPU_PROBE_IMAGE: "nvcr.io/nvidia/base/ubuntu:noble-20251013"
     steps:
       - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock
index 953449c57..e61c9a8c1 100644
--- a/e2e/rust/Cargo.lock
+++ b/e2e/rust/Cargo.lock
@@ -188,6 +188,17 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
 
+[[package]]
+name = "futures-executor"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
 [[package]]
 name = "futures-macro"
 version = "0.3.32"
@@ -614,7 +625,10 @@ dependencies = [
  "hyper-util",
  "prost",
  "rand",
+ "serde",
  "serde_json",
+ "serde_yaml",
+ "serial_test",
  "sha1",
  "sha2",
  "tempfile",
@@ -872,6 +886,44 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
+[[package]]
+name = "serial_test"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d"
+dependencies = [
+ "futures-executor",
+ "futures-util",
+ "log",
+ "once_cell",
+ "parking_lot",
+ "serial_test_derive",
+]
+
+[[package]]
+name = "serial_test_derive"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "sha1"
 version = "0.10.6"
@@ -1087,6 +1139,12 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
 
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+
 [[package]]
 name = "url"
 version = "2.5.8"
diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml
index 083c622df..eae80734d 100644
--- a/e2e/rust/Cargo.toml
+++ b/e2e/rust/Cargo.toml
@@ -98,8 +98,8 @@ path = "tests/forward_proxy_graphql_l7.rs"
 required-features = ["e2e-host-gateway"]
 
 [[test]]
-name = "gpu_device_selection"
-path = "tests/gpu_device_selection.rs"
+name = "gpu"
+path = "tests/gpu.rs"
 required-features = ["e2e-gpu"]
 
 [dependencies]
@@ -117,7 +117,12 @@ sha1 = "0.10"
 sha2 = "0.10"
 hex = "0.4"
 rand = "0.9"
+serde = { version = "1", features = ["derive"] }
 serde_json = "1"
+serde_yaml = "0.9"
+
+[dev-dependencies]
+serial_test = "3"
 
 [lints.rust]
 unsafe_code = "warn"
diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh
index 70e9835bd..99cd6daf7 100755
--- a/e2e/rust/e2e-docker.sh
+++ b/e2e/rust/e2e-docker.sh
@@ -11,9 +11,14 @@ set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}"
 E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}"
+DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml"
 
 cargo build -p openshell-cli
 
+if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then
+  echo "note: running GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST."
+fi
+
 exec "${ROOT}/e2e/with-docker-gateway.sh" \
   cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
     --features "${E2E_FEATURES}" \
diff --git a/e2e/rust/tests/gpu.rs b/e2e/rust/tests/gpu.rs
new file mode 100644
index 000000000..4a3f951f5
--- /dev/null
+++ b/e2e/rust/tests/gpu.rs
@@ -0,0 +1,12 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#![cfg(feature = "e2e-gpu")]
+
+// GPU-consuming e2e tests use #[serial(gpu)] because common single-GPU hosts
+// cannot reliably provision multiple GPU sandboxes at the same time.
+
+#[path = "gpu/device_selection.rs"]
+mod device_selection;
+#[path = "gpu/workloads.rs"]
+mod workloads;
diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu/device_selection.rs
similarity index 99%
rename from e2e/rust/tests/gpu_device_selection.rs
rename to e2e/rust/tests/gpu/device_selection.rs
index 08e77ce2b..56f0019d4 100644
--- a/e2e/rust/tests/gpu_device_selection.rs
+++ b/e2e/rust/tests/gpu/device_selection.rs
@@ -1,8 +1,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-#![cfg(feature = "e2e-gpu")]
-
 //! GPU device selection e2e tests.
 //!
 //! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`.
@@ -15,6 +13,7 @@ use openshell_e2e::harness::container::{ContainerEngine, e2e_driver};
 use openshell_e2e::harness::output::strip_ansi;
 use openshell_e2e::harness::sandbox::SandboxGuard;
 use serde_json::{Map, Value};
+use serial_test::serial;
 use tokio::time::timeout;
 
 const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600);
@@ -340,6 +339,7 @@ async fn sandbox_create_output(args: &[&str]) -> String {
 }
 
 #[tokio::test]
+#[serial(gpu)]
 async fn gpu_request_without_device_matches_plain_default_gpu_container() {
     let device_ids = discovered_cdi_gpu_device_ids();
     let Some(default_gpu_device) =
@@ -359,6 +359,7 @@ async fn gpu_request_without_device_matches_plain_default_gpu_container() {
 }
 
 #[tokio::test]
+#[serial(gpu)]
 async fn gpu_request_for_each_discovered_device_matches_plain_container() {
     let device_ids: Vec<_> = discovered_cdi_gpu_device_ids()
         .into_iter()
@@ -383,6 +384,7 @@ async fn gpu_request_for_each_discovered_device_matches_plain_container() {
 }
 
 #[tokio::test]
+#[serial(gpu)]
 async fn gpu_all_device_request_matches_plain_all_gpu_container() {
     if !has_cdi_gpu_device(CDI_GPU_DEVICE_ALL) {
         eprintln!(
@@ -401,6 +403,7 @@ async fn gpu_all_device_request_matches_plain_all_gpu_container() {
 }
 
 #[tokio::test]
+#[serial(gpu)]
 async fn gpu_invalid_device_request_fails() {
     let driver_config_json = cdi_devices_driver_config_json(&["nvidia.com/gpu=invalid"]);
     let args = vec![
diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs
new file mode 100644
index 000000000..d0d192650
--- /dev/null
+++ b/e2e/rust/tests/gpu/workloads.rs
@@ -0,0 +1,181 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! GPU workload validation e2e tests.
+
+use std::fs;
+use std::path::{Path, PathBuf};
+
+use openshell_e2e::harness::output::strip_ansi;
+use openshell_e2e::harness::sandbox::SandboxGuard;
+use serde::Deserialize;
+use serial_test::serial;
+
+const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST";
+const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS";
+const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE";
+
+#[derive(Debug, Deserialize)]
+struct WorkloadManifest {
+    workloads: Vec<WorkloadDefinition>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+struct WorkloadDefinition {
+    name: String,
+    image: String,
+    command: Vec<String>,
+    expect: WorkloadExpectation,
+    #[serde(default)]
+    requirements: WorkloadRequirements,
+}
+
+#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
+#[serde(rename_all = "lowercase")]
+enum WorkloadExpectation {
+    Pass,
+    Fail,
+}
+
+#[derive(Clone, Debug, Default, Deserialize)]
+struct WorkloadRequirements {
+    #[serde(default)]
+    gpu: bool,
+}
+
+fn default_workload_manifest_path() -> PathBuf {
+    Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml")
+}
+
+fn workload_manifest_path() -> PathBuf {
+    std::env::var(WORKLOAD_MANIFEST_ENV)
+        .ok()
+        .map(|value| value.trim().to_string())
+        .filter(|value| !value.is_empty())
+        .map_or_else(default_workload_manifest_path, PathBuf::from)
+}
+
+fn load_workload_manifest() -> Option<WorkloadManifest> {
+    let path = workload_manifest_path();
+    let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV)
+        .ok()
+        .is_some_and(|value| !value.trim().is_empty());
+
+    let contents = match fs::read_to_string(&path) {
+        Ok(contents) => contents,
+        Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => {
+            eprintln!(
+                "skipping GPU workload validation: no workload manifest at {}. \
+                 Run `mise run e2e:workloads:build` to create the local manifest \
+                 or set {WORKLOAD_MANIFEST_ENV} to an external manifest.",
+                path.display()
+            );
+            return None;
+        }
+        Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()),
+    };
+
+    let manifest: WorkloadManifest = serde_yaml::from_str(&contents).unwrap_or_else(|err| {
+        panic!(
+            "failed to parse workload manifest {}: {err}",
+            path.display()
+        )
+    });
+    assert!(
+        !manifest.workloads.is_empty(),
+        "workload manifest {} contains no workloads",
+        path.display()
+    );
+    Some(manifest)
+}
+
+async fn assert_expected_pass(workload: &WorkloadDefinition) {
+    let mut args = vec![
+        "--gpu".to_string(),
+        "--from".to_string(),
+        workload.image.clone(),
+        "--".to_string(),
+    ];
+    args.extend(workload.command.clone());
+    let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>();
+
+    let mut guard = SandboxGuard::create(&arg_refs).await.unwrap_or_else(|err| {
+        panic!(
+            "GPU workload '{}' expected success but sandbox create failed:\n{err}",
+            workload.name
+        )
+    });
+
+    let clean_output = strip_ansi(&guard.create_output);
+    assert!(
+        clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER),
+        "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}",
+        workload.name,
+        workload.image,
+    );
+
+    guard.cleanup().await;
+}
+
+async fn assert_expected_fail(workload: &WorkloadDefinition) {
+    let mut args = vec![
+        "--gpu".to_string(),
+        "--from".to_string(),
+        workload.image.clone(),
+        "--".to_string(),
+    ];
+    args.extend(workload.command.clone());
+    let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>();
+
+    match SandboxGuard::create(&arg_refs).await {
+        Ok(mut guard) => {
+            let clean_output = strip_ansi(&guard.create_output);
+            guard.cleanup().await;
+            panic!(
+                "GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}",
+                workload.name
+            );
+        }
+        Err(err) => {
+            let clean_output = strip_ansi(&err);
+            assert!(
+                clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER),
+                "expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}",
+                workload.name,
+                workload.image,
+            );
+        }
+    }
+}
+
+#[tokio::test]
+#[serial(gpu)]
+async fn gpu_workload_manifest_runs_expected_workloads() {
+    let Some(manifest) = load_workload_manifest() else {
+        return;
+    };
+
+    let gpu_workloads = manifest
+        .workloads
+        .into_iter()
+        .filter(|workload| workload.requirements.gpu)
+        .collect::<Vec<_>>();
+
+    assert!(
+        !gpu_workloads.is_empty(),
+        "workload manifest contains no GPU-tagged workloads"
+    );
+
+    for workload in gpu_workloads {
+        assert!(
+            !workload.command.is_empty(),
+            "workload '{}' must declare a non-empty command",
+            workload.name
+        );
+
+        match workload.expect {
+            WorkloadExpectation::Pass => assert_expected_pass(&workload).await,
+            WorkloadExpectation::Fail => assert_expected_fail(&workload).await,
+        }
+    }
+}
diff --git a/tasks/test.toml b/tasks/test.toml
index 444ea15e1..95036e041 100644
--- a/tasks/test.toml
+++ b/tasks/test.toml
@@ -87,7 +87,7 @@ run = "e2e/rust/e2e-podman-rootless.sh"
 
 ["e2e:podman:gpu"]
 description = "Run GPU e2e against a standalone gateway with the Podman compute driver"
-env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" }
+env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" }
 run = "e2e/rust/e2e-podman.sh"
 
 ["e2e:kubernetes"]
@@ -116,7 +116,7 @@ run = [
 
 ["e2e:docker:gpu"]
 description = "Run GPU e2e against a standalone gateway with the Docker compute driver"
-env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" }
+env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" }
 run = "e2e/rust/e2e-docker.sh"
 
 ["e2e:openshift"]

From 87ea21e479ae5c52ba1c4278677f549ea548407a Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 17 Jun 2026 09:26:56 +0200
Subject: [PATCH 3/3] ci(e2e): build gpu workloads before gpu e2e

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .github/workflows/e2e-gpu-test.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml
index 9e758994d..bcbd96bb4 100644
--- a/.github/workflows/e2e-gpu-test.yaml
+++ b/.github/workflows/e2e-gpu-test.yaml
@@ -48,6 +48,7 @@ jobs:
       OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
       OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
       OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
+      CONTAINER_ENGINE: docker
       OPENSHELL_E2E_DOCKER_GPU: "1"
       # NVIDIA-managed Ubuntu base used as the GPU probe target: it has the
       # filesystem layout CDI injection expects (ldconfig, populated /usr/bin)
@@ -65,5 +66,8 @@ jobs:
           docker info --format '{{json .CDISpecDirs}}'
           docker run --rm --device nvidia.com/gpu=all "${OPENSHELL_E2E_GPU_PROBE_IMAGE}" nvidia-smi -L
 
+      - name: Build GPU workload images
+        run: mise run --no-deps --skip-deps e2e:workloads:build
+
       - name: Run tests
         run: mise run --no-deps --skip-deps e2e:docker:gpu