diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index d16277786..bcbd96bb4 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -48,11 +48,12 @@ jobs: OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + CONTAINER_ENGINE: docker OPENSHELL_E2E_DOCKER_GPU: "1" # NVIDIA-managed Ubuntu base used as the GPU probe target: it has the # filesystem layout CDI injection expects (ldconfig, populated /usr/bin) # which the distroless gateway runtime lacks. Consumed by the prereq - # probe below and by the e2e tests in e2e/rust/tests/gpu_device_selection.rs. + # probe below and by the e2e tests in e2e/rust/tests/gpu/device_selection.rs. OPENSHELL_E2E_GPU_PROBE_IMAGE: "nvcr.io/nvidia/base/ubuntu:noble-20251013" steps: - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 @@ -65,5 +66,8 @@ jobs: docker info --format '{{json .CDISpecDirs}}' docker run --rm --device nvidia.com/gpu=all "${OPENSHELL_E2E_GPU_PROBE_IMAGE}" nvidia-smi -L + - name: Build GPU workload images + run: mise run --no-deps --skip-deps e2e:workloads:build + - name: Run tests run: mise run --no-deps --skip-deps e2e:docker:gpu diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md index 8c796b444..8462006cd 100644 --- a/e2e/gpu/README.md +++ b/e2e/gpu/README.md @@ -3,7 +3,8 @@ # GPU workload images -This directory defines workload test images for OpenShell GPU validation. +This directory defines workload test images currently used by the OpenShell GPU +e2e suite. ## Contract @@ -22,11 +23,10 @@ Each workload image must: command explicitly. OpenShell sandbox creation replaces the image entrypoint with the supervisor and -does not run the OCI image `CMD`. When these images are used through OpenShell, -the workload command from each manifest entry must be passed explicitly. +does not run the OCI image `CMD`. E2e tests that use these images through +OpenShell run the command from each manifest entry explicitly. -The image build task writes a local workload manifest. Each workload entry -carries: +The test harness is manifest-driven. Each workload entry carries: - `name` - `image` @@ -61,9 +61,9 @@ The build task uses `tasks/scripts/container-engine.sh`. Set `CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine explicitly. When unset, the helper uses its existing auto-detection behavior. -Local tags use the current commit short SHA plus a short fingerprint of the -external build inputs. Dirty local trees append `-dirty`. Set -`OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to override the tag. +Local tags use a short SHA-256 fingerprint of the selected workload contexts +and external build inputs. Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to +override the tag. The task writes the latest build refs to: @@ -71,8 +71,7 @@ The task writes the latest build refs to: e2e/gpu/images/.build/latest.env ``` -The task also writes a local workload manifest for downstream tooling and -future workload-runner integration: +The task also writes the local workload manifest used by the Rust e2e runner: ```text e2e/gpu/images/.build/workloads.yaml @@ -90,8 +89,7 @@ source e2e/gpu/images/.build/latest.env ``` That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local -manifest. The current checked-in Rust GPU e2e target does not consume this -manifest yet. The per-image refs remain available as a convenience for direct +manifest. The per-image refs remain available as a convenience for direct container-engine validation. ## Direct Validation @@ -124,14 +122,63 @@ where Podman CDI is configured. Direct container-engine validation catches image, CDI, CUDA, and host GPU setup issues before OpenShell sandbox behavior is involved. -## OpenShell GPU E2E +## Manifest-Driven Validation -The current Rust GPU validation target is: +Run manifest-driven GPU validation through the e2e tasks so the workload +images, manifest, gateway, and container-engine environment match CI: ```shell -mise run e2e:gpu +mise run e2e:workloads:build +mise run e2e:docker:gpu +``` + +For Podman GPU validation, build the manifest with +`CONTAINER_ENGINE=podman mise run e2e:workloads:build`, then run +`mise run e2e:podman:gpu`. + +The workload validation path reads: + +```text +OPENSHELL_E2E_WORKLOAD_MANIFEST +``` + +When that variable is unset, the runner uses the default local manifest path: + +```text +e2e/gpu/images/.build/workloads.yaml +``` + +If neither path exists, the workload validation test prints a clear skip +message telling you to run: + +```shell +mise run e2e:workloads:build +``` + +or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest. + +Each manifest entry supplies the sandbox image and command. OpenShell runs that +command through `openshell sandbox create --gpu --from -- `. +The test runner iterates all GPU-tagged workload entries and enforces each +entry's declared expectation: + +- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS` +- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE` + +The current local manifest includes three workloads: + +- `smoke-pass` expected to pass +- `smoke-fail` expected to fail +- `cuda-basic` expected to pass + +## External Manifests + +External workload catalogs can use the same schema. Point the runner at one +with: + +```shell +export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml ``` -That target runs `gpu_device_selection`. It validates GPU request and device -selection behavior against a Docker-backed gateway. It does not run the -workload manifest generated by `mise run e2e:workloads:build`. +That lets alternate workload manifests use the same test runner without +introducing per-workload env vars. diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock index 953449c57..aebec66c0 100644 --- a/e2e/rust/Cargo.lock +++ b/e2e/rust/Cargo.lock @@ -188,6 +188,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-macro" version = "0.3.32" @@ -550,6 +561,16 @@ version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +[[package]] +name = "libyml" +version = "0.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3302702afa434ffa30847a83305f0a69d6abd74293b6554c18ec85c7ef30c980" +dependencies = [ + "anyhow", + "version_check", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -614,7 +635,10 @@ dependencies = [ "hyper-util", "prost", "rand", + "serde", "serde_json", + "serde_yml", + "serial_test", "sha1", "sha2", "tempfile", @@ -872,6 +896,46 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yml" +version = "0.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59e2dd588bf1597a252c3b920e0143eb99b0f76e4e082f4c92ce34fbc9e71ddd" +dependencies = [ + "indexmap", + "itoa", + "libyml", + "memchr", + "ryu", + "serde", + "version_check", +] + +[[package]] +name = "serial_test" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d" +dependencies = [ + "futures-executor", + "futures-util", + "log", + "once_cell", + "parking_lot", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha1" version = "0.10.6" diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 083c622df..8e014afb3 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -98,8 +98,8 @@ path = "tests/forward_proxy_graphql_l7.rs" required-features = ["e2e-host-gateway"] [[test]] -name = "gpu_device_selection" -path = "tests/gpu_device_selection.rs" +name = "gpu" +path = "tests/gpu.rs" required-features = ["e2e-gpu"] [dependencies] @@ -117,7 +117,12 @@ sha1 = "0.10" sha2 = "0.10" hex = "0.4" rand = "0.9" +serde = { version = "1", features = ["derive"] } serde_json = "1" +serde_yml = "0.0.12" + +[dev-dependencies] +serial_test = "3" [lints.rust] unsafe_code = "warn" diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index 70e9835bd..99cd6daf7 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -11,9 +11,14 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}" E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}" +DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml" cargo build -p openshell-cli +if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then + echo "note: running GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST." +fi + exec "${ROOT}/e2e/with-docker-gateway.sh" \ cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ --features "${E2E_FEATURES}" \ diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh index 26843e128..39b6b523a 100755 --- a/e2e/rust/e2e-podman.sh +++ b/e2e/rust/e2e-podman.sh @@ -11,9 +11,14 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}" E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e-podman}" +DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml" cargo build -p openshell-cli +if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then + echo "note: running Podman GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'CONTAINER_ENGINE=podman mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST." +fi + TEST_ARGS=( cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" --features "${E2E_FEATURES}" diff --git a/e2e/rust/tests/gpu.rs b/e2e/rust/tests/gpu.rs new file mode 100644 index 000000000..4a3f951f5 --- /dev/null +++ b/e2e/rust/tests/gpu.rs @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-gpu")] + +// GPU-consuming e2e tests use #[serial(gpu)] because common single-GPU hosts +// cannot reliably provision multiple GPU sandboxes at the same time. + +#[path = "gpu/device_selection.rs"] +mod device_selection; +#[path = "gpu/workloads.rs"] +mod workloads; diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu/device_selection.rs similarity index 99% rename from e2e/rust/tests/gpu_device_selection.rs rename to e2e/rust/tests/gpu/device_selection.rs index 08e77ce2b..56f0019d4 100644 --- a/e2e/rust/tests/gpu_device_selection.rs +++ b/e2e/rust/tests/gpu/device_selection.rs @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -#![cfg(feature = "e2e-gpu")] - //! GPU device selection e2e tests. //! //! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`. @@ -15,6 +13,7 @@ use openshell_e2e::harness::container::{ContainerEngine, e2e_driver}; use openshell_e2e::harness::output::strip_ansi; use openshell_e2e::harness::sandbox::SandboxGuard; use serde_json::{Map, Value}; +use serial_test::serial; use tokio::time::timeout; const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600); @@ -340,6 +339,7 @@ async fn sandbox_create_output(args: &[&str]) -> String { } #[tokio::test] +#[serial(gpu)] async fn gpu_request_without_device_matches_plain_default_gpu_container() { let device_ids = discovered_cdi_gpu_device_ids(); let Some(default_gpu_device) = @@ -359,6 +359,7 @@ async fn gpu_request_without_device_matches_plain_default_gpu_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_request_for_each_discovered_device_matches_plain_container() { let device_ids: Vec<_> = discovered_cdi_gpu_device_ids() .into_iter() @@ -383,6 +384,7 @@ async fn gpu_request_for_each_discovered_device_matches_plain_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_all_device_request_matches_plain_all_gpu_container() { if !has_cdi_gpu_device(CDI_GPU_DEVICE_ALL) { eprintln!( @@ -401,6 +403,7 @@ async fn gpu_all_device_request_matches_plain_all_gpu_container() { } #[tokio::test] +#[serial(gpu)] async fn gpu_invalid_device_request_fails() { let driver_config_json = cdi_devices_driver_config_json(&["nvidia.com/gpu=invalid"]); let args = vec![ diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs new file mode 100644 index 000000000..1f4125118 --- /dev/null +++ b/e2e/rust/tests/gpu/workloads.rs @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! GPU workload validation e2e tests. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; +use serde::Deserialize; +use serial_test::serial; +use tokio::time::timeout; + +const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST"; +const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; +const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE"; +const WORKLOAD_SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600); + +#[derive(Debug, Deserialize)] +struct WorkloadManifest { + workloads: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct WorkloadDefinition { + name: String, + image: String, + command: Vec, + expect: WorkloadExpectation, + #[serde(default)] + requirements: WorkloadRequirements, +} + +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "lowercase")] +enum WorkloadExpectation { + Pass, + Fail, +} + +#[derive(Clone, Debug, Default, Deserialize)] +struct WorkloadRequirements { + #[serde(default)] + gpu: bool, +} + +fn default_workload_manifest_path() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml") +} + +fn workload_manifest_path() -> PathBuf { + std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .map_or_else(default_workload_manifest_path, PathBuf::from) +} + +fn load_workload_manifest() -> Option { + let path = workload_manifest_path(); + let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .is_some_and(|value| !value.trim().is_empty()); + + let contents = match fs::read_to_string(&path) { + Ok(contents) => contents, + Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => { + eprintln!( + "skipping GPU workload validation: no workload manifest at {}. \ + Run `mise run e2e:workloads:build` to create the local manifest \ + or set {WORKLOAD_MANIFEST_ENV} to an external manifest.", + path.display() + ); + return None; + } + Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()), + }; + + let manifest: WorkloadManifest = serde_yml::from_str(&contents).unwrap_or_else(|err| { + panic!( + "failed to parse workload manifest {}: {err}", + path.display() + ) + }); + assert!( + !manifest.workloads.is_empty(), + "workload manifest {} contains no workloads", + path.display() + ); + Some(manifest) +} + +async fn create_workload_sandbox(args: &[&str]) -> Result { + timeout(WORKLOAD_SANDBOX_CREATE_TIMEOUT, SandboxGuard::create(args)) + .await + .map_err(|_| { + format!( + "GPU workload sandbox create timed out after {WORKLOAD_SANDBOX_CREATE_TIMEOUT:?}" + ) + })? +} + +async fn assert_expected_pass(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + let mut guard = create_workload_sandbox(&arg_refs) + .await + .unwrap_or_else(|err| { + panic!( + "GPU workload '{}' expected success but sandbox create failed:\n{err}", + workload.name + ) + }); + + let clean_output = strip_ansi(&guard.create_output); + guard.cleanup().await; + + assert!( + clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER), + "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}", + workload.name, + workload.image, + ); +} + +async fn assert_expected_fail(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + match create_workload_sandbox(&arg_refs).await { + Ok(mut guard) => { + let clean_output = strip_ansi(&guard.create_output); + guard.cleanup().await; + panic!( + "GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}", + workload.name + ); + } + Err(err) => { + let clean_output = strip_ansi(&err); + assert!( + clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER), + "expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}", + workload.name, + workload.image, + ); + } + } +} + +#[tokio::test] +#[serial(gpu)] +async fn gpu_workload_manifest_runs_expected_workloads() { + let Some(manifest) = load_workload_manifest() else { + return; + }; + + let gpu_workloads = manifest + .workloads + .into_iter() + .filter(|workload| workload.requirements.gpu) + .collect::>(); + + assert!( + !gpu_workloads.is_empty(), + "workload manifest contains no GPU-tagged workloads" + ); + + for workload in gpu_workloads { + assert!( + !workload.command.is_empty(), + "workload '{}' must declare a non-empty command", + workload.name + ); + + match workload.expect { + WorkloadExpectation::Pass => assert_expected_pass(&workload).await, + WorkloadExpectation::Fail => assert_expected_fail(&workload).await, + } + } +} diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh index 2a6a13b51..efe3d8378 100644 --- a/tasks/scripts/e2e-gpu-build-images.sh +++ b/tasks/scripts/e2e-gpu-build-images.sh @@ -39,6 +39,15 @@ yaml_quote() { printf '"%s"' "${value}" } +if command -v sha256sum >/dev/null 2>&1; then + SHA256_CMD=(sha256sum) +elif command -v shasum >/dev/null 2>&1; then + SHA256_CMD=(shasum -a 256) +else + echo "neither sha256sum nor shasum is available for hashing" >&2 + exit 1 +fi + available_image_dirs() { local preferred @@ -59,6 +68,34 @@ contains_image() { return 1 } +find_sorted_files_null() { + local dir=$1 + local file + local key + local -a files=() + local i + local j + local LC_ALL=C + + while IFS= read -r -d '' file; do + files+=("${file}") + done < <(find "${dir}" -type f -print0) + + for ((i = 1; i < ${#files[@]}; i++)); do + key=${files[${i}]} + j=$((i - 1)) + while ((j >= 0)) && [[ ${files[${j}]} > "${key}" ]]; do + files[$((j + 1))]=${files[${j}]} + j=$((j - 1)) + done + files[$((j + 1))]=${key} + done + + if [[ ${#files[@]} -gt 0 ]]; then + printf '%s\0' "${files[@]}" + fi +} + image_env_var() { case "$1" in smoke-pass) echo "OPENSHELL_E2E_GPU_SMOKE_PASS_IMAGE" ;; @@ -84,15 +121,29 @@ image_expectation() { workload_input_fingerprint() { local -a names=("$@") + local digest + local file + local name + local rel { + printf 'schema=openshell-gpu-workload-input-v1\n' printf 'OPENSHELL_SANDBOX_BASE_IMAGE=%s\n' "${BASE_IMAGE}" if contains_image cuda-basic "${names[@]}"; then printf 'CUDA_BUILD_IMAGE=%s\n' "${CUDA_BUILD_IMAGE}" printf 'CUDA_SAMPLES_REPO=%s\n' "${CUDA_SAMPLES_REPO}" printf 'CUDA_SAMPLES_REF=%s\n' "${CUDA_SAMPLES_REF}" fi - } | git -C "${ROOT}" hash-object --stdin | cut -c1-8 + for name in "${names[@]}"; do + printf 'WORKLOAD=%s\n' "${name}" + while IFS= read -r -d '' file; do + rel="${file#"${ROOT}/"}" + digest="$("${SHA256_CMD[@]}" "${file}" | awk '{print $1}')" + printf 'FILE=%s\n' "${rel}" + printf 'SHA256=%s\n' "${digest}" + done < <(find_sorted_files_null "${IMAGES_ROOT}/${name}") + done + } | "${SHA256_CMD[@]}" | cut -c1-12 } mapfile -t available < <(available_image_dirs) @@ -123,28 +174,18 @@ if [[ ${#selected[@]} -eq 0 ]]; then exit 1 fi -source_sha="$(git -C "${ROOT}" rev-parse HEAD)" -source_short_sha="$(git -C "${ROOT}" rev-parse --short HEAD)" -source_dirty=false -if [[ -n "$(git -C "${ROOT}" status --short)" ]]; then - source_dirty=true -fi +input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG:-}" ]]; then image_tag="${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG}" else - input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" - image_tag="${source_short_sha}-${input_fingerprint}" - if [[ "${source_dirty}" == "true" ]]; then - image_tag="${image_tag}-dirty" - fi + image_tag="${input_fingerprint}" fi -input_fingerprint="$(workload_input_fingerprint "${selected[@]}")" declare -A image_refs=() echo "Building GPU workload images with ${CONTAINER_ENGINE}" -echo "Source: ${source_short_sha} (dirty: ${source_dirty})" +echo "Fingerprint: ${input_fingerprint}" echo "Tag: ${image_tag}" for name in "${selected[@]}"; do @@ -159,7 +200,6 @@ for name in "${selected[@]}"; do --label "com.nvidia.openshell.gpu-workload.source=${name}" --label "com.nvidia.openshell.gpu-workload.base-image=${BASE_IMAGE}" --label "com.nvidia.openshell.gpu-workload.input-fingerprint=${input_fingerprint}" - --label "org.opencontainers.image.revision=${source_sha}" ) if [[ "${name}" == "cuda-basic" ]]; then build_args+=( @@ -195,8 +235,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml" echo "# Source this file to use the most recently built GPU workload images." write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_TAG "${image_tag}" write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_PATH "${IMAGES_ROOT}" - write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_SHA "${source_sha}" - write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_DIRTY "${source_dirty}" write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_INPUT_FINGERPRINT "${input_fingerprint}" write_env_var OPENSHELL_SANDBOX_BASE_IMAGE "${BASE_IMAGE}" write_env_var CUDA_BUILD_IMAGE "${CUDA_BUILD_IMAGE}" @@ -214,8 +252,6 @@ manifest_path="${BUILD_DIR}/workloads.yaml" echo "generated_by: $(yaml_quote "mise run e2e:workloads:build")" echo "source:" echo " path: $(yaml_quote "${IMAGES_ROOT}")" - echo " revision: $(yaml_quote "${source_sha}")" - echo " dirty: ${source_dirty}" echo " input_fingerprint: $(yaml_quote "${input_fingerprint}")" echo " container_engine: $(yaml_quote "${CONTAINER_ENGINE}")" echo " inputs:" diff --git a/tasks/test.toml b/tasks/test.toml index 444ea15e1..95036e041 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -87,7 +87,7 @@ run = "e2e/rust/e2e-podman-rootless.sh" ["e2e:podman:gpu"] description = "Run GPU e2e against a standalone gateway with the Podman compute driver" -env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } run = "e2e/rust/e2e-podman.sh" ["e2e:kubernetes"] @@ -116,7 +116,7 @@ run = [ ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } run = "e2e/rust/e2e-docker.sh" ["e2e:openshift"]