diff --git a/.github/scripts/stop_old_app_engine_versions.sh b/.github/scripts/stop_old_app_engine_versions.sh new file mode 100755 index 000000000..baef1275b --- /dev/null +++ b/.github/scripts/stop_old_app_engine_versions.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Stops zero-traffic App Engine versions beyond a retention window. +# +# Versions on the flexible environment keep their VMs running — and billing — +# 24/7 while in SERVING state, even at a 0% traffic split. Every deploy leaves +# one behind (staging and prod), so without cleanup the fleet grows by two +# 4vCPU/24GB VMs (~$278/month each) per release: by July 2026 this had +# accumulated 41 zero-traffic versions costing roughly $11k/month. + +set -euo pipefail + +APP_ENGINE_SERVICE="${APP_ENGINE_SERVICE:-default}" +KEEP_PER_PREFIX="${KEEP_PER_PREFIX:-2}" + +project_args=() +if [[ -n "${APP_ENGINE_PROJECT:-}" ]]; then + project_args+=("--project=${APP_ENGINE_PROJECT}") +fi + +# Versions currently receiving traffic are never stopped, regardless of age. +live_versions="$(gcloud app versions list \ + --service="${APP_ENGINE_SERVICE}" \ + --hide-no-traffic \ + --format="value(version.id)" \ + ${project_args[@]+"${project_args[@]}"})" + +for prefix in prod staging; do + serving="$(gcloud app versions list \ + --service="${APP_ENGINE_SERVICE}" \ + --filter="version.servingStatus=SERVING AND version.id:${prefix}-*" \ + --sort-by="~version.createTime" \ + --format="value(version.id)" \ + ${project_args[@]+"${project_args[@]}"})" + + stale="$(tail -n +"$((KEEP_PER_PREFIX + 1))" <<<"${serving}")" + + for version in ${stale}; do + if grep -qx "${version}" <<<"${live_versions}"; then + echo "Skipping ${version}: currently receiving traffic" + continue + fi + echo "Stopping zero-traffic version ${version}" + gcloud app versions stop --quiet \ + --service="${APP_ENGINE_SERVICE}" \ + "${version}" \ + ${project_args[@]+"${project_args[@]}"} + done +done diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 501fbd26e..f66e7544b 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -463,6 +463,30 @@ jobs: env: APP_ENGINE_VERSION: ${{ needs.deploy-production-candidate.outputs.version }} + stop-old-app-engine-versions: + name: Stop old App Engine versions + runs-on: ubuntu-latest + needs: promote-production + if: | + (github.repository == 'PolicyEngine/policyengine-api') + && (github.event.head_commit.message == 'Update PolicyEngine API') + environment: production + permissions: + contents: read + id-token: write + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: GCP authentication + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}" + service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + - name: Set up GCloud + uses: "google-github-actions/setup-gcloud@v2" + - name: Stop zero-traffic versions beyond retention window + run: bash .github/scripts/stop_old_app_engine_versions.sh + deploy-cloud-run-candidate: name: Deploy production Cloud Run candidate runs-on: ubuntu-latest diff --git a/changelog.d/stop-old-app-engine-versions.fixed.md b/changelog.d/stop-old-app-engine-versions.fixed.md new file mode 100644 index 000000000..d8b93e58f --- /dev/null +++ b/changelog.d/stop-old-app-engine-versions.fixed.md @@ -0,0 +1 @@ +Stop zero-traffic App Engine versions after each production deploy, keeping the two newest per environment. Idle flexible-environment versions each kept a 4vCPU/24GB VM running (~$278/month; ~$11k/month accumulated by June 2026). diff --git a/uv.lock b/uv.lock index 8f7c89585..a47b06e10 100644 --- a/uv.lock +++ b/uv.lock @@ -2616,7 +2616,7 @@ models = [ [[package]] name = "policyengine-api" -version = "3.43.5" +version = "3.43.6" source = { editable = "." } dependencies = [ { name = "a2wsgi" },