diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1721af77f..717a5cc46 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -82,6 +82,108 @@ jobs: reconcile: "true" delete: "false" + - name: 💾 DR restore drill (Velero backup → delete → restore) + # Validates the full backup → data-loss → restore cycle against the + # in-cluster MinIO (the local R2 stand-in) on every k8s PR, so the + # Velero code path is regression-tested before changes reach prod. + # Reuses the cluster the System Test step just reconciled; adds ~2-3 + # minutes. See docs/dr/restore-drill.md for design + manual run. + run: | + set -euo pipefail + + # Resource names are fully qualified with the velero.io group + # throughout: CNPG also defines a `backups` resource, and kubectl + # resolves an unqualified `backup` to backups.postgresql.cnpg.io on + # this cluster — the drill's first run polled the wrong API group + # for its entire timeout while the actual Velero backup completed. + dump_velero_state() { + echo "::group::Velero state (drill failure)" + kubectl -n velero get backupstoragelocations.velero.io,backups.velero.io,restores.velero.io -o wide || true + kubectl -n velero describe backups.velero.io dr-drill || true + kubectl -n velero describe restores.velero.io dr-drill || true + kubectl -n velero logs deploy/velero --tail=200 || true + echo "::endgroup::" + } + trap dump_velero_state ERR + + wait_phase() { + # wait_phase — poll a Velero CR + # until .status.phase is Completed; fail fast on a terminal + # failure phase instead of burning the whole timeout. + local kind="$1" name="$2" timeout="$3" phase="" + local deadline=$((SECONDS + timeout)) + while [ "$SECONDS" -lt "$deadline" ]; do + phase=$(kubectl -n velero get "$kind" "$name" -o jsonpath='{.status.phase}' 2>/dev/null || true) + case "$phase" in + Completed) echo "$kind/$name: Completed"; return 0 ;; + Failed|PartiallyFailed|FailedValidation) + echo "::error::$kind/$name entered terminal phase $phase" + return 1 ;; + *) sleep 5 ;; + esac + done + echo "::error::$kind/$name did not complete within ${timeout}s (last phase: ${phase:-})" + return 1 + } + + echo "::group::Wait for BackupStorageLocation default to be Available" + kubectl -n velero wait backupstoragelocations.velero.io/default \ + --for=jsonpath='{.status.phase}'=Available --timeout=10m + echo "::endgroup::" + + echo "::group::Create marker namespace + ConfigMap" + kubectl create namespace dr-drill + kubectl -n dr-drill create configmap dr-marker \ + --from-literal=run-id="${GITHUB_RUN_ID}" \ + --from-literal=sha="${GITHUB_SHA}" + echo "::endgroup::" + + echo "::group::Back up the marker namespace" + kubectl -n velero create -f - </dev/null 2>&1; then + echo "::error::namespace dr-drill still exists after deletion" + exit 1 + fi + echo "::endgroup::" + + echo "::group::Restore from the backup" + kubectl -n velero create -f - < Velero CRs are created with `kubectl` rather than the `velero` CLI so +> the drill needs no extra tool install and can never drift from the +> deployed Velero version. > **Why namespace deletion instead of full cluster rebuild?** MinIO runs > in-cluster with ephemeral storage, so destroying the cluster would also @@ -31,11 +37,13 @@ reach `prod`. ## Wall-clock budget -`timeout-minutes: 240` on the job — matches the **4 h RTO** documented -in [`runbook.md`](./runbook.md). In practice the drill runs in ~15 min. -The 4 h ceiling is the operator promise for the manual prod path; CI -keeps that promise honest by failing fast if the local round trip -explodes. +The drill itself is bounded: 10 min for the `BackupStorageLocation` to +go `Available`, then 5 min each for the backup and the restore to reach +`Completed` (terminal failure phases abort immediately). In practice the +whole sequence takes ~2-3 minutes on top of the system test. The **4 h +RTO** in [`runbook.md`](./runbook.md) is the operator promise for the +manual prod path; CI keeps that promise honest by failing fast if the +local round trip explodes. ## What this catches diff --git a/docs/dr/runbook.md b/docs/dr/runbook.md index 9743d4ae7..743a829c7 100644 --- a/docs/dr/runbook.md +++ b/docs/dr/runbook.md @@ -302,7 +302,10 @@ etcdctl --endpoints unix:///tmp/etcd.snapshot \ # Kubernetes Secret YAML, the EncryptionConfiguration was lost. ``` -This check is also asserted by the CI restore drill (see [restore-drill.md](./restore-drill.md)). +This check is deliberately **not** part of the CI restore drill — Talos +verifies the encryption key at install time, so a CI assertion would add +complexity for a structurally-enforced property (see +[restore-drill.md](./restore-drill.md) for the full rationale). ---