From 31aacc60f5c62e5a5ed114e6dd709160c45f77bd Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Thu, 11 Jun 2026 00:20:48 +0200 Subject: [PATCH 1/2] ci: implement the documented Velero restore drill in the system-test job docs/dr/restore-drill.md has documented a CI restore drill since it was written, but no workflow ever implemented it -- the backup -> data-loss -> restore path was never regression-tested, so a Velero chart bump, RBAC drift or MinIO credential break would only surface during a real disaster (the worst possible time, as the 2026-06-10 vault incident demonstrated for the adjacent snapshot path). Implement the drill as steps inside the existing system-test job, reusing the Talos+Docker cluster it just reconciled (a separate job would pay a second 10-minute cluster bootstrap for no extra signal): 1. wait for BackupStorageLocation/default to be Available (Velero -> in-cluster MinIO, the local R2 stand-in) 2. create a dr-drill namespace + marker ConfigMap carrying run-id/sha 3. Backup CR scoped to the namespace, wait for Completed (fail fast on Failed/PartiallyFailed/FailedValidation) 4. delete the namespace and assert it is gone 5. Restore CR from the backup, wait for Completed 6. assert the restored ConfigMap's run-id matches GITHUB_RUN_ID Velero CRs are created with kubectl (no velero CLI install, no version drift). On any drill failure the step dumps BSL/Backup/Restore state and the Velero server log before exiting. Also truth up the docs: restore-drill.md described a standalone job with its own cluster and a timeout-minutes: 240 budget that never existed; runbook.md claimed the drill asserts etcd encryption-at-rest, which restore-drill.md itself explicitly scopes out. Co-Authored-By: Claude Fable 5 --- .github/workflows/ci.yaml | 97 +++++++++++++++++++++++++++++++++++++++ docs/dr/restore-drill.md | 42 ++++++++++------- docs/dr/runbook.md | 5 +- 3 files changed, 126 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2443b98a2..1db7eb7ee 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -82,6 +82,103 @@ jobs: reconcile: "true" delete: "false" + - name: 💾 DR restore drill (Velero backup → delete → restore) + # Validates the full backup → data-loss → restore cycle against the + # in-cluster MinIO (the local R2 stand-in) on every k8s PR, so the + # Velero code path is regression-tested before changes reach prod. + # Reuses the cluster the System Test step just reconciled; adds ~2-3 + # minutes. See docs/dr/restore-drill.md for design + manual run. + run: | + set -euo pipefail + + dump_velero_state() { + echo "::group::Velero state (drill failure)" + kubectl -n velero get backupstoragelocations,backups,restores -o wide || true + kubectl -n velero describe backup dr-drill || true + kubectl -n velero describe restore dr-drill || true + kubectl -n velero logs deploy/velero --tail=200 || true + echo "::endgroup::" + } + trap dump_velero_state ERR + + wait_phase() { + # wait_phase — poll a Velero CR + # until .status.phase is Completed; fail fast on a terminal + # failure phase instead of burning the whole timeout. + local kind="$1" name="$2" timeout="$3" phase="" + local deadline=$((SECONDS + timeout)) + while [ "$SECONDS" -lt "$deadline" ]; do + phase=$(kubectl -n velero get "$kind" "$name" -o jsonpath='{.status.phase}' 2>/dev/null || true) + case "$phase" in + Completed) echo "$kind/$name: Completed"; return 0 ;; + Failed|PartiallyFailed|FailedValidation) + echo "::error::$kind/$name entered terminal phase $phase" + return 1 ;; + *) sleep 5 ;; + esac + done + echo "::error::$kind/$name did not complete within ${timeout}s (last phase: ${phase:-})" + return 1 + } + + echo "::group::Wait for BackupStorageLocation default to be Available" + kubectl -n velero wait backupstoragelocation/default \ + --for=jsonpath='{.status.phase}'=Available --timeout=10m + echo "::endgroup::" + + echo "::group::Create marker namespace + ConfigMap" + kubectl create namespace dr-drill + kubectl -n dr-drill create configmap dr-marker \ + --from-literal=run-id="${GITHUB_RUN_ID}" \ + --from-literal=sha="${GITHUB_SHA}" + echo "::endgroup::" + + echo "::group::Back up the marker namespace" + kubectl -n velero create -f - </dev/null 2>&1; then + echo "::error::namespace dr-drill still exists after deletion" + exit 1 + fi + echo "::endgroup::" + + echo "::group::Restore from the backup" + kubectl -n velero create -f - < Velero CRs are created with `kubectl` rather than the `velero` CLI so +> the drill needs no extra tool install and can never drift from the +> deployed Velero version. > **Why namespace deletion instead of full cluster rebuild?** MinIO runs > in-cluster with ephemeral storage, so destroying the cluster would also @@ -31,11 +37,13 @@ reach `prod`. ## Wall-clock budget -`timeout-minutes: 240` on the job — matches the **4 h RTO** documented -in [`runbook.md`](./runbook.md). In practice the drill runs in ~15 min. -The 4 h ceiling is the operator promise for the manual prod path; CI -keeps that promise honest by failing fast if the local round trip -explodes. +The drill itself is bounded: 10 min for the `BackupStorageLocation` to +go `Available`, then 5 min each for the backup and the restore to reach +`Completed` (terminal failure phases abort immediately). In practice the +whole sequence takes ~2-3 minutes on top of the system test. The **4 h +RTO** in [`runbook.md`](./runbook.md) is the operator promise for the +manual prod path; CI keeps that promise honest by failing fast if the +local round trip explodes. ## What this catches diff --git a/docs/dr/runbook.md b/docs/dr/runbook.md index f8b812db4..b4a754004 100644 --- a/docs/dr/runbook.md +++ b/docs/dr/runbook.md @@ -287,7 +287,10 @@ etcdctl --endpoints unix:///tmp/etcd.snapshot \ # Kubernetes Secret YAML, the EncryptionConfiguration was lost. ``` -This check is also asserted by the CI restore drill (see [restore-drill.md](./restore-drill.md)). +This check is deliberately **not** part of the CI restore drill — Talos +verifies the encryption key at install time, so a CI assertion would add +complexity for a structurally-enforced property (see +[restore-drill.md](./restore-drill.md) for the full rationale). --- From 65815e7feee7362c09a0f2ce251eb0a4af9fd123 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Thu, 11 Jun 2026 19:43:59 +0200 Subject: [PATCH 2/2] fix(ci): fully qualify Velero resource names in the restore drill The drill's first real run (after main's reconcile wedge was fixed) exposed a resource-name collision: CNPG also defines a 'backups' resource, and kubectl resolves an unqualified 'backup' to backups.postgresql.cnpg.io on this cluster -- so wait_phase polled the wrong API group for its entire 300s timeout while the actual Velero backup ran unobserved ('backups.postgresql.cnpg.io "dr-drill" not found'). Qualify every get/describe/wait in the drill with the velero.io group so the resolution can never be ambiguous, and note the collision in a comment. Co-Authored-By: Claude Fable 5 --- .github/workflows/ci.yaml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e95b462f7..717a5cc46 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -91,11 +91,16 @@ jobs: run: | set -euo pipefail + # Resource names are fully qualified with the velero.io group + # throughout: CNPG also defines a `backups` resource, and kubectl + # resolves an unqualified `backup` to backups.postgresql.cnpg.io on + # this cluster — the drill's first run polled the wrong API group + # for its entire timeout while the actual Velero backup completed. dump_velero_state() { echo "::group::Velero state (drill failure)" - kubectl -n velero get backupstoragelocations,backups,restores -o wide || true - kubectl -n velero describe backup dr-drill || true - kubectl -n velero describe restore dr-drill || true + kubectl -n velero get backupstoragelocations.velero.io,backups.velero.io,restores.velero.io -o wide || true + kubectl -n velero describe backups.velero.io dr-drill || true + kubectl -n velero describe restores.velero.io dr-drill || true kubectl -n velero logs deploy/velero --tail=200 || true echo "::endgroup::" } @@ -122,7 +127,7 @@ jobs: } echo "::group::Wait for BackupStorageLocation default to be Available" - kubectl -n velero wait backupstoragelocation/default \ + kubectl -n velero wait backupstoragelocations.velero.io/default \ --for=jsonpath='{.status.phase}'=Available --timeout=10m echo "::endgroup::" @@ -146,7 +151,7 @@ jobs: storageLocation: default ttl: 1h0m0s EOF - wait_phase backup dr-drill 300 + wait_phase backups.velero.io dr-drill 300 echo "::endgroup::" echo "::group::Simulate data loss (delete the namespace)" @@ -167,7 +172,7 @@ jobs: spec: backupName: dr-drill EOF - wait_phase restore dr-drill 300 + wait_phase restores.velero.io dr-drill 300 echo "::endgroup::" echo "::group::Verify restored marker"