From 31aacc60f5c62e5a5ed114e6dd709160c45f77bd Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Thu, 11 Jun 2026 00:20:48 +0200
Subject: [PATCH 1/2] ci: implement the documented Velero restore drill in the
 system-test job

docs/dr/restore-drill.md has documented a CI restore drill since it was
written, but no workflow ever implemented it -- the backup -> data-loss ->
restore path was never regression-tested, so a Velero chart bump, RBAC
drift or MinIO credential break would only surface during a real
disaster (the worst possible time, as the 2026-06-10 vault incident
demonstrated for the adjacent snapshot path).

Implement the drill as steps inside the existing system-test job,
reusing the Talos+Docker cluster it just reconciled (a separate job
would pay a second 10-minute cluster bootstrap for no extra signal):

1. wait for BackupStorageLocation/default to be Available (Velero ->
   in-cluster MinIO, the local R2 stand-in)
2. create a dr-drill namespace + marker ConfigMap carrying run-id/sha
3. Backup CR scoped to the namespace, wait for Completed (fail fast on
   Failed/PartiallyFailed/FailedValidation)
4. delete the namespace and assert it is gone
5. Restore CR from the backup, wait for Completed
6. assert the restored ConfigMap's run-id matches GITHUB_RUN_ID

Velero CRs are created with kubectl (no velero CLI install, no version
drift). On any drill failure the step dumps BSL/Backup/Restore state and
the Velero server log before exiting.

Also truth up the docs: restore-drill.md described a standalone job with
its own cluster and a timeout-minutes: 240 budget that never existed;
runbook.md claimed the drill asserts etcd encryption-at-rest, which
restore-drill.md itself explicitly scopes out.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/workflows/ci.yaml | 97 +++++++++++++++++++++++++++++++++++++++
 docs/dr/restore-drill.md  | 42 ++++++++++-------
 docs/dr/runbook.md        |  5 +-
 3 files changed, 126 insertions(+), 18 deletions(-)
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 2443b98a2..1db7eb7ee 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -82,6 +82,103 @@ jobs:
           reconcile: "true"
           delete: "false"
 
+      - name: 💾 DR restore drill (Velero backup → delete → restore)
+        # Validates the full backup → data-loss → restore cycle against the
+        # in-cluster MinIO (the local R2 stand-in) on every k8s PR, so the
+        # Velero code path is regression-tested before changes reach prod.
+        # Reuses the cluster the System Test step just reconciled; adds ~2-3
+        # minutes. See docs/dr/restore-drill.md for design + manual run.
+        run: |
+          set -euo pipefail
+
+          dump_velero_state() {
+            echo "::group::Velero state (drill failure)"
+            kubectl -n velero get backupstoragelocations,backups,restores -o wide || true
+            kubectl -n velero describe backup dr-drill || true
+            kubectl -n velero describe restore dr-drill || true
+            kubectl -n velero logs deploy/velero --tail=200 || true
+            echo "::endgroup::"
+          }
+          trap dump_velero_state ERR
+
+          wait_phase() {
+            # wait_phase <kind> <name> <timeout-seconds> — poll a Velero CR
+            # until .status.phase is Completed; fail fast on a terminal
+            # failure phase instead of burning the whole timeout.
+            local kind="$1" name="$2" timeout="$3" phase=""
+            local deadline=$((SECONDS + timeout))
+            while [ "$SECONDS" -lt "$deadline" ]; do
+              phase=$(kubectl -n velero get "$kind" "$name" -o jsonpath='{.status.phase}' 2>/dev/null || true)
+              case "$phase" in
+                Completed) echo "$kind/$name: Completed"; return 0 ;;
+                Failed|PartiallyFailed|FailedValidation)
+                  echo "::error::$kind/$name entered terminal phase $phase"
+                  return 1 ;;
+                *) sleep 5 ;;
+              esac
+            done
+            echo "::error::$kind/$name did not complete within ${timeout}s (last phase: ${phase:-<none>})"
+            return 1
+          }
+
+          echo "::group::Wait for BackupStorageLocation default to be Available"
+          kubectl -n velero wait backupstoragelocation/default \
+            --for=jsonpath='{.status.phase}'=Available --timeout=10m
+          echo "::endgroup::"
+
+          echo "::group::Create marker namespace + ConfigMap"
+          kubectl create namespace dr-drill
+          kubectl -n dr-drill create configmap dr-marker \
+            --from-literal=run-id="${GITHUB_RUN_ID}" \
+            --from-literal=sha="${GITHUB_SHA}"
+          echo "::endgroup::"
+
+          echo "::group::Back up the marker namespace"
+          kubectl -n velero create -f - <<EOF
+          apiVersion: velero.io/v1
+          kind: Backup
+          metadata:
+            name: dr-drill
+            namespace: velero
+          spec:
+            includedNamespaces:
+              - dr-drill
+            storageLocation: default
+            ttl: 1h0m0s
+          EOF
+          wait_phase backup dr-drill 300
+          echo "::endgroup::"
+
+          echo "::group::Simulate data loss (delete the namespace)"
+          kubectl delete namespace dr-drill --wait=true --timeout=2m
+          if kubectl get namespace dr-drill >/dev/null 2>&1; then
+            echo "::error::namespace dr-drill still exists after deletion"
+            exit 1
+          fi
+          echo "::endgroup::"
+
+          echo "::group::Restore from the backup"
+          kubectl -n velero create -f - <<EOF
+          apiVersion: velero.io/v1
+          kind: Restore
+          metadata:
+            name: dr-drill
+            namespace: velero
+          spec:
+            backupName: dr-drill
+          EOF
+          wait_phase restore dr-drill 300
+          echo "::endgroup::"
+
+          echo "::group::Verify restored marker"
+          restored=$(kubectl -n dr-drill get configmap dr-marker -o jsonpath='{.data.run-id}')
+          if [ "${restored}" != "${GITHUB_RUN_ID}" ]; then
+            echo "::error::restored run-id '${restored}' does not match expected '${GITHUB_RUN_ID}'"
+            exit 1
+          fi
+          echo "✅ Restore drill passed: marker ConfigMap restored with matching run-id."
+          echo "::endgroup::"
+
       - name: 🩺 Diagnose Flux on failure
         if: failure()
         run: |
diff --git a/docs/dr/restore-drill.md b/docs/dr/restore-drill.md
index 5230d718e..ddb026c68 100644
--- a/docs/dr/restore-drill.md
+++ b/docs/dr/restore-drill.md
@@ -1,27 +1,33 @@
 # DR restore drill (CI)
 
-`.github/workflows/ci.yaml` runs a `restore-drill` job on every PR that
-touches `k8s/**` or the cluster configs. The job validates the full
-backup → data-loss → restore cycle end-to-end on a local Talos+Docker
-cluster, so the Velero code path is regression-tested **before** changes
-reach `prod`.
+`.github/workflows/ci.yaml` runs restore-drill steps inside the
+`system-test` job on every PR that touches `k8s/**` or the cluster
+configs. The drill validates the full backup → data-loss → restore cycle
+end-to-end on the local Talos+Docker cluster the job just reconciled, so
+the Velero code path is regression-tested **before** changes reach
+`prod`. (Reusing the system-test cluster instead of creating a second
+one keeps the added wall-clock to ~2-3 minutes.)
 
 ## What it does
 
-1. `ksail cluster create` and reconcile all workloads.
-2. Wait for **Velero** + **MinIO** (the local R2 stand-in) to be ready
-   and `BackupStorageLocation/default` `Available`.
+1. Reuse the cluster the `system-test` job created and reconciled.
+2. Wait for `BackupStorageLocation/default` to report `Available`
+   (Velero validates against **MinIO**, the local R2 stand-in).
 3. Create a marker `Namespace`/`ConfigMap` carrying the GitHub
    `run-id` and `sha` (so identity can be proved later).
-4. `velero backup create` against the marker namespace, `--wait` for
-   `Completed`.
+4. Create a `Backup` CR scoped to the marker namespace and wait for
+   phase `Completed` (failing fast on `Failed`/`PartiallyFailed`).
 5. **Simulate data loss**: delete the marker namespace (`kubectl delete
    namespace`).
 6. Assert the marker namespace does **not** exist after deletion.
-7. `velero restore create --from-backup ... --wait` for `Completed`.
+7. Create a `Restore` CR from the backup and wait for `Completed`.
 8. Assert the marker `ConfigMap` is back and `data.run-id` matches the
    current `GITHUB_RUN_ID`.
-9. Tear down the cluster (`if: always()`).
+9. The job tears down the cluster (`if: always()`) as usual.
+
+> Velero CRs are created with `kubectl` rather than the `velero` CLI so
+> the drill needs no extra tool install and can never drift from the
+> deployed Velero version.
 
 > **Why namespace deletion instead of full cluster rebuild?** MinIO runs
 > in-cluster with ephemeral storage, so destroying the cluster would also
@@ -31,11 +37,13 @@ reach `prod`.
 
 ## Wall-clock budget
 
-`timeout-minutes: 240` on the job — matches the **4 h RTO** documented
-in [`runbook.md`](./runbook.md). In practice the drill runs in ~15 min.
-The 4 h ceiling is the operator promise for the manual prod path; CI
-keeps that promise honest by failing fast if the local round trip
-explodes.
+The drill itself is bounded: 10 min for the `BackupStorageLocation` to
+go `Available`, then 5 min each for the backup and the restore to reach
+`Completed` (terminal failure phases abort immediately). In practice the
+whole sequence takes ~2-3 minutes on top of the system test. The **4 h
+RTO** in [`runbook.md`](./runbook.md) is the operator promise for the
+manual prod path; CI keeps that promise honest by failing fast if the
+local round trip explodes.
 
 ## What this catches
 
diff --git a/docs/dr/runbook.md b/docs/dr/runbook.md
index f8b812db4..b4a754004 100644
--- a/docs/dr/runbook.md
+++ b/docs/dr/runbook.md
@@ -287,7 +287,10 @@ etcdctl --endpoints unix:///tmp/etcd.snapshot \
 # Kubernetes Secret YAML, the EncryptionConfiguration was lost.
 ```
 
-This check is also asserted by the CI restore drill (see [restore-drill.md](./restore-drill.md)).
+This check is deliberately **not** part of the CI restore drill — Talos
+verifies the encryption key at install time, so a CI assertion would add
+complexity for a structurally-enforced property (see
+[restore-drill.md](./restore-drill.md) for the full rationale).
 
 ---
 

From 65815e7feee7362c09a0f2ce251eb0a4af9fd123 Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Thu, 11 Jun 2026 19:43:59 +0200
Subject: [PATCH 2/2] fix(ci): fully qualify Velero resource names in the
 restore drill

The drill's first real run (after main's reconcile wedge was fixed)
exposed a resource-name collision: CNPG also defines a 'backups'
resource, and kubectl resolves an unqualified 'backup' to
backups.postgresql.cnpg.io on this cluster -- so wait_phase polled the
wrong API group for its entire 300s timeout while the actual Velero
backup ran unobserved ('backups.postgresql.cnpg.io "dr-drill" not
found'). Qualify every get/describe/wait in the drill with the
velero.io group so the resolution can never be ambiguous, and note the
collision in a comment.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/workflows/ci.yaml | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index e95b462f7..717a5cc46 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -91,11 +91,16 @@ jobs:
         run: |
           set -euo pipefail
 
+          # Resource names are fully qualified with the velero.io group
+          # throughout: CNPG also defines a `backups` resource, and kubectl
+          # resolves an unqualified `backup` to backups.postgresql.cnpg.io on
+          # this cluster — the drill's first run polled the wrong API group
+          # for its entire timeout while the actual Velero backup completed.
           dump_velero_state() {
             echo "::group::Velero state (drill failure)"
-            kubectl -n velero get backupstoragelocations,backups,restores -o wide || true
-            kubectl -n velero describe backup dr-drill || true
-            kubectl -n velero describe restore dr-drill || true
+            kubectl -n velero get backupstoragelocations.velero.io,backups.velero.io,restores.velero.io -o wide || true
+            kubectl -n velero describe backups.velero.io dr-drill || true
+            kubectl -n velero describe restores.velero.io dr-drill || true
             kubectl -n velero logs deploy/velero --tail=200 || true
             echo "::endgroup::"
           }
@@ -122,7 +127,7 @@ jobs:
           }
 
           echo "::group::Wait for BackupStorageLocation default to be Available"
-          kubectl -n velero wait backupstoragelocation/default \
+          kubectl -n velero wait backupstoragelocations.velero.io/default \
             --for=jsonpath='{.status.phase}'=Available --timeout=10m
           echo "::endgroup::"
 
@@ -146,7 +151,7 @@ jobs:
             storageLocation: default
             ttl: 1h0m0s
           EOF
-          wait_phase backup dr-drill 300
+          wait_phase backups.velero.io dr-drill 300
           echo "::endgroup::"
 
           echo "::group::Simulate data loss (delete the namespace)"
@@ -167,7 +172,7 @@ jobs:
           spec:
             backupName: dr-drill
           EOF
-          wait_phase restore dr-drill 300
+          wait_phase restores.velero.io dr-drill 300
           echo "::endgroup::"
 
           echo "::group::Verify restored marker"