diff --git a/.github/workflows/dr-rebuild.yaml b/.github/workflows/dr-rebuild.yaml new file mode 100644 index 000000000..b8a772836 --- /dev/null +++ b/.github/workflows/dr-rebuild.yaml @@ -0,0 +1,391 @@ +# DR — Rebuild Prod: the executable form of docs/dr/runbook.md Scenario 4 +# ("full cluster rebuild from zero"), runnable as one button press when the +# production cluster is gone. +# +# What it does, in order: +# 1. ksail cluster create — fresh Hetzner servers, Talos, CCM, CSI +# 2. workload push + reconcile — Flux converges the platform (fresh, empty +# OpenBao; SOPS-sourced secrets re-seed automatically) +# 3. (restore=true) Velero resource restore from the newest Completed +# backup synced from R2 +# 4. (restore=true) OpenBao data recovery: fetch the newest raft snapshot +# from the R2 openbao-snapshots/ mirror onto the vault-snapshots PVC, +# restore the pre-incident openbao-unseal Secret from the Velero backup, +# reset the fresh vault (scale down + delete data PVCs), and let the +# vault-config Job's automated snapshot-restore path bring the old vault +# back (docs/dr/openbao.md scenario 2/3) +# 5. Optionally refresh the KUBE_CONFIG / TALOS_CONFIG environment secrets +# (requires a DR_GH_ADMIN_TOKEN secret with environment-secrets write; +# without it the step prints the manual Scenario 9 instructions) +# +# Known limits (documented in the runbook): +# * Per-app PVC data (headlamp, actual-budget) is NOT rehydrated into +# already-running pods — Velero skips existing resources. Recover an app's +# data with the per-app reset dance (runbook Scenario 5). +# * CNPG databases (umami-db) recover from their own barman backups — +# see runbook Scenario 5 for the cnpg restore command. +# * DNS needs no manual step: external-dns (policy: sync) repoints the +# Cloudflare records at the new load balancer once HTTPRoutes are Ready. +name: DR - Rebuild Prod + +on: + workflow_dispatch: + inputs: + confirm: + description: 'Type REBUILD-PROD to confirm a from-zero rebuild of the production cluster' + required: true + type: string + restore: + description: 'Restore data after the rebuild (Velero resources + OpenBao raft snapshot)' + required: false + type: boolean + default: true + +permissions: {} + +# Shared with ci.yaml's merge-queue deploy and cd.yaml's tag deploy so a DR +# rebuild can never race a regular deploy against the prod cluster. +concurrency: + group: prod-deploy + cancel-in-progress: false + +jobs: + rebuild: + name: 🚑 Rebuild prod from zero + runs-on: ubuntu-latest + environment: prod + permissions: + contents: read # checkout repository + packages: write # push OCI artifacts to GHCR + steps: + - name: 🛑 Verify confirmation phrase + env: + CONFIRM: ${{ inputs.confirm }} + run: | + if [ "${CONFIRM}" != "REBUILD-PROD" ]; then + echo "::error::Confirmation phrase mismatch — type REBUILD-PROD (exactly) to run this workflow." + exit 1 + fi + echo "Confirmation accepted. Rebuilding the production cluster from zero." + + - name: 📑 Checkout + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + + - name: ⚙️ Setup KSail + # Same install path as ci.yaml's deploy-prod job; renovate keeps the + # pin current in both places. + env: + # renovate: datasource=github-releases depName=devantler-tech/ksail extractVersion=^v(?.+)$ + KSAIL_VERSION: "7.54.0" + run: | + curl -fsSL "https://github.com/devantler-tech/ksail/releases/download/v${KSAIL_VERSION}/ksail_${KSAIL_VERSION}_linux_amd64.tar.gz" -o /tmp/ksail.tar.gz + tar -xzf /tmp/ksail.tar.gz -C /tmp + sudo install /tmp/ksail /usr/local/bin/ksail + ksail --version + + - name: 🔐 Create SOPS Age key + env: + SOPS_AGE_KEY: ${{ secrets.SOPS_AGE_KEY }} + run: | + mkdir -p ~/.config/sops/age + umask 077 + echo "${SOPS_AGE_KEY}" > ~/.config/sops/age/keys.txt + chmod 600 ~/.config/sops/age/keys.txt + + - name: 🏗️ Create cluster + # From-zero provisioning: Hetzner servers, Talos boot, CCM, CSI. Writes + # a fresh kubeconfig (~/.kube/config, context admin@prod) and + # talosconfig (~/.talos/config) on this runner — every later step uses + # those, so this workflow does NOT depend on the (now stale) + # KUBE_CONFIG / TALOS_CONFIG environment secrets. + run: ksail --config ksail.prod.yaml cluster create + env: + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} + HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} + + - name: 📦 Push manifests to GHCR + run: ksail --config ksail.prod.yaml workload push + env: + GITHUB_ACTOR: ${{ github.actor }} + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} + HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} + + - name: 🔁 Trigger Flux reconciliation + run: ksail --config ksail.prod.yaml workload reconcile + env: + GITHUB_ACTOR: ${{ github.actor }} + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} + HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} + + - name: ⏳ Wait for Flux to settle + run: | + for k in bootstrap infrastructure-controllers infrastructure apps; do + echo "Waiting for Kustomization ${k}..." + kubectl -n flux-system wait "kustomization/${k}" \ + --for=condition=Ready --timeout=20m + done + echo "✅ All Flux Kustomizations Ready — fresh platform converged." + + - name: 💾 Velero resource restore (newest Completed backup) + if: ${{ inputs.restore }} + run: | + set -euo pipefail + echo "Waiting for the BackupStorageLocation to be Available (Velero syncs old backups from R2)..." + kubectl -n velero wait backupstoragelocation/default \ + --for=jsonpath='{.status.phase}'=Available --timeout=15m + + # Backup CRs are synced from the R2 bucket by Velero's backup-sync + # controller; give it a moment to populate after the BSL goes ready. + BACKUP="" + for i in $(seq 1 30); do + BACKUP=$(kubectl -n velero get backups -o json | jq -r \ + '[.items[] | select(.status.phase=="Completed")] | sort_by(.metadata.creationTimestamp) | last | .metadata.name // empty') + [ -n "${BACKUP}" ] && break + echo " ...no Completed backups synced yet (${i}/30)" + sleep 10 + done + if [ -z "${BACKUP}" ]; then + echo "::error::No Completed Velero backup found in the BSL — cannot restore. The rebuilt (empty) platform is still up." + exit 1 + fi + echo "Restoring from backup: ${BACKUP}" + echo "backup_name=${BACKUP}" >> "${GITHUB_ENV}" + + kubectl -n velero create -f - </dev/null || true) + case "${phase}" in + Completed) break ;; + Failed|PartiallyFailed) break ;; + *) sleep 10 ;; + esac + done + echo "Restore phase: ${phase:-}" + # PartiallyFailed is expected here: most resources already exist on + # the freshly-converged cluster and are skipped/conflicted. The + # restore's job is to bring back resources Flux does NOT own (the + # old openbao-unseal comes back separately below). + if [ "${phase}" != "Completed" ] && [ "${phase}" != "PartiallyFailed" ]; then + kubectl -n velero describe restore "dr-rebuild-${GITHUB_RUN_ID}" || true + echo "::error::Velero restore did not finish (phase: ${phase:-})." + exit 1 + fi + + - name: 🔐 Restore OpenBao from the R2 snapshot mirror + if: ${{ inputs.restore }} + run: | + set -euo pipefail + + echo "::group::Fetch the newest raft snapshot from R2 onto the vault-snapshots PVC" + # The fresh vault has already re-seeded the R2 credentials from SOPS, + # so the vault-snapshot-r2 Secret (vault-backup/external-secret.yaml) + # is the in-cluster source — no credentials leave the cluster. + kubectl -n openbao wait externalsecret/vault-snapshot-r2 \ + --for=condition=Ready --timeout=15m + + R2_ENDPOINT=$(kubectl -n flux-system get configmap variables-cluster -o jsonpath='{.data.r2_endpoint}' 2>/dev/null || true) + [ -n "${R2_ENDPOINT}" ] || R2_ENDPOINT=$(kubectl -n flux-system get configmap variables-base -o jsonpath='{.data.r2_endpoint}') + R2_BUCKET=$(kubectl -n flux-system get configmap variables-cluster -o jsonpath='{.data.r2_bucket}' 2>/dev/null || true) + [ -n "${R2_BUCKET}" ] || R2_BUCKET=$(kubectl -n flux-system get configmap variables-base -o jsonpath='{.data.r2_bucket}') + kubectl -n openbao create configmap dr-r2-target \ + --from-literal=endpoint="${R2_ENDPOINT}" \ + --from-literal=bucket="${R2_BUCKET}" \ + --dry-run=client -o yaml | kubectl apply -f - + + kubectl -n openbao delete pod dr-snapshot-fetch --ignore-not-found + # Labelled app: vault-snapshot so the pod reuses the CronJob's + # CiliumNetworkPolicy (egress to R2/MinIO + DNS). + kubectl -n openbao apply -f - <<'POD' + apiVersion: v1 + kind: Pod + metadata: + name: dr-snapshot-fetch + namespace: openbao + labels: + app: vault-snapshot + spec: + restartPolicy: Never + securityContext: + runAsNonRoot: true + runAsUser: 100 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + volumes: + - name: snapshots + persistentVolumeClaim: + claimName: vault-snapshots + - name: r2-credentials + secret: + secretName: vault-snapshot-r2 + - name: mc-config + emptyDir: {} + containers: + - name: fetch + image: quay.io/minio/mc:RELEASE.2025-04-08T15-39-49Z@sha256:7e3efb09c22c0882fbf341b9d99f61f94ae6c4c20a06f2f1a2b20ea8993d8952 + securityContext: + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + env: + - name: MC_CONFIG_DIR + value: /tmp/.mc + - name: R2_ENDPOINT + valueFrom: + configMapKeyRef: + name: dr-r2-target + key: endpoint + - name: R2_BUCKET + valueFrom: + configMapKeyRef: + name: dr-r2-target + key: bucket + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + memory: 128Mi + volumeMounts: + - name: snapshots + mountPath: /snapshots + - name: r2-credentials + mountPath: /r2 + readOnly: true + - name: mc-config + mountPath: /tmp/.mc + command: + - /bin/sh + - -ec + - | + mc alias set backup "$R2_ENDPOINT" \ + "$(cat /r2/access_key_id)" "$(cat /r2/secret_access_key)" + NEWEST=$(mc ls "backup/$R2_BUCKET/openbao-snapshots/" \ + | awk '{print $NF}' | grep '\.snap$' | sort | tail -n 1) + if [ -z "$NEWEST" ]; then + echo "ERROR: no snapshots found in openbao-snapshots/ — nothing to restore." + exit 1 + fi + echo "Fetching $NEWEST..." + mc cp "backup/$R2_BUCKET/openbao-snapshots/$NEWEST" "/snapshots/$NEWEST" + ls -l /snapshots/ + POD + if ! kubectl -n openbao wait pod/dr-snapshot-fetch \ + --for=jsonpath='{.status.phase}'=Succeeded --timeout=10m; then + kubectl -n openbao logs pod/dr-snapshot-fetch || true + echo "::error::Snapshot fetch from R2 failed." + exit 1 + fi + kubectl -n openbao logs pod/dr-snapshot-fetch + kubectl -n openbao delete pod dr-snapshot-fetch --ignore-not-found + kubectl -n openbao delete configmap dr-r2-target --ignore-not-found + echo "::endgroup::" + + echo "::group::Restore the pre-incident openbao-unseal Secret from the Velero backup" + # Suspend the HelmRelease first so drift detection cannot fight the + # scale-down below. + kubectl -n openbao patch helmrelease openbao --type merge -p '{"spec":{"suspend":true}}' + kubectl -n openbao scale statefulset openbao --replicas=0 + kubectl -n openbao wait pod -l app.kubernetes.io/name=openbao \ + --for=delete --timeout=5m || true + # The fresh vault's keys are useless; deleting them lets the Velero + # restore bring back the pair that matches the snapshot. + kubectl -n openbao delete secret openbao-unseal --ignore-not-found + kubectl -n velero create -f - </dev/null || true) + case "${phase}" in + Completed|PartiallyFailed) break ;; + Failed) break ;; + *) sleep 5 ;; + esac + done + if ! kubectl -n openbao get secret openbao-unseal >/dev/null 2>&1; then + echo "::error::openbao-unseal was not restored (restore phase: ${phase:-}) — aborting before touching the data PVCs." + exit 1 + fi + echo "::endgroup::" + + echo "::group::Reset the fresh vault and trigger the automated snapshot restore" + # Empty data volumes + surviving keys + available snapshot = the + # vault-config Job's automated restore path (docs/dr/openbao.md). + kubectl -n openbao get pvc -o name | grep '/data-openbao-' \ + | xargs -r kubectl -n openbao delete + kubectl -n openbao patch helmrelease openbao --type merge -p '{"spec":{"suspend":false}}' + # Force the (already-completed) vault-config Job to re-run: delete it + # and let the infrastructure Kustomization re-apply it. + kubectl -n openbao delete job vault-config --ignore-not-found + kubectl -n flux-system annotate kustomization infrastructure \ + "reconcile.fluxcd.io/requestedAt=$(date -u +%Y-%m-%dT%H:%M:%SZ)" --overwrite + for i in $(seq 1 90); do + kubectl -n openbao get job vault-config >/dev/null 2>&1 && break + echo " ...waiting for Flux to recreate the vault-config Job (${i}/90)" + sleep 10 + done + kubectl -n openbao wait job/vault-config --for=condition=Complete --timeout=30m + echo "::endgroup::" + + echo "::group::Force ExternalSecrets + PushSecrets to resync against the restored vault" + NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ) + kubectl annotate externalsecrets --all -A "force-sync=${NOW}" --overwrite || true + kubectl annotate pushsecrets --all -A "force-sync=${NOW}" --overwrite || true + echo "::endgroup::" + echo "✅ OpenBao restored from the raft snapshot mirror." + + - name: 🔑 Refresh CI deploy credentials (KUBE_CONFIG / TALOS_CONFIG) + env: + DR_GH_ADMIN_TOKEN: ${{ secrets.DR_GH_ADMIN_TOKEN }} + run: | + if [ -z "${DR_GH_ADMIN_TOKEN}" ]; then + echo "::warning::DR_GH_ADMIN_TOKEN is not configured — the prod environment's KUBE_CONFIG / TALOS_CONFIG secrets are now STALE. Refresh them manually per docs/dr/runbook.md Scenario 9, or the next deploy will fail its reachability preflight." + exit 0 + fi + export GH_TOKEN="${DR_GH_ADMIN_TOKEN}" + gh secret set KUBE_CONFIG --env prod --repo "${GITHUB_REPOSITORY}" < ~/.kube/config + gh secret set TALOS_CONFIG --env prod --repo "${GITHUB_REPOSITORY}" < ~/.talos/config + echo "✅ prod environment secrets refreshed from this rebuild's fresh configs." + + - name: 📋 Post-rebuild summary + if: always() + run: | + set +e + IP=$(kubectl -n kube-system get svc cilium-gateway-platform -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null) + echo "## DR rebuild summary" + echo "- Load balancer IP: ${IP:-}" + echo "- DNS: external-dns (policy: sync) repoints the Cloudflare records automatically once HTTPRoutes are Ready — verify with: kubectl -n external-dns logs deploy/external-dns" + echo "- CNPG databases (umami-db): recover from barman/R2 per docs/dr/runbook.md Scenario 5 (kubectl cnpg restore)." + echo "- Per-app PVC data (headlamp, actual-budget): already-running pods keep their fresh volumes; use the per-app reset dance from runbook Scenario 5 if old data is needed." + kubectl get kustomizations.kustomize.toolkit.fluxcd.io -A -o wide + kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null | head -30 diff --git a/docs/dr/runbook.md b/docs/dr/runbook.md index f8b812db4..9743d4ae7 100644 --- a/docs/dr/runbook.md +++ b/docs/dr/runbook.md @@ -20,7 +20,7 @@ these simultaneously and you cannot recover. | Artifact | Where it lives | Recovery if lost | | --------------------------------------- | ------------------------------------ | ------------------------------------ | | **SOPS Age private keys** (one per env) | Secure vault + offline backup | Re-encrypt all `*.enc.yaml` (below) | -| **OpenBao unseal key + root token** | `openbao-unseal` Secret (Velero-backed) + operator vault | Restore the `openbao-unseal` Secret + OpenBao PVC from the most recent Velero snapshot ([openbao.md](openbao.md) scenario 3); only if every copy is gone, re-initialize OpenBao and re-seed KV — existing encrypted data is then unrecoverable | +| **OpenBao unseal key + root token** | `openbao-unseal` Secret (Velero-backed) + operator vault | Restore the `openbao-unseal` Secret from the most recent Velero backup; the paired raft snapshot lives on the `vault-snapshots` PVC and in the R2 `openbao-snapshots/` mirror, and the `vault-config` Job restores it automatically ([openbao.md](openbao.md) scenarios 2-3); only if every copy is gone, re-initialize OpenBao and re-seed KV — existing encrypted data is then unrecoverable | | **Cloudflare R2 access keys** | Secure vault | Mint new in Cloudflare; SOPS-update | | **Hetzner Cloud API token** | Secure vault | Mint new in Hetzner Cloud console | | **Cloudflare API token** | Secure vault | Mint new in Cloudflare dashboard | @@ -103,6 +103,16 @@ plane is a cattle resource that ksail can re-provision in < 15 min. The "everything is gone" path. ~10 min of Hetzner provisioning + ~15 min of Flux reconciliation. +> **One-button path:** run the **`DR - Rebuild Prod`** workflow +> (`.github/workflows/dr-rebuild.yaml`, `workflow_dispatch`, confirmation +> phrase `REBUILD-PROD`). It executes every step below from the CI runner — +> cluster create, Flux convergence, the Velero resource restore, and the +> OpenBao raft-snapshot recovery ([openbao.md](openbao.md) scenario 3) — and +> needs none of the (stale-after-rebuild) `KUBE_CONFIG`/`TALOS_CONFIG` +> secrets, because `ksail cluster create` writes fresh configs on the runner. +> The manual procedure below is the fallback when GitHub Actions itself is +> unavailable. + ```bash # 1. Set credentials locally export HCLOUD_TOKEN= @@ -120,10 +130,15 @@ ksail --config ksail.prod.yaml workload reconcile # Flux pulls and applies flux get kustomizations -A # Re-run if any are NotReady; expect convergence in 10-15 minutes -# 5. Point public DNS at the new Hetzner Cloud Load Balancer +# 5. DNS — normally NO manual step: external-dns (hetzner overlay, +# policy: sync, gateway-httproute source) repoints the Cloudflare +# records at the new load balancer automatically once the HTTPRoutes +# are Ready and its Cloudflare token has re-synced from the vault. +# Verify, and only intervene if external-dns itself is broken: +kubectl -n external-dns logs deploy/external-dns | tail -20 kubectl -n kube-system get svc cilium-gateway-platform \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' -# Update A/AAAA records for ${domain} and *.${domain} at your DNS provider. +# Fallback only: update A/AAAA records for ${domain} at your DNS provider. # 6. Restore Velero backups (apps + PVCs) kubectl -n velero create -f - < The `DR - Rebuild Prod` workflow refreshes both secrets automatically at the +> end of a rebuild **if** a `DR_GH_ADMIN_TOKEN` secret (a fine-grained PAT +> with environment-secrets write on this repo) is configured; without it the +> workflow prints a warning and the manual procedure below applies. + The prod deploy pipeline (`.github/workflows/cd.yaml` on a `v*` tag, and the merge-queue `deploy-prod` job in `ci.yaml`) authenticates to the cluster with two GitHub `prod` environment secrets: