Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
391 changes: 391 additions & 0 deletions .github/workflows/dr-rebuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,391 @@
# DR β€” Rebuild Prod: the executable form of docs/dr/runbook.md Scenario 4
# ("full cluster rebuild from zero"), runnable as one button press when the
# production cluster is gone.
#
# What it does, in order:
# 1. ksail cluster create β€” fresh Hetzner servers, Talos, CCM, CSI
# 2. workload push + reconcile β€” Flux converges the platform (fresh, empty
# OpenBao; SOPS-sourced secrets re-seed automatically)
# 3. (restore=true) Velero resource restore from the newest Completed
# backup synced from R2
# 4. (restore=true) OpenBao data recovery: fetch the newest raft snapshot
# from the R2 openbao-snapshots/ mirror onto the vault-snapshots PVC,
# restore the pre-incident openbao-unseal Secret from the Velero backup,
# reset the fresh vault (scale down + delete data PVCs), and let the
# vault-config Job's automated snapshot-restore path bring the old vault
# back (docs/dr/openbao.md scenario 2/3)
# 5. Optionally refresh the KUBE_CONFIG / TALOS_CONFIG environment secrets
# (requires a DR_GH_ADMIN_TOKEN secret with environment-secrets write;
# without it the step prints the manual Scenario 9 instructions)
#
# Known limits (documented in the runbook):
# * Per-app PVC data (headlamp, actual-budget) is NOT rehydrated into
# already-running pods β€” Velero skips existing resources. Recover an app's
# data with the per-app reset dance (runbook Scenario 5).
# * CNPG databases (umami-db) recover from their own barman backups β€”
# see runbook Scenario 5 for the cnpg restore command.
# * DNS needs no manual step: external-dns (policy: sync) repoints the
# Cloudflare records at the new load balancer once HTTPRoutes are Ready.
name: DR - Rebuild Prod

on:
workflow_dispatch:
inputs:
confirm:
description: 'Type REBUILD-PROD to confirm a from-zero rebuild of the production cluster'
required: true
type: string
restore:
description: 'Restore data after the rebuild (Velero resources + OpenBao raft snapshot)'
required: false
type: boolean
default: true

permissions: {}

# Shared with ci.yaml's merge-queue deploy and cd.yaml's tag deploy so a DR
# rebuild can never race a regular deploy against the prod cluster.
concurrency:
group: prod-deploy
cancel-in-progress: false

jobs:
rebuild:
name: πŸš‘ Rebuild prod from zero
runs-on: ubuntu-latest
environment: prod
permissions:
contents: read # checkout repository
packages: write # push OCI artifacts to GHCR
steps:
- name: πŸ›‘ Verify confirmation phrase
env:
CONFIRM: ${{ inputs.confirm }}
run: |
if [ "${CONFIRM}" != "REBUILD-PROD" ]; then
echo "::error::Confirmation phrase mismatch β€” type REBUILD-PROD (exactly) to run this workflow."
exit 1
fi
echo "Confirmation accepted. Rebuilding the production cluster from zero."

- name: πŸ“‘ Checkout
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
persist-credentials: false

- name: βš™οΈ Setup KSail
# Same install path as ci.yaml's deploy-prod job; renovate keeps the
# pin current in both places.
env:
# renovate: datasource=github-releases depName=devantler-tech/ksail extractVersion=^v(?<version>.+)$
KSAIL_VERSION: "7.54.0"
run: |
curl -fsSL "https://github.com/devantler-tech/ksail/releases/download/v${KSAIL_VERSION}/ksail_${KSAIL_VERSION}_linux_amd64.tar.gz" -o /tmp/ksail.tar.gz
tar -xzf /tmp/ksail.tar.gz -C /tmp
sudo install /tmp/ksail /usr/local/bin/ksail
ksail --version

- name: πŸ” Create SOPS Age key
env:
SOPS_AGE_KEY: ${{ secrets.SOPS_AGE_KEY }}
run: |
mkdir -p ~/.config/sops/age
umask 077
echo "${SOPS_AGE_KEY}" > ~/.config/sops/age/keys.txt
chmod 600 ~/.config/sops/age/keys.txt

- name: πŸ—οΈ Create cluster
# From-zero provisioning: Hetzner servers, Talos boot, CCM, CSI. Writes
# a fresh kubeconfig (~/.kube/config, context admin@prod) and
# talosconfig (~/.talos/config) on this runner β€” every later step uses
# those, so this workflow does NOT depend on the (now stale)
# KUBE_CONFIG / TALOS_CONFIG environment secrets.
run: ksail --config ksail.prod.yaml cluster create
env:
GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}

- name: πŸ“¦ Push manifests to GHCR
run: ksail --config ksail.prod.yaml workload push
env:
GITHUB_ACTOR: ${{ github.actor }}
GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}

- name: πŸ” Trigger Flux reconciliation
run: ksail --config ksail.prod.yaml workload reconcile
env:
GITHUB_ACTOR: ${{ github.actor }}
GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}

- name: ⏳ Wait for Flux to settle
run: |
for k in bootstrap infrastructure-controllers infrastructure apps; do
echo "Waiting for Kustomization ${k}..."
kubectl -n flux-system wait "kustomization/${k}" \
--for=condition=Ready --timeout=20m
done
echo "βœ… All Flux Kustomizations Ready β€” fresh platform converged."

- name: πŸ’Ύ Velero resource restore (newest Completed backup)
if: ${{ inputs.restore }}
run: |
set -euo pipefail
echo "Waiting for the BackupStorageLocation to be Available (Velero syncs old backups from R2)..."
kubectl -n velero wait backupstoragelocation/default \
--for=jsonpath='{.status.phase}'=Available --timeout=15m

# Backup CRs are synced from the R2 bucket by Velero's backup-sync
# controller; give it a moment to populate after the BSL goes ready.
BACKUP=""
for i in $(seq 1 30); do
BACKUP=$(kubectl -n velero get backups -o json | jq -r \
'[.items[] | select(.status.phase=="Completed")] | sort_by(.metadata.creationTimestamp) | last | .metadata.name // empty')
[ -n "${BACKUP}" ] && break
echo " ...no Completed backups synced yet (${i}/30)"
sleep 10
done
if [ -z "${BACKUP}" ]; then
echo "::error::No Completed Velero backup found in the BSL β€” cannot restore. The rebuilt (empty) platform is still up."
exit 1
fi
echo "Restoring from backup: ${BACKUP}"
echo "backup_name=${BACKUP}" >> "${GITHUB_ENV}"

kubectl -n velero create -f - <<EOF
apiVersion: velero.io/v1
kind: Restore
metadata:
name: dr-rebuild-${GITHUB_RUN_ID}
namespace: velero
spec:
backupName: ${BACKUP}
includedNamespaces:
- "*"
excludedNamespaces:
- kube-system
- velero
EOF
phase=""
for i in $(seq 1 180); do
phase=$(kubectl -n velero get restore "dr-rebuild-${GITHUB_RUN_ID}" -o jsonpath='{.status.phase}' 2>/dev/null || true)
case "${phase}" in
Completed) break ;;
Failed|PartiallyFailed) break ;;
*) sleep 10 ;;
esac
done
echo "Restore phase: ${phase:-<none>}"
# PartiallyFailed is expected here: most resources already exist on
# the freshly-converged cluster and are skipped/conflicted. The
# restore's job is to bring back resources Flux does NOT own (the
# old openbao-unseal comes back separately below).
if [ "${phase}" != "Completed" ] && [ "${phase}" != "PartiallyFailed" ]; then
kubectl -n velero describe restore "dr-rebuild-${GITHUB_RUN_ID}" || true
echo "::error::Velero restore did not finish (phase: ${phase:-<none>})."
exit 1
fi

- name: πŸ” Restore OpenBao from the R2 snapshot mirror
if: ${{ inputs.restore }}
run: |
set -euo pipefail

echo "::group::Fetch the newest raft snapshot from R2 onto the vault-snapshots PVC"
# The fresh vault has already re-seeded the R2 credentials from SOPS,
# so the vault-snapshot-r2 Secret (vault-backup/external-secret.yaml)
# is the in-cluster source β€” no credentials leave the cluster.
kubectl -n openbao wait externalsecret/vault-snapshot-r2 \
--for=condition=Ready --timeout=15m

R2_ENDPOINT=$(kubectl -n flux-system get configmap variables-cluster -o jsonpath='{.data.r2_endpoint}' 2>/dev/null || true)
[ -n "${R2_ENDPOINT}" ] || R2_ENDPOINT=$(kubectl -n flux-system get configmap variables-base -o jsonpath='{.data.r2_endpoint}')
R2_BUCKET=$(kubectl -n flux-system get configmap variables-cluster -o jsonpath='{.data.r2_bucket}' 2>/dev/null || true)
[ -n "${R2_BUCKET}" ] || R2_BUCKET=$(kubectl -n flux-system get configmap variables-base -o jsonpath='{.data.r2_bucket}')
kubectl -n openbao create configmap dr-r2-target \
--from-literal=endpoint="${R2_ENDPOINT}" \
--from-literal=bucket="${R2_BUCKET}" \
--dry-run=client -o yaml | kubectl apply -f -

kubectl -n openbao delete pod dr-snapshot-fetch --ignore-not-found
# Labelled app: vault-snapshot so the pod reuses the CronJob's
# CiliumNetworkPolicy (egress to R2/MinIO + DNS).
kubectl -n openbao apply -f - <<'POD'
apiVersion: v1
kind: Pod
metadata:
name: dr-snapshot-fetch
namespace: openbao
labels:
app: vault-snapshot
spec:
restartPolicy: Never
securityContext:
runAsNonRoot: true
runAsUser: 100
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
volumes:
- name: snapshots
persistentVolumeClaim:
claimName: vault-snapshots
- name: r2-credentials
secret:
secretName: vault-snapshot-r2
- name: mc-config
emptyDir: {}
containers:
- name: fetch
image: quay.io/minio/mc:RELEASE.2025-04-08T15-39-49Z@sha256:7e3efb09c22c0882fbf341b9d99f61f94ae6c4c20a06f2f1a2b20ea8993d8952
securityContext:
runAsNonRoot: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
env:
- name: MC_CONFIG_DIR
value: /tmp/.mc
- name: R2_ENDPOINT
valueFrom:
configMapKeyRef:
name: dr-r2-target
key: endpoint
- name: R2_BUCKET
valueFrom:
configMapKeyRef:
name: dr-r2-target
key: bucket
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
memory: 128Mi
volumeMounts:
- name: snapshots
mountPath: /snapshots
- name: r2-credentials
mountPath: /r2
readOnly: true
- name: mc-config
mountPath: /tmp/.mc
command:
- /bin/sh
- -ec
- |
mc alias set backup "$R2_ENDPOINT" \
"$(cat /r2/access_key_id)" "$(cat /r2/secret_access_key)"
NEWEST=$(mc ls "backup/$R2_BUCKET/openbao-snapshots/" \
| awk '{print $NF}' | grep '\.snap$' | sort | tail -n 1)
if [ -z "$NEWEST" ]; then
echo "ERROR: no snapshots found in openbao-snapshots/ β€” nothing to restore."
exit 1
fi
echo "Fetching $NEWEST..."
mc cp "backup/$R2_BUCKET/openbao-snapshots/$NEWEST" "/snapshots/$NEWEST"
ls -l /snapshots/
POD
if ! kubectl -n openbao wait pod/dr-snapshot-fetch \
--for=jsonpath='{.status.phase}'=Succeeded --timeout=10m; then
kubectl -n openbao logs pod/dr-snapshot-fetch || true
echo "::error::Snapshot fetch from R2 failed."
exit 1
fi
kubectl -n openbao logs pod/dr-snapshot-fetch
kubectl -n openbao delete pod dr-snapshot-fetch --ignore-not-found
kubectl -n openbao delete configmap dr-r2-target --ignore-not-found
echo "::endgroup::"

echo "::group::Restore the pre-incident openbao-unseal Secret from the Velero backup"
# Suspend the HelmRelease first so drift detection cannot fight the
# scale-down below.
kubectl -n openbao patch helmrelease openbao --type merge -p '{"spec":{"suspend":true}}'
kubectl -n openbao scale statefulset openbao --replicas=0
kubectl -n openbao wait pod -l app.kubernetes.io/name=openbao \
--for=delete --timeout=5m || true
# The fresh vault's keys are useless; deleting them lets the Velero
# restore bring back the pair that matches the snapshot.
kubectl -n openbao delete secret openbao-unseal --ignore-not-found
kubectl -n velero create -f - <<EOF
apiVersion: velero.io/v1
kind: Restore
metadata:
name: dr-openbao-secrets-${GITHUB_RUN_ID}
namespace: velero
spec:
backupName: ${backup_name}
includedNamespaces:
- openbao
includedResources:
- secrets
EOF
phase=""
for i in $(seq 1 60); do
phase=$(kubectl -n velero get restore "dr-openbao-secrets-${GITHUB_RUN_ID}" -o jsonpath='{.status.phase}' 2>/dev/null || true)
case "${phase}" in
Completed|PartiallyFailed) break ;;
Failed) break ;;
*) sleep 5 ;;
esac
done
if ! kubectl -n openbao get secret openbao-unseal >/dev/null 2>&1; then
echo "::error::openbao-unseal was not restored (restore phase: ${phase:-<none>}) β€” aborting before touching the data PVCs."
exit 1
fi
echo "::endgroup::"

echo "::group::Reset the fresh vault and trigger the automated snapshot restore"
# Empty data volumes + surviving keys + available snapshot = the
# vault-config Job's automated restore path (docs/dr/openbao.md).
kubectl -n openbao get pvc -o name | grep '/data-openbao-' \
| xargs -r kubectl -n openbao delete
kubectl -n openbao patch helmrelease openbao --type merge -p '{"spec":{"suspend":false}}'
# Force the (already-completed) vault-config Job to re-run: delete it
# and let the infrastructure Kustomization re-apply it.
kubectl -n openbao delete job vault-config --ignore-not-found
kubectl -n flux-system annotate kustomization infrastructure \
"reconcile.fluxcd.io/requestedAt=$(date -u +%Y-%m-%dT%H:%M:%SZ)" --overwrite
for i in $(seq 1 90); do
kubectl -n openbao get job vault-config >/dev/null 2>&1 && break
echo " ...waiting for Flux to recreate the vault-config Job (${i}/90)"
sleep 10
done
kubectl -n openbao wait job/vault-config --for=condition=Complete --timeout=30m
echo "::endgroup::"

echo "::group::Force ExternalSecrets + PushSecrets to resync against the restored vault"
NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
kubectl annotate externalsecrets --all -A "force-sync=${NOW}" --overwrite || true
kubectl annotate pushsecrets --all -A "force-sync=${NOW}" --overwrite || true
echo "::endgroup::"
echo "βœ… OpenBao restored from the raft snapshot mirror."

- name: πŸ”‘ Refresh CI deploy credentials (KUBE_CONFIG / TALOS_CONFIG)
env:
DR_GH_ADMIN_TOKEN: ${{ secrets.DR_GH_ADMIN_TOKEN }}
run: |
if [ -z "${DR_GH_ADMIN_TOKEN}" ]; then
echo "::warning::DR_GH_ADMIN_TOKEN is not configured β€” the prod environment's KUBE_CONFIG / TALOS_CONFIG secrets are now STALE. Refresh them manually per docs/dr/runbook.md Scenario 9, or the next deploy will fail its reachability preflight."
exit 0
fi
export GH_TOKEN="${DR_GH_ADMIN_TOKEN}"
gh secret set KUBE_CONFIG --env prod --repo "${GITHUB_REPOSITORY}" < ~/.kube/config
gh secret set TALOS_CONFIG --env prod --repo "${GITHUB_REPOSITORY}" < ~/.talos/config
echo "βœ… prod environment secrets refreshed from this rebuild's fresh configs."

- name: πŸ“‹ Post-rebuild summary
if: always()
run: |
set +e
IP=$(kubectl -n kube-system get svc cilium-gateway-platform -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null)
echo "## DR rebuild summary"
echo "- Load balancer IP: ${IP:-<pending>}"
echo "- DNS: external-dns (policy: sync) repoints the Cloudflare records automatically once HTTPRoutes are Ready β€” verify with: kubectl -n external-dns logs deploy/external-dns"
echo "- CNPG databases (umami-db): recover from barman/R2 per docs/dr/runbook.md Scenario 5 (kubectl cnpg restore)."
echo "- Per-app PVC data (headlamp, actual-budget): already-running pods keep their fresh volumes; use the per-app reset dance from runbook Scenario 5 if old data is needed."
kubectl get kustomizations.kustomize.toolkit.fluxcd.io -A -o wide
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null | head -30
Loading