From 29daee6f3e77c8d183f2dfb0a28852a00915b639 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Thu, 11 Jun 2026 01:44:10 +0200 Subject: [PATCH 1/2] fix(openbao): take an initial snapshot at deploy time so the PVC binds everywhere MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vault-snapshots PVC (#1983) has exactly one consumer: the 03:30 CronJob. On clusters whose default StorageClass binds WaitForFirstConsumer (the Talos+Docker CI cluster), a consumer-less PVC stays Pending, kstatus reports it InProgress, and the wait-enabled infrastructure Kustomization health-gates on it for the whole run — the last remaining system-test blocker after #1999 (prod binds instantly because longhorn is Immediate, so this never showed there). Add a one-shot vault-snapshot-init Job (vault-config retry pattern: force-recreate annotation, OnFailure, backoffLimit 30) that takes a baseline snapshot immediately after deploy unless one from today already exists. That both binds the PVC on WaitForFirstConsumer providers and closes the exposure window between 'snapshots configured' and 'first snapshot taken' that the 2026-06-10 wipe fell into. Co-Authored-By: Claude Fable 5 --- .../infrastructure/vault-backup/init-job.yaml | 87 +++++++++++++++++++ .../vault-backup/kustomization.yaml | 1 + 2 files changed, 88 insertions(+) create mode 100644 k8s/bases/infrastructure/vault-backup/init-job.yaml diff --git a/k8s/bases/infrastructure/vault-backup/init-job.yaml b/k8s/bases/infrastructure/vault-backup/init-job.yaml new file mode 100644 index 000000000..4d731ebf2 --- /dev/null +++ b/k8s/bases/infrastructure/vault-backup/init-job.yaml @@ -0,0 +1,87 @@ +# One-shot initial snapshot at deploy time. Two jobs in one: +# +# 1. A baseline raft snapshot exists immediately after every vault-backup +# change instead of waiting for the nightly 03:30 CronJob — after the +# 2026-06-10 KV wipe the gap between "snapshots configured" and "first +# snapshot taken" was the whole exposure window. +# 2. It is the vault-snapshots PVC's first consumer. On clusters whose +# default StorageClass uses volumeBindingMode: WaitForFirstConsumer +# (the Talos+Docker CI cluster), a consumer-less PVC stays Pending, +# kstatus reports it InProgress, and the wait-enabled `infrastructure` +# Kustomization health-gates on it forever. Prod (longhorn, Immediate) +# binds without this, but the base must work on both providers. +# +# Same retry pattern as the vault-config Job: the script fails until +# vault-config has created the vault-snapshot auth role and unsealed the +# vault, and OnFailure + backoffLimit keep retrying until then. +apiVersion: batch/v1 +kind: Job +metadata: + name: vault-snapshot-init + namespace: openbao + annotations: + # Jobs are immutable; let Flux delete-and-recreate on spec change. + kustomize.toolkit.fluxcd.io/force: enabled +spec: + ttlSecondsAfterFinished: 600 + backoffLimit: 30 + activeDeadlineSeconds: 3600 + template: + metadata: + labels: + app: vault-snapshot + spec: + serviceAccountName: vault-snapshot + restartPolicy: OnFailure + securityContext: + runAsUser: 100 + runAsGroup: 1000 + fsGroup: 1000 + volumes: + - name: snapshots + persistentVolumeClaim: + claimName: vault-snapshots + containers: + - name: snapshot + image: quay.io/openbao/openbao:2.5.3@sha256:fdc6da21ca6963560c32336fd7feb9cf2d5e52668f1a1647205a4b41171f0806 + env: + # openbao-active = unsealed Raft leader only (same rationale as + # the CronJob in cronjob.yaml). + - name: BAO_ADDR + value: http://openbao-active.openbao.svc.cluster.local:8200 + volumeMounts: + - name: snapshots + mountPath: /snapshots + command: + - /bin/sh + - -ec + - | + # Authenticate via Kubernetes auth + JWT=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + VAULT_TOKEN=$(bao write -field=token auth/kubernetes/login \ + role=vault-snapshot \ + jwt="$JWT") + export BAO_TOKEN="$VAULT_TOKEN" + + echo "Checking OpenBao health..." + STATUS=$(bao status -format=json) + echo "$STATUS" + + if echo "$STATUS" | grep -q '"sealed": true'; then + echo "ERROR: Vault is sealed!" + exit 1 + fi + echo "Vault is unsealed and healthy." + + # Skip if the nightly CronJob already produced a snapshot today — + # this job only guarantees a baseline exists. + if ls /snapshots/openbao-"$(date -u +%Y%m%d)"-*.snap >/dev/null 2>&1; then + echo "A snapshot from today already exists — nothing to do." + exit 0 + fi + + SNAP="/snapshots/openbao-$(date -u +%Y%m%d-%H%M%S).snap" + echo "Saving initial raft snapshot to $SNAP..." + bao operator raft snapshot save "$SNAP" + ls -l "$SNAP" + echo "Initial snapshot complete." diff --git a/k8s/bases/infrastructure/vault-backup/kustomization.yaml b/k8s/bases/infrastructure/vault-backup/kustomization.yaml index b96984d07..5a1892935 100644 --- a/k8s/bases/infrastructure/vault-backup/kustomization.yaml +++ b/k8s/bases/infrastructure/vault-backup/kustomization.yaml @@ -4,5 +4,6 @@ kind: Kustomization resources: - serviceaccount.yaml - pvc.yaml + - init-job.yaml - cronjob.yaml - networkpolicy.yaml From 653ce8de55ab60b764f990bbe5a0ab5d8acd0efa Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Thu, 11 Jun 2026 00:46:35 +0200 Subject: [PATCH 2/2] fix(cluster-policies): mutate pod security contexts on CREATE only The add-security-context ClusterPolicy matched Pods without an operations scope, so Kyverno also applied the securityContext mutation on every pod UPDATE. Pod spec is immutable: for any pod created while the policy/webhook was not yet active (exactly what fresh-cluster bring-up ordering produces), every later update gets the mutation bolted on and the apiserver rejects the whole request with HTTP 422 'pod updates may not change fields other than image...'. That bricked OpenBao's Kubernetes service registration in CI: the label-state updates (openbao-active/sealed/initialized) 422'd forever, the openbao-active Service never gained endpoints, the entire vault seeding chain timed out, and every system test since the 2026-06-10 active-service cutover failed. Probe evidence in #1990's run: the 422 diff shows the webhook's own securityContext injection, not the label patch. Prod was unaffected only because its pods happened to be recreated while the policy was live (mutation already in the spec -> no-op on update). Scope both rules to operations: [CREATE]. Pods created before the policy stay unmutated until their next natural recreation, which is strictly better than being permanently un-updatable. Co-Authored-By: Claude Fable 5 --- .../best-practices/add-security-context.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/k8s/bases/infrastructure/cluster-policies/best-practices/add-security-context.yaml b/k8s/bases/infrastructure/cluster-policies/best-practices/add-security-context.yaml index 0d8f3e91d..30c58d3fc 100644 --- a/k8s/bases/infrastructure/cluster-policies/best-practices/add-security-context.yaml +++ b/k8s/bases/infrastructure/cluster-policies/best-practices/add-security-context.yaml @@ -31,6 +31,16 @@ spec: - resources: kinds: - Pod + # CREATE only. Without this, Kyverno also mutates pod UPDATEs; + # pod spec is immutable, so for any pod created while the + # policy/webhook wasn't active yet (fresh-cluster bring-up + # ordering) EVERY later update — including OpenBao's + # service-registration label patches — is rejected with + # HTTP 422 "pod updates may not change fields other than + # image...". That froze openbao-active at zero endpoints and + # broke every system test from 2026-06-10 (#1990 probe output). + operations: + - CREATE exclude: any: - resources: @@ -59,6 +69,9 @@ spec: - resources: kinds: - Pod + # CREATE only — same immutability rationale as above. + operations: + - CREATE exclude: any: - resources: