From e2bfb3f79bf941b1c4e86eb5f8ee9c54e3df7cd2 Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Mon, 18 May 2026 21:32:21 -0400 Subject: [PATCH 1/2] feat(helm): add opt-in PDB and probes for CNPG plugin sidecars PodDisruptionBudget (M9 from #381) ================================== Add an optional PodDisruptionBudget for the operator, gated by podDisruptionBudget.enabled (default: false). Disabled by default because the operator currently ships with replicaCount: 1 and a PDB on a single-replica deployment blocks node drains rather than helping availability. Users running multi-replica with leader election should enable it. Plugin probes (M12 from #381) ============================= The sidecar-injector and wal-replica deployments are gRPC servers on port 9090 that previously had no probes pods were marked Ready as soon as the container started, regardless of whether the gRPC endpoint was actually serving. Add tcpSocket readiness + liveness probes on port 9090, gated by pluginProbes.enabled (default: true) with tunable initialDelaySeconds, periodSeconds, and failureThreshold. TCP socket probe is used because the plugins do not expose an HTTP health endpoint. The probe verifies the gRPC server is bound and accepting connections. Verified locally on kind: - helm template renders PDB only when enabled; renders probes by default. - helm upgrade applies PDB; second upgrade with --set ... enabled=false removes it as expected. - sidecar-injector pod becomes Ready (TCP probe passes against the actually-running gRPC server). - helm lint clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../02_documentdb_sidecar_injector.yaml | 14 +++++++++++++ .../templates/03_documentdb_wal_replica.yaml | 14 +++++++++++++ .../templates/11_pdb.yaml | 21 +++++++++++++++++++ operator/documentdb-helm-chart/values.yaml | 20 ++++++++++++++++++ 4 files changed, 69 insertions(+) create mode 100644 operator/documentdb-helm-chart/templates/11_pdb.yaml diff --git a/operator/documentdb-helm-chart/templates/02_documentdb_sidecar_injector.yaml b/operator/documentdb-helm-chart/templates/02_documentdb_sidecar_injector.yaml index 1dafcded..569173e2 100644 --- a/operator/documentdb-helm-chart/templates/02_documentdb_sidecar_injector.yaml +++ b/operator/documentdb-helm-chart/templates/02_documentdb_sidecar_injector.yaml @@ -85,6 +85,20 @@ spec: ports: - containerPort: 9090 protocol: TCP + {{- if .Values.pluginProbes.enabled }} + readinessProbe: + tcpSocket: + port: 9090 + initialDelaySeconds: {{ .Values.pluginProbes.initialDelaySeconds }} + periodSeconds: {{ .Values.pluginProbes.periodSeconds }} + failureThreshold: {{ .Values.pluginProbes.failureThreshold }} + livenessProbe: + tcpSocket: + port: 9090 + initialDelaySeconds: {{ .Values.pluginProbes.initialDelaySeconds }} + periodSeconds: {{ .Values.pluginProbes.periodSeconds }} + failureThreshold: {{ .Values.pluginProbes.failureThreshold }} + {{- end }} {{- with .Values.sidecarInjector.resources }} resources: {{- toYaml . | nindent 10 }} diff --git a/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml b/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml index f1693345..7e49ca53 100644 --- a/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml +++ b/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml @@ -125,6 +125,20 @@ spec: ports: - containerPort: 9090 protocol: TCP + {{- if .Values.pluginProbes.enabled }} + readinessProbe: + tcpSocket: + port: 9090 + initialDelaySeconds: {{ .Values.pluginProbes.initialDelaySeconds }} + periodSeconds: {{ .Values.pluginProbes.periodSeconds }} + failureThreshold: {{ .Values.pluginProbes.failureThreshold }} + livenessProbe: + tcpSocket: + port: 9090 + initialDelaySeconds: {{ .Values.pluginProbes.initialDelaySeconds }} + periodSeconds: {{ .Values.pluginProbes.periodSeconds }} + failureThreshold: {{ .Values.pluginProbes.failureThreshold }} + {{- end }} args: - receivewal - --server-cert=/server/tls.crt diff --git a/operator/documentdb-helm-chart/templates/11_pdb.yaml b/operator/documentdb-helm-chart/templates/11_pdb.yaml new file mode 100644 index 00000000..3790b1e6 --- /dev/null +++ b/operator/documentdb-helm-chart/templates/11_pdb.yaml @@ -0,0 +1,21 @@ +{{- if .Values.podDisruptionBudget.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: documentdb-operator + namespace: {{ .Values.namespace | default .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "documentdb-chart.name" . }} + app.kubernetes.io/component: operator + app.kubernetes.io/managed-by: "Helm" +spec: + selector: + matchLabels: + app: {{ .Release.Name }} + {{- with .Values.podDisruptionBudget.minAvailable }} + minAvailable: {{ . }} + {{- end }} + {{- with .Values.podDisruptionBudget.maxUnavailable }} + maxUnavailable: {{ . }} + {{- end }} +{{- end }} diff --git a/operator/documentdb-helm-chart/values.yaml b/operator/documentdb-helm-chart/values.yaml index 27128577..bc833abc 100644 --- a/operator/documentdb-helm-chart/values.yaml +++ b/operator/documentdb-helm-chart/values.yaml @@ -74,6 +74,26 @@ certManager: # unreliable. Disabling the check does NOT remove the dependency. preflightCheck: true +# PodDisruptionBudget for the operator. Disabled by default because the +# operator ships with replicaCount: 1 and a PDB on a single-replica deployment +# blocks node drains. Enable when running multi-replica with leader election. +# Set exactly one of minAvailable or maxUnavailable; if both are set, only +# minAvailable is honored. +podDisruptionBudget: + enabled: false + minAvailable: 1 + maxUnavailable: "" + +# Probes for the CNPG plugin sidecars (sidecar-injector, wal-replica). +# Both are gRPC servers on port 9090; TCP socket probes are used because +# the plugins do not expose an HTTP health endpoint. Set probe.enabled=false +# to omit the probe (e.g., if you supply your own via a patch). +pluginProbes: + enabled: true + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 3 + # Per-component pod-level configuration: resources, security contexts, and scheduling. # Defaults are conservative and aim to be compatible with Pod Security Admission's # `restricted` profile. Override any field per component as needed. From a33d54b735b7c36a528873269da9ab65207788d2 Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 12 Jun 2026 15:02:34 -0400 Subject: [PATCH 2/2] test(helm): add unit tests for PDB and plugin probes Add helm-unittest test coverage for two new features: - PDB (11_pdb_test.yaml): 10 tests covering feature gate toggle, metadata/namespace, label selector, and minAvailable/maxUnavailable configurations including percentage support. - Plugin probes (02_sidecar_injector_test.yaml, 03_wal_replica_test.yaml): 6 tests (3 per template) verifying default TCP readiness/liveness probes, disabled probes, and custom probe settings. Signed-off-by: Wenting Wu --- .../tests/02_sidecar_injector_test.yaml | 58 +++++++++ .../tests/03_wal_replica_test.yaml | 53 +++++++++ .../tests/11_pdb_test.yaml | 112 ++++++++++++++++++ 3 files changed, 223 insertions(+) create mode 100644 operator/documentdb-helm-chart/tests/11_pdb_test.yaml diff --git a/operator/documentdb-helm-chart/tests/02_sidecar_injector_test.yaml b/operator/documentdb-helm-chart/tests/02_sidecar_injector_test.yaml index c8f4c711..2e874f72 100644 --- a/operator/documentdb-helm-chart/tests/02_sidecar_injector_test.yaml +++ b/operator/documentdb-helm-chart/tests/02_sidecar_injector_test.yaml @@ -155,3 +155,61 @@ tests: value: cnpg-system - isNotNull: path: spec.selfSigned + + # ------------------------------------------------------------------- + # Plugin probes + # ------------------------------------------------------------------- + - it: should render TCP readiness and liveness probes by default + documentIndex: 1 + asserts: + - equal: + path: spec.template.spec.containers[0].readinessProbe.tcpSocket.port + value: 9090 + - equal: + path: spec.template.spec.containers[0].readinessProbe.initialDelaySeconds + value: 5 + - equal: + path: spec.template.spec.containers[0].readinessProbe.periodSeconds + value: 10 + - equal: + path: spec.template.spec.containers[0].readinessProbe.failureThreshold + value: 3 + - equal: + path: spec.template.spec.containers[0].livenessProbe.tcpSocket.port + value: 9090 + - equal: + path: spec.template.spec.containers[0].livenessProbe.initialDelaySeconds + value: 5 + - equal: + path: spec.template.spec.containers[0].livenessProbe.periodSeconds + value: 10 + - equal: + path: spec.template.spec.containers[0].livenessProbe.failureThreshold + value: 3 + + - it: should omit probes when pluginProbes.enabled is false + set: + pluginProbes.enabled: false + documentIndex: 1 + asserts: + - notExists: + path: spec.template.spec.containers[0].readinessProbe + - notExists: + path: spec.template.spec.containers[0].livenessProbe + + - it: should use custom probe settings when overridden + set: + pluginProbes.initialDelaySeconds: 15 + pluginProbes.periodSeconds: 30 + pluginProbes.failureThreshold: 5 + documentIndex: 1 + asserts: + - equal: + path: spec.template.spec.containers[0].readinessProbe.initialDelaySeconds + value: 15 + - equal: + path: spec.template.spec.containers[0].readinessProbe.periodSeconds + value: 30 + - equal: + path: spec.template.spec.containers[0].livenessProbe.failureThreshold + value: 5 diff --git a/operator/documentdb-helm-chart/tests/03_wal_replica_test.yaml b/operator/documentdb-helm-chart/tests/03_wal_replica_test.yaml index a7369d23..bbe6d6e7 100644 --- a/operator/documentdb-helm-chart/tests/03_wal_replica_test.yaml +++ b/operator/documentdb-helm-chart/tests/03_wal_replica_test.yaml @@ -145,3 +145,56 @@ tests: - equal: path: metadata.namespace value: cnpg-system + + # ------------------------------------------------------------------- + # Plugin probes + # ------------------------------------------------------------------- + - it: should render TCP readiness and liveness probes by default + set: + walReplica: true + documentIndex: 3 + asserts: + - equal: + path: spec.template.spec.containers[0].readinessProbe.tcpSocket.port + value: 9090 + - equal: + path: spec.template.spec.containers[0].readinessProbe.initialDelaySeconds + value: 5 + - equal: + path: spec.template.spec.containers[0].readinessProbe.periodSeconds + value: 10 + - equal: + path: spec.template.spec.containers[0].readinessProbe.failureThreshold + value: 3 + - equal: + path: spec.template.spec.containers[0].livenessProbe.tcpSocket.port + value: 9090 + + - it: should omit probes when pluginProbes.enabled is false + set: + walReplica: true + pluginProbes.enabled: false + documentIndex: 3 + asserts: + - notExists: + path: spec.template.spec.containers[0].readinessProbe + - notExists: + path: spec.template.spec.containers[0].livenessProbe + + - it: should use custom probe settings when overridden + set: + walReplica: true + pluginProbes.initialDelaySeconds: 20 + pluginProbes.periodSeconds: 60 + pluginProbes.failureThreshold: 10 + documentIndex: 3 + asserts: + - equal: + path: spec.template.spec.containers[0].readinessProbe.initialDelaySeconds + value: 20 + - equal: + path: spec.template.spec.containers[0].livenessProbe.periodSeconds + value: 60 + - equal: + path: spec.template.spec.containers[0].livenessProbe.failureThreshold + value: 10 diff --git a/operator/documentdb-helm-chart/tests/11_pdb_test.yaml b/operator/documentdb-helm-chart/tests/11_pdb_test.yaml new file mode 100644 index 00000000..46eff464 --- /dev/null +++ b/operator/documentdb-helm-chart/tests/11_pdb_test.yaml @@ -0,0 +1,112 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/helm-unittest/helm-unittest/main/schema/helm-testsuite.json +suite: pod disruption budget +templates: + - 11_pdb.yaml + +capabilities: + apiVersions: + - cert-manager.io/v1/Certificate + +tests: + # ------------------------------------------------------------------- + # Feature gate (podDisruptionBudget.enabled) + # ------------------------------------------------------------------- + - it: should not render PDB when disabled (default) + asserts: + - hasDocuments: + count: 0 + + - it: should render PDB when enabled + set: + podDisruptionBudget.enabled: true + asserts: + - hasDocuments: + count: 1 + + # ------------------------------------------------------------------- + # Metadata + # ------------------------------------------------------------------- + - it: should create a PodDisruptionBudget with correct metadata + set: + podDisruptionBudget.enabled: true + asserts: + - isKind: + of: PodDisruptionBudget + - isAPIVersion: + of: policy/v1 + - equal: + path: metadata.name + value: documentdb-operator + - equal: + path: metadata.labels["app.kubernetes.io/component"] + value: operator + - equal: + path: metadata.labels["app.kubernetes.io/managed-by"] + value: Helm + + - it: should use release namespace when values.namespace is empty + set: + podDisruptionBudget.enabled: true + namespace: "" + release: + namespace: my-ns + asserts: + - equal: + path: metadata.namespace + value: my-ns + + - it: should use custom namespace when set + set: + podDisruptionBudget.enabled: true + namespace: custom-ns + asserts: + - equal: + path: metadata.namespace + value: custom-ns + + # ------------------------------------------------------------------- + # Selector + # ------------------------------------------------------------------- + - it: should select pods by release name + set: + podDisruptionBudget.enabled: true + release: + name: my-release + asserts: + - equal: + path: spec.selector.matchLabels.app + value: my-release + + # ------------------------------------------------------------------- + # minAvailable / maxUnavailable + # ------------------------------------------------------------------- + - it: should set minAvailable by default + set: + podDisruptionBudget.enabled: true + asserts: + - equal: + path: spec.minAvailable + value: 1 + - notExists: + path: spec.maxUnavailable + + - it: should use maxUnavailable when set (and minAvailable cleared) + set: + podDisruptionBudget.enabled: true + podDisruptionBudget.minAvailable: "" + podDisruptionBudget.maxUnavailable: 1 + asserts: + - equal: + path: spec.maxUnavailable + value: 1 + - notExists: + path: spec.minAvailable + + - it: should support percentage for minAvailable + set: + podDisruptionBudget.enabled: true + podDisruptionBudget.minAvailable: "50%" + asserts: + - equal: + path: spec.minAvailable + value: "50%"