From f60c69c3d6af1d71a1bbf61dcb5a2ee655d1e1a4 Mon Sep 17 00:00:00 2001 From: German Date: Thu, 28 May 2026 13:34:22 -0700 Subject: [PATCH] test(e2e): add expired backup cleanup coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #370. Adds an e2e spec that verifies the operator garbage-collects Backup CRs once status.expiredAt is in the past, plus two reusable helpers: - MarkExpired — status-subresource patch fast-forwarding expiration (avoids the silent no-op when patching through c.Patch on a +kubebuilder:subresource:status type). - WaitForBackupDeleted — polls until the Backup CR is gone, treating NotFound as success. The spec orchestrates: cluster ready → on-demand Backup → wait Completed → MarkExpired(now-1h) → WaitForBackupDeleted. Labels: [backup, needs-csi-snapshots, level:medium]. Validation: - go vet ./... clean - go test ./pkg/e2eutils/backup/... 11/11 pass - ginkgo --focus 'expired backup cleanup' ./tests/backup PASS in 67s on kind (CSI hostpath + external-snapshotter) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: German --- test/e2e/pkg/e2eutils/backup/backup.go | 68 ++++++++++++ test/e2e/pkg/e2eutils/backup/backup_test.go | 70 ++++++++++++ .../tests/backup/backup_expiration_test.go | 101 ++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 test/e2e/tests/backup/backup_expiration_test.go diff --git a/test/e2e/pkg/e2eutils/backup/backup.go b/test/e2e/pkg/e2eutils/backup/backup.go index d7d6424d..36752700 100644 --- a/test/e2e/pkg/e2eutils/backup/backup.go +++ b/test/e2e/pkg/e2eutils/backup/backup.go @@ -23,6 +23,7 @@ import ( cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" "github.com/cloudnative-pg/cloudnative-pg/tests/utils/envsubst" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/yaml" @@ -205,6 +206,73 @@ func WaitForCompleted(ctx context.Context, c client.Client, ns, name string, tim } } +// MarkExpired patches status.expiredAt on the Backup CR via the +// status subresource so the operator's reconcile loop treats it as +// expired and garbage-collects it. Used by the expired-backup +// cleanup spec to fast-forward expiration without waiting out the +// real retentionDays window. +// +// Trigger semantics: the status patch fires a watch event that the +// Backup controller picks up within seconds, so deletion is +// event-driven rather than bound to the 1-minute periodic requeue in +// backup_controller.go. +// +// Implementation detail: this must use c.Status().Patch — patching +// the same field through c.Patch is a silent no-op because Backup +// declares status as a subresource (see operator/src/api/preview/ +// backup_types.go +kubebuilder:subresource:status). +// +// A fresh Get is performed inside the helper so callers can pass a +// stable (ns, name) without juggling resourceVersion conflicts with +// the controller's own status writers. +func MarkExpired(ctx context.Context, c client.Client, ns, name string, when time.Time) error { + if c == nil { + return errors.New("backup.MarkExpired: client must not be nil") + } + var current previewv1.Backup + if err := c.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, ¤t); err != nil { + return fmt.Errorf("get Backup %s/%s: %w", ns, name, err) + } + patch := client.MergeFrom(current.DeepCopy()) + expired := metav1.NewTime(when) + current.Status.ExpiredAt = &expired + if err := c.Status().Patch(ctx, ¤t, patch); err != nil { + return fmt.Errorf("patch status.expiredAt on Backup %s/%s: %w", ns, name, err) + } + return nil +} + +// WaitForBackupDeleted polls until the named Backup no longer exists +// in ns or timeout elapses. Mirrors WaitForPVCDeleted in shape and +// is used by the expired-backup cleanup spec to assert the operator +// removed the Backup CR after MarkExpired flipped status.expiredAt +// into the past. +func WaitForBackupDeleted(ctx context.Context, c client.Client, ns, name string, timeout time.Duration) error { + if c == nil { + return errors.New("backup.WaitForBackupDeleted: client must not be nil") + } + deadline := time.Now().Add(timeout) + for { + var b previewv1.Backup + err := c.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, &b) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return fmt.Errorf("polling Backup %s/%s: %w", ns, name, err) + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for Backup %s/%s to be deleted (last phase=%q)", + timeout, ns, name, b.Status.Phase) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(DefaultPollInterval): + } + } +} + // Delete issues a foreground delete on the Backup CR and returns when // the object is gone or timeout elapses. Safe to call on a missing // object (IsNotFound is treated as success). diff --git a/test/e2e/pkg/e2eutils/backup/backup_test.go b/test/e2e/pkg/e2eutils/backup/backup_test.go index 369e14b6..0f20caf4 100644 --- a/test/e2e/pkg/e2eutils/backup/backup_test.go +++ b/test/e2e/pkg/e2eutils/backup/backup_test.go @@ -215,6 +215,76 @@ func TestPVBelongsToClusterMatchesLabelsOrClaimPrefix(t *testing.T) { } } +func TestMarkExpiredPatchesStatusViaSubresource(t *testing.T) { + t.Parallel() + s := newScheme(t) + bkp := &previewv1.Backup{ + ObjectMeta: metav1.ObjectMeta{Name: "b", Namespace: "ns"}, + Spec: previewv1.BackupSpec{Cluster: cnpgv1.LocalObjectReference{Name: "c"}}, + } + c := fakeclient.NewClientBuilder(). + WithScheme(s). + WithObjects(bkp). + WithStatusSubresource(&previewv1.Backup{}). + Build() + + when := time.Date(2000, 1, 2, 3, 4, 5, 0, time.UTC) + if err := MarkExpired(context.Background(), c, "ns", "b", when); err != nil { + t.Fatalf("MarkExpired: %v", err) + } + + var got previewv1.Backup + if err := c.Get(context.Background(), client.ObjectKey{Namespace: "ns", Name: "b"}, &got); err != nil { + t.Fatalf("Get after MarkExpired: %v", err) + } + if got.Status.ExpiredAt == nil { + t.Fatalf("expected status.expiredAt to be set; got nil") + } + if !got.Status.ExpiredAt.Time.Equal(when) { + t.Fatalf("status.expiredAt=%v want %v", got.Status.ExpiredAt.Time, when) + } +} + +func TestMarkExpiredErrorsOnNotFound(t *testing.T) { + t.Parallel() + s := newScheme(t) + c := fakeclient.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&previewv1.Backup{}). + Build() + + err := MarkExpired(context.Background(), c, "ns", "missing", time.Unix(0, 0)) + if err == nil { + t.Fatal("expected error for missing Backup, got nil") + } + if !strings.Contains(err.Error(), "missing") { + t.Fatalf("expected error to mention name; got %v", err) + } +} + +func TestWaitForBackupDeletedReturnsWhenMissing(t *testing.T) { + t.Parallel() + s := newScheme(t) + c := fakeclient.NewClientBuilder().WithScheme(s).Build() + if err := WaitForBackupDeleted(context.Background(), c, "ns", "nope", 500*time.Millisecond); err != nil { + t.Fatalf("expected nil for missing Backup, got %v", err) + } +} + +func TestWaitForBackupDeletedTimesOutWhenPresent(t *testing.T) { + t.Parallel() + s := newScheme(t) + bkp := &previewv1.Backup{ + ObjectMeta: metav1.ObjectMeta{Name: "b", Namespace: "ns"}, + Status: previewv1.BackupStatus{Phase: cnpgv1.BackupPhaseCompleted}, + } + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(bkp).Build() + err := WaitForBackupDeleted(context.Background(), c, "ns", "b", 500*time.Millisecond) + if err == nil || !strings.Contains(err.Error(), "timed out") { + t.Fatalf("expected timeout error, got %v", err) + } +} + func TestWaitForPVCDeletedReturnsWhenMissing(t *testing.T) { t.Parallel() s := newScheme(t) diff --git a/test/e2e/tests/backup/backup_expiration_test.go b/test/e2e/tests/backup/backup_expiration_test.go new file mode 100644 index 00000000..20164995 --- /dev/null +++ b/test/e2e/tests/backup/backup_expiration_test.go @@ -0,0 +1,101 @@ +package backup + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + bkp "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/backup" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +var _ = Describe("DocumentDB backup — expired backup cleanup", + Label(e2e.BackupLabel, e2e.NeedsCSISnapshotsLabel), e2e.MediumLevelLabel, + func() { + const ( + clusterName = "backup-expire" + backupName = "backup-expire-1" + ) + var ( + ctx context.Context + ns string + c client.Client + ) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + ctx = context.Background() + c = e2e.SuiteEnv().Client + skipUnlessCSISnapshotsUsable(ctx, c) + ns = namespaces.NamespaceForSpec(e2e.BackupLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns) + }) + + It("deletes the Backup CR once status.expiredAt is in the past", func() { + dd, err := documentdb.Create(ctx, c, ns, clusterName, documentdb.CreateOptions{ + Base: "documentdb", + Vars: baseVars(clusterName, ns, "2Gi"), + ManifestsRoot: manifestsRoot(), + }) + Expect(err).NotTo(HaveOccurred()) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + // 1. Source cluster healthy so the backup can actually run. + key := types.NamespacedName{Namespace: ns, Name: clusterName} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + // 2. Create an on-demand Backup CR and wait for Completed + // before fast-forwarding expiration. Marking an in-flight + // backup expired would race the snapshot controller and + // leak CNPG state, and the retentionDays contract is + // defined against completed backups anyway. + _, err = bkp.Create(ctx, c, bkp.BackupVars{ + Name: backupName, + Namespace: ns, + ClusterName: clusterName, + RetentionDays: 1, + }) + Expect(err).NotTo(HaveOccurred(), "create Backup CR %s/%s", ns, backupName) + // The expired-cleanup path makes the Backup disappear on + // its own; bkp.Delete already swallows IsNotFound so this + // cleanup stays correct even after the spec succeeds. + DeferCleanup(func(ctx SpecContext) { + _ = bkp.Delete(ctx, c, ns, backupName, 1*time.Minute) + }) + _, err = bkp.WaitForCompleted(ctx, c, ns, backupName, + timeouts.For(timeouts.BackupComplete)) + Expect(err).NotTo(HaveOccurred(), + "Backup %s/%s did not reach Completed", ns, backupName) + + // 3. Fast-forward expiration via the status subresource. + // The next reconcile sees IsExpired() == true and deletes + // the CR — this is the behaviour the retired + // test-backup-and-restore workflow used to cover. + Expect(bkp.MarkExpired(ctx, c, ns, backupName, time.Now().Add(-1*time.Hour))). + To(Succeed(), "patch status.expiredAt on Backup %s/%s", ns, backupName) + + // 4. Operator must garbage-collect the Backup CR. The + // status patch from MarkExpired fires a watch event that + // the controller picks up within seconds, so 3 minutes is + // comfortable headroom even on a loaded CI worker — and + // well clear of the 1-minute periodic requeue cadence in + // backup_controller.go. + Expect(bkp.WaitForBackupDeleted(ctx, c, ns, backupName, 3*time.Minute)). + To(Succeed(), "Backup %s/%s was not garbage-collected after expiration", ns, backupName) + }) + })