diff --git a/Makefile b/Makefile index c7d9ea5cf..bbf3c1d5f 100644 --- a/Makefile +++ b/Makefile @@ -158,7 +158,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml -$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml +$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml HELM_SETTINGS ?= .PHONY: $(MANIFESTS) $(MANIFESTS): $(HELM) @@ -484,8 +484,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i CATD_NAMESPACE := olmv1-system .PHONY: wait wait: - kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s - kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert + kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m + kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert .PHONY: docker-build docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH. diff --git a/hack/test/pre-upgrade-setup.sh b/hack/test/pre-upgrade-setup.sh index 669f9da37..e52c88b46 100755 --- a/hack/test/pre-upgrade-setup.sh +++ b/hack/test/pre-upgrade-setup.sh @@ -155,5 +155,5 @@ spec: version: 1.0.0 EOF -kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME -kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME +kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME +kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME diff --git a/helm/high-availability.yaml b/helm/high-availability.yaml new file mode 100644 index 000000000..a8de38aac --- /dev/null +++ b/helm/high-availability.yaml @@ -0,0 +1,33 @@ +# High Availability (HA) configuration for OLMv1 +# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup +# This is used in experimental-e2e.yaml to test multi-replica deployments +# +# Pod anti-affinity is configured as "preferred" (not "required") to ensure: +# - In multi-node clusters: replicas are scheduled on different nodes for better availability +# - In single-node clusters (like kind): both replicas can still be scheduled on the same node +options: + operatorController: + deployment: + replicas: 2 + catalogd: + deployment: + replicas: 2 + +# Pod anti-affinity configuration to prefer spreading replicas across different nodes +# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow +# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests) +deployments: + templateSpec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: control-plane + operator: In + values: + - operator-controller-controller-manager + - catalogd-controller-manager + topologyKey: kubernetes.io/hostname diff --git a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml index 092cb7a24..ac69bce6a 100644 --- a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml +++ b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml @@ -12,11 +12,11 @@ metadata: namespace: {{ .Values.namespaces.olmv1.name }} spec: minReadySeconds: 5 - replicas: 1 + replicas: {{ .Values.options.catalogd.deployment.replicas }} strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml index 249610244..bea8f0404 100644 --- a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml +++ b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml @@ -11,11 +11,11 @@ metadata: name: operator-controller-controller-manager namespace: {{ .Values.namespaces.olmv1.name }} spec: - replicas: 1 + replicas: {{ .Values.options.operatorController.deployment.replicas }} strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/helm/olmv1/values.yaml b/helm/olmv1/values.yaml index cb454f625..c5845b9a1 100644 --- a/helm/olmv1/values.yaml +++ b/helm/olmv1/values.yaml @@ -8,6 +8,7 @@ options: enabled: true deployment: image: quay.io/operator-framework/operator-controller:devel + replicas: 1 extraArguments: [] features: enabled: [] @@ -19,6 +20,7 @@ options: enabled: true deployment: image: quay.io/operator-framework/catalogd:devel + replicas: 1 extraArguments: [] features: enabled: [] diff --git a/manifests/experimental-e2e.yaml b/manifests/experimental-e2e.yaml index e536cd72a..ef0200aaf 100644 --- a/manifests/experimental-e2e.yaml +++ b/manifests/experimental-e2e.yaml @@ -2107,11 +2107,11 @@ metadata: namespace: olmv1-system spec: minReadySeconds: 5 - replicas: 1 + replicas: 2 strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -2224,6 +2224,18 @@ spec: operator: In values: - linux + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: control-plane + operator: In + values: + - operator-controller-controller-manager + - catalogd-controller-manager + topologyKey: kubernetes.io/hostname + weight: 100 nodeSelector: kubernetes.io/os: linux node-role.kubernetes.io/control-plane: "" @@ -2258,11 +2270,11 @@ metadata: name: operator-controller-controller-manager namespace: olmv1-system spec: - replicas: 1 + replicas: 2 strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -2383,6 +2395,18 @@ spec: operator: In values: - linux + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: control-plane + operator: In + values: + - operator-controller-controller-manager + - catalogd-controller-manager + topologyKey: kubernetes.io/hostname + weight: 100 nodeSelector: kubernetes.io/os: linux node-role.kubernetes.io/control-plane: "" diff --git a/manifests/experimental.yaml b/manifests/experimental.yaml index f88debab0..9e9d611f0 100644 --- a/manifests/experimental.yaml +++ b/manifests/experimental.yaml @@ -2036,7 +2036,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -2174,7 +2174,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/manifests/standard-e2e.yaml b/manifests/standard-e2e.yaml index 1aed38ba9..36cb9971c 100644 --- a/manifests/standard-e2e.yaml +++ b/manifests/standard-e2e.yaml @@ -1799,7 +1799,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -1949,7 +1949,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/manifests/standard.yaml b/manifests/standard.yaml index 34cc57918..933ff5d81 100644 --- a/manifests/standard.yaml +++ b/manifests/standard.yaml @@ -1724,7 +1724,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -1861,7 +1861,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/test/e2e/cluster_extension_install_test.go b/test/e2e/cluster_extension_install_test.go index b3380ff0f..bc1eb1e68 100644 --- a/test/e2e/cluster_extension_install_test.go +++ b/test/e2e/cluster_extension_install_test.go @@ -29,6 +29,8 @@ import ( const ( artifactName = "operator-controller-e2e" pollDuration = time.Minute + catalogPollDuration = 3 * time.Minute + extendedPollDuration = 5 * time.Minute pollInterval = time.Second testCatalogRefEnvVar = "CATALOG_IMG" testCatalogName = "test-catalog" @@ -167,12 +169,11 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`, t.Log("By eventually reporting a successful resolution and bundle path") require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) - }, 2*time.Minute, pollInterval) + }, pollDuration, pollInterval) - // Give the check 2 minutes instead of the typical 1 for the pod's - // files to update from the configmap change. + // Give the check extra time for the pod's files to update from the configmap change. // The theoretical max time is the kubelet sync period of 1 minute + - // ConfigMap cache TTL of 1 minute = 2 minutes + // ConfigMap cache TTL of 1 minute = 2 minutes, plus buffer for reconciliation. t.Log("By eventually reporting progressing as True") require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) @@ -180,7 +181,7 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`, require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, 2*time.Minute, pollInterval) + }, extendedPollDuration, pollInterval) t.Log("By eventually installing the package successfully") require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -333,13 +334,14 @@ func TestClusterExtensionForceInstallNonSuccessorVersion(t *testing.T) { } require.NoError(t, c.Create(context.Background(), clusterExtension)) t.Log("By eventually reporting a successful resolution") + // Use catalogPollDuration for initial catalog resolution require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing) require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) t.Log("It allows to upgrade the ClusterExtension to a non-successor version") t.Log("By updating the ClusterExtension resource to a non-successor version") @@ -380,13 +382,14 @@ func TestClusterExtensionInstallSuccessorVersion(t *testing.T) { } require.NoError(t, c.Create(context.Background(), clusterExtension)) t.Log("By eventually reporting a successful resolution") + // Use catalogPollDuration for initial catalog resolution require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing) require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) t.Log("It does allow to upgrade the ClusterExtension to any of the successor versions within non-zero major version") t.Log("By updating the ClusterExtension resource by skipping versions") @@ -436,13 +439,14 @@ func TestClusterExtensionInstallReResolvesWhenCatalogIsPatched(t *testing.T) { require.NoError(t, c.Create(context.Background(), clusterExtension)) t.Log("By reporting a successful resolution and bundle path") + // Use catalogPollDuration since this test waits for the catalog to be unpacked and served require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing) require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) // patch imageRef tag on test-catalog image with v2 image t.Log("By patching the catalog ImageRef to point to the v2 catalog") @@ -517,26 +521,28 @@ func TestClusterExtensionInstallReResolvesWhenNewCatalog(t *testing.T) { require.NoError(t, c.Create(context.Background(), clusterExtension)) t.Log("By reporting a successful resolution and bundle path") + // Use catalogPollDuration since this test waits for the catalog to be unpacked and served require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing) require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) // update tag on test-catalog image with v2 image t.Log("By updating the catalog tag to point to the v2 catalog") v2Image := fmt.Sprintf("%s/%s", os.Getenv("LOCAL_REGISTRY_HOST"), os.Getenv("E2E_TEST_CATALOG_V2")) err = crane.Tag(v2Image, latestImageTag, crane.Insecure) require.NoError(t, err) + // Use catalogPollDuration for waiting on catalog re-unpacking after tag update require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: extensionCatalog.Name}, extensionCatalog)) cond := apimeta.FindStatusCondition(extensionCatalog.Status.Conditions, ocv1.TypeServing) require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonAvailable, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) t.Log("By eventually reporting a successful resolution and bundle path") require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -655,6 +661,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) { // backoff of this eventually check we MUST ensure we do not touch the ClusterExtension // after creating int the Namespace and ServiceAccount. t.Log("By eventually installing the package successfully") + // Use extendedPollDuration for recovery tests to account for exponential backoff after repeated failures require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -663,7 +670,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) { require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) require.Contains(ct, cond.Message, "Installed bundle") require.NotEmpty(ct, clusterExtension.Status.Install) - }, pollDuration, pollInterval) + }, extendedPollDuration, pollInterval) t.Log("By eventually reporting Progressing == True with Reason Success") require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -777,6 +784,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi // backoff of this eventually check we MUST ensure we do not touch the ClusterExtension // after deleting the Deployment. t.Log("By eventually installing the package successfully") + // Use extendedPollDuration for recovery tests to account for exponential backoff after repeated failures require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -785,7 +793,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) require.Contains(ct, cond.Message, "Installed bundle") require.NotEmpty(ct, clusterExtension.Status.Install) - }, pollDuration, pollInterval) + }, extendedPollDuration, pollInterval) t.Log("By eventually reporting Progressing == True with Reason Success") require.EventuallyWithT(t, func(ct *assert.CollectT) { diff --git a/test/e2e/webhook_support_test.go b/test/e2e/webhook_support_test.go index 1c80c615b..a3ae361a7 100644 --- a/test/e2e/webhook_support_test.go +++ b/test/e2e/webhook_support_test.go @@ -103,13 +103,14 @@ func TestWebhookSupport(t *testing.T) { }) t.Log("By waiting for the catalog to serve its metadata") + // Use catalogPollDuration since catalog unpacking can take time require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: extensionCatalog.GetName()}, extensionCatalog)) cond := apimeta.FindStatusCondition(extensionCatalog.Status.Conditions, ocv1.TypeServing) require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonAvailable, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) t.Log("By installing the webhook-operator ClusterExtension") clusterExtension := &ocv1.ClusterExtension{ @@ -138,6 +139,8 @@ func TestWebhookSupport(t *testing.T) { }) t.Log("By waiting for webhook-operator extension to be installed successfully") + // Use extendedPollDuration for webhook installation as it requires webhook cert generation via + // cert-manager, which can take significant time. require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(t.Context(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -147,7 +150,7 @@ func TestWebhookSupport(t *testing.T) { require.Contains(ct, cond.Message, "Installed bundle") require.NotNil(ct, clusterExtension.Status.Install) require.NotEmpty(ct, clusterExtension.Status.Install.Bundle) - }, pollDuration, pollInterval) + }, extendedPollDuration, pollInterval) t.Log("By waiting for webhook-operator deployment to be available") require.EventuallyWithT(t, func(ct *assert.CollectT) { diff --git a/test/helpers/helpers.go b/test/helpers/helpers.go index af142c6e3..e646cbc05 100644 --- a/test/helpers/helpers.go +++ b/test/helpers/helpers.go @@ -34,7 +34,15 @@ var ( ) const ( - pollDuration = time.Minute + pollDuration = time.Minute + // catalogPollDuration is used for catalog operations (unpacking, serving) which involve + // I/O-bound operations like pulling OCI images and unpacking catalog content. + catalogPollDuration = 3 * time.Minute + // extendedPollDuration is used for operations that involve pod restarts (like upgrades) + // or webhook installations with cert-manager. In the worst case of a pod crash during upgrade, + // leader election can take up to 163 seconds (LeaseDuration: 137s + RetryPeriod: 26s). + // With LeaderElectionReleaseOnCancel: true, graceful shutdowns only take ~26s (RetryPeriod). + extendedPollDuration = 5 * time.Minute pollInterval = time.Second testCatalogName = "test-catalog" testCatalogRefEnvVar = "CATALOG_IMG" @@ -268,7 +276,7 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) { require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) t.Log("Checking that catalog has the expected metadata label") require.NotNil(t, catalog.Labels) @@ -283,13 +291,17 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) { require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonAvailable, cond.Reason) - }, pollDuration, pollInterval) + }, catalogPollDuration, pollInterval) } func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) { ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName} - // CRDs may take an extra long time to be deleted, and may run into the following error: + // Use 2 minute timeout for cleanup operations to ensure they complete within the test timeout. + // This is shorter than pollDuration (3 min) to leave buffer for the overall test suite. + cleanupTimeout := 2 * time.Minute + + // CRDs may take extra time to be deleted, and may run into the following error: // Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing" t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName) require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -297,7 +309,7 @@ func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) { err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()}) require.NoError(ct, err) require.Empty(ct, list.Items) - }, 5*pollDuration, pollInterval) + }, cleanupTimeout, pollInterval) t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName) require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -305,7 +317,7 @@ func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) { err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()}) require.NoError(ct, err) require.Empty(ct, list.Items) - }, 2*pollDuration, pollInterval) + }, cleanupTimeout, pollInterval) t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName) require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -313,7 +325,7 @@ func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) { err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()}) require.NoError(ct, err) require.Empty(ct, list.Items) - }, 2*pollDuration, pollInterval) + }, cleanupTimeout, pollInterval) } func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) { @@ -348,10 +360,12 @@ func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1. if ns != nil { t.Logf("By deleting Namespace %q", ns.Name) require.NoError(t, c.Delete(context.Background(), ns)) + // Namespace deletion may take longer as it needs to delete all resources within it. + // Use extendedPollDuration to allow sufficient time for graceful cleanup. require.Eventually(t, func() bool { err := c.Get(context.Background(), types.NamespacedName{Name: ns.Name}, &corev1.Namespace{}) return errors.IsNotFound(err) - }, pollDuration, pollInterval) + }, extendedPollDuration, pollInterval) } } diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go index 785d91ea3..802e9c33c 100644 --- a/test/upgrade-e2e/post_upgrade_test.go +++ b/test/upgrade-e2e/post_upgrade_test.go @@ -11,11 +11,13 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" appsv1 "k8s.io/api/apps/v1" + coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" ocv1 "github.com/operator-framework/operator-controller/api/v1" @@ -25,6 +27,11 @@ import ( const ( artifactName = "operator-controller-upgrade-e2e" container = "manager" + // pollDuration is set to 3 minutes for upgrade tests because upgrades cause pod restarts. + // With LeaderElectionReleaseOnCancel: true, graceful shutdowns take ~26s (RetryPeriod). + // In the worst case (pod crash), leader election can take up to 163s (LeaseDuration: 137s + RetryPeriod: 26s). + pollDuration = 3 * time.Minute + pollInterval = time.Second ) func TestClusterCatalogUnpacking(t *testing.T) { @@ -45,23 +52,22 @@ func TestClusterCatalogUnpacking(t *testing.T) { require.Equal(ct, *managerDeployment.Spec.Replicas, managerDeployment.Status.ReadyReplicas) }, time.Minute, time.Second) - var managerPod corev1.Pod - t.Log("Waiting for only one controller-manager pod to remain") + t.Log("Waiting for controller-manager pods to match the desired replica count") require.EventuallyWithT(t, func(ct *assert.CollectT) { var managerPods corev1.PodList err := c.List(ctx, &managerPods, client.MatchingLabels(managerLabelSelector)) require.NoError(ct, err) - require.Len(ct, managerPods.Items, 1) - managerPod = managerPods.Items[0] + require.Len(ct, managerPods.Items, int(*managerDeployment.Spec.Replicas)) }, time.Minute, time.Second) t.Log("Waiting for acquired leader election") - leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute) + leaderCtx, leaderCancel := context.WithTimeout(ctx, pollDuration) defer leaderCancel() - leaderSubstrings := []string{"successfully acquired lease"} - leaderElected, err := watchPodLogsForSubstring(leaderCtx, &managerPod, leaderSubstrings...) + + // When there are multiple replicas, find the leader pod + managerPod, err := findLeaderPod(leaderCtx, "catalogd") require.NoError(t, err) - require.True(t, leaderElected) + require.NotNil(t, managerPod) t.Log("Reading logs to make sure that ClusterCatalog was reconciled by catalogdv1") logCtx, cancel := context.WithTimeout(ctx, time.Minute) @@ -70,7 +76,7 @@ func TestClusterCatalogUnpacking(t *testing.T) { "reconcile ending", fmt.Sprintf(`ClusterCatalog=%q`, testClusterCatalogName), } - found, err := watchPodLogsForSubstring(logCtx, &managerPod, substrings...) + found, err := watchPodLogsForSubstring(logCtx, managerPod, substrings...) require.NoError(t, err) require.True(t, found) @@ -83,7 +89,7 @@ func TestClusterCatalogUnpacking(t *testing.T) { require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, time.Minute, time.Second) + }, pollDuration, pollInterval) t.Log("Ensuring ClusterCatalog has Status.Condition of Serving with a status == True, reason == Available") require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -93,7 +99,7 @@ func TestClusterCatalogUnpacking(t *testing.T) { require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonAvailable, cond.Reason) - }, time.Minute, time.Second) + }, pollDuration, pollInterval) } func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { @@ -103,22 +109,30 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { // wait for catalogd deployment to finish t.Log("Wait for catalogd deployment to be ready") - catalogdManagerPod := waitForDeployment(t, ctx, "catalogd") + waitForDeployment(t, ctx, "catalogd") + + // Find the catalogd leader pod + catalogdLeaderCtx, catalogdLeaderCancel := context.WithTimeout(ctx, pollDuration) + defer catalogdLeaderCancel() + catalogdManagerPod, err := findLeaderPod(catalogdLeaderCtx, "catalogd") + require.NoError(t, err) + require.NotNil(t, catalogdManagerPod) // wait for operator-controller deployment to finish t.Log("Wait for operator-controller deployment to be ready") - managerPod := waitForDeployment(t, ctx, "operator-controller") + waitForDeployment(t, ctx, "operator-controller") t.Log("Wait for acquired leader election") // Average case is under 1 minute but in the worst case: (previous leader crashed) // we could have LeaseDuration (137s) + RetryPeriod (26s) +/- 163s - leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute) + leaderCtx, leaderCancel := context.WithTimeout(ctx, pollDuration) defer leaderCancel() - leaderSubstrings := []string{"successfully acquired lease"} - leaderElected, err := watchPodLogsForSubstring(leaderCtx, managerPod, leaderSubstrings...) + // When there are multiple replicas, find the leader pod + var managerPod *corev1.Pod + managerPod, err = findLeaderPod(leaderCtx, "operator-controller") require.NoError(t, err) - require.True(t, leaderElected) + require.NotNil(t, managerPod) t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it") // Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes @@ -154,7 +168,7 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) require.True(ct, clusterCatalog.Status.LastUnpacked.After(catalogdManagerPod.CreationTimestamp.Time)) - }, time.Minute, time.Second) + }, pollDuration, pollInterval) // TODO: if we change the underlying revision storage mechanism, the new version // will not detect any installed versions, we need to make sure that the upgrade @@ -172,7 +186,7 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.Contains(ct, cond.Message, "Installed bundle") require.NotNil(ct, clusterExtension.Status.Install) require.NotEmpty(ct, clusterExtension.Status.Install.Bundle.Version) - }, time.Minute, time.Second) + }, pollDuration, pollInterval) previousVersion := clusterExtension.Status.Install.Bundle.Version @@ -182,6 +196,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.NoError(t, c.Update(ctx, &clusterExtension)) t.Log("Checking that the ClusterExtension installs successfully") + // Use 10 minutes for post-OLM-upgrade extension upgrade operations. + // After upgrading OLM itself, the system needs time to: + // - Stabilize after operator-controller pods restart (leader election: up to 163s) + // - Process the ClusterExtension spec change + // - Resolve and unpack the new bundle (1.0.1) + // - Apply manifests and wait for rollout + // In multi-replica deployments with recent OLM upgrade, this can take significant time require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: testClusterExtensionName}, &clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -190,14 +211,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.Contains(ct, cond.Message, "Installed bundle") require.Equal(ct, ocv1.BundleMetadata{Name: "test-operator.1.0.1", Version: "1.0.1"}, clusterExtension.Status.Install.Bundle) require.NotEqual(ct, previousVersion, clusterExtension.Status.Install.Bundle.Version) - }, time.Minute, time.Second) + }, 10*time.Minute, pollInterval) } // waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label // has reached the desired number of replicas and that the number pods matches that number -// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary -// to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626 -func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod { +// i.e. no old pods remain. +func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) { deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() t.Log("Checking that the deployment is updated") @@ -217,13 +237,82 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri desiredNumReplicas = *managerDeployment.Spec.Replicas }, time.Minute, time.Second) - var managerPods corev1.PodList t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas) require.EventuallyWithT(t, func(ct *assert.CollectT) { + var managerPods corev1.PodList require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector})) - require.Len(ct, managerPods.Items, 1) + require.Len(ct, managerPods.Items, int(desiredNumReplicas)) }, time.Minute, time.Second) - return &managerPods.Items[0] +} + +// findLeaderPod finds the pod that has acquired the leader lease by inspecting the Lease resource. +// This is more reliable than checking logs as it directly queries the Kubernetes API for the lease holder. +func findLeaderPod(ctx context.Context, controlPlaneLabel string) (*corev1.Pod, error) { + deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() + + // Map component name to its LeaderElectionID + leaseNameMap := map[string]string{ + "operator-controller": "9c4404e7.operatorframework.io", + "catalogd": "catalogd-operator-lock", + } + + leaseName, ok := leaseNameMap[controlPlaneLabel] + if !ok { + return nil, fmt.Errorf("unknown control plane label: %s", controlPlaneLabel) + } + + var leaderPod *corev1.Pod + + // Use require.Eventually for polling instead of custom ticker implementation + err := wait.PollUntilContextTimeout(ctx, 2*time.Second, pollDuration, true, func(ctx context.Context) (bool, error) { + // Fetch lease first to avoid race condition where pod count changes between list and lease check + var lease coordinationv1.Lease + if err := c.Get(ctx, types.NamespacedName{ + Name: leaseName, + Namespace: "olmv1-system", + }, &lease); err != nil { + // Lease might not exist yet, retry + return false, nil + } + + if lease.Spec.HolderIdentity == nil { + // No leader elected yet, retry + return false, nil + } + + // The HolderIdentity is typically in the format "pod-name_hash" + holderIdentity := *lease.Spec.HolderIdentity + + // Now fetch pods and find the one matching the lease holder + var managerPods corev1.PodList + if err := c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}); err != nil { + return false, nil + } + + if len(managerPods.Items) == 0 { + return false, nil + } + + // Find the pod that matches the holder identity + for i := range managerPods.Items { + pod := &managerPods.Items[i] + // The holder identity contains the pod name as a prefix + if strings.HasPrefix(holderIdentity, pod.Name+"_") { + leaderPod = pod + return true, nil + } + } + + // If we couldn't match the holder identity to a pod, retry + // This can happen during pod rotation + return false, nil + }) + + if err != nil { + return nil, fmt.Errorf("timeout waiting for leader election: %w", err) + } + + return leaderPod, nil } func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) { diff --git a/testdata/build-test-registry.sh b/testdata/build-test-registry.sh index e2dcc0914..7dee9e3c1 100755 --- a/testdata/build-test-registry.sh +++ b/testdata/build-test-registry.sh @@ -103,7 +103,7 @@ spec: type: NodePort EOF -kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=60s +kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=3m kubectl apply -f - << EOF apiVersion: batch/v1 @@ -135,4 +135,4 @@ spec: secretName: ${namespace}-registry EOF -kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=60s +kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=3m