Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
$(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
$(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
$(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
HELM_SETTINGS ?=
.PHONY: $(MANIFESTS)
$(MANIFESTS): $(HELM)
Expand Down Expand Up @@ -484,8 +484,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
CATD_NAMESPACE := olmv1-system
.PHONY: wait
wait:
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert

.PHONY: docker-build
docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.
Expand Down
4 changes: 2 additions & 2 deletions hack/test/pre-upgrade-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,5 +155,5 @@ spec:
version: 1.0.0
EOF

kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
10 changes: 10 additions & 0 deletions helm/high-availability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# High Availability (HA) configuration for OLMv1
# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
# This is used in experimental-e2e.yaml to test multi-replica deployments
options:
operatorController:
deployment:
replicas: 2
catalogd:
deployment:
replicas: 2
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ metadata:
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
minReadySeconds: 5
replicas: 1
replicas: {{ .Values.options.catalogd.deployment.replicas }}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we already have node anti affinity configured to make sure these replicas do not end up on the same node? If not, we need that as well (but only when replicas > 1).

Copy link
Contributor

@tmshort tmshort Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I will point out that this may cause an issue on our single-node kind experimental-e2e tests where we have two replicas (such that we are validating that two replicas does not cause issues with the e2e tests).

strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ metadata:
name: operator-controller-controller-manager
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
replicas: 1
replicas: {{ .Values.options.operatorController.deployment.replicas }}
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
2 changes: 2 additions & 0 deletions helm/olmv1/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/operator-controller:devel
replicas: 1
extraArguments: []
features:
enabled: []
Expand All @@ -19,6 +20,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/catalogd:devel
replicas: 1
extraArguments: []
features:
enabled: []
Expand Down
8 changes: 4 additions & 4 deletions manifests/experimental-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2107,11 +2107,11 @@ metadata:
namespace: olmv1-system
spec:
minReadySeconds: 5
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2258,11 +2258,11 @@ metadata:
name: operator-controller-controller-manager
namespace: olmv1-system
spec:
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2036,7 +2036,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2174,7 +2174,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1799,7 +1799,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1949,7 +1949,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1724,7 +1724,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1861,7 +1861,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
32 changes: 20 additions & 12 deletions test/e2e/cluster_extension_install_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
const (
artifactName = "operator-controller-e2e"
pollDuration = time.Minute
catalogPollDuration = 3 * time.Minute
extendedPollDuration = 5 * time.Minute
pollInterval = time.Second
testCatalogRefEnvVar = "CATALOG_IMG"
testCatalogName = "test-catalog"
Expand Down Expand Up @@ -167,20 +169,19 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`,
t.Log("By eventually reporting a successful resolution and bundle path")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
}, 2*time.Minute, pollInterval)
}, pollDuration, pollInterval)

// Give the check 2 minutes instead of the typical 1 for the pod's
// files to update from the configmap change.
// Give the check extra time for the pod's files to update from the configmap change.
// The theoretical max time is the kubelet sync period of 1 minute +
// ConfigMap cache TTL of 1 minute = 2 minutes
// ConfigMap cache TTL of 1 minute = 2 minutes, plus buffer for reconciliation.
t.Log("By eventually reporting progressing as True")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, 2*time.Minute, pollInterval)
}, extendedPollDuration, pollInterval)

t.Log("By eventually installing the package successfully")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down Expand Up @@ -333,13 +334,14 @@ func TestClusterExtensionForceInstallNonSuccessorVersion(t *testing.T) {
}
require.NoError(t, c.Create(context.Background(), clusterExtension))
t.Log("By eventually reporting a successful resolution")
// Use catalogPollDuration for initial catalog resolution
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

t.Log("It allows to upgrade the ClusterExtension to a non-successor version")
t.Log("By updating the ClusterExtension resource to a non-successor version")
Expand Down Expand Up @@ -380,13 +382,14 @@ func TestClusterExtensionInstallSuccessorVersion(t *testing.T) {
}
require.NoError(t, c.Create(context.Background(), clusterExtension))
t.Log("By eventually reporting a successful resolution")
// Use catalogPollDuration for initial catalog resolution
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

t.Log("It does allow to upgrade the ClusterExtension to any of the successor versions within non-zero major version")
t.Log("By updating the ClusterExtension resource by skipping versions")
Expand Down Expand Up @@ -436,13 +439,14 @@ func TestClusterExtensionInstallReResolvesWhenCatalogIsPatched(t *testing.T) {
require.NoError(t, c.Create(context.Background(), clusterExtension))

t.Log("By reporting a successful resolution and bundle path")
// Use catalogPollDuration since this test waits for the catalog to be unpacked and served
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

// patch imageRef tag on test-catalog image with v2 image
t.Log("By patching the catalog ImageRef to point to the v2 catalog")
Expand Down Expand Up @@ -517,26 +521,28 @@ func TestClusterExtensionInstallReResolvesWhenNewCatalog(t *testing.T) {
require.NoError(t, c.Create(context.Background(), clusterExtension))

t.Log("By reporting a successful resolution and bundle path")
// Use catalogPollDuration since this test waits for the catalog to be unpacked and served
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

// update tag on test-catalog image with v2 image
t.Log("By updating the catalog tag to point to the v2 catalog")
v2Image := fmt.Sprintf("%s/%s", os.Getenv("LOCAL_REGISTRY_HOST"), os.Getenv("E2E_TEST_CATALOG_V2"))
err = crane.Tag(v2Image, latestImageTag, crane.Insecure)
require.NoError(t, err)
// Use catalogPollDuration for waiting on catalog re-unpacking after tag update
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: extensionCatalog.Name}, extensionCatalog))
cond := apimeta.FindStatusCondition(extensionCatalog.Status.Conditions, ocv1.TypeServing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonAvailable, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

t.Log("By eventually reporting a successful resolution and bundle path")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down Expand Up @@ -655,6 +661,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
// after creating int the Namespace and ServiceAccount.
t.Log("By eventually installing the package successfully")
// Use extendedPollDuration for recovery tests to account for exponential backoff after repeated failures
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
Expand All @@ -663,7 +670,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
require.Contains(ct, cond.Message, "Installed bundle")
require.NotEmpty(ct, clusterExtension.Status.Install)
}, pollDuration, pollInterval)
}, extendedPollDuration, pollInterval)

t.Log("By eventually reporting Progressing == True with Reason Success")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down Expand Up @@ -777,6 +784,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
// after deleting the Deployment.
t.Log("By eventually installing the package successfully")
// Use extendedPollDuration for recovery tests to account for exponential backoff after repeated failures
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
Expand All @@ -785,7 +793,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
require.Contains(ct, cond.Message, "Installed bundle")
require.NotEmpty(ct, clusterExtension.Status.Install)
}, pollDuration, pollInterval)
}, extendedPollDuration, pollInterval)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above:

Why do we need to account for leader election time here? This test should not cause a restart of our controllers. The original 1m duration seems like it has been sufficient for expotential backoff concerns?


t.Log("By eventually reporting Progressing == True with Reason Success")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down
7 changes: 5 additions & 2 deletions test/e2e/webhook_support_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,14 @@ func TestWebhookSupport(t *testing.T) {
})

t.Log("By waiting for the catalog to serve its metadata")
// Use catalogPollDuration since catalog unpacking can take time
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: extensionCatalog.GetName()}, extensionCatalog))
cond := apimeta.FindStatusCondition(extensionCatalog.Status.Conditions, ocv1.TypeServing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonAvailable, cond.Reason)
}, pollDuration, pollInterval)
}, catalogPollDuration, pollInterval)

t.Log("By installing the webhook-operator ClusterExtension")
clusterExtension := &ocv1.ClusterExtension{
Expand Down Expand Up @@ -138,6 +139,8 @@ func TestWebhookSupport(t *testing.T) {
})

t.Log("By waiting for webhook-operator extension to be installed successfully")
// Use extendedPollDuration for webhook installation as it requires webhook cert generation via
// cert-manager, which can take significant time.
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(t.Context(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
Expand All @@ -147,7 +150,7 @@ func TestWebhookSupport(t *testing.T) {
require.Contains(ct, cond.Message, "Installed bundle")
require.NotNil(ct, clusterExtension.Status.Install)
require.NotEmpty(ct, clusterExtension.Status.Install.Bundle)
}, pollDuration, pollInterval)
}, extendedPollDuration, pollInterval)

t.Log("By waiting for webhook-operator deployment to be available")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down
Loading