From 7efac25a10b5dedd04d9eba04b6cd133aaa1c244 Mon Sep 17 00:00:00 2001 From: Gianluca Mardente Date: Fri, 6 Mar 2026 16:01:25 +0100 Subject: [PATCH] (bug) Stale ResourceSummary instances in agentless mode In agentless mode, when a profile in mode `ContinuousWithDriftDetection` matches a managed cluster, Sveltos creates the following resources in the management cluster: 1. A ResourceSummary instance (in ther cluster namespace) 2. A drift-detection-manager deployment (in the projectsveltos namespace) dedicated to that managed cluster. The `drift-detection-manager` adds a finalizer to `ResourceSummary` and is responsible for removing it during the deletion reconciliation process. In agentless mode, Sveltos also starts a goroutine that periodically (every minute) scans stale ResourceSummary/drift-detection-manager deployments. Stale here means resources referring to a managed cluster that has been deleted. When a managed cluster is deleted, the cleanup goroutine attempts to delete both the ResourceSummary and the drift-detection-manager deployment. This leads to two potential outcomes: 1. Graceful Cleanup: The drift-detection-manager reconciles the ResourceSummary deletion, removes the finalizer, and the resource is cleaned up before the manager itself is terminated. 2. Orphaned Resources: The drift-detection-manager pod is terminated before it can process the ResourceSummary deletion. Consequently, the ResourceSummary remains stuck in a "Terminating" state indefinitely because the controller responsible for removing its finalizer no longer exists. This PR ensures the drift-detection-manager deployment is only deleted after all associated ResourceSummary instances for that cluster have been fully removed. By enforcing this order, we guarantee the manager remains active long enough to clear its finalizers. --- controllers/export_test.go | 9 +-- controllers/utils.go | 65 +++++++++++++------- controllers/utils_test.go | 121 +++++++++++++++++++++++++++++++++++-- 3 files changed, 164 insertions(+), 31 deletions(-) diff --git a/controllers/export_test.go b/controllers/export_test.go index 2c3fd2fa..7269be67 100644 --- a/controllers/export_test.go +++ b/controllers/export_test.go @@ -147,10 +147,11 @@ var ( InstantiateTemplateValues = instantiateTemplateValues FetchClusterObjects = fetchClusterObjects - IsCluterSummaryProvisioned = isCluterSummaryProvisioned - IsNamespaced = isNamespaced - StringifyMap = stringifyMap - ParseMapFromString = parseMapFromString + IsCluterSummaryProvisioned = isCluterSummaryProvisioned + IsNamespaced = isNamespaced + RemoveStaleDriftDetectionResources = removeStaleDriftDetectionResources + StringifyMap = stringifyMap + ParseMapFromString = parseMapFromString GetTemplateResourceRefHash = getTemplateResourceRefHash ) diff --git a/controllers/utils.go b/controllers/utils.go index e46e014c..c77c8b83 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -351,32 +351,41 @@ func removeStaleDriftDetectionResources(ctx context.Context, logger logr.Logger) }, } - for { - time.Sleep(time.Minute) - - c := getManagementClusterClient() - driftDetectionDeployments := &appsv1.DeploymentList{} - err := c.List(ctx, driftDetectionDeployments, listOptions...) - if err != nil { - logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to collect driftDetection deployment: %v", err)) - continue - } + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() - for i := range driftDetectionDeployments.Items { - depl := &driftDetectionDeployments.Items[i] + for { + select { + case <-ctx.Done(): + logger.V(logs.LogInfo).Info("stopping stale drift detection resource cleanup: context canceled") + return + case <-ticker.C: + c := getManagementClusterClient() + driftDetectionDeployments := &appsv1.DeploymentList{} + err := c.List(ctx, driftDetectionDeployments, listOptions...) + if err != nil { + logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to collect driftDetection deployment: %v", err)) + continue + } - exist, clusterNs, clusterName, clusterType := deplAssociatedClusterExist(ctx, c, depl, logger) - if !exist { - // find resourceSummaries from this cluster and remove those. - // Remove deployment only after this one succeed - err = removeStaleResourceSummary(ctx, clusterNs, clusterName, clusterType, logger) - if err != nil { - continue + for i := range driftDetectionDeployments.Items { + depl := &driftDetectionDeployments.Items[i] + + exist, clusterNs, clusterName, clusterType := deplAssociatedClusterExist(ctx, c, depl, logger) + if !exist { + // find resourceSummaries from this cluster and remove those. + // Remove deployment only after this one succeed + // Do not change the order. First delete ResourceSummary instances. Till ResourceSummary + // instances are found, do not delete the drift-detection-manager deployment + err = removeStaleResourceSummary(ctx, clusterNs, clusterName, clusterType, logger) + if err != nil { + continue + } + + logger.V(logs.LogInfo).Info(fmt.Sprintf("deleting driftDetection deployment %s/%s", + depl.Namespace, depl.Name)) + _ = c.Delete(ctx, depl) } - - logger.V(logs.LogInfo).Info(fmt.Sprintf("deleting driftDetection deployment %s/%s", - depl.Namespace, depl.Name)) - _ = c.Delete(ctx, depl) } } } @@ -421,6 +430,16 @@ func removeStaleResourceSummary(ctx context.Context, clusterNamespace, clusterNa } } + if len(resourceSummaries.Items) > 0 { + // The drift-detection-manager adds a finalizer to ResourceSummary instances and removes + // it only after a successful deletion. If ResourceSummary instances still exist, we return + // an error to prevent the drift-detection-manager deployment from being deleted, ensuring + // it has sufficient time to complete the cleanup. Otherwise, we risk leaving behind stale + // ResourceSummaries or, worse, deleted instances stuck with a finalizer that have no manager + // left to clean them up. + return fmt.Errorf("resourceSummary instances still present") + } + return nil } diff --git a/controllers/utils_test.go b/controllers/utils_test.go index a91300c4..d70775d5 100644 --- a/controllers/utils_test.go +++ b/controllers/utils_test.go @@ -45,6 +45,7 @@ import ( "github.com/projectsveltos/addon-controller/lib/clusterops" libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" "github.com/projectsveltos/libsveltos/lib/k8s_utils" + "github.com/projectsveltos/libsveltos/lib/sveltos_upgrade" ) const ( @@ -55,7 +56,6 @@ const ( const ( upstreamClusterNamePrefix = "upstream-cluster" - upstreamMachineNamePrefix = "upstream-machine" clusterProfileNamePrefix = "cluster-profile" ) @@ -500,14 +500,127 @@ metadata: currentRS := &libsveltosv1beta1.ResourceSummary{} - Expect(controllers.RemoveStaleResourceSummary(context.TODO(), clusterNamespace, clusterName, - clusterType, logger)).To(Succeed()) + err = controllers.RemoveStaleResourceSummary(context.TODO(), clusterNamespace, clusterName, + clusterType, logger) + Expect(err).ToNot(BeNil()) + Expect(err.Error()).To(ContainSubstring("resourceSummary instances still present")) Eventually(func() bool { err = testEnv.Get(context.TODO(), types.NamespacedName{Namespace: u.GetNamespace(), Name: u.GetName()}, currentRS) if err == nil { - return false + return !currentRS.DeletionTimestamp.IsZero() + } + return apierrors.IsNotFound(err) + }, timeout, pollingInterval).Should(BeTrue()) + }) + + It("removeStaleDriftDetectionResources deletes drift-detection-manager after all ResourceSummaries are gone", + func(ctx SpecContext) { + namespace := randomString() + clusterName := randomString() + + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + Expect(testEnv.Create(context.TODO(), ns)).To(Succeed()) + Expect(waitForObject(context.TODO(), testEnv, ns)).To(Succeed()) + + resourceSummary := &libsveltosv1beta1.ResourceSummary{ + ObjectMeta: metav1.ObjectMeta{ + Name: randomString(), + Namespace: namespace, + Labels: map[string]string{ + sveltos_upgrade.ClusterNameLabel: clusterName, + sveltos_upgrade.ClusterTypeLabel: strings.ToLower(string(libsveltosv1beta1.ClusterTypeSveltos)), + }, + }, + } + + Expect(testEnv.Create(context.TODO(), resourceSummary)).To(Succeed()) + Expect(waitForObject(context.TODO(), testEnv, resourceSummary)).To(Succeed()) + + lbls := map[string]string{ + "cluster-namespace": namespace, + "cluster-name": clusterName, + "cluster-type": strings.ToLower(string(libsveltosv1beta1.ClusterTypeSveltos)), + "feature": "drift-detection", + } + + var replicas int32 = 1 + driftDetectionManager := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: randomString(), + Namespace: "projectsveltos", + Labels: lbls, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: lbls, + }, + Replicas: &replicas, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: lbls, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: randomString(), + Image: randomString(), + ImagePullPolicy: corev1.PullAlways, + }, + }, + ServiceAccountName: randomString(), + }, + }, + }, + } + Expect(testEnv.Create(context.TODO(), driftDetectionManager)).To(Succeed()) + Expect(waitForObject(context.TODO(), testEnv, driftDetectionManager)).To(Succeed()) + + logger := textlogger.NewLogger(textlogger.NewConfig()) + go controllers.RemoveStaleDriftDetectionResources(ctx, logger) + + // RemoveStaleDriftDetectionResources sleeps for a minute + time.Sleep(time.Minute) + + // Because ResourceSummary exists, deployment is not deleted + Consistently(func() bool { + currentDeployment := &appsv1.Deployment{} + err := testEnv.Get(context.TODO(), + types.NamespacedName{Namespace: driftDetectionManager.GetNamespace(), Name: driftDetectionManager.GetName()}, + currentDeployment) + return err == nil + }, timeout, pollingInterval).Should(BeTrue()) + + currentResourceSummary := &libsveltosv1beta1.ResourceSummary{} + Expect(testEnv.Get(context.TODO(), types.NamespacedName{Namespace: resourceSummary.Namespace, Name: resourceSummary.Name}, + currentResourceSummary)).To(Succeed()) + currentResourceSummary.Finalizers = nil + Expect(testEnv.Delete(context.TODO(), currentResourceSummary)).To(Succeed()) + + Eventually(func() bool { + err := testEnv.Get(context.TODO(), + types.NamespacedName{Namespace: resourceSummary.Namespace, Name: resourceSummary.Name}, + currentResourceSummary) + return err != nil && + apierrors.IsNotFound(err) + }, timeout, pollingInterval).Should(BeTrue()) + + // RemoveStaleDriftDetectionResources sleeps for a minute + time.Sleep(time.Minute) + + // Because ResourceSummary is gone, deployment is deleted + Eventually(func() bool { + currentDeployment := &appsv1.Deployment{} + err := testEnv.Get(context.TODO(), + types.NamespacedName{Namespace: driftDetectionManager.GetNamespace(), Name: driftDetectionManager.GetName()}, + currentDeployment) + if err == nil { + return !currentDeployment.DeletionTimestamp.IsZero() } return apierrors.IsNotFound(err) }, timeout, pollingInterval).Should(BeTrue())