diff --git a/controllers/export_test.go b/controllers/export_test.go index 2c3fd2fa..7269be67 100644 --- a/controllers/export_test.go +++ b/controllers/export_test.go @@ -147,10 +147,11 @@ var ( InstantiateTemplateValues = instantiateTemplateValues FetchClusterObjects = fetchClusterObjects - IsCluterSummaryProvisioned = isCluterSummaryProvisioned - IsNamespaced = isNamespaced - StringifyMap = stringifyMap - ParseMapFromString = parseMapFromString + IsCluterSummaryProvisioned = isCluterSummaryProvisioned + IsNamespaced = isNamespaced + RemoveStaleDriftDetectionResources = removeStaleDriftDetectionResources + StringifyMap = stringifyMap + ParseMapFromString = parseMapFromString GetTemplateResourceRefHash = getTemplateResourceRefHash ) diff --git a/controllers/utils.go b/controllers/utils.go index e46e014c..c77c8b83 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -351,32 +351,41 @@ func removeStaleDriftDetectionResources(ctx context.Context, logger logr.Logger) }, } - for { - time.Sleep(time.Minute) - - c := getManagementClusterClient() - driftDetectionDeployments := &appsv1.DeploymentList{} - err := c.List(ctx, driftDetectionDeployments, listOptions...) - if err != nil { - logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to collect driftDetection deployment: %v", err)) - continue - } + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() - for i := range driftDetectionDeployments.Items { - depl := &driftDetectionDeployments.Items[i] + for { + select { + case <-ctx.Done(): + logger.V(logs.LogInfo).Info("stopping stale drift detection resource cleanup: context canceled") + return + case <-ticker.C: + c := getManagementClusterClient() + driftDetectionDeployments := &appsv1.DeploymentList{} + err := c.List(ctx, driftDetectionDeployments, listOptions...) + if err != nil { + logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to collect driftDetection deployment: %v", err)) + continue + } - exist, clusterNs, clusterName, clusterType := deplAssociatedClusterExist(ctx, c, depl, logger) - if !exist { - // find resourceSummaries from this cluster and remove those. - // Remove deployment only after this one succeed - err = removeStaleResourceSummary(ctx, clusterNs, clusterName, clusterType, logger) - if err != nil { - continue + for i := range driftDetectionDeployments.Items { + depl := &driftDetectionDeployments.Items[i] + + exist, clusterNs, clusterName, clusterType := deplAssociatedClusterExist(ctx, c, depl, logger) + if !exist { + // find resourceSummaries from this cluster and remove those. + // Remove deployment only after this one succeed + // Do not change the order. First delete ResourceSummary instances. Till ResourceSummary + // instances are found, do not delete the drift-detection-manager deployment + err = removeStaleResourceSummary(ctx, clusterNs, clusterName, clusterType, logger) + if err != nil { + continue + } + + logger.V(logs.LogInfo).Info(fmt.Sprintf("deleting driftDetection deployment %s/%s", + depl.Namespace, depl.Name)) + _ = c.Delete(ctx, depl) } - - logger.V(logs.LogInfo).Info(fmt.Sprintf("deleting driftDetection deployment %s/%s", - depl.Namespace, depl.Name)) - _ = c.Delete(ctx, depl) } } } @@ -421,6 +430,16 @@ func removeStaleResourceSummary(ctx context.Context, clusterNamespace, clusterNa } } + if len(resourceSummaries.Items) > 0 { + // The drift-detection-manager adds a finalizer to ResourceSummary instances and removes + // it only after a successful deletion. If ResourceSummary instances still exist, we return + // an error to prevent the drift-detection-manager deployment from being deleted, ensuring + // it has sufficient time to complete the cleanup. Otherwise, we risk leaving behind stale + // ResourceSummaries or, worse, deleted instances stuck with a finalizer that have no manager + // left to clean them up. + return fmt.Errorf("resourceSummary instances still present") + } + return nil } diff --git a/controllers/utils_test.go b/controllers/utils_test.go index a91300c4..d70775d5 100644 --- a/controllers/utils_test.go +++ b/controllers/utils_test.go @@ -45,6 +45,7 @@ import ( "github.com/projectsveltos/addon-controller/lib/clusterops" libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" "github.com/projectsveltos/libsveltos/lib/k8s_utils" + "github.com/projectsveltos/libsveltos/lib/sveltos_upgrade" ) const ( @@ -55,7 +56,6 @@ const ( const ( upstreamClusterNamePrefix = "upstream-cluster" - upstreamMachineNamePrefix = "upstream-machine" clusterProfileNamePrefix = "cluster-profile" ) @@ -500,14 +500,127 @@ metadata: currentRS := &libsveltosv1beta1.ResourceSummary{} - Expect(controllers.RemoveStaleResourceSummary(context.TODO(), clusterNamespace, clusterName, - clusterType, logger)).To(Succeed()) + err = controllers.RemoveStaleResourceSummary(context.TODO(), clusterNamespace, clusterName, + clusterType, logger) + Expect(err).ToNot(BeNil()) + Expect(err.Error()).To(ContainSubstring("resourceSummary instances still present")) Eventually(func() bool { err = testEnv.Get(context.TODO(), types.NamespacedName{Namespace: u.GetNamespace(), Name: u.GetName()}, currentRS) if err == nil { - return false + return !currentRS.DeletionTimestamp.IsZero() + } + return apierrors.IsNotFound(err) + }, timeout, pollingInterval).Should(BeTrue()) + }) + + It("removeStaleDriftDetectionResources deletes drift-detection-manager after all ResourceSummaries are gone", + func(ctx SpecContext) { + namespace := randomString() + clusterName := randomString() + + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + Expect(testEnv.Create(context.TODO(), ns)).To(Succeed()) + Expect(waitForObject(context.TODO(), testEnv, ns)).To(Succeed()) + + resourceSummary := &libsveltosv1beta1.ResourceSummary{ + ObjectMeta: metav1.ObjectMeta{ + Name: randomString(), + Namespace: namespace, + Labels: map[string]string{ + sveltos_upgrade.ClusterNameLabel: clusterName, + sveltos_upgrade.ClusterTypeLabel: strings.ToLower(string(libsveltosv1beta1.ClusterTypeSveltos)), + }, + }, + } + + Expect(testEnv.Create(context.TODO(), resourceSummary)).To(Succeed()) + Expect(waitForObject(context.TODO(), testEnv, resourceSummary)).To(Succeed()) + + lbls := map[string]string{ + "cluster-namespace": namespace, + "cluster-name": clusterName, + "cluster-type": strings.ToLower(string(libsveltosv1beta1.ClusterTypeSveltos)), + "feature": "drift-detection", + } + + var replicas int32 = 1 + driftDetectionManager := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: randomString(), + Namespace: "projectsveltos", + Labels: lbls, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: lbls, + }, + Replicas: &replicas, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: lbls, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: randomString(), + Image: randomString(), + ImagePullPolicy: corev1.PullAlways, + }, + }, + ServiceAccountName: randomString(), + }, + }, + }, + } + Expect(testEnv.Create(context.TODO(), driftDetectionManager)).To(Succeed()) + Expect(waitForObject(context.TODO(), testEnv, driftDetectionManager)).To(Succeed()) + + logger := textlogger.NewLogger(textlogger.NewConfig()) + go controllers.RemoveStaleDriftDetectionResources(ctx, logger) + + // RemoveStaleDriftDetectionResources sleeps for a minute + time.Sleep(time.Minute) + + // Because ResourceSummary exists, deployment is not deleted + Consistently(func() bool { + currentDeployment := &appsv1.Deployment{} + err := testEnv.Get(context.TODO(), + types.NamespacedName{Namespace: driftDetectionManager.GetNamespace(), Name: driftDetectionManager.GetName()}, + currentDeployment) + return err == nil + }, timeout, pollingInterval).Should(BeTrue()) + + currentResourceSummary := &libsveltosv1beta1.ResourceSummary{} + Expect(testEnv.Get(context.TODO(), types.NamespacedName{Namespace: resourceSummary.Namespace, Name: resourceSummary.Name}, + currentResourceSummary)).To(Succeed()) + currentResourceSummary.Finalizers = nil + Expect(testEnv.Delete(context.TODO(), currentResourceSummary)).To(Succeed()) + + Eventually(func() bool { + err := testEnv.Get(context.TODO(), + types.NamespacedName{Namespace: resourceSummary.Namespace, Name: resourceSummary.Name}, + currentResourceSummary) + return err != nil && + apierrors.IsNotFound(err) + }, timeout, pollingInterval).Should(BeTrue()) + + // RemoveStaleDriftDetectionResources sleeps for a minute + time.Sleep(time.Minute) + + // Because ResourceSummary is gone, deployment is deleted + Eventually(func() bool { + currentDeployment := &appsv1.Deployment{} + err := testEnv.Get(context.TODO(), + types.NamespacedName{Namespace: driftDetectionManager.GetNamespace(), Name: driftDetectionManager.GetName()}, + currentDeployment) + if err == nil { + return !currentDeployment.DeletionTimestamp.IsZero() } return apierrors.IsNotFound(err) }, timeout, pollingInterval).Should(BeTrue())