diff --git a/.evergreen-tasks.yml b/.evergreen-tasks.yml index cae1ce3da..eef39e348 100644 --- a/.evergreen-tasks.yml +++ b/.evergreen-tasks.yml @@ -783,6 +783,11 @@ tasks: commands: - func: "e2e_test" + - name: e2e_om_appdb_tls_disable + tags: [ "patch-run" ] + commands: + - func: "e2e_test" + - name: e2e_om_appdb_multi_change tags: [ "patch-run" ] commands: diff --git a/.evergreen.yml b/.evergreen.yml index f39693d58..1f5042670 100644 --- a/.evergreen.yml +++ b/.evergreen.yml @@ -968,6 +968,7 @@ task_groups: - e2e_om_appdb_flags_and_config - e2e_om_appdb_upgrade - e2e_om_appdb_monitoring_tls + - e2e_om_appdb_tls_disable - e2e_om_ops_manager_backup - e2e_om_ops_manager_backup_light - e2e_om_ops_manager_backup_liveness_probe @@ -1022,6 +1023,7 @@ task_groups: - e2e_om_appdb_flags_and_config - e2e_om_appdb_upgrade - e2e_om_appdb_monitoring_tls + - e2e_om_appdb_tls_disable - e2e_om_ops_manager_backup - e2e_om_ops_manager_backup_light - e2e_om_ops_manager_backup_liveness_probe @@ -1076,6 +1078,7 @@ task_groups: - e2e_om_appdb_external_connectivity - e2e_om_appdb_flags_and_config - e2e_om_appdb_monitoring_tls + - e2e_om_appdb_tls_disable - e2e_om_appdb_multi_change - e2e_om_appdb_scale_up_down - e2e_om_appdb_upgrade @@ -1120,6 +1123,7 @@ task_groups: - e2e_om_appdb_external_connectivity - e2e_om_appdb_flags_and_config - e2e_om_appdb_monitoring_tls + - e2e_om_appdb_tls_disable - e2e_om_appdb_multi_change - e2e_om_appdb_scale_up_down - e2e_om_appdb_upgrade @@ -1439,7 +1443,21 @@ buildvariants: tags: [ "pr_patch", "staging", "e2e_test_suite", "static" ] run_on: - ubuntu2404-large - <<: *base_om8_dependency + depends_on: + - name: build_om_images + variant: build_om80_images + - name: build_operator_ubi + variant: init_test_run + - name: build_init_database_image_ubi + variant: init_test_run + - name: build_test_image + variant: init_test_run + - name: build_init_appdb_images_ubi + variant: init_test_run + - name: build_init_om_images_ubi + variant: init_test_run + - name: publish_helm_chart + variant: init_test_run tasks: - name: e2e_static_ops_manager_kind_only_task_group - name: e2e_static_ops_manager_kind_6_0_only_task_group diff --git a/changelog/20251216_fix_tls_monitoring.md b/changelog/20251216_fix_tls_monitoring.md new file mode 100644 index 000000000..ec63eeb2e --- /dev/null +++ b/changelog/20251216_fix_tls_monitoring.md @@ -0,0 +1,6 @@ +--- +kind: fix +date: 2025-12-16 +--- + +* Fixed an issue where monitoring agents would fail after disabling TLS on a MongoDB deployment. diff --git a/controllers/om/deployment.go b/controllers/om/deployment.go index 3b75e19fe..4d31f92c5 100644 --- a/controllers/om/deployment.go +++ b/controllers/om/deployment.go @@ -317,6 +317,10 @@ func (d Deployment) AddMonitoring(log *zap.SugaredLogger, tls bool, caFilePath s } monitoringVersion["additionalParams"] = additionalParams + } else { + // Clear TLS params when TLS is disabled to prevent monitoring from + // trying to use certificate files that no longer exist + delete(monitoringVersion, "additionalParams") } } diff --git a/controllers/om/deployment_test.go b/controllers/om/deployment_test.go index 281731187..5b07086ee 100644 --- a/controllers/om/deployment_test.go +++ b/controllers/om/deployment_test.go @@ -528,10 +528,39 @@ func TestAddMonitoringTls(t *testing.T) { assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions()) // adding again - nothing changes - d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer) + d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer) assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions()) } +func TestAddMonitoringTLSDisable(t *testing.T) { + d := NewDeployment() + + rs0 := buildRsByProcesses("my-rs", createReplicaSetProcessesCount(3, "my-rs")) + d.MergeReplicaSet(rs0, nil, nil, zap.S()) + d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer) + + // verify TLS is present in additionalParams + expectedAdditionalParams := map[string]string{ + "useSslForAllConnections": "true", + "sslTrustedServerCertificates": util.CAFilePathInContainer, + } + expectedMonitoringVersionsWithTls := []interface{}{ + map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams}, + map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams}, + map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams}, + } + assert.Equal(t, expectedMonitoringVersionsWithTls, d.getMonitoringVersions()) + + // disabling TLS should clear additionalParams (CLOUDP-351614) + d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer) + expectedMonitoringVersionsWithoutTls := []interface{}{ + map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion}, + map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion}, + map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion}, + } + assert.Equal(t, expectedMonitoringVersionsWithoutTls, d.getMonitoringVersions()) +} + func TestAddBackup(t *testing.T) { d := NewDeployment() diff --git a/controllers/operator/appdbreplicaset_controller.go b/controllers/operator/appdbreplicaset_controller.go index 86f61b52b..a20a5ed2d 100644 --- a/controllers/operator/appdbreplicaset_controller.go +++ b/controllers/operator/appdbreplicaset_controller.go @@ -1430,9 +1430,14 @@ func addMonitoring(ac *automationconfig.AutomationConfig, log *zap.SugaredLogger monitoringVersions := ac.MonitoringVersions for _, p := range ac.Processes { found := false - for _, m := range monitoringVersions { + for i, m := range monitoringVersions { if m.Hostname == p.HostName { found = true + if !tls { + // Clear TLS params when TLS is disabled to prevent monitoring from + // trying to use certificate files that no longer exist + monitoringVersions[i].AdditionalParams = nil + } break } } diff --git a/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_appdb_tls_disable.yaml b/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_appdb_tls_disable.yaml new file mode 100644 index 000000000..bf5230739 --- /dev/null +++ b/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_appdb_tls_disable.yaml @@ -0,0 +1,35 @@ +apiVersion: mongodb.com/v1 +kind: MongoDBOpsManager +metadata: + name: om-tls-disable-test +spec: + replicas: 1 + version: 4.4.1 + adminCredentials: ops-manager-admin-secret + security: + tls: + secretRef: + name: certs-for-ops-manager + ca: issuer-ca + backup: + enabled: false + applicationDatabase: + members: 3 + version: 5.0.14-ent + security: + certsSecretPrefix: appdb + tls: + ca: issuer-ca + + # adding this just to avoid wizard when opening OM UI + configuration: + automation.versions.source: mongodb + mms.adminEmailAddr: cloud-manager-support@mongodb.com + mms.fromEmailAddr: cloud-manager-support@mongodb.com + mms.ignoreInitialUiSetup: "true" + mms.mail.hostname: email-smtp.us-east-1.amazonaws.com + mms.mail.port: "465" + mms.mail.ssl: "true" + mms.mail.transport: smtp + mms.minimumTLSVersion: TLSv1.2 + mms.replyToEmailAddr: cloud-manager-support@mongodb.com diff --git a/docker/mongodb-kubernetes-tests/tests/opsmanager/withMonitoredAppDB/om_appdb_tls_disable.py b/docker/mongodb-kubernetes-tests/tests/opsmanager/withMonitoredAppDB/om_appdb_tls_disable.py new file mode 100644 index 000000000..bb7423733 --- /dev/null +++ b/docker/mongodb-kubernetes-tests/tests/opsmanager/withMonitoredAppDB/om_appdb_tls_disable.py @@ -0,0 +1,184 @@ +""" +CLOUDP-351614: Test that TLS can be disabled on AppDB without breaking monitoring. + +This is a dedicated test file to verify that when TLS is disabled on AppDB, +the operator correctly clears stale TLS params from the monitoring configuration. +""" +from typing import Optional + +from kubetester import create_or_update_secret, try_load +from kubetester.certs import create_ops_manager_tls_certs +from kubetester.kubetester import fixture as yaml_fixture +from kubetester.opsmanager import MongoDBOpsManager +from kubetester.phase import Phase +from pytest import fixture, mark +from tests.common.cert.cert_issuer import create_appdb_certs +from tests.conftest import is_multi_cluster +from tests.opsmanager.withMonitoredAppDB.conftest import enable_multi_cluster_deployment + + +def get_monitoring_tls_params(ops_manager: MongoDBOpsManager) -> dict: + """ + Extract TLS-related additionalParams from monitoring config via OM API. + + Returns a dict with the additionalParams for all monitoring agents, + keyed by hostname. An empty dict means no TLS params are present. + + Note: We use get_om_tester() to access the full automation config from + Ops Manager API, which includes monitoringVersions. The K8s secret-based + get_automation_config_tester() only has the cluster config for agents. + """ + om_tester = ops_manager.get_om_tester(ops_manager.app_db_name()) + ac = om_tester.get_automation_config_tester() + monitoring_versions = ac.automation_config.get("monitoringVersions", []) + + tls_params_by_host = {} + for mv in monitoring_versions: + hostname = mv.get("hostname", "unknown") + additional_params = mv.get("additionalParams", {}) + if additional_params: + tls_params_by_host[hostname] = additional_params + + return tls_params_by_host + +OM_NAME = "om-tls-disable-test" +APPDB_NAME = f"{OM_NAME}-db" + + +@fixture(scope="module") +def ops_manager_certs(namespace: str, issuer: str): + return create_ops_manager_tls_certs(issuer, namespace, OM_NAME) + + +@fixture(scope="module") +def appdb_certs(namespace: str, issuer: str): + return create_appdb_certs(namespace, issuer, APPDB_NAME) + + +@fixture(scope="module") +@mark.usefixtures("appdb_certs", "ops_manager_certs", "issuer_ca_configmap") +def ops_manager( + namespace: str, + issuer_ca_configmap: str, + appdb_certs: str, + ops_manager_certs: str, + custom_version: Optional[str], + custom_appdb_version: str, +) -> MongoDBOpsManager: + """Create an Ops Manager with TLS-enabled AppDB for testing TLS disable.""" + om = MongoDBOpsManager.from_yaml(yaml_fixture("om_appdb_tls_disable.yaml"), namespace=namespace) + om.set_version(custom_version) + om.set_appdb_version(custom_appdb_version) + + if try_load(om): + return om + + if is_multi_cluster(): + enable_multi_cluster_deployment(om) + + om.update() + return om + + +@mark.e2e_om_appdb_tls_disable +def test_om_created_with_tls(ops_manager: MongoDBOpsManager): + """Verify OM and AppDB are running with TLS enabled.""" + ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=900) + ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600) + + +@mark.e2e_om_appdb_tls_disable +def test_appdb_monitoring_works_with_tls(ops_manager: MongoDBOpsManager): + """Verify monitoring works with TLS enabled before we disable it.""" + ops_manager.assert_appdb_monitoring_group_was_created() + ops_manager.assert_monitoring_data_exists(timeout=600, all_hosts=False) + + +@mark.e2e_om_appdb_tls_disable +def test_monitoring_config_has_tls_params_when_tls_enabled(ops_manager: MongoDBOpsManager): + """ + CLOUDP-351614: Verify that monitoring config contains TLS params when TLS is enabled. + + When TLS is enabled on AppDB, the monitoring agents should be configured with + TLS parameters in additionalParams, including: + - useSslForAllConnections: "true" + - sslTrustedServerCertificates: path to CA file + - sslClientCertificate: path to client certificate (for x509 auth) + """ + tls_params = get_monitoring_tls_params(ops_manager) + + # Monitoring agents should have TLS params configured + assert len(tls_params) > 0, "Expected TLS params in monitoring config when TLS is enabled" + + # Verify TLS params contain expected keys + for hostname, params in tls_params.items(): + assert params.get("useSslForAllConnections") == "true", ( + f"Expected useSslForAllConnections=true for {hostname}, got {params}" + ) + assert "sslTrustedServerCertificates" in params, ( + f"Expected sslTrustedServerCertificates in params for {hostname}" + ) + + +@mark.e2e_om_appdb_tls_disable +def test_disable_tls_on_appdb(ops_manager: MongoDBOpsManager): + """ + CLOUDP-351614: Disable TLS on AppDB and verify the operator correctly handles + the transition without leaving stale TLS params in monitoring config. + + This test disables TLS by setting tls.enabled = False. The operator handles + the TLS mode transition: requireTLS -> preferTLS -> allowTLS -> disabled. + + Note: We keep the certsSecretPrefix in place because: + 1. The cert files are needed during the TLS transition + 2. Removing certsSecretPrefix while TLS mode is still transitioning causes failures + 3. The certs are harmless to keep after TLS is disabled (just unused) + """ + ops_manager.load() + + # Disable TLS mode (keeping certs for the transition) + # The operator will handle the TLS mode transition: requireTLS -> preferTLS -> allowTLS -> disabled + ops_manager["spec"]["applicationDatabase"]["security"]["tls"]["enabled"] = False + ops_manager.update() + + # Wait for AppDB to reach Running state after TLS mode is fully disabled + # Use a longer timeout as the transition goes through multiple TLS modes + ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=1800) + + +@mark.e2e_om_appdb_tls_disable +def test_monitoring_config_tls_params_cleared_after_tls_disabled(ops_manager: MongoDBOpsManager): + """ + CLOUDP-351614: Verify that TLS params are CLEARED from monitoring config after TLS is disabled. + + THIS IS THE KEY TEST FOR THE BUG FIX: + Before the fix in CLOUDP-351614, the operator would leave stale TLS params + (useSslForAllConnections, sslClientCertificate, etc.) in the monitoring config + even after TLS was disabled. This caused monitoring agents to fail because they + would try to use TLS certificate files that are no longer valid for authentication. + + After the fix, the operator correctly clears these params by calling: + delete(monitoringVersion, "additionalParams") + """ + tls_params = get_monitoring_tls_params(ops_manager) + + # After TLS is disabled, monitoring config should NOT have TLS params + # This assertion would have FAILED before the fix because stale TLS params remained + assert len(tls_params) == 0, ( + f"CLOUDP-351614 BUG: TLS params should be cleared from monitoring config after " + f"TLS is disabled, but found stale params: {tls_params}" + ) + + +@mark.e2e_om_appdb_tls_disable +def test_monitoring_works_after_tls_disable(ops_manager: MongoDBOpsManager): + """ + CLOUDP-351614: Verify monitoring data can still be collected after TLS disable. + + After TLS is disabled, monitoring agents switch from x509 to SCRAM authentication. + This test verifies that: + 1. The operator correctly cleared stale TLS params from monitoring config + 2. Monitoring agents can reconnect and collect data + """ + # Use a longer timeout as monitoring agents need time to reconnect with SCRAM auth + ops_manager.assert_monitoring_data_exists(timeout=1200, all_hosts=False)