diff --git a/src/charm.py b/src/charm.py index a6896a7..fde3846 100755 --- a/src/charm.py +++ b/src/charm.py @@ -129,6 +129,9 @@ def _on_cluster_relation_changed(self, event: EventBase) -> None: # check whether restart is needed for all `*_changed` events self.on[self.restart.name].acquire_lock.emit() + if self.tls.upgrading and len(self.cluster.peer_units) == 1: + event.defer() + def _restart(self, event: EventBase) -> None: """Handler for emitted restart events.""" # this can cause issues if ran before `init_server()` @@ -156,7 +159,14 @@ def _restart(self, event: EventBase) -> None: # flag to update that this unit is running `portUnification` during ssl<->no-ssl upgrade # in case restart was manual, also remove self.cluster.relation.data[self.unit].update( - {"unified": "true" if self.tls.upgrading else "", "manual-restart": ""} + { + # flag to declare unit running `portUnification` during ssl<->no-ssl upgrade + "unified": "true" if self.tls.upgrading else "", + # in case restart was manual + "manual-restart": "", + # flag to declare unit restarted with new quorum encryption + "quorum": self.cluster.quorum or "", + } ) def init_server(self): @@ -196,10 +206,13 @@ def init_server(self): # unit flags itself as 'started' so it can be retrieved by the leader logger.info(f"Server.{self.cluster.get_unit_id(self.unit)} started") - # flag to update that this unit is running `portUnification` during ssl<->no-ssl upgrade # added here in case a `restart` was missed self.cluster.relation.data[self.unit].update( - {"state": "started", "unified": "true" if self.tls.upgrading else ""} + { + "state": "started", + "unified": "true" if self.tls.upgrading else "", + "quorum": self.cluster.quorum or "", + } ) def config_changed(self): @@ -268,24 +281,38 @@ def update_quorum(self, event: EventBase) -> None: # set first unit to "added" asap to get the units starting sooner self.add_init_leader() - if self.cluster.stale_quorum or isinstance( - # ensure these events always run without delay to maintain quorum on scale down - event, - (RelationDepartedEvent, LeaderElectedEvent), + if ( + self.cluster.stale_quorum # in case of scale-up + or isinstance( # to run without delay to maintain quorum on scale down + event, + (RelationDepartedEvent, LeaderElectedEvent), + ) ): updated_servers = self.cluster.update_cluster() # triggers a `cluster_relation_changed` to wake up following units self.cluster.relation.data[self.app].update(updated_servers) + # default startup without ssl relation + if not self.cluster.stale_quorum and not self.tls.enabled and not self.tls.upgrading: + if not self.cluster.quorum: # avoids multiple loglines + logger.info("ZooKeeper cluster running with non-SSL quorum") + + self.cluster.relation.data[self.app].update({"quorum": "non-ssl"}) + # declare upgrade complete only when all peer units have started # triggers `cluster_relation_changed` to rolling-restart without `portUnification` if self.tls.all_units_unified: if self.tls.enabled: - logger.info("ZooKeeper cluster running with quorum encryption") - self.cluster.relation.data[self.app].update({"quorum": "ssl", "upgrading": ""}) + self.cluster.relation.data[self.app].update({"quorum": "ssl"}) else: - logger.info("ZooKeeper cluster running without quorum encryption") - self.cluster.relation.data[self.app].update({"quorum": "non-ssl", "upgrading": ""}) + self.cluster.relation.data[self.app].update({"quorum": "non-ssl"}) + + if self.cluster.all_units_quorum: + self.cluster.relation.data[self.app].update({"upgrading": ""}) + logger.debug(f"ZooKeeper cluster switching to {self.cluster.quorum} quorum") + + # attempt update of client relation data in case port updated + self.provider.apply_relation_data(event) def add_init_leader(self) -> None: """Adds the first leader server to the relation data for other units to ack.""" diff --git a/src/cluster.py b/src/cluster.py index dd250cb..33635a1 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -315,7 +315,7 @@ def update_cluster(self) -> Dict[str, str]: UnitNotFoundError, BadArgumentsError, ) as e: - logger.debug(str(e)) + logger.warning(str(e)) return {} def is_unit_turn(self, unit: Optional[Unit] = None) -> bool: @@ -461,3 +461,21 @@ def manual_restart(self) -> bool: True if manual-restart flag is set. Otherwise False """ return bool(self.relation.data[self.charm.app].get("manual-restart", None)) + + @property + def all_units_quorum(self) -> bool: + """Checks if all units are running with the cluster quorum encryption. + + Returns: + True if all units are running the quorum encryption in app data. + Otherwise False. + """ + unit_quorums = set() + for unit in self.peer_units: + unit_quorum = self.relation.data[unit].get("quorum", None) + if unit_quorum != self.quorum: + return False + + unit_quorums.add(unit_quorum) + + return len(unit_quorums) == 1 diff --git a/src/config.py b/src/config.py index 93bdfa8..28a809f 100644 --- a/src/config.py +++ b/src/config.py @@ -8,9 +8,10 @@ from typing import List from ops.model import Relation +from ops.pebble import PathError from literals import CONTAINER, PEER, REL_NAME -from utils import push +from utils import pull, push logger = logging.getLogger(__name__) @@ -146,7 +147,7 @@ def zookeeper_properties(self) -> List[str]: + [ f"dataDir={self.default_config_path}/data", f"dataLogDir={self.default_config_path}/log", - f"dynamicConfigFile={self.default_config_path}/zookeeper-dynamic.properties", + f"{self.current_dynamic_config_file}", ] ) @@ -188,6 +189,34 @@ def zookeeper_properties(self) -> List[str]: return properties + @property + def current_dynamic_config_file(self) -> str: + """Gets current dynamicConfigFile property from live unit. + + When setting config dynamically, ZK creates a new properties file + that keeps track of the current dynamic config version. + When setting our config, we overwrite the file, losing the tracked version, + so we can re-set it with this. + + Returns: + String of current `dynamicConfigFile=` for the running server + """ + try: + current_properties = pull( + container=self.container, path=self.properties_filepath + ).splitlines() + except PathError: + logger.debug("zookeeper.properties file not found - using default dynamic path") + return f"dynamicConfigFile={self.default_config_path}/zookeeper-dynamic.properties" + + for current_property in current_properties: + if "dynamicConfigFile" in current_property: + return current_property + + logger.debug("dynamicConfigFile property missing - using default dynamic path") + + return f"dynamicConfigFile={self.default_config_path}/zookeeper-dynamic.properties" + @property def static_properties(self) -> List[str]: """Build the zookeeper.properties content, without dynamic options. @@ -232,7 +261,7 @@ def build_static_properties(properties: List[str]) -> List[str]: Running ZooKeeper cluster with `reconfigEnabled` moves dynamic options to a dedicated dynamic file - These options are `dynamicConfigFile`, `clientPort` and `secureClientPort` + These options are `clientPort` and `secureClientPort` Args: properties: the properties to make static @@ -243,11 +272,7 @@ def build_static_properties(properties: List[str]) -> List[str]: return [ prop for prop in properties - if ( - "dynamicConfigFile" not in prop - and "clientPort" not in prop - and "secureClientPort" not in prop - ) + if ("clientPort" not in prop and "secureClientPort" not in prop) ] @property diff --git a/src/provider.py b/src/provider.py index 3509de2..36f3571 100644 --- a/src/provider.py +++ b/src/provider.py @@ -17,7 +17,7 @@ from kazoo.security import ACL, make_acl from ops.charm import RelationBrokenEvent, RelationEvent from ops.framework import EventBase, Object -from ops.model import MaintenanceStatus, Relation +from ops.model import Relation from cluster import UnitNotFoundError from literals import PEER, REL_NAME @@ -252,9 +252,16 @@ def apply_relation_data(self, event: Optional[EventBase] = None) -> None: relation_data["password"] = config["password"] or generate_password() relation_data["chroot"] = config["chroot"] relation_data["endpoints"] = ",".join(list(hosts)) + + if self.charm.cluster.quorum == "ssl": + relation_data["tls"] = "enabled" + port = self.charm.cluster.secure_client_port + else: + relation_data["tls"] = "disabled" + port = self.charm.cluster.client_port + relation_data["uris"] = ( - ",".join([f"{host}:{self.charm.cluster.client_port}" for host in hosts]) - + config["chroot"] + ",".join([f"{host}:{port}" for host in hosts]) + config["chroot"] ) self.app_relation.data[self.charm.app].update( @@ -271,6 +278,11 @@ def _on_client_relation_updated(self, event: RelationEvent) -> None: Args: event (optional): used for checking `RelationBrokenEvent` """ + # avoids failure from early relation + if not self.charm.cluster.quorum: + event.defer() + return + if self.charm.unit.is_leader(): try: self.update_acls(event=event) @@ -281,8 +293,7 @@ def _on_client_relation_updated(self, event: RelationEvent) -> None: KazooTimeoutError, UnitNotFoundError, ) as e: - logger.debug(str(e)) - self.charm.unit.status = MaintenanceStatus(str(e)) + logger.warning(str(e)) event.defer() return diff --git a/src/tls.py b/src/tls.py index 54e8820..987668b 100644 --- a/src/tls.py +++ b/src/tls.py @@ -305,7 +305,7 @@ def set_certificate(self) -> None: def set_truststore(self) -> None: """Adds CA to JKS truststore.""" try: - self.container.exec( + proc = self.container.exec( [ "keytool", "-import", @@ -314,24 +314,28 @@ def set_truststore(self) -> None: "ca", "-file", "ca.pem", - "-keystore truststore.jks", + "-keystore", + "truststore.jks", "-storepass", f"{self.keystore_password}", "-noprompt", ], working_dir=self.charm.zookeeper_config.default_config_path, ) + logger.debug(str(proc.wait_output()[1])) except ExecError as e: - # in case this reruns and fails - if "already exists" in (str(e.stderr) or str(e.stdout)): + expected_error_string = "alias already exists" + if expected_error_string in str(e.stdout): + logger.debug(expected_error_string) return + logger.error(e.stdout) raise e def set_p12_keystore(self) -> None: """Creates and adds unit cert and private-key to a PCKS12 keystore.""" try: - self.container.exec( + proc = self.container.exec( [ "openssl", "pkcs12", @@ -351,8 +355,9 @@ def set_p12_keystore(self) -> None: ], working_dir=self.charm.zookeeper_config.default_config_path, ) + logger.debug(str(proc.wait_output()[1])) except ExecError as e: - logger.error(e.stdout) + logger.error(str(e.stdout)) raise e def remove_stores(self) -> None: diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index ab32312..14abf27 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -43,7 +43,7 @@ async def test_deploy_ssl_quorum(ops_test: OpsTest): assert ops_test.model.applications["tls-certificates-operator"].status == "active" await ops_test.model.add_relation(APP_NAME, "tls-certificates-operator") await ops_test.model.wait_for_idle( - apps=[APP_NAME, "tls-certificates-operator"], status="active", timeout=1000 + apps=[APP_NAME, "tls-certificates-operator"], status="active", timeout=1000, idle_period=30 ) assert ops_test.model.applications[APP_NAME].status == "active" assert ops_test.model.applications["tls-certificates-operator"].status == "active" @@ -54,6 +54,9 @@ async def test_deploy_ssl_quorum(ops_test: OpsTest): assert "sslQuorum=true" in check_properties( model_full_name=ops_test.model_full_name, unit=unit.name ) + assert "portUnification=true" not in check_properties( + model_full_name=ops_test.model_full_name, unit=unit.name + ) @pytest.mark.abort_on_fail @@ -109,3 +112,16 @@ async def test_scale_up_tls(ops_test: OpsTest): await ops_test.model.block_until(lambda: len(ops_test.model.applications[APP_NAME].units) == 4) await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000) assert ping_servers(ops_test) + + +@pytest.mark.abort_on_fail +async def test_client_relate_maintains_quorum(ops_test: OpsTest): + dummy_name = "app" + app_charm = await ops_test.build_charm("tests/integration/app-charm") + await ops_test.model.deploy(app_charm, application_name=dummy_name, num_units=1) + await ops_test.model.wait_for_idle([APP_NAME, dummy_name], status="active", timeout=1000) + await ops_test.model.add_relation(APP_NAME, dummy_name) + await ops_test.model.wait_for_idle([APP_NAME, dummy_name], status="active", timeout=1000) + assert ops_test.model.applications[APP_NAME].status == "active" + assert ops_test.model.applications[dummy_name].status == "active" + assert ping_servers(ops_test) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 21cc4ae..b850766 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -3,7 +3,9 @@ # See LICENSE file for licensing details. from pathlib import Path +from unittest.mock import patch +import ops.testing import pytest import yaml from ops.testing import Harness @@ -12,6 +14,15 @@ from config import ZooKeeperConfig from literals import CHARM_KEY +ops.testing.SIMULATE_CAN_CONNECT = True + + +@pytest.fixture(autouse=True) +def patched_pull(): + with patch("ops.model.Container.pull"): + yield + + CONFIG = str(yaml.safe_load(Path("./config.yaml").read_text())) ACTIONS = str(yaml.safe_load(Path("./actions.yaml").read_text())) METADATA = str(yaml.safe_load(Path("./metadata.yaml").read_text())) @@ -29,7 +40,6 @@ def test_build_static_properties_removes_necessary_rows(): "clientPort=2181", "authProvider.sasl=org.apache.zookeeper.server.auth.SASLAuthenticationProvider", "maxClientCnxns=60", - "dynamicConfigFile=/data/zookeeper/zookeeper.properties.dynamic.100000041", ] static = ZooKeeperConfig.build_static_properties(properties=properties) diff --git a/tests/unit/test_provider.py b/tests/unit/test_provider.py index 2d0ac4e..7631b20 100644 --- a/tests/unit/test_provider.py +++ b/tests/unit/test_provider.py @@ -7,8 +7,8 @@ import unittest from collections import namedtuple from pathlib import Path +from unittest.mock import patch -import ops.testing import yaml from ops.charm import RelationBrokenEvent from ops.testing import Harness @@ -16,8 +16,6 @@ from charm import ZooKeeperK8sCharm from literals import CHARM_KEY, PEER, REL_NAME -ops.testing.SIMULATE_CAN_CONNECT = True - logger = logging.getLogger(__name__) METADATA = str(yaml.safe_load(Path("./metadata.yaml").read_text())) @@ -31,8 +29,8 @@ class TestProvider(unittest.TestCase): def setUp(self): self.harness = Harness(ZooKeeperK8sCharm, meta=METADATA, config=CONFIG, actions=ACTIONS) self.addCleanup(self.harness.cleanup) - self.harness.add_relation(REL_NAME, "application") - self.harness.add_relation(PEER, CHARM_KEY) + self.client_rel_id = self.harness.add_relation(REL_NAME, "application") + self.peer_rel_id = self.harness.add_relation(PEER, CHARM_KEY) self.harness.begin() @property @@ -225,6 +223,60 @@ def test_is_child_of_not(self): self.assertFalse(self.harness.charm.provider._is_child_of(path=chroot, chroots=chroots)) + def test_port_updates_if_tls(self): + self.harness.set_leader(True) + self.harness.update_relation_data( + self.provider.client_relations[0].id, "application", {"chroot": "app"} + ) + # checking if ssl port and ssl flag are passed + self.harness.update_relation_data( + self.provider.app_relation.id, + f"{CHARM_KEY}/0", + {"state": "started"}, + ) + self.harness.update_relation_data( + self.provider.app_relation.id, + CHARM_KEY, + {"quorum": "ssl"}, + ) + self.harness.charm.provider.apply_relation_data() + + for relation in self.provider.client_relations: + uris = relation.data[self.harness.charm.app].get("uris", "") + ssl = relation.data[self.harness.charm.app].get("tls", "") + + self.assertIn(str(self.harness.charm.cluster.secure_client_port), uris) + self.assertEqual(ssl, "enabled") + + self.harness.update_relation_data( + self.provider.app_relation.id, + CHARM_KEY, + {"quorum": "non-ssl"}, + ) + self.harness.charm.provider.apply_relation_data() + + for relation in self.provider.client_relations: + uris = relation.data[self.harness.charm.app].get("uris", "") + ssl = relation.data[self.harness.charm.app].get("tls", "") + + self.assertIn(str(self.harness.charm.cluster.client_port), uris) + self.assertEqual(ssl, "disabled") + + @patch("ops.model.Container.can_connect") + @patch( + "charms.rolling_ops.v0.rollingops.RollingOpsManager._on_acquire_lock", return_value=None + ) + def test_provider_relation_data_updates_port(self, *_): + with patch("provider.ZooKeeperProvider.apply_relation_data", return_value=None) as patched: + self.harness.set_leader(True) + self.harness.update_relation_data( + self.peer_rel_id, + CHARM_KEY, + {"quorum": "non-ssl"}, + ) + + patched.assert_called_once() + def test_apply_relation_data(self): self.harness.set_leader(True) self.harness.add_relation("zookeeper", "new_application") @@ -236,22 +288,22 @@ def test_apply_relation_data(self): "new_application", {"chroot": "new_app", "chroot-acl": "rw"}, ) - self.harness.add_relation_unit(self.provider.app_relation.id, "{CHARM_KEY}/0") + self.harness.add_relation_unit(self.provider.app_relation.id, f"{CHARM_KEY}/0") self.harness.update_relation_data( self.provider.app_relation.id, - "{CHARM_KEY}/0", + f"{CHARM_KEY}/0", {"state": "started"}, ) - self.harness.add_relation_unit(self.provider.app_relation.id, "{CHARM_KEY}/1") + self.harness.add_relation_unit(self.provider.app_relation.id, f"{CHARM_KEY}/1") self.harness.update_relation_data( self.provider.app_relation.id, - "{CHARM_KEY}/1", + f"{CHARM_KEY}/1", {"state": "ready"}, ) - self.harness.add_relation_unit(self.provider.app_relation.id, "{CHARM_KEY}/2") + self.harness.add_relation_unit(self.provider.app_relation.id, f"{CHARM_KEY}/2") self.harness.update_relation_data( self.provider.app_relation.id, - "{CHARM_KEY}/2", + f"{CHARM_KEY}/2", {"state": "started"}, ) @@ -275,7 +327,7 @@ def test_apply_relation_data(self): # checking existence of all necessary keys self.assertEqual( sorted(relation.data[self.harness.charm.app].keys()), - sorted(["chroot", "endpoints", "password", "uris", "username"]), + sorted(["chroot", "endpoints", "password", "tls", "uris", "username"]), ) username = relation.data[self.harness.charm.app]["username"] diff --git a/tests/unit/test_tls.py b/tests/unit/test_tls.py index c29376a..b1385d0 100644 --- a/tests/unit/test_tls.py +++ b/tests/unit/test_tls.py @@ -22,7 +22,7 @@ def harness(): harness = Harness(ZooKeeperK8sCharm, meta=METADATA, config=CONFIG, actions=ACTIONS) peer_rel_id = harness.add_relation(PEER, CHARM_KEY) harness.add_relation_unit(peer_rel_id, f"{CHARM_KEY}/0") - harness._update_config({"init-limit": "5", "sync-limit": "2", "tick-time": "2000"}) + harness._update_config({"init-limit": 5, "sync-limit": 2, "tick-time": 2000}) harness.begin() return harness diff --git a/tox.ini b/tox.ini index 4a747bf..60f6134 100644 --- a/tox.ini +++ b/tox.ini @@ -76,7 +76,7 @@ deps = -r{toxinidir}/requirements.txt commands = coverage run --source={[vars]src_path} \ - -m pytest --ignore={[vars]tst_path}integration -v --tb native -s {posargs} + -m pytest --ignore={[vars]tst_path}integration -vv --tb native -s {posargs} coverage report [testenv:integration]