Skip to content

Commit

Permalink
Merge pull request #18 from canonical/update_secure_port
Browse files Browse the repository at this point in the history
  • Loading branch information
marcoppenheimer authored Oct 5, 2022
2 parents c8841c0 + a20b7be commit 4098fe0
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 47 deletions.
49 changes: 38 additions & 11 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def _on_cluster_relation_changed(self, event: EventBase) -> None:
# check whether restart is needed for all `*_changed` events
self.on[self.restart.name].acquire_lock.emit()

if self.tls.upgrading and len(self.cluster.peer_units) == 1:
event.defer()

def _restart(self, event: EventBase) -> None:
"""Handler for emitted restart events."""
# this can cause issues if ran before `init_server()`
Expand Down Expand Up @@ -156,7 +159,14 @@ def _restart(self, event: EventBase) -> None:
# flag to update that this unit is running `portUnification` during ssl<->no-ssl upgrade
# in case restart was manual, also remove
self.cluster.relation.data[self.unit].update(
{"unified": "true" if self.tls.upgrading else "", "manual-restart": ""}
{
# flag to declare unit running `portUnification` during ssl<->no-ssl upgrade
"unified": "true" if self.tls.upgrading else "",
# in case restart was manual
"manual-restart": "",
# flag to declare unit restarted with new quorum encryption
"quorum": self.cluster.quorum or "",
}
)

def init_server(self):
Expand Down Expand Up @@ -196,10 +206,13 @@ def init_server(self):
# unit flags itself as 'started' so it can be retrieved by the leader
logger.info(f"Server.{self.cluster.get_unit_id(self.unit)} started")

# flag to update that this unit is running `portUnification` during ssl<->no-ssl upgrade
# added here in case a `restart` was missed
self.cluster.relation.data[self.unit].update(
{"state": "started", "unified": "true" if self.tls.upgrading else ""}
{
"state": "started",
"unified": "true" if self.tls.upgrading else "",
"quorum": self.cluster.quorum or "",
}
)

def config_changed(self):
Expand Down Expand Up @@ -268,24 +281,38 @@ def update_quorum(self, event: EventBase) -> None:
# set first unit to "added" asap to get the units starting sooner
self.add_init_leader()

if self.cluster.stale_quorum or isinstance(
# ensure these events always run without delay to maintain quorum on scale down
event,
(RelationDepartedEvent, LeaderElectedEvent),
if (
self.cluster.stale_quorum # in case of scale-up
or isinstance( # to run without delay to maintain quorum on scale down
event,
(RelationDepartedEvent, LeaderElectedEvent),
)
):
updated_servers = self.cluster.update_cluster()
# triggers a `cluster_relation_changed` to wake up following units
self.cluster.relation.data[self.app].update(updated_servers)

# default startup without ssl relation
if not self.cluster.stale_quorum and not self.tls.enabled and not self.tls.upgrading:
if not self.cluster.quorum: # avoids multiple loglines
logger.info("ZooKeeper cluster running with non-SSL quorum")

self.cluster.relation.data[self.app].update({"quorum": "non-ssl"})

# declare upgrade complete only when all peer units have started
# triggers `cluster_relation_changed` to rolling-restart without `portUnification`
if self.tls.all_units_unified:
if self.tls.enabled:
logger.info("ZooKeeper cluster running with quorum encryption")
self.cluster.relation.data[self.app].update({"quorum": "ssl", "upgrading": ""})
self.cluster.relation.data[self.app].update({"quorum": "ssl"})
else:
logger.info("ZooKeeper cluster running without quorum encryption")
self.cluster.relation.data[self.app].update({"quorum": "non-ssl", "upgrading": ""})
self.cluster.relation.data[self.app].update({"quorum": "non-ssl"})

if self.cluster.all_units_quorum:
self.cluster.relation.data[self.app].update({"upgrading": ""})
logger.debug(f"ZooKeeper cluster switching to {self.cluster.quorum} quorum")

# attempt update of client relation data in case port updated
self.provider.apply_relation_data(event)

def add_init_leader(self) -> None:
"""Adds the first leader server to the relation data for other units to ack."""
Expand Down
20 changes: 19 additions & 1 deletion src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def update_cluster(self) -> Dict[str, str]:
UnitNotFoundError,
BadArgumentsError,
) as e:
logger.debug(str(e))
logger.warning(str(e))
return {}

def is_unit_turn(self, unit: Optional[Unit] = None) -> bool:
Expand Down Expand Up @@ -461,3 +461,21 @@ def manual_restart(self) -> bool:
True if manual-restart flag is set. Otherwise False
"""
return bool(self.relation.data[self.charm.app].get("manual-restart", None))

@property
def all_units_quorum(self) -> bool:
"""Checks if all units are running with the cluster quorum encryption.
Returns:
True if all units are running the quorum encryption in app data.
Otherwise False.
"""
unit_quorums = set()
for unit in self.peer_units:
unit_quorum = self.relation.data[unit].get("quorum", None)
if unit_quorum != self.quorum:
return False

unit_quorums.add(unit_quorum)

return len(unit_quorums) == 1
41 changes: 33 additions & 8 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from typing import List

from ops.model import Relation
from ops.pebble import PathError

from literals import CONTAINER, PEER, REL_NAME
from utils import push
from utils import pull, push

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -146,7 +147,7 @@ def zookeeper_properties(self) -> List[str]:
+ [
f"dataDir={self.default_config_path}/data",
f"dataLogDir={self.default_config_path}/log",
f"dynamicConfigFile={self.default_config_path}/zookeeper-dynamic.properties",
f"{self.current_dynamic_config_file}",
]
)

Expand Down Expand Up @@ -188,6 +189,34 @@ def zookeeper_properties(self) -> List[str]:

return properties

@property
def current_dynamic_config_file(self) -> str:
"""Gets current dynamicConfigFile property from live unit.
When setting config dynamically, ZK creates a new properties file
that keeps track of the current dynamic config version.
When setting our config, we overwrite the file, losing the tracked version,
so we can re-set it with this.
Returns:
String of current `dynamicConfigFile=<value>` for the running server
"""
try:
current_properties = pull(
container=self.container, path=self.properties_filepath
).splitlines()
except PathError:
logger.debug("zookeeper.properties file not found - using default dynamic path")
return f"dynamicConfigFile={self.default_config_path}/zookeeper-dynamic.properties"

for current_property in current_properties:
if "dynamicConfigFile" in current_property:
return current_property

logger.debug("dynamicConfigFile property missing - using default dynamic path")

return f"dynamicConfigFile={self.default_config_path}/zookeeper-dynamic.properties"

@property
def static_properties(self) -> List[str]:
"""Build the zookeeper.properties content, without dynamic options.
Expand Down Expand Up @@ -232,7 +261,7 @@ def build_static_properties(properties: List[str]) -> List[str]:
Running ZooKeeper cluster with `reconfigEnabled` moves dynamic options
to a dedicated dynamic file
These options are `dynamicConfigFile`, `clientPort` and `secureClientPort`
These options are `clientPort` and `secureClientPort`
Args:
properties: the properties to make static
Expand All @@ -243,11 +272,7 @@ def build_static_properties(properties: List[str]) -> List[str]:
return [
prop
for prop in properties
if (
"dynamicConfigFile" not in prop
and "clientPort" not in prop
and "secureClientPort" not in prop
)
if ("clientPort" not in prop and "secureClientPort" not in prop)
]

@property
Expand Down
21 changes: 16 additions & 5 deletions src/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from kazoo.security import ACL, make_acl
from ops.charm import RelationBrokenEvent, RelationEvent
from ops.framework import EventBase, Object
from ops.model import MaintenanceStatus, Relation
from ops.model import Relation

from cluster import UnitNotFoundError
from literals import PEER, REL_NAME
Expand Down Expand Up @@ -252,9 +252,16 @@ def apply_relation_data(self, event: Optional[EventBase] = None) -> None:
relation_data["password"] = config["password"] or generate_password()
relation_data["chroot"] = config["chroot"]
relation_data["endpoints"] = ",".join(list(hosts))

if self.charm.cluster.quorum == "ssl":
relation_data["tls"] = "enabled"
port = self.charm.cluster.secure_client_port
else:
relation_data["tls"] = "disabled"
port = self.charm.cluster.client_port

relation_data["uris"] = (
",".join([f"{host}:{self.charm.cluster.client_port}" for host in hosts])
+ config["chroot"]
",".join([f"{host}:{port}" for host in hosts]) + config["chroot"]
)

self.app_relation.data[self.charm.app].update(
Expand All @@ -271,6 +278,11 @@ def _on_client_relation_updated(self, event: RelationEvent) -> None:
Args:
event (optional): used for checking `RelationBrokenEvent`
"""
# avoids failure from early relation
if not self.charm.cluster.quorum:
event.defer()
return

if self.charm.unit.is_leader():
try:
self.update_acls(event=event)
Expand All @@ -281,8 +293,7 @@ def _on_client_relation_updated(self, event: RelationEvent) -> None:
KazooTimeoutError,
UnitNotFoundError,
) as e:
logger.debug(str(e))
self.charm.unit.status = MaintenanceStatus(str(e))
logger.warning(str(e))
event.defer()
return

Expand Down
17 changes: 11 additions & 6 deletions src/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def set_certificate(self) -> None:
def set_truststore(self) -> None:
"""Adds CA to JKS truststore."""
try:
self.container.exec(
proc = self.container.exec(
[
"keytool",
"-import",
Expand All @@ -314,24 +314,28 @@ def set_truststore(self) -> None:
"ca",
"-file",
"ca.pem",
"-keystore truststore.jks",
"-keystore",
"truststore.jks",
"-storepass",
f"{self.keystore_password}",
"-noprompt",
],
working_dir=self.charm.zookeeper_config.default_config_path,
)
logger.debug(str(proc.wait_output()[1]))
except ExecError as e:
# in case this reruns and fails
if "already exists" in (str(e.stderr) or str(e.stdout)):
expected_error_string = "alias <ca> already exists"
if expected_error_string in str(e.stdout):
logger.debug(expected_error_string)
return

logger.error(e.stdout)
raise e

def set_p12_keystore(self) -> None:
"""Creates and adds unit cert and private-key to a PCKS12 keystore."""
try:
self.container.exec(
proc = self.container.exec(
[
"openssl",
"pkcs12",
Expand All @@ -351,8 +355,9 @@ def set_p12_keystore(self) -> None:
],
working_dir=self.charm.zookeeper_config.default_config_path,
)
logger.debug(str(proc.wait_output()[1]))
except ExecError as e:
logger.error(e.stdout)
logger.error(str(e.stdout))
raise e

def remove_stores(self) -> None:
Expand Down
18 changes: 17 additions & 1 deletion tests/integration/test_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ async def test_deploy_ssl_quorum(ops_test: OpsTest):
assert ops_test.model.applications["tls-certificates-operator"].status == "active"
await ops_test.model.add_relation(APP_NAME, "tls-certificates-operator")
await ops_test.model.wait_for_idle(
apps=[APP_NAME, "tls-certificates-operator"], status="active", timeout=1000
apps=[APP_NAME, "tls-certificates-operator"], status="active", timeout=1000, idle_period=30
)
assert ops_test.model.applications[APP_NAME].status == "active"
assert ops_test.model.applications["tls-certificates-operator"].status == "active"
Expand All @@ -54,6 +54,9 @@ async def test_deploy_ssl_quorum(ops_test: OpsTest):
assert "sslQuorum=true" in check_properties(
model_full_name=ops_test.model_full_name, unit=unit.name
)
assert "portUnification=true" not in check_properties(
model_full_name=ops_test.model_full_name, unit=unit.name
)


@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -109,3 +112,16 @@ async def test_scale_up_tls(ops_test: OpsTest):
await ops_test.model.block_until(lambda: len(ops_test.model.applications[APP_NAME].units) == 4)
await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000)
assert ping_servers(ops_test)


@pytest.mark.abort_on_fail
async def test_client_relate_maintains_quorum(ops_test: OpsTest):
dummy_name = "app"
app_charm = await ops_test.build_charm("tests/integration/app-charm")
await ops_test.model.deploy(app_charm, application_name=dummy_name, num_units=1)
await ops_test.model.wait_for_idle([APP_NAME, dummy_name], status="active", timeout=1000)
await ops_test.model.add_relation(APP_NAME, dummy_name)
await ops_test.model.wait_for_idle([APP_NAME, dummy_name], status="active", timeout=1000)
assert ops_test.model.applications[APP_NAME].status == "active"
assert ops_test.model.applications[dummy_name].status == "active"
assert ping_servers(ops_test)
12 changes: 11 additions & 1 deletion tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
# See LICENSE file for licensing details.

from pathlib import Path
from unittest.mock import patch

import ops.testing
import pytest
import yaml
from ops.testing import Harness
Expand All @@ -12,6 +14,15 @@
from config import ZooKeeperConfig
from literals import CHARM_KEY

ops.testing.SIMULATE_CAN_CONNECT = True


@pytest.fixture(autouse=True)
def patched_pull():
with patch("ops.model.Container.pull"):
yield


CONFIG = str(yaml.safe_load(Path("./config.yaml").read_text()))
ACTIONS = str(yaml.safe_load(Path("./actions.yaml").read_text()))
METADATA = str(yaml.safe_load(Path("./metadata.yaml").read_text()))
Expand All @@ -29,7 +40,6 @@ def test_build_static_properties_removes_necessary_rows():
"clientPort=2181",
"authProvider.sasl=org.apache.zookeeper.server.auth.SASLAuthenticationProvider",
"maxClientCnxns=60",
"dynamicConfigFile=/data/zookeeper/zookeeper.properties.dynamic.100000041",
]

static = ZooKeeperConfig.build_static_properties(properties=properties)
Expand Down
Loading

0 comments on commit 4098fe0

Please sign in to comment.