From 3c6df215502c8855ffa939e3674a82d02cad21f0 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Thu, 26 Sep 2024 15:19:14 +0200 Subject: [PATCH 1/4] Cover large deployments with more upgrade testing --- tests/integration/upgrades/helpers.py | 32 +++ .../test_manual_large_deployment_upgrades.py | 193 +++++++++++++++--- .../test_small_deployment_upgrades.py | 47 ++--- 3 files changed, 205 insertions(+), 67 deletions(-) diff --git a/tests/integration/upgrades/helpers.py b/tests/integration/upgrades/helpers.py index 77cbecbb1..c5f489c2f 100644 --- a/tests/integration/upgrades/helpers.py +++ b/tests/integration/upgrades/helpers.py @@ -6,6 +6,7 @@ import subprocess from typing import Optional +import pytest from pytest_operator.plugin import OpsTest from tenacity import Retrying, stop_after_attempt, wait_fixed @@ -17,12 +18,43 @@ from ..helpers import APP_NAME, IDLE_PERIOD, app_name, run_action from ..helpers_deployments import get_application_units, wait_until +OPENSEARCH_ORIGINAL_CHARM_NAME = "opensearch" OPENSEARCH_SERVICE_PATH = "/etc/systemd/system/snap.opensearch.daemon.service" ORIGINAL_RESTART_DELAY = 20 SECOND_APP_NAME = "second-opensearch" RESTART_DELAY = 360 +OPENSEARCH_CHANNEL = "2/edge" + + +STARTING_VERSION = "2.15.0" + + +VERSION_TO_REVISION = { + STARTING_VERSION: 144, + "2.16.0": 160, +} + + +CHANNELS = ["edge", "beta", "2/stable"] + + +FROM_VERSION_PREFIX = "from_v{}_to_local" + + +UPGRADE_INITIAL_VERSION = [ + ( + pytest.param( + version, + id=FROM_VERSION_PREFIX.format(version), + marks=pytest.mark.group(FROM_VERSION_PREFIX.format(version)), + ) + ) + for version in VERSION_TO_REVISION.keys() +] + + logger = logging.getLogger(__name__) diff --git a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py index 5aa9e85a8..e964d54e3 100644 --- a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py +++ b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py @@ -10,15 +10,30 @@ from ..ha.continuous_writes import ContinuousWrites from ..ha.helpers import assert_continuous_writes_consistency -from ..helpers import APP_NAME, IDLE_PERIOD, MODEL_CONFIG, SERIES, run_action -from ..helpers_deployments import get_application_units, wait_until +from ..helpers import ( + APP_NAME, + IDLE_PERIOD, + MODEL_CONFIG, + SERIES, + get_leader_unit_id, + run_action, + set_watermark, +) +from ..helpers_deployments import wait_until from ..tls.test_tls import TLS_CERTIFICATES_APP_NAME +from .helpers import ( + OPENSEARCH_CHANNEL, + OPENSEARCH_ORIGINAL_CHARM_NAME, + STARTING_VERSION, + UPGRADE_INITIAL_VERSION, + VERSION_TO_REVISION, + assert_upgrade_to_local, + refresh, +) logger = logging.getLogger(__name__) -OPENSEARCH_ORIGINAL_CHARM_NAME = "opensearch" -OPENSEARCH_INITIAL_CHANNEL = "2/edge" OPENSEARCH_MAIN_APP_NAME = "main" OPENSEARCH_FAILOVER_APP_NAME = "failover" @@ -27,19 +42,19 @@ WORKLOAD = { - APP_NAME: 3, - OPENSEARCH_FAILOVER_APP_NAME: 2, - OPENSEARCH_MAIN_APP_NAME: 1, + APP_NAME: 2, + OPENSEARCH_FAILOVER_APP_NAME: 1, + OPENSEARCH_MAIN_APP_NAME: 3, } -@pytest.mark.skip(reason="Fix with DPE-4528") -@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"]) -@pytest.mark.group(1) -@pytest.mark.abort_on_fail -@pytest.mark.skip_if_deployed -async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None: - """Build and deploy the charm for large deployment tests.""" +####################################################################### +# +# Auxiliary functions +# +####################################################################### +async def _build_env(ops_test: OpsTest, version: str) -> None: + """Deploy OpenSearch cluster from a given revision.""" await ops_test.model.set_config(MODEL_CONFIG) # Deploy TLS Certificates operator. tls_config = {"ca-common-name": "CN_CA"} @@ -52,7 +67,7 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None failover_orchestrator_conf = { "cluster_name": "backup-test", "init_hold": True, - "roles": "cluster_manager", + "roles": "voting_only", } data_hot_conf = {"cluster_name": "backup-test", "init_hold": True, "roles": "data.hot"} @@ -63,7 +78,7 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None application_name=OPENSEARCH_MAIN_APP_NAME, num_units=WORKLOAD[OPENSEARCH_MAIN_APP_NAME], series=SERIES, - channel=OPENSEARCH_INITIAL_CHANNEL, + channel=OPENSEARCH_CHANNEL, config=main_orchestrator_conf, ), ops_test.model.deploy( @@ -71,7 +86,7 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None application_name=OPENSEARCH_FAILOVER_APP_NAME, num_units=WORKLOAD[OPENSEARCH_FAILOVER_APP_NAME], series=SERIES, - channel=OPENSEARCH_INITIAL_CHANNEL, + channel=OPENSEARCH_CHANNEL, config=failover_orchestrator_conf, ), ops_test.model.deploy( @@ -79,7 +94,7 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None application_name=APP_NAME, num_units=WORKLOAD[APP_NAME], series=SERIES, - channel=OPENSEARCH_INITIAL_CHANNEL, + channel=OPENSEARCH_CHANNEL, config=data_hot_conf, ), ) @@ -117,18 +132,12 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None timeout=3600, ) + await set_watermark(ops_test, APP_NAME) -@pytest.mark.skip(reason="Fix with DPE-4528") -@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"]) -@pytest.mark.group(1) -@pytest.mark.abort_on_fail -async def test_manually_upgrade_to_local( - ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner -) -> None: - """Test upgrade from usptream to currently locally built version.""" - units = await get_application_units(ops_test, OPENSEARCH_MAIN_APP_NAME) - leader_id = [u.id for u in units if u.is_leader][0] +async def _upgrade(ops_test: OpsTest, local_build: bool = False, revision: str = None) -> None: + app = OPENSEARCH_MAIN_APP_NAME + leader_id = await get_leader_unit_id(ops_test, app) action = await run_action( ops_test, leader_id, @@ -144,13 +153,14 @@ async def test_manually_upgrade_to_local( async with ops_test.fast_forward(): for app, unit_count in WORKLOAD.items(): - application = ops_test.model.applications[app] - units = await get_application_units(ops_test, app) - leader_id = [u.id for u in units if u.is_leader][0] + leader_id = get_leader_unit_id(ops_test, app) logger.info(f"Refresh app {app}, leader {leader_id}") - await application.refresh(path=charm) + if local_build: + await refresh(ops_test, app, path=charm) + else: + await refresh(ops_test, app, revision=revision) logger.info("Refresh is over, waiting for the charm to settle") if unit_count == 1: @@ -166,10 +176,13 @@ async def test_manually_upgrade_to_local( logger.info(f"Upgrade of app {app} finished") continue + # Wait until we are set in an idle state and can rollback the revision. + # app status blocked: that will happen if we are jumping N-2 versions in our test + # app status active: that will happen if we are jumping N-1 in our test await wait_until( ops_test, apps=[app], - apps_statuses=["blocked"], + apps_statuses=["active", "blocked"], units_statuses=["active"], wait_for_exact_units={ app: unit_count, @@ -197,9 +210,123 @@ async def test_manually_upgrade_to_local( ) logger.info(f"Upgrade of app {app} finished") + +####################################################################### +# +# Tests +# +####################################################################### +@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"]) +@pytest.mark.group("happy_path_upgrade") +@pytest.mark.abort_on_fail +@pytest.mark.skip_if_deployed +async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None: + """Deploy OpenSearch.""" + await _build_env(ops_test, STARTING_VERSION) + + +@pytest.mark.group("happy_path_upgrade") +@pytest.mark.abort_on_fail +async def test_upgrade_between_versions( + ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner +) -> None: + """Test upgrade from upstream to currently locally built version.""" + for version, rev in VERSION_TO_REVISION.items(): + if version == STARTING_VERSION: + # We're starting in this version + continue + + logger.info(f"Upgrading to version {version}") + await _upgrade(ops_test, revision=rev) + + await _upgrade(ops_test, local_build=True) # continuous writes checks await assert_continuous_writes_consistency( ops_test, c_writes, [APP_NAME, OPENSEARCH_MAIN_APP_NAME], ) + + +################################################################################## +# +# test scenarios from each version: +# Start with each version, moving to local and then rolling back mid-upgrade +# Once this test passes, the 2nd test will rerun the upgrade, this time to +# its end. +# +################################################################################## +@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"]) +@pytest.mark.parametrize("version", UPGRADE_INITIAL_VERSION) +@pytest.mark.abort_on_fail +@pytest.mark.skip_if_deployed +async def test_deploy_from_version(ops_test: OpsTest, version) -> None: + """Deploy OpenSearch.""" + await _build_env(ops_test, version) + + +@pytest.mark.parametrize("version", UPGRADE_INITIAL_VERSION) +@pytest.mark.abort_on_fail +async def test_upgrade_rollback_from_local( + ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner, version +) -> None: + """Test upgrade and rollback to each version available.""" + app = OPENSEARCH_MAIN_APP_NAME + leader_id = await get_leader_unit_id(ops_test, app) + action = await run_action( + ops_test, + leader_id, + "pre-upgrade-check", + app=OPENSEARCH_MAIN_APP_NAME, + ) + assert action.status == "completed" + + logger.info("Build charm locally") + global charm + if not charm: + charm = await ops_test.build_charm(".") + + async with ops_test.fast_forward(): + logger.info(f"Refresh app {app}, leader {leader_id}") + + async with ops_test.fast_forward(): + for app, unit_count in WORKLOAD.items(): + leader_id = get_leader_unit_id(ops_test, app) + + await refresh(ops_test, app, path=charm) + logger.info("Refresh is over, waiting for the charm to settle") + + # Wait until we are set in an idle state and can rollback the revision. + # app status blocked: that will happen if we are jumping N-2 versions in our test + # app status active: that will happen if we are jumping N-1 in our test + await wait_until( + ops_test, + apps=[app], + apps_statuses=["active", "blocked"], + units_statuses=["active"], + wait_for_exact_units={ + app: unit_count, + }, + idle_period=120, + timeout=3600, + ) + + # continuous writes checks + await assert_continuous_writes_consistency( + ops_test, + c_writes, + [APP_NAME, OPENSEARCH_MAIN_APP_NAME], + ) + + +@pytest.mark.parametrize("version", UPGRADE_INITIAL_VERSION) +@pytest.mark.abort_on_fail +async def test_upgrade_from_version_to_local( + ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner, version +) -> None: + """Test upgrade from usptream to currently locally built version.""" + logger.info("Build charm locally") + global charm + if not charm: + charm = await ops_test.build_charm(".") + await assert_upgrade_to_local(ops_test, c_writes, charm) diff --git a/tests/integration/upgrades/test_small_deployment_upgrades.py b/tests/integration/upgrades/test_small_deployment_upgrades.py index cee4e6853..69a92d556 100644 --- a/tests/integration/upgrades/test_small_deployment_upgrades.py +++ b/tests/integration/upgrades/test_small_deployment_upgrades.py @@ -14,44 +14,25 @@ IDLE_PERIOD, MODEL_CONFIG, SERIES, + get_leader_unit_id, run_action, set_watermark, ) -from ..helpers_deployments import get_application_units, wait_until +from ..helpers_deployments import wait_until from ..tls.test_tls import TLS_CERTIFICATES_APP_NAME -from .helpers import assert_upgrade_to_local, refresh +from .helpers import ( + OPENSEARCH_CHANNEL, + OPENSEARCH_ORIGINAL_CHARM_NAME, + STARTING_VERSION, + UPGRADE_INITIAL_VERSION, + VERSION_TO_REVISION, + assert_upgrade_to_local, + refresh, +) logger = logging.getLogger(__name__) -OPENSEARCH_ORIGINAL_CHARM_NAME = "opensearch" -OPENSEARCH_CHANNEL = "2/edge" - - -STARTING_VERSION = "2.15.0" - - -VERSION_TO_REVISION = { - STARTING_VERSION: 144, - "2.16.0": 160, -} - - -FROM_VERSION_PREFIX = "from_v{}_to_local" - - -UPGRADE_INITIAL_VERSION = [ - ( - pytest.param( - version, - id=FROM_VERSION_PREFIX.format(version), - marks=pytest.mark.group(FROM_VERSION_PREFIX.format(version)), - ) - ) - for version in VERSION_TO_REVISION.keys() -] - - charm = None @@ -114,8 +95,7 @@ async def test_upgrade_between_versions( ) -> None: """Test upgrade from upstream to currently locally built version.""" app = (await app_name(ops_test)) or APP_NAME - units = await get_application_units(ops_test, app) - leader_id = [u.id for u in units if u.is_leader][0] + leader_id = get_leader_unit_id(ops_test, app) for version, rev in VERSION_TO_REVISION.items(): if version == STARTING_VERSION: @@ -212,8 +192,7 @@ async def test_upgrade_rollback_from_local( ) -> None: """Test upgrade and rollback to each version available.""" app = (await app_name(ops_test)) or APP_NAME - units = await get_application_units(ops_test, app) - leader_id = [u.id for u in units if u.is_leader][0] + leader_id = await get_leader_unit_id(ops_test, app) action = await run_action( ops_test, From ab1598bf5c0a0ad459e2ec9848f8a7fea5fd4118 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Thu, 26 Sep 2024 19:19:43 +0200 Subject: [PATCH 2/4] Update roles --- .../upgrades/test_manual_large_deployment_upgrades.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py index e964d54e3..28ea95399 100644 --- a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py +++ b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py @@ -67,9 +67,9 @@ async def _build_env(ops_test: OpsTest, version: str) -> None: failover_orchestrator_conf = { "cluster_name": "backup-test", "init_hold": True, - "roles": "voting_only", + "roles": "cluster_manager", } - data_hot_conf = {"cluster_name": "backup-test", "init_hold": True, "roles": "data.hot"} + data_conf = {"cluster_name": "backup-test", "init_hold": True, "roles": "data"} await asyncio.gather( ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="stable", config=tls_config), @@ -95,7 +95,7 @@ async def _build_env(ops_test: OpsTest, version: str) -> None: num_units=WORKLOAD[APP_NAME], series=SERIES, channel=OPENSEARCH_CHANNEL, - config=data_hot_conf, + config=data_conf, ), ) From 304629a271d5a6f368dabcea27030d20717c40b8 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Thu, 26 Sep 2024 20:06:55 +0200 Subject: [PATCH 3/4] Update large deployments --- .../test_manual_large_deployment_upgrades.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py index 28ea95399..e5d98e0e0 100644 --- a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py +++ b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py @@ -36,6 +36,8 @@ OPENSEARCH_MAIN_APP_NAME = "main" OPENSEARCH_FAILOVER_APP_NAME = "failover" +REL_ORCHESTRATOR = "peer-cluster-orchestrator" +REL_PEER = "peer-cluster" charm = None @@ -60,16 +62,16 @@ async def _build_env(ops_test: OpsTest, version: str) -> None: tls_config = {"ca-common-name": "CN_CA"} main_orchestrator_conf = { - "cluster_name": "backup-test", + "cluster_name": "upgrade-test", "init_hold": False, - "roles": "cluster_manager", + "roles": "cluster_manager,data", } failover_orchestrator_conf = { - "cluster_name": "backup-test", + "cluster_name": "upgrade-test", "init_hold": True, - "roles": "cluster_manager", + "roles": "cluster_manager,data", } - data_conf = {"cluster_name": "backup-test", "init_hold": True, "roles": "data"} + data_conf = {"cluster_name": "upgrade-test", "init_hold": True, "roles": "data"} await asyncio.gather( ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="stable", config=tls_config), @@ -99,6 +101,22 @@ async def _build_env(ops_test: OpsTest, version: str) -> None: ), ) + # integrate TLS to all applications + for app in [OPENSEARCH_MAIN_APP_NAME, OPENSEARCH_FAILOVER_APP_NAME, APP_NAME]: + await ops_test.model.integrate(app, TLS_CERTIFICATES_APP_NAME) + + # create the peer-cluster-relation + await ops_test.model.integrate( + f"{APP_NAME}:{REL_PEER}", f"{OPENSEARCH_MAIN_APP_NAME}:{REL_ORCHESTRATOR}" + ) + await ops_test.model.integrate( + f"{OPENSEARCH_FAILOVER_APP_NAME}:{REL_PEER}", + f"{OPENSEARCH_MAIN_APP_NAME}:{REL_ORCHESTRATOR}", + ) + await ops_test.model.integrate( + f"{APP_NAME}:{REL_PEER}", f"{OPENSEARCH_FAILOVER_APP_NAME}:{REL_ORCHESTRATOR}" + ) + # Large deployment setup await ops_test.model.integrate("main:peer-cluster-orchestrator", "failover:peer-cluster") await ops_test.model.integrate("main:peer-cluster-orchestrator", f"{APP_NAME}:peer-cluster") @@ -106,11 +124,6 @@ async def _build_env(ops_test: OpsTest, version: str) -> None: "failover:peer-cluster-orchestrator", f"{APP_NAME}:peer-cluster" ) - # TLS setup - await ops_test.model.integrate("main", TLS_CERTIFICATES_APP_NAME) - await ops_test.model.integrate("failover", TLS_CERTIFICATES_APP_NAME) - await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME) - # Charms except s3-integrator should be active await wait_until( ops_test, From 0d1462bef20410e82a8e75d8e25268269f4991d1 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Thu, 26 Sep 2024 20:12:19 +0200 Subject: [PATCH 4/4] Remove doubled integration --- .../upgrades/test_manual_large_deployment_upgrades.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py index e5d98e0e0..15892095b 100644 --- a/tests/integration/upgrades/test_manual_large_deployment_upgrades.py +++ b/tests/integration/upgrades/test_manual_large_deployment_upgrades.py @@ -117,13 +117,6 @@ async def _build_env(ops_test: OpsTest, version: str) -> None: f"{APP_NAME}:{REL_PEER}", f"{OPENSEARCH_FAILOVER_APP_NAME}:{REL_ORCHESTRATOR}" ) - # Large deployment setup - await ops_test.model.integrate("main:peer-cluster-orchestrator", "failover:peer-cluster") - await ops_test.model.integrate("main:peer-cluster-orchestrator", f"{APP_NAME}:peer-cluster") - await ops_test.model.integrate( - "failover:peer-cluster-orchestrator", f"{APP_NAME}:peer-cluster" - ) - # Charms except s3-integrator should be active await wait_until( ops_test,