diff --git a/README.md b/README.md index 4bc28cf..6dc5e3b 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,49 @@ To make changes to the RAMCloud code simply make changes to the code in the Once RAMCloud is rebuilt, you can run the unit and standalone tests again to run the updated code. +# Bringing up your own RAMCloud test cluster + +There's a script to simplify bringing up/down/resetting your RAMCloud cluster for you, +especially if you feel like debugging RAMCloud from python3 interpreter (arguably a +really nice way to troubleshoot in RAMCloud). It's the file testing/ramcloud_test_cluster.py, +from within the development environment (it's ./config/dev-env to bring this up, as +mentioned in previous section). From this dev environment, you can run: + + python3 testing/ramcloud_test_cluster.py + +This shows the status of your RAMCloud cluster. (Nifty, aye?) It may say you don't have +a cluster up or not. You can bring one up or clear out all RAMCloud tables in this +cluster by doing: + + python3 testing/ramcloud_test_cluster.py -a reset + +The nice thing about this command is clearing out all tables without wasting time bringing +down then up RAMCloud. You can also do this to bring down the cluster when you're done. + + python3 testing/ramcloud_test_cluster.py -a stop + +The -a option also supports start and status, in addition to reset and stop. start will +hard-reset the cluster if it's up already (slower), or in the event there's no cluster up, +it brings one up. status shows if a cluster is up or not (it's equiv to omitting the -a option) + +There's also the -n option, which controls the number of nodes to bring up (each node has +zk + rc-coordinator + rc-server). When -n is ommitted, it defaults to 3. You should RARELY +ever need to change this from the default. 3 is arguably the minimum # of nodes needed for +"good behavior" in zk (due to consensus algorithm with tie-breaker) and rc-server (due to +needing one instance for master copy, one instance for backup, and one instance for +"probation" until it is trusted by the other rc-servers and elected rc-coordinator) + +In the event you do need to mess with -n (let's say you want it at 4), note that you WILL +need to hard-reset the cluster (ie doing -a reset will NOT work). Something like this should +achieve the effect you want: + + python3 testing/ramcloud_test_cluster.py -a start -n 4 + +After this point, you can continue to soft-reset the cluster, and it keeps the same number of +nodes. I.e., this command should work at this point: + + python3 testing/ramcloud_test_cluster.py -a reset + # Obtaining the Patched Code First, install `stgit` through your package manager, e.g. `apt-get install diff --git a/config/Dockerfile.node b/config/Dockerfile.node index 7d333b1..5427962 100644 --- a/config/Dockerfile.node +++ b/config/Dockerfile.node @@ -49,8 +49,8 @@ ARG DISTRO_NAME=apache-zookeeper-3.5.8-bin # Download Apache Zookeeper, verify its PGP signature, untar and clean up RUN set -eux; \ - wget -q "https://www.apache.org/dist/zookeeper/$SHORT_DISTRO_NAME/$DISTRO_NAME.tar.gz"; \ - wget -q "https://www.apache.org/dist/zookeeper/$SHORT_DISTRO_NAME/$DISTRO_NAME.tar.gz.asc"; \ + wget -q "http://archive.apache.org/dist/zookeeper/$SHORT_DISTRO_NAME/$DISTRO_NAME.tar.gz"; \ + wget -q "http://archive.apache.org/dist/zookeeper/$SHORT_DISTRO_NAME/$DISTRO_NAME.tar.gz.asc"; \ export GNUPGHOME="$(mktemp -d)"; \ # Removing these checks because the GPG_KEY value above is no longer correct for the 3.5.7 ZK package # gpg --keyserver ha.pool.sks-keyservers.net --recv-key "$GPG_KEY" || \ diff --git a/testing/cluster_test_utils.py b/testing/cluster_test_utils.py index 9e7b82a..48ef5ed 100644 --- a/testing/cluster_test_utils.py +++ b/testing/cluster_test_utils.py @@ -1,6 +1,7 @@ import copy import docker import kazoo.client +import kazoo.exceptions import logging import logging.config import os @@ -102,6 +103,52 @@ def launch_node(cluster_name, hostname, zk_servers, external_storage, zkid, ip, logger.info('Launching node container %s with IP address %s...successful', hostname, ip) return docker_client.containers.get(container_id) +def get_status(): + docker_containers = docker_client.containers.list(all=True, filters={"name":"ramcloud-node-*"}) + docker_network = False + try: + docker_network = docker_client.networks.get("ramcloud-net") + except docker.errors.NotFound as nf: + pass + if not docker_containers: + logger.info('No ramcloud nodes found') + else: + logger.info('Found %s ramcloude nodes', len(docker_containers)) + if not docker_network: + logger.info('ramcloud network not found') + else: + logger.info('Found ramcloud network') + return (docker_network, docker_containers) + +def destroy_network_and_containers(docker_network, docker_containers): + try: + for dc in docker_containers: + print("removing container:", dc.name) + dc.remove(force=True) + if docker_network: + print("removing network:", docker_network) + docker_network.remove() + except docker.errors.NotFound as nf: + print("unable to destroy containers and/or network") + +def get_ensemble(num_nodes = 3): + return {i: '10.0.1.{}'.format(i) for i in range(1, num_nodes + 1)} + +def get_table_names(ensemble): + try: + zkc = get_zookeeper_client(ensemble) + return zkc.get_children('/ramcloud/main/tables') + except kazoo.exceptions.NoNodeError as nne: + # If tables in zk exists but wasn't initialized, then this is thrown, so return an empty list + return [] + +def drop_tables(ensemble, table_names): + r = ramcloud.RAMCloud() + external_storage = 'zk:' + external_storage_string(ensemble) + r.connect(external_storage, 'main') + for table_name in table_names: + r.drop_table(table_name) + # ClusterTest Usage in Python interpreter: # >>> import cluster_test_utils as ctu # >>> x = ctu.ClusterTest() diff --git a/testing/ramcloud_test_cluster.py b/testing/ramcloud_test_cluster.py new file mode 100644 index 0000000..44843a8 --- /dev/null +++ b/testing/ramcloud_test_cluster.py @@ -0,0 +1,48 @@ +import cluster_test_utils as ctu +import argparse +import sys + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--action', '-a', metavar='A', type=str, default="status", + help="Defines the action to take: status, reset, start, stop") + parser.add_argument('--nodes', '-n', type=int, default=3, + help="Number of zk, rc-coordinator, and rc-server instances to bring up. Only relevant when there's no cluster up yet. Default is 3") + +args = parser.parse_args() + +print("action =",args.action) +print("nodes =",args.nodes) +if (args.action == "start"): + x = ctu.ClusterTest() + x.setUp(num_nodes = args.nodes) +elif (args.action == "status"): + ctu.get_status() +elif (args.action == "stop"): + docker_network, docker_containers = ctu.get_status() + ctu.destroy_network_and_containers(docker_network, docker_containers) +elif (args.action == "reset"): + docker_network, docker_containers = ctu.get_status() + if (not docker_network): + # No network (or containers), means bring up new cluster + print("Bringing up new cluster with ", args.nodes, " nodes") + x = ctu.ClusterTest() + x.setUp(num_nodes = args.nodes) + elif (not docker_containers): + # A network but no containers means no data, so take it down, & bring back up + print("Inconsistent State") + print("Bringing up new cluster with ", args.nodes, " nodes") + ctu.destroy_network_and_containers(docker_network, []) + x = ctu.ClusterTest() + x.setUp(num_nodes = args.nodes) + else: + # We have a network and containers. Get the ensemble, table names, then drop all tables! + print("Found a cluster with ", len(docker_containers), " nodes") + print("Identifying tables") + ensemble = ctu.get_ensemble(len(docker_containers)) + table_names = ctu.get_table_names(ensemble) + print("Table names = ", table_names) + print("Dropping all tables") + ctu.drop_tables(ensemble, table_names) +else: + parser.print_help()