From c6b31d497054180965d1d1c24e0acfed16a41a9d Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 26 Aug 2024 22:53:24 -0600 Subject: [PATCH] trimming for public repo --- .gitattributes | 2 - README.md | 69 -- models/bodega/calc_wan_delays.py | 244 ----- models/bodega/plot_wan_quorums.py | 100 -- models/crossword/motiv_profile_cdf.py | 254 ----- models/crossword/plot_cstr_bounds.py | 269 ----- models/crossword/prob_calculation.py | 279 ----- publish/crossword/ARTIFACT.md | 42 - publish/crossword/archive_results.sh | 49 - publish/crossword/crop_bench_figs.sh | 77 -- publish/crossword/crop_slide_figs.sh | 67 -- publish/public_repo_trim.py | 174 ---- scripts/crossword/bench_adaptive.py | 476 --------- scripts/crossword/bench_breakdown.py | 463 --------- scripts/crossword/bench_critical.py | 835 --------------- scripts/crossword/bench_failover.py | 505 --------- scripts/crossword/bench_rs_coding.py | 192 ---- scripts/crossword/bench_staleness.py | 521 ---------- scripts/crossword/bench_unbalanced.py | 422 -------- scripts/crossword/bench_ycsb_trace.py | 640 ------------ scripts/crossword/distr_chainapp.py | 314 ------ scripts/crossword/distr_chaincli.py | 248 ----- scripts/crossword/gen_ycsb_a_trace.py | 51 - scripts/crossword/install_devdeps.sh | 35 - scripts/crossword/kill_chain_procs.sh | 19 - scripts/distr_cluster.py | 1 - scripts/local_cluster.py | 1 - scripts/remote_hosts.toml | 28 +- scripts/remote_killall.py | 1 - src/lib.rs | 1 - src/protocols/crossword/adaptive.rs | 316 ------ src/protocols/crossword/control.rs | 118 --- src/protocols/crossword/durability.rs | 247 ----- src/protocols/crossword/execution.rs | 77 -- src/protocols/crossword/gossiping.rs | 190 ---- src/protocols/crossword/leadership.rs | 579 ----------- src/protocols/crossword/messages.rs | 760 -------------- src/protocols/crossword/mod.rs | 1357 ------------------------- src/protocols/crossword/recovery.rs | 188 ---- src/protocols/crossword/request.rs | 239 ----- src/protocols/crossword/snapshot.rs | 308 ------ src/protocols/mod.rs | 20 - src/utils/linreg.rs | 280 ----- src/utils/mod.rs | 4 - src/utils/qdisc.rs | 234 ----- tla+/bodega/.gitignore | 1 - tla+/bodega/Bodega.tla | 848 --------------- tla+/bodega/Bodega_MC.cfg | 16 - tla+/bodega/Bodega_MC.tla | 78 -- tla+/crossword/.gitignore | 9 - tla+/crossword/Crossword.tla | 739 -------------- tla+/crossword/Crossword_MC.cfg | 17 - tla+/crossword/Crossword_MC.tla | 78 -- 53 files changed, 14 insertions(+), 13068 deletions(-) delete mode 100644 models/bodega/calc_wan_delays.py delete mode 100644 models/bodega/plot_wan_quorums.py delete mode 100644 models/crossword/motiv_profile_cdf.py delete mode 100644 models/crossword/plot_cstr_bounds.py delete mode 100644 models/crossword/prob_calculation.py delete mode 100644 publish/crossword/ARTIFACT.md delete mode 100755 publish/crossword/archive_results.sh delete mode 100755 publish/crossword/crop_bench_figs.sh delete mode 100755 publish/crossword/crop_slide_figs.sh delete mode 100644 publish/public_repo_trim.py delete mode 100644 scripts/crossword/bench_adaptive.py delete mode 100644 scripts/crossword/bench_breakdown.py delete mode 100644 scripts/crossword/bench_critical.py delete mode 100644 scripts/crossword/bench_failover.py delete mode 100644 scripts/crossword/bench_rs_coding.py delete mode 100644 scripts/crossword/bench_staleness.py delete mode 100644 scripts/crossword/bench_unbalanced.py delete mode 100644 scripts/crossword/bench_ycsb_trace.py delete mode 100644 scripts/crossword/distr_chainapp.py delete mode 100644 scripts/crossword/distr_chaincli.py delete mode 100644 scripts/crossword/gen_ycsb_a_trace.py delete mode 100755 scripts/crossword/install_devdeps.sh delete mode 100755 scripts/crossword/kill_chain_procs.sh delete mode 100644 src/protocols/crossword/adaptive.rs delete mode 100644 src/protocols/crossword/control.rs delete mode 100644 src/protocols/crossword/durability.rs delete mode 100644 src/protocols/crossword/execution.rs delete mode 100644 src/protocols/crossword/gossiping.rs delete mode 100644 src/protocols/crossword/leadership.rs delete mode 100644 src/protocols/crossword/messages.rs delete mode 100644 src/protocols/crossword/mod.rs delete mode 100644 src/protocols/crossword/recovery.rs delete mode 100644 src/protocols/crossword/request.rs delete mode 100644 src/protocols/crossword/snapshot.rs delete mode 100644 src/utils/linreg.rs delete mode 100644 src/utils/qdisc.rs delete mode 100644 tla+/bodega/.gitignore delete mode 100644 tla+/bodega/Bodega.tla delete mode 100644 tla+/bodega/Bodega_MC.cfg delete mode 100644 tla+/bodega/Bodega_MC.tla delete mode 100644 tla+/crossword/.gitignore delete mode 100644 tla+/crossword/Crossword.tla delete mode 100644 tla+/crossword/Crossword_MC.cfg delete mode 100644 tla+/crossword/Crossword_MC.tla diff --git a/.gitattributes b/.gitattributes index 364c58ac..5ab316f0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,2 @@ models/** linguist-vendored publish/** linguist-vendored -scripts/crossword/** linguist-vendored -scripts/bodega/** linguist-vendored diff --git a/README.md b/README.md index 872ba2da..1f68ee83 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,3 @@ -This is a private mirror of [Summerset](https://github.com/josehu07/summerset). - -[![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Aformat) -[![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Abuild) -[![Unit tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests_unit.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests_unit) -[![Proc tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests_proc) -[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) - -## Private-Public Sync Commands - -To create a branch to track public repo `main`, pull new things from it, and merge into the private `main`: - -```bash -# in the private repo: -git remote add public git@github.com:josehu07/summerset.git -git config --add --local checkout.defaultRemote origin -git checkout -b public-main -git branch --set-upstream-to=public/main public-main -git checkout main -# skip the above for later times -git pull public -git merge public-main -git push -``` - -To create a pull request on the public repo to make batched contributions from private repo `main`: - -```bash -# in the public repo: -git remote add private git@github.com:josehu07/summerset-private.git -git config --add --local checkout.defaultRemote origin -# skip the above for later times -git checkout -b private/main -git merge -s ours main -git pull private -python3 publish/public_repo_trim.py -# double check the trim and commit -git push origin -# then, on GitHub, make a squashing PR from branch to main -``` - # Summerset [![Format check](https://github.com/josehu07/summerset/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat) @@ -166,34 +125,6 @@ python3 scripts/distr_clients.py -h Note that these scripts use `sudo` and assume specific ranges of available ports, so a Linux server machine environment is recommended. -## TODO List - -- [x] async event-loop foundation -- [x] implementation of Chain Replication - - [ ] failure detection & recovery - - [ ] TLA+ spec -- [x] implementation of MultiPaxos - - [x] TLA+ spec -- [x] implementation of RS-Paxos -- [x] implementation of Raft - - [ ] TLA+ spec -- [x] implementation of CRaft -- [x] implementation of Crossword - - [x] TLA+ spec -- [ ] long-term planned improvements - - [ ] use a sophisticated storage backend - - [ ] efficient state-transfer snapshotting - - [ ] more robust TCP msg infrastructure - - [ ] membership discovery & view change - - [ ] multi-versioning & stale reads - - [ ] partitioned groups service structure -- [ ] client-side utilities - - [x] interactive REPL - - [x] benchmarking client - - [x] unit tester - - [ ] linearizability fuzzer -- [ ] better README & documentation - --- **Lore**: [Summerset Isles](https://en.uesp.net/wiki/Online:Summerset) is the name of an elvish archipelagic province in the Elder Scrolls series. diff --git a/models/bodega/calc_wan_delays.py b/models/bodega/calc_wan_delays.py deleted file mode 100644 index c2964d9a..00000000 --- a/models/bodega/calc_wan_delays.py +++ /dev/null @@ -1,244 +0,0 @@ -import math - - -class RingWorld: - def __init__(self): - self.ticks = 24 - self.servers = [3, 0, 18, 14, 12] - self.clients = list(range(4)) + list(range(11, 20)) - self.leader_id = 4 - self.leader = self.servers[self.leader_id] - - def distance(self, a, b): - assert a >= 0 and a < self.ticks - assert b >= 0 and b < self.ticks - d = abs(a - b) - return min(d, self.ticks - d) - - def nearest_server_among(self, origin, servers): - nearest, min_dist = None, self.ticks + 1 - for s in servers: - dist = self.distance(origin, s) - if dist < min_dist: - min_dist = dist - nearest = s - return nearest - - def nearest_server(self, origin): - return self.nearest_server_among(origin, self.servers) - - def farthest_server_among(self, origin, servers): - farthest, max_dist = None, 0 - for s in servers: - dist = self.distance(origin, s) - if dist > max_dist: - max_dist = dist - farthest = s - return farthest - - def farthest_server(self, origin): - return self.farthest_server_among(origin, self.servers) - - def quorum_max_from(self, origin, size): - ds = [self.distance(origin, s) for s in self.servers] - ds.sort() - return ds[size - 1] - - def quorum_incl_max_from(self, origin, size, includes): - max_noni = self.quorum_max_from(origin, size) - max_incl = 0 - for s in includes: - dist = self.distance(origin, s) - if dist > max_incl: - max_incl = dist - return max(max_noni, max_incl) - - -class Protocol: - def __init__(self, world, name): - self.name = name - self.world = world - assert len(self.world.servers) % 2 == 1 - - def write_from(self, client): - raise RuntimeError("base method called") - - def read_idle_from(self, client): - raise RuntimeError("base method called") - - def read_busy_from(self, client): - raise RuntimeError("base method called") - - def window_guess(self, client): - """Only consider one farthest in-flight conflicting write here.""" - raise RuntimeError("base method called") - - def write_avg(self): - delays = [self.write_from(c) for c in self.world.clients] - return sum(delays) / len(delays) - - def read_idle_avg(self): - delays = [self.read_idle_from(c) for c in self.world.clients] - return sum(delays) / len(delays) - - def read_busy_avg(self): - delays = [self.read_busy_from(c) for c in self.world.clients] - return sum(delays) / len(delays) - - def window_avg(self): - windows = [self.window_guess(c) for c in self.world.clients] - return sum(windows) / len(windows) - - -class MultiPaxos(Protocol): - def __init__(self, world): - super().__init__(world, "MP") - self.majority = (len(world.servers) + 1) // 2 - - def write_from(self, client): - return 2 * ( - self.world.distance(client, self.world.leader) - + self.world.quorum_max_from(self.world.leader, self.majority) - ) - - def read_idle_from(self, client): - return 2 * self.world.distance(client, self.world.leader) - - def read_busy_from(self, client): - return 2 * self.world.distance(client, self.world.leader) - - def window_guess(self, client): - return 0 - - -class EPaxos(Protocol): - def __init__(self, world): - super().__init__(world, "EP") - self.majority = (len(world.servers) + 1) // 2 - self.supermajority = math.floor((3 * len(world.servers) - 1) / 4) - - def write_from(self, client): - nearest = self.world.nearest_server(client) - return 2 * ( - self.world.distance(client, nearest) - + self.world.quorum_max_from(nearest, self.supermajority) - ) - - def read_idle_from(self, client): - nearest = self.world.nearest_server(client) - return 2 * ( - self.world.distance(client, nearest) - + self.world.quorum_max_from(nearest, self.supermajority) - ) - - def read_busy_from(self, client): - nearest = self.world.nearest_server(client) - return 2 * ( - self.world.distance(client, nearest) - + self.world.quorum_max_from(nearest, self.supermajority) - + self.world.quorum_max_from(nearest, self.majority) - ) - - def window_guess(self, client): - farthest = self.world.farthest_server(client) - return 2 * self.world.quorum_max_from(farthest, self.supermajority) - - -class QuorumLease(Protocol): - def __init__(self, world, lessees): - super().__init__(world, f"QL-{len(lessees)}") - self.majority = (len(world.servers) + 1) // 2 - self.lessees = [self.world.servers[l] for l in lessees] - - def write_from(self, client): - return 2 * ( - self.world.distance(client, self.world.leader) - + self.world.quorum_incl_max_from( - self.world.leader, self.majority, self.lessees - ) - ) - - def read_idle_from(self, client): - nearest = self.world.nearest_server_among(client, self.lessees) - return 2 * min( - self.world.distance(client, nearest), - self.world.distance(client, self.world.leader), - ) - - def read_busy_from(self, client): - nearest = self.world.nearest_server_among(client, self.lessees) - return ( - self.world.distance(client, nearest) - + self.world.distance(nearest, self.world.leader) - + self.world.distance(self.world.leader, client) - ) - - def window_guess(self, client): - return 4 * self.world.quorum_incl_max_from( - self.world.leader, self.majority, self.lessees - ) - - -class NearbyRead(Protocol): - def __init__(self, world, read_qsize): - super().__init__(world, f"NR-{read_qsize}") - self.majority = (len(world.servers) + 1) // 2 - self.read_qsize = read_qsize - self.write_qsize = max(len(world.servers) + 1 - read_qsize, self.majority) - - def write_from(self, client): - return 2 * ( - self.world.distance(client, self.world.leader) - + self.world.quorum_max_from(self.world.leader, self.write_qsize) - ) - - def read_idle_from(self, client): - return 2 * min( - self.world.quorum_max_from(client, self.read_qsize), - self.world.distance(client, self.world.leader), - ) - - def read_busy_from(self, client): - return 2 * self.world.distance(client, self.world.leader) - - def window_guess(self, client): - return 2 * self.world.quorum_max_from(self.world.leader, self.write_qsize) - - -def ring_world_model(): - world = RingWorld() - protocols = [ - MultiPaxos(world), - EPaxos(world), - QuorumLease(world, list(range(5))), - QuorumLease(world, [0, 2, 4]), - NearbyRead(world, 1), - NearbyRead(world, 2), - NearbyRead(world, 3), - ] - - results = { - "write": [], - "read-idle": [], - "read-busy": [], - "window": [], - } - for p in protocols: - results["write"].append(p.write_avg()) - results["read-idle"].append(p.read_idle_avg()) - results["read-busy"].append(p.read_busy_avg()) - results["window"].append(p.window_avg()) - - print(f"{' ':9s}", end="") - for p in protocols: - print(f" {p.name:>4s}", end="") - print() - for w in results: - print(f"{w:>9s}", end="") - for i in range(len(protocols)): - print(f" {results[w][i]:4.1f}", end="") - print() - - -if __name__ == "__main__": - ring_world_model() diff --git a/models/bodega/plot_wan_quorums.py b/models/bodega/plot_wan_quorums.py deleted file mode 100644 index 7dfa8719..00000000 --- a/models/bodega/plot_wan_quorums.py +++ /dev/null @@ -1,100 +0,0 @@ -import math - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -class World: - def plot_quorum_delays(self): - sorted_dists = [] - for c in self.clients: - c_dists = sorted([self.distance_func(c, s) for s in self.servers]) - sorted_dists.append(c_dists) - - quorum_sizes = list(range(1, len(self.servers) + 1)) - quorum_delays = dict() - for qs in quorum_sizes: - delays = [sorted_dists[ci][qs - 1] for ci in range(len(self.clients))] - quorum_delays[qs] = delays - - for qs in quorum_sizes: - # print(f" {qs}: {quorum_delays[qs]}") - plt.plot(list(range(len(self.clients))), quorum_delays[qs], label=str(qs)) - plt.legend() - plt.tight_layout() - plt.savefig(f"models/bodega/{self.name}.png") - plt.close() - - -class World1DArray(World): - def __init__(self, width, num_servers): - assert num_servers > 1 - assert width >= num_servers - - self.name = f"1DArray.{width}.{num_servers}" - self.width = width - self.clients = list(range(self.width)) - - self.servers = [0] - for i in range(1, num_servers - 1): - self.servers.append((self.width * i) // (num_servers - 1)) - self.servers.append(self.width - 1) - - def distance_func(self, p1, p2): - assert p1 >= 0 and p1 < self.width - assert p2 >= 0 and p2 < self.width - return abs(p1 - p2) - - -class World1DRing(World): - def __init__(self, length, num_servers): - assert num_servers > 1 - assert length >= num_servers - - self.name = f"1DRing.{length}.{num_servers}" - self.length = length - self.clients = list(range(self.length)) - - self.servers = [] - for i in range(num_servers): - self.servers.append((self.length * i) // num_servers) - - def distance_func(self, p1, p2): - assert p1 >= 0 and p1 < self.length - assert p2 >= 0 and p2 < self.length - return min(abs(p1 - p2), p1 + self.length - p2, p2 + self.length - p1) - - -class World2DRect(World): - def __init__(self, width, height, num_servers): - assert num_servers == 5 # TODO: complete this - - self.name = f"2DRect.{width}x{height}.{num_servers}" - self.width = width - self.height = height - self.clients = [(w, h) for w in range(width) for h in range(height)] - - self.servers = [ - (0, 0), - (0, height - 1), - (width // 2, height // 2), - (width - 1, 0), - (width - 1, height - 1), - ] - - def distance_func(self, p1, p2): - return math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) - - -if __name__ == "__main__": - world_1d_array = World1DArray(100, 5) - world_1d_array.plot_quorum_delays() - - world_1d_ring = World1DRing(100, 5) - world_1d_ring.plot_quorum_delays() - - world_2d_rect = World2DRect(100, 50, 5) - world_2d_rect.plot_quorum_delays() diff --git a/models/crossword/motiv_profile_cdf.py b/models/crossword/motiv_profile_cdf.py deleted file mode 100644 index e12854b5..00000000 --- a/models/crossword/motiv_profile_cdf.py +++ /dev/null @@ -1,254 +0,0 @@ -import pickle - -import matplotlib # type: ignore - -matplotlib.use("Agg") - -# from brokenaxes import brokenaxes # type: ignore -import matplotlib.pyplot as plt # type: ignore -from matplotlib.lines import Line2D # type: ignore - - -TIDB_DATA_SIZE = 73 -CRDB_DATA_SIZE = 16 - - -def preprocess_len_cnts(len_cnts, excludes=5): - print(" Distinct lengths:", len(len_cnts)) - print(" Min & Max:", min(len_cnts.keys()), max(len_cnts.keys())) - - sorted_len_cnts = sorted(len_cnts.items(), key=lambda t: t[1]) - - print(" Top 10:") - tops = reversed(sorted_len_cnts[-10:]) - for l, c in tops: - print(f"{l:10d} {c:7d}") - - print(" Bottom 10:") - bottoms = sorted_len_cnts[:10] - for l, c in bottoms: - print(f"{l:10d} {c:7d}") - - for l, _ in sorted_len_cnts[-excludes:]: - del len_cnts[l] - - total, large = 0, 0 - for l, c in len_cnts.items(): - total += c - if l >= 4 * 1024: - large += c - print(f" Fraction >= 4K: {large / total:.2f}") - - -def plot_len_cnts_cdfs(len_cnts_tidb, len_cnts_crdb): - matplotlib.rcParams.update( - { - "figure.figsize": (3.6, 1.05), - "font.size": 10, - } - ) - fig = plt.figure("cdf") - - DBS_DATA_COLOR_ZORDER_ENDX = { - "TiDB": (len_cnts_tidb, "steelblue", 10, 116 * 1024), - "CockroachDB": (len_cnts_crdb, "lightcoral", 5, 148 * 1024), - } - - append_xticks, append_xticklabels = [], [] - for db, (len_cnts, color, zorder, endx) in DBS_DATA_COLOR_ZORDER_ENDX.items(): - x, xmax, xmin = [], 0, float("inf") - for l, c in len_cnts.items(): - x += [l for _ in range(c)] - if l > xmax: - xmax = l - if l < xmin: - xmin = l - - xright = 150 * 1024 - step = int(endx / 4096) - bins = [i * step for i in range(4096)] + [float("inf")] - - plt.hist( - x, - bins=bins, - range=(0, endx), - density=True, - cumulative=True, - histtype="step", - linewidth=2, - color=color, - label=db, - zorder=zorder, - ) - - adjusted_endx = endx if db == "TiDB" else endx - 2000 # hardcoded - endx_label = "290KB" if db == "TiDB" else "53MB" # hardcoded - plt.vlines( - adjusted_endx, - ymin=0.9, - ymax=1.05, - colors=color, - linestyles="solid", - zorder=zorder, - linewidth=2, - ) - plt.vlines( - adjusted_endx, - ymin=0, - ymax=0.9, - colors="gray", - linestyles="dashed", - linewidth=1, - ) - - append_xticks.append(adjusted_endx) - append_xticklabels.append(endx_label) - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.xlim(0, xright) - plt.xticks( - [4096, 32 * 1024, 64 * 1024] + append_xticks, - ["4KB", "32KB", "64KB"] + append_xticklabels, - ) - - plt.ylim(0, 1.05) - plt.yticks([0.5, 1.0]) - - plt.vlines( - 4096, ymin=0, ymax=1.05, colors="dimgray", linestyles="dashed", zorder=20 - ) - # plt.arrow( - # 7500, - # 0.84, - # 0, - # 0.17, - # color="dimgray", - # width=220, - # length_includes_head=True, - # head_width=1500, - # head_length=0.07, - # zorder=20, - # ) - # plt.arrow( - # 7500, - # 0.84, - # 0, - # -0.16, - # color="dimgray", - # width=220, - # length_includes_head=True, - # head_width=1500, - # head_length=0.07, - # zorder=20, - # ) - plt.text(7000, 0.25, "45% & 17% are ≥ 4KB, resp.", color="dimgray", fontsize=9.5) - - # arrow = patches.FancyArrowPatch( - # (96 * 1024, 0.45), - # (xright, 0.25), - # connectionstyle="arc3,rad=.12", - # arrowstyle="Simple, tail_width=0.1, head_width=2.5, head_length=4", - # color="dimgray", - # ) - # ax.add_patch(arrow) - # plt.text(89000, 0.55, "max 290KB", color="dimgray") - - def draw_xaxis_break(xloc): - xpl, xpr = xloc - 3.5, xloc + 3.5 - xs = [xpl * 1024, xpl * 1024, xpr * 1024, xpr * 1024] - ys = [-0.1, 0.1, 0.1, -0.1] - plt.fill(xs, ys, "w", fill=True, linewidth=0, zorder=10, clip_on=False) - plt.plot( - [(xpl - 1) * 1024, (xpl + 1) * 1024], - [-0.1, 0.1], - color="k", - linewidth=1, - zorder=20, - clip_on=False, - ) - plt.plot( - [(xpr - 1) * 1024, (xpr + 1) * 1024], - [-0.1, 0.1], - color="k", - linewidth=1, - zorder=20, - clip_on=False, - ) - plt.text( - xloc * 1024, - 0, - "~", - fontsize=8, - zorder=30, - clip_on=False, - ha="center", - va="center", - ) - - draw_xaxis_break(100) - draw_xaxis_break(131) - - plt.ylabel("CDF") - - plt.tight_layout() - - pdf_name = "results/intros/motiv_profile_cdf.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_len_cnts_legend(handles, labels): - matplotlib.rcParams.update( - { - "figure.figsize": (1.8, 1.3), - "font.size": 10, - } - ) - plt.figure("Legend") - - plt.axis("off") - - line_handles = [ - Line2D([0], [0], linewidth=2, color=h.get_edgecolor()) for h in handles - ] - - lgd = plt.legend( - line_handles, - labels, - handlelength=0.6, - handletextpad=0.4, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - - pdf_name = "results/intros/legend-motiv_profile.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - len_cnts_tidb, len_cnts_crdb = None, None - with open("results/intros/length_counts-tidb.pkl", "rb") as fpkl: - len_cnts_tidb = pickle.load(fpkl) - print("TiDB --") - preprocess_len_cnts(len_cnts_tidb) - print() - - with open("results/intros/length_counts-crdb.pkl", "rb") as fpkl: - len_cnts_crdb = pickle.load(fpkl) - len_cnts_crdb = { - int(l * TIDB_DATA_SIZE / CRDB_DATA_SIZE): c for l, c in len_cnts_crdb.items() - } - print("CockroachDB --") - preprocess_len_cnts(len_cnts_crdb) - print() - - handles, labels = plot_len_cnts_cdfs(len_cnts_tidb, len_cnts_crdb) - plot_len_cnts_legend(handles, labels) diff --git a/models/crossword/plot_cstr_bounds.py b/models/crossword/plot_cstr_bounds.py deleted file mode 100644 index 15fc32b6..00000000 --- a/models/crossword/plot_cstr_bounds.py +++ /dev/null @@ -1,269 +0,0 @@ -import argparse -import math - -import matplotlib # type: ignore - -matplotlib.use("Agg") - -import numpy as np # type: ignore -import matplotlib.pyplot as plt # type: ignore -import matplotlib.patches as mpatches # type: ignore -from matplotlib.legend_handler import HandlerPatch # type: ignore - - -SUBPLOT_ARG = lambda idx: 141 + idx - -CLUSTER_SIZES = [3, 5, 7, 9] -SIZE_COLOR_MAP = { - 3: ("seagreen", "palegreen"), - 5: ("orange", "bisque"), - 7: ("steelblue", "powderblue"), - 9: ("chocolate", "mistyrose"), -} - -X_TICKS = list(range(1, 10)) -Y_TICKS = list(range(1, 6)) - - -def plot_cstr_bound(idx, cluster_size): - ax = plt.subplot(SUBPLOT_ARG(idx)) - - n = cluster_size - f = n // 2 - m = n - f - - line_color, fill_color = SIZE_COLOR_MAP[cluster_size] - - # Classic Paxos/Raft point - plt.scatter( - m, m, marker="s", s=100, color="black", label="Classic Paxos/Raft", zorder=10 - ) - - # CRaft point - craft_q = math.ceil((n + m) / 2) - plt.scatter( - craft_q, - 1, - marker="X", - s=110, - color="lightcoral", - label="RSPaxos/CRaft", - zorder=10, - ) - - # boundary lines - xs = [x for x in range(m, n + 1)] - ys = [x for x in range(m, 0, -1)] - plt.plot( - xs, - ys, - linewidth=2, - marker="o", - markersize=7, - color=line_color, - label="Crossword configs", - zorder=20, - ) - if n <= 5: - plt.vlines(m, ymin=m, ymax=n, linestyles="-", color=line_color, zorder=20) - plt.vlines(n, ymin=1, ymax=n, linestyles="-", color=line_color, zorder=20) - plt.hlines( - n, xmin=m - 0.05, xmax=n + 0.05, linestyles="-", color=line_color, zorder=20 - ) - else: - plt.vlines(m, ymin=m, ymax=m + 1.4, linestyles="-", color=line_color, zorder=20) - plt.vlines(n, ymin=1, ymax=m + 1.4, linestyles="-", color=line_color, zorder=20) - - # correct region - xs = [m, m, n, n] - ys = [m, n, n, 1] if n <= 5 else [m, m + 1.4, m + 1.4, 1] - plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0) - - # unused x-axis ranges - xs = [0.4, m - 0.55, m - 0.85, 0.1] - ys = [0.3, 0.3, 0, 0] - plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10) - if cluster_size < CLUSTER_SIZES[-1]: - xs = [n + 1, X_TICKS[-1] + 0.4, X_TICKS[-1] + 0.1, n + 0.7] - plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10) - - # unused y-axis range for 3 - if cluster_size == CLUSTER_SIZES[0]: - xs = [0, 0.3, 0.3, 0] - ys = [n + 0.55, n + 0.85, Y_TICKS[-1] + 0.4, Y_TICKS[-1] + 0.1] - plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10) - - # environment tradeoff arrows - plt.arrow( - m + 0.1 if n <= 3 else m + 0.6, - n + 1.1 if n <= 3 else m + 2.6 if n <= 5 else m + 2.1, - -1.3, - 0, - linewidth=1, - color="dimgray", - length_includes_head=True, - head_width=0.2, - head_length=0.25, - overhang=0.5, - clip_on=False, - label="Tradeoff decisions", - ) - plt.text( - m + 0.3 if n <= 3 else m + 0.8 if n <= 5 else m + 0.8, - n + 1.1 if n <= 3 else m + 2.6 if n <= 5 else m + 2.1, - "if high\njitter" if n <= 3 else "if high jitter", - horizontalalignment="left", - verticalalignment="center", - color="dimgray", - ) - plt.arrow( - n + 1, - 2, - 0, - -1.3, - linewidth=1, - color="dimgray", - length_includes_head=True, - head_width=0.2, - head_length=0.25, - overhang=0.5, - clip_on=False, - ) - plt.text( - n + 1.3 if n < 7 else n + 0.4, - 1 + 1.1 if n < 7 else 1 + 2.1, - "if bw\nlimited", - horizontalalignment="left", - verticalalignment="center", - color="dimgray", - ) - - plt.axis("scaled") - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.xlim((0, X_TICKS[-1] + 0.7)) - plt.ylim((0, Y_TICKS[-1] + 2.7)) - plt.xticks( - X_TICKS[m - 1 : cluster_size], list(map(str, X_TICKS))[m - 1 : cluster_size] - ) - plt.yticks(Y_TICKS[:cluster_size], list(map(str, Y_TICKS))[:cluster_size]) - - if idx < 2: - plt.xlabel("|Quorum| (q)", loc="right") - ax.xaxis.set_label_coords(1.15, -0.06) - else: - plt.xlabel("q") - ax.xaxis.set_label_coords(1.06, 0.06) - plt.ylabel( - "Shards per\nserver (c)", - loc="top", - rotation=0, - backgroundcolor="white", - ) - ax.yaxis.set_label_coords(0.19, 0.76) - - # plt.title( - # f"|Cluster|={n} f={f}", - # x=0.5, - # y=-0.48, - # fontsize=11, - # # fontweight="bold", - # # backgroundcolor=fill_color, - # ) - plt.text(5.4, -2.4, f"n={n}, f={f}", fontsize=11, ha="center", va="center") - plt.text(2.8, -2.4, "▬", fontsize=11, color=line_color, ha="center", va="center") - - return ax - - -def make_legend(fig, handles, labels): - def make_legend_arrow( - legend, orig_handle, xdescent, ydescent, width, height, fontsize - ): - return mpatches.FancyArrow( - 0, - 0.5 * height, - width, - 0, - linewidth=1, - color="dimgray", - length_includes_head=True, - head_width=0.6 * height, - overhang=0.2, - ) - - def make_legend_polygon( - legend, orig_handle, xdescent, ydescent, width, height, fontsize - ): - return mpatches.Polygon( - xy=np.array( - [ - [0.2 * width, 0.5 * height], - [0.2 * width, 1.2 * height], - [0.8 * width, 1.2 * height], - [0.8 * width, -0.2 * height], - ] - ), - closed=True, - color="dimgray", - ) - - order = [] - for s in ("Classic", "RSPaxos", "Crossword", "Region", "Tradeoff"): - for i, l in enumerate(labels): - if s in l: - order.append(i) - break - sorted_handles = [handles[i] for i in order] - sorted_labels = [labels[i] for i in order] - - leg = fig.legend( - sorted_handles, - sorted_labels, - loc="lower center", - bbox_to_anchor=(0.5, 0.72), - ncol=len(handles), - handlelength=1.5, - handletextpad=0.5, - handler_map={ - mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow), - mpatches.Polygon: HandlerPatch(patch_func=make_legend_polygon), - }, - ) - for h in leg.legend_handles[2:]: - h.set_color("dimgray") - - -def plot_all_cstr_bounds(output_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (10, 3), - "font.size": 10, - "axes.axisbelow": False, - "pdf.fonttype": 42, - } - ) - fig = plt.figure() - - handles, labels = None, None - for idx, cluster_size in enumerate(CLUSTER_SIZES): - ax = plot_cstr_bound(idx, cluster_size) - if idx == len(CLUSTER_SIZES) - 1: - handles, labels = ax.get_legend_handles_labels() - - # single legend group on top - make_legend(fig, handles, labels) - - plt.tight_layout(pad=1.0) - plt.savefig(f"{output_dir}/cstr_bounds.pdf", bbox_inches=0) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", "--output_dir", type=str, default="./results", help="output folder" - ) - args = parser.parse_args() - - plot_all_cstr_bounds(args.output_dir) diff --git a/models/crossword/prob_calculation.py b/models/crossword/prob_calculation.py deleted file mode 100644 index 1aa777d3..00000000 --- a/models/crossword/prob_calculation.py +++ /dev/null @@ -1,279 +0,0 @@ -import random -import statistics -import argparse -import pickle - -import matplotlib # type: ignore - -matplotlib.use("Agg") - -import matplotlib.pyplot as plt # type: ignore - - -CLUSTER = 5 - -# instance size in KBs -SIZES = [2**i for i in range(3, 10)] -SIZES += [1024 * i for i in range(1, 101)] - -# tuples of (min_delay in ms, max_bandwidth in Gbps) -POWERS = [(10, 100), (50, 10), (120, 1)] - -# delay jitter value (in percentage of min_delay) -JITTERS = [10, 25, 50] - -PARETO_ALPHA = 1.16 # log_4(5) -NUM_TRIALS = 10000 - -QUORUM_COLOR_WIDTH = { - 5: ("red", 1), - 4: ("steelblue", 1.25), - 3: ("dimgray", 1.5), -} - - -def rand_individual_time(c, s, d, b, jitter): - pareto = random.paretovariate(PARETO_ALPHA) - while pareto > 10: - pareto = random.paretovariate(PARETO_ALPHA) - t = d + d * (jitter / 100) * (pareto - 1) - t += (s * c) / (b * 1024 / 8) - return t - - -def response_time_sample(n, q, c, s, d, b, jitter): - ts = [rand_individual_time(c, s, d, b, jitter) for _ in range(n - 1)] - ts.sort() - # diffs = [ts[i] - ts[i - 1] for i in range(1, len(ts))] - # print([int(t) for t in ts], [int(diff) for diff in diffs]) - return ts[q - 2] # assuming leader itself must have accepted - - -def response_time_mean_stdev(n, q, c, s, d, b, jitter): - rts = [] - for _ in range(NUM_TRIALS): - rts.append(response_time_sample(n, q, c, s, d, b, jitter)) - mean = sum(rts) / len(rts) - stdev = statistics.stdev(rts) - return mean, stdev - - -def calc_fixed_env_result(n, d, b, jitter): - m = n // 2 + 1 - result = dict() - for q in range(m, n + 1): - c = n + 1 - q - result[(q, c)] = [] - for v in SIZES: - s = v / m - mean, stdev = response_time_mean_stdev(n, q, c, s, d, b, jitter) - result[(q, c)].append((mean, stdev)) - return result - - -def calc_all_env_results(n): - results = dict() - for i, (d, b) in enumerate(POWERS): - for j, jitter in enumerate(JITTERS): - result = calc_fixed_env_result(n, d, b, jitter) - results[(i, j)] = result - print(f"calculated {d} {b} {jitter}") - return results - - -def print_all_env_results(results): - for i, (d, b) in enumerate(POWERS): - for j, jitter in enumerate(JITTERS): - print(f"Env {i},{j}: d={d} b={b} jitter={jitter}") - for q, c in results[(i, j)]: - print(f" config q={q} c={c} ", end="") - for mean, stdev in results[(i, j)][(q, c)]: - print(f" {mean:7.2f}", end="") - print() - - -def plot_env_result_subplot(i, j, results): - POWERS = results["powers"] - JITTERS = results["jitters"] - VSIZES = results["vsizes"] - results = results["results"] - - subplot_id = len(POWERS) * 100 + len(JITTERS) * 10 - subplot_id += i * len(JITTERS) + j + 1 - ax = plt.subplot(subplot_id) - - for q, c in results[(i, j)]: - xs = [s / 1024 for s in VSIZES] - ys = [t[0] for t in results[(i, j)][(q, c)]] - plt.plot( - xs, - ys, - label=f"q={q} c={c}", - color=QUORUM_COLOR_WIDTH[q][0], - linewidth=QUORUM_COLOR_WIDTH[q][1], - ) - - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.tick_params(direction="in") - - if i == len(POWERS) - 1 and j == len(JITTERS) - 1: - plt.xlabel("Instance\nsize (MB)", loc="right", multialignment="left") - ax.xaxis.set_label_coords(2, 0.18) - if i < len(POWERS) - 1: - ax.tick_params(bottom=False, labelbottom=False) - - if i == 0 and j == 0: - plt.ylabel( - "Response\ntime (ms)", - loc="top", - rotation=0, - multialignment="left", - backgroundcolor="white", - ) - ax.yaxis.set_label_coords(0.45, 1.02) - if j > 0: - ax.tick_params(left=False, labelleft=False) - - xright = max(VSIZES) / 1024 - ybottom, ytop = float("inf"), 0 - for jj in range(len(JITTERS)): - for cf in results[(i, j)]: - for v in range(len(VSIZES)): - y = results[(i, jj)][cf][v][0] - if y > ytop: - ytop = y - if y < ybottom: - ybottom = y - - plt.xlim(0, xright * 1.1) - plt.ylim(0, ytop * 1.2) - - plt.xticks( - [0, xright], ["0", f"{int(xright)}"], fontsize="x-small", color="dimgray" - ) - plt.yticks( - [ybottom, ytop], - [f"{int(ybottom)}", f"{int(ytop)}"], - fontsize="x-small", - color="dimgray", - ) - - if i == len(POWERS) - 1: - jitter = JITTERS[j] - plt.text( - xright * 0.5 if j > 0 else xright * 0.65, - -ytop * 0.48, - f"+{jitter / 100:.1f}d", - horizontalalignment="center", - verticalalignment="center", - ) - if j == 0: - d, b = POWERS[i] - i_env_strs = { - 0: "datacenter", - 1: "regional", - 2: "wide-area", - } - plt.text( - -xright * 1.05, - ytop * 0.6 if i < len(POWERS) - 1 else ytop * 0.8, - f"{i_env_strs[i]}\n{d}ms\n{b}Gbps", - horizontalalignment="center", - verticalalignment="center", - ) - if i == len(POWERS) - 1 and j == 0: - # plt.text( - # -xright * 1.1, - # -ytop * 0.5, - # "Env.", - # horizontalalignment="center", - # verticalalignment="center", - # # color="dimgray", - # weight="bold", - # ) - plt.text( - -xright * 1.05, - 0, - "RTT (d)\nBW (b)", - horizontalalignment="center", - verticalalignment="center", - weight="bold", - ) - plt.text( - -xright * 0.48, - -ytop * 0.48, - "Base Jitter", - horizontalalignment="center", - verticalalignment="center", - weight="bold", - ) - - print(f"Plotted subplot {subplot_id}") - return ax - - -def plot_all_env_results(results, output_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (5, 4), - "font.size": 10, - "axes.axisbelow": False, - } - ) - fig = plt.figure() - - handles, labels = None, None - for i in range(len(results["powers"])): - for j in range(len(results["jitters"])): - ax = plot_env_result_subplot(i, j, results) - if i == 0 and j == 0: - handles, labels = ax.get_legend_handles_labels() - - leg = fig.legend( - handles, - labels, - loc="center left", - bbox_to_anchor=(0.76, 0.5), - handlelength=0.8, - title="Configs", - ) - - fig.subplots_adjust(bottom=0.16, top=0.9, left=0.23, right=0.75) - - plt.savefig( - f"{output_dir}/calc.envs.r_{CLUSTER}.pdf", - bbox_inches=0, - ) - plt.close() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", "--output_dir", type=str, default="./results", help="output folder" - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not args.plot: - results = calc_all_env_results(CLUSTER) - # print_all_env_results(results) - - results = { - "vsizes": SIZES, - "powers": POWERS, - "jitters": JITTERS, - "results": results, - } - - with open(f"{args.output_dir}/calc.envs.r_{CLUSTER}.pkl", "wb") as fpkl: - pickle.dump(results, fpkl) - print(f"Dumped: {CLUSTER}") - - else: - with open(f"{args.output_dir}/calc.envs.r_{CLUSTER}.pkl", "rb") as fpkl: - results = pickle.load(fpkl) - plot_all_env_results(results, args.output_dir) diff --git a/publish/crossword/ARTIFACT.md b/publish/crossword/ARTIFACT.md deleted file mode 100644 index 10cae769..00000000 --- a/publish/crossword/ARTIFACT.md +++ /dev/null @@ -1,42 +0,0 @@ -## Commands Memo for AE - -For a shell command, `$` indicates running it on the local development machine, while `%` indicates running it on a CloudLab remote host. - -1. On you local dev machine, change into the repo's path - 1. `$ cd path/to/summerset` -2. Create CloudLab machines and fill in `scripts/remote_hosts.toml` -3. Add the following setting to your `~/.ssh/config` for skipping the SSH `known_hosts` check - - ```text - Host * - StrictHostKeyChecking no - ``` - -4. For each of the hosts (examples below are for `host0`), do the following setup work - 1. SSH to it - 2. Create `/eval` path and acquire its ownership: - 1. `% sudo mkdir /eval` - 2. `% sudo chown -R $USER /eval` - 3. Back to the local machine, sync the repo folder to the remote host - 1. `$ python3 scripts/remote_mirror.py -g 1dc` - 4. On `host0`, you will find the mirrored repo at `/eval/summerset` - 5. Resize the root partition to make more space - 1. `% cd /eval/summerset` - 2. `% ./scripts/setup/resize_partition.sh` - 6. Update Linux kernel version to v6.1.64, the one used for evaluations presented in the paper - 1. `% ./scripts/setup/install_kernel.sh` - 2. `% sudo reboot` - 7. After rebooting, double check the kernel version - 1. `% uname -a` - 2. `% cd /eval/summerset` - 8. Install necessary dependencies - 1. `% ./scripts/setup/install_devdeps.sh` - 2. `% ./scripts/crossword/install_devdeps.sh` - 9. Set up network devices (for netem experiments) - 1. `% ./scripts/setup/setup_net_devs.sh` - 10. Set up TCP buffer sizes - 1. `% ./scripts/setup/setup_tcp_bufs.sh` - 11. Configure & open TCP ports - 1. `% ./scripts/setup/open_tcp_ports.sh` - 12. Record the SSH key pair for mutual login between remote nodes - 1. `% ./scripts/setup/sshkey_record.sh` diff --git a/publish/crossword/archive_results.sh b/publish/crossword/archive_results.sh deleted file mode 100755 index a44e668e..00000000 --- a/publish/crossword/archive_results.sh +++ /dev/null @@ -1,49 +0,0 @@ -#! /bin/bash - -# Usage: -# To archive: ./publish//archive_results.sh -# To extract: ./publish//archive_results.sh ex - - -if [ $(id -u) -eq 0 ]; -then - echo "Please run this script as normal user!" - exit 1 -fi - -if [ $# -le 0 ]; -then - echo "ERROR: please give conference name!" - exit 1 -fi - - -TAR_NAME="$1.tar.xz" - -DROPBOX_DIR="${HOME}/Dropbox/UW-Madison/ADSL-Systems-Lab/Data-Backups/Crossword" - - -if [ $# -ge 2 ] && [ "$2" = "ex" ]; -then - # extracting... - echo - echo "Downloading archive from Dropbox..." - cp ${DROPBOX_DIR}/${TAR_NAME} backups/ - - echo - echo "Extracting results/ <- backups/${TAR_NAME}..." - tar -xf backups/${TAR_NAME} -C results/ - -else - # archiving... - echo - echo "Archiving results/ -> backups/${TAR_NAME}..." - cd results/ - tar -Jcf ${TAR_NAME} * - cd .. - mv results/${TAR_NAME} backups/ - - echo - echo "Replicating archive to Dropbox..." - cp backups/${TAR_NAME} ${DROPBOX_DIR}/ -fi diff --git a/publish/crossword/crop_bench_figs.sh b/publish/crossword/crop_bench_figs.sh deleted file mode 100755 index ecab095b..00000000 --- a/publish/crossword/crop_bench_figs.sh +++ /dev/null @@ -1,77 +0,0 @@ -#! /bin/bash - -# Requires: -# sudo apt install python3-pip python3-tk ghostscript poppler-utils -# pip3 install pdfCropMargins -# Add user ~/.local/bin to PATH - -# Usage: -# 1. save final bench plots to results/... -# 2. run from repo root: ./publish//crop_bench_figs.sh - - -if [ $(id -u) -eq 0 ]; -then - echo "Please run this script as normal user!" - exit 1 -fi - - -PLOT_FILES=("models/cstr_bounds" - "plots/breakdown/exper-breakdown" - "plots/breakdown/legend-breakdown" - "plots/adaptive/exper-adaptive" - "plots/failover/exper-failover" - "plots/unbalanced/exper-unbalanced" - "plots/unbalanced/legend-unbalanced" - "plots/critical/exper-critical-5.small.50.1dc" - "plots/critical/exper-critical-5.small.50.wan" - "plots/critical/exper-critical-5.large.50.1dc" - "plots/critical/exper-critical-5.large.50.wan" - "plots/critical/exper-critical-5.mixed.50.1dc" - "plots/critical/exper-critical-5.mixed.50.wan" - "plots/critical/exper-critical-cluster_size" - "plots/critical/exper-critical-write_ratio" - "plots/critical/legend-critical" - "plots/critical/legend-critical-minor" - "plots/staleness/exper-staleness" - "plots/ycsb_trace/exper-ycsb_trace") -PLOT_FILES_BOTTOM_MORE=("intros/legend-motiv_profile" - "plots/critical/ylabels-critical" - "plots/failover/legend-failover" - "plots/adaptive/legend-adaptive" - "plots/staleness/legend-staleness" - "plots/ycsb_trace/legend-ycsb_trace") -PLOT_FILES_RIGHT_MORE=("intros/motiv_profile_cdf") - - -echo -echo "Cropping bench plots..." -for FILE_NAME in ${PLOT_FILES[@]}; -do - echo " cropping results/${FILE_NAME}.pdf" - pdfcropmargins -p 0 -t 255 -mo -o results "results/${FILE_NAME}.pdf" -done - - -echo -echo "Cropping plots with more space at the bottom..." -for FILE_NAME in ${PLOT_FILES_BOTTOM_MORE[@]}; -do - echo " cropping results/${FILE_NAME}.pdf" - pdfcropmargins -p4 0 50 0 0 -t 255 -mo -o results "results/${FILE_NAME}.pdf" -done - - -echo -echo "Cropping plots with more space to the right..." -for FILE_NAME in ${PLOT_FILES_RIGHT_MORE[@]}; -do - echo " cropping results/${FILE_NAME}.pdf" - pdfcropmargins -p4 0 0 5 0 -t 255 -mo -o results "results/${FILE_NAME}.pdf" -done - - -echo -echo "Deleting uncropped files..." -rm results/*_uncropped.pdf diff --git a/publish/crossword/crop_slide_figs.sh b/publish/crossword/crop_slide_figs.sh deleted file mode 100755 index 6053e087..00000000 --- a/publish/crossword/crop_slide_figs.sh +++ /dev/null @@ -1,67 +0,0 @@ -#! /bin/bash - -# Requires: -# sudo apt install python3-pip python3-tk ghostscript poppler-utils -# pip3 install pdfCropMargins -# Add user ~/.local/bin to PATH - -# Usage: -# 1. save slides exported PDF as results/slide-figures.pdf -# 2. run from repo root: ./publish//crop_slide_figs.sh - - -if [ $(id -u) -eq 0 ]; -then - echo "Please run this script as normal user!" - exit 1 -fi - - -ORIGINAL_PDF=results/slide-figures.pdf -TAKE_PAGES=9 -TARGET_NAMES=("status_diagram" - "log_in_action" - "rs_codeword_space" - "policy-multipaxos" - "policy-rspaxos" - "policy-balanced_rr_2" - "policy-balanced_rr_3" - "policy-unbalanced" - "concurrent_failures") - - -echo -echo "Deleting old results..." -rm results/slides/*.pdf -rm "results/slide-figures.pdf:Zone.Identifier" - - -echo -echo "Separating desired pages..." -pdfseparate -l $TAKE_PAGES $ORIGINAL_PDF "results/slides/slide-%d.pdf" - - -echo -echo "Cropping separated pages..." -for FILE in $(ls results/slides/ | grep .pdf); -do - echo " cropping $FILE" - pdfcropmargins -p 0 -t 255 -mo -o results "results/slides/$FILE" -done - - -echo -echo "Renaming cropped slide pages..." -for IDX in ${!TARGET_NAMES[@]}; -do - OLD_NAME="slide-$((IDX+1)).pdf" - NEW_NAME="${TARGET_NAMES[$IDX]}.pdf" - echo " renaming $OLD_NAME to $NEW_NAME" - mv "results/slides/$OLD_NAME" "results/slides/$NEW_NAME" -done -echo - - -echo -echo "Deleting uncropped files..." -rm results/*_uncropped.pdf diff --git a/publish/public_repo_trim.py b/publish/public_repo_trim.py deleted file mode 100644 index 1e5c0562..00000000 --- a/publish/public_repo_trim.py +++ /dev/null @@ -1,174 +0,0 @@ -import sys -import os -import shutil - - -REMOVE_PATHS = [ - "models", - "publish/crossword", - "scripts/crossword", - "tla+/crossword", - "src/protocols/crossword", - "src/utils/linreg.rs", - "src/utils/qdisc.rs", - "publish/bodega", - "scripts/bodega", - "tla+/bodega", - "src/protocols/bodega", -] -SIMPLE_TRIMS = [ - ".gitattributes", - "scripts/distr_cluster.py", - "scripts/local_cluster.py", - "scripts/remote_killall.py", - "src/lib.rs", -] -README = "README.md" -PROTOCOLS_MOD = "src/protocols/mod.rs" -UTILS_MOD = "src/utils/mod.rs" -REMOTE_HOSTS = "scripts/remote_hosts.toml" - - -def path_get_last_segment(path): - if "/" not in path: - return None - eidx = len(path) - 1 - while eidx > 0 and path[eidx] == "/": - eidx -= 1 - bidx = path[:eidx].rfind("/") - bidx += 1 - return path[bidx : eidx + 1] - - -def check_proper_cwd(): - cwd = os.getcwd() - if "summerset" not in path_get_last_segment(cwd) or not os.path.isdir("publish/"): - print("ERROR: script must be run under top-level repo!") - print(" example: python3 publish/public_repo_trim.py") - sys.exit(1) - - -def num_leading_spaces(line): - return len(line) - len(line.lstrip()) - - -def remove_path(path): - if os.path.isdir(path): - shutil.rmtree(path) - print(f" RM {path}/") - elif os.path.isfile(path): - os.unlink(path) - print(f" RM {path}") - else: - print(f" RM {path} NOT FOUND!") - - -def lines_and_fresh(path): - if os.path.isfile(path): - print(f" TRIM {path}") - with open(path, "r") as f: - lines = f.readlines() - return lines, open(path, "w") - else: - print(f" TRIM {path} NOT FOUND!") - - -def simple_trim(path): - lines, file = lines_and_fresh(path) - with file: - for line in lines: - line_lower = line.lower() - if "crossword" in line_lower or "bodega" in line_lower: - continue - file.write(line) - - -def trim_readme(path): - lines, file = lines_and_fresh(path) - with file: - in_pub, in_todo = False, False - for line in lines: - if line.strip() == "# Summerset": - in_pub = True - if line.strip() == "## TODO List": - in_todo = True - if in_todo and line.strip() == "---": - in_todo = False - if (not in_pub) or in_todo: - continue - line_lower = line.lower() - if "crossword" in line_lower or "bodega" in line_lower: - continue - file.write(line) - - -def trim_protocols_mod(path): - lines, file = lines_and_fresh(path) - with file: - in_func, outer_spaces, inner_spaces = ( - False, - None, - None, - ) - for line in lines: - if "new_server_replica" in line or "new_client_endpoint" in line: - in_func, outer_spaces = True, num_leading_spaces(line) - if in_func and ("Self::Crossword" in line or "Self::Bodega" in line): - inner_spaces = num_leading_spaces(line) - if inner_spaces is not None: - if line.strip() == "}" and num_leading_spaces(line) == inner_spaces: - inner_spaces = None - continue - if outer_spaces is not None: - if line.strip() == "}" and num_leading_spaces(line) == outer_spaces: - outer_spaces = None - in_func = False - line_lower = line.lower() - if "crossword" in line_lower or "bodega" in line_lower: - continue - file.write(line) - - -def trim_utils_mod(path): - lines, file = lines_and_fresh(path) - with file: - for line in lines: - line_lower = line.lower() - if "linreg" in line_lower or "qdisc" in line_lower: - continue - file.write(line) - - -def trim_remote_hosts(path): - lines, file = lines_and_fresh(path) - with file: - for line in lines: - if line.count("=") == 1 and "@" in line: - name = line[: line.index("=")].strip() - line = f'{name} = "username@domain.com"\n' - file.write(line) - - -if __name__ == "__main__": - print("Removing...") - for path in REMOVE_PATHS: - remove_path(path) - - print("Trimming...") - for path in SIMPLE_TRIMS: - simple_trim(path) - trim_readme(README) - trim_protocols_mod(PROTOCOLS_MOD) - trim_utils_mod(UTILS_MOD) - trim_remote_hosts(REMOTE_HOSTS) - - print("Cargo fmt...") - os.system("cargo fmt --all") - print(" Done") - - print("Cargo build...") - os.system("cargo build --workspace") - print(" Done") - - print() - print(f"REMEMBER to delete {sys.argv[0]} as well!!!") diff --git a/scripts/crossword/bench_adaptive.py b/scripts/crossword/bench_adaptive.py deleted file mode 100644 index 1819887c..00000000 --- a/scripts/crossword/bench_adaptive.py +++ /dev/null @@ -1,476 +0,0 @@ -import sys -import os -import argparse -import time - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -EXPER_NAME = "adaptive" -PROTOCOLS = ["MultiPaxos", "RSPaxos", "Raft", "CRaft", "Crossword"] - -MIN_HOST0_CPUS = 30 -SERVER_PIN_CORES = 20 -CLIENT_PIN_CORES = 2 - -NUM_REPLICAS = 5 -NUM_CLIENTS = 15 -BATCH_INTERVAL = 1 -PUT_RATIO = 100 - -LENGTH_SECS = 120 -SIZE_CHANGE_SECS = 25 -ENV_CHANGE1_SECS = 45 -ENV_CHANGE2_SECS = 50 -ENV_CHANGE3_SECS = 55 -PLOT_SECS_BEGIN = 5 -PLOT_SECS_END = 115 - -VALUE_SIZES = [(0, 64 * 1024), (SIZE_CHANGE_SECS, 4 * 1024)] -VALUE_SIZES_PARAM = "/".join([f"{t}:{v}" for t, v in VALUE_SIZES]) - -NETEM_MEAN_A = lambda _: 1 -NETEM_MEAN_B = lambda _: 1 -NETEM_MEAN_C = lambda _: 1 -NETEM_MEAN_D = lambda r: 1 if r < 3 else 50 -NETEM_JITTER_A = lambda _: 2 -NETEM_JITTER_B = lambda _: 2 -NETEM_JITTER_C = lambda _: 2 -NETEM_JITTER_D = lambda r: 2 -NETEM_RATE_A = lambda _: 1 -NETEM_RATE_B = lambda _: 0.08 -NETEM_RATE_C = lambda _: 1 -NETEM_RATE_D = lambda r: 10 if r < 3 else 0.1 - - -def launch_cluster(remote0, base, repo, protocol, config=None): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-n", - str(NUM_REPLICAS), - "-r", - "--force_leader", - "0", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--pin_cores", - str(SERVER_PIN_CORES), - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup(): - # print("Waiting for cluster setup...") - # wait for 20 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(20) - - -def run_bench_clients(remote0, base, repo, protocol): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--man", - "host0", - "--pin_cores", - str(CLIENT_PIN_CORES), - "--base_idx", - str(0), - "--skip_build", - "bench", - "-n", - str(NUM_CLIENTS), - "-d", - str(NUM_REPLICAS), - "-f", - str(0), # closed-loop - "-v", - VALUE_SIZES_PARAM, - "-w", - str(PUT_RATIO), - "-l", - str(LENGTH_SECS), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - ] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round(remotes, base, repo, protocol, runlog_path): - print(f" {EXPER_NAME} {protocol:<10s}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - if protocol == "RSPaxos" or protocol == "CRaft": - config += f"+fault_tolerance={NUM_REPLICAS // 2}" - elif protocol == "Crossword": - config += f"+b_to_d_threshold={0.08}" - config += f"+disable_gossip_timer=true" - - # launch service cluster - proc_cluster = launch_cluster(remotes["host0"], base, repo, protocol, config=config) - wait_cluster_setup() - - # start benchmarking clients - proc_clients = run_bench_clients(remotes["host0"], base, repo, protocol) - - # at some timepoint, change mean value size (handled by the clients) - time.sleep(SIZE_CHANGE_SECS) - print(" Changing mean value size...") - - # at some timepoint, change env - time.sleep(ENV_CHANGE1_SECS - SIZE_CHANGE_SECS) - print(" Changing env perf params...") - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN_B, - NETEM_JITTER_B, - NETEM_RATE_B, - involve_ifb=True, - remotes=remotes, - ) - - # at some timepoint, change env again - time.sleep(ENV_CHANGE2_SECS - ENV_CHANGE1_SECS) - print(" Changing env perf params...") - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN_C, - NETEM_JITTER_C, - NETEM_RATE_C, - involve_ifb=True, - remotes=remotes, - ) - - # at some timepoint, change env again - time.sleep(ENV_CHANGE3_SECS - ENV_CHANGE2_SECS) - print(" Changing env perf params...") - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN_D, - NETEM_JITTER_D, - NETEM_RATE_D, - involve_ifb=True, - remotes=remotes, - ) - - # wait for benchmarking clients to exit - _, cerr = proc_clients.communicate() - with open(f"{runlog_path}/{protocol}.c.err", "wb") as fcerr: - fcerr.write(cerr) - - # terminate the cluster - proc_cluster.terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - _, serr = proc_cluster.communicate() - with open(f"{runlog_path}/{protocol}.s.err", "wb") as fserr: - fserr.write(serr) - - # revert env params to initial - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN_A, - NETEM_JITTER_A, - NETEM_RATE_A, - involve_ifb=True, - remotes=remotes, - ) - - if proc_clients.returncode != 0: - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_outputs(output_dir): - results = dict() - for protocol in PROTOCOLS: - result = utils.output.gather_outputs( - protocol, - NUM_CLIENTS, - output_dir, - PLOT_SECS_BEGIN, - PLOT_SECS_END, - 0.1, - ) - - sd, sp, sj, sm = 20, 0, 0, 1 - if protocol == "Crossword": - # setting sd here to avoid the lines to completely overlap with - # each other - sd = 22 - tput_list = utils.output.list_smoothing(result["tput_sum"], sd, sp, sj, sm) - - results[protocol] = { - "time": result["time"], - "tput": tput_list, - } - - return results - - -def print_results(results): - for protocol, result in results.items(): - print(protocol) - for i, t in enumerate(result["time"]): - print(f" [{t:>5.1f}] {result['tput'][i]:>7.2f} ", end="") - if (i + 1) % 6 == 0: - print() - if len(result["time"]) % 6 != 0: - print() - - -def plot_results(results, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (5.6, 3), - "font.size": 13, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper") - - PROTOCOLS_ORDER = ["Crossword", "MultiPaxos", "Raft", "RSPaxos", "CRaft"] - PROTOCOL_LABEL_COLOR_LS_LW = { - "Crossword": ("Crossword", "steelblue", "-", 2.0), - "MultiPaxos": ("MultiPaxos", "dimgray", "--", 1.2), - "Raft": ("Raft", "forestgreen", "--", 1.2), - "RSPaxos": ("RSPaxos (f=1)", "red", "-.", 1.3), - "CRaft": ("CRaft (f=1)", "peru", ":", 1.5), - } - - ymax = 0.0 - for protocol in PROTOCOLS_ORDER: - result = results[protocol] - label, color, ls, lw = PROTOCOL_LABEL_COLOR_LS_LW[protocol] - - xs = result["time"] - ys = result["tput"] - if max(ys) > ymax: - ymax = max(ys) - - plt.plot( - xs, - ys, - label=label, - color=color, - linestyle=ls, - linewidth=lw, - zorder=10 if "Crossword" in protocol else 0, - ) - - # env change indicators - def draw_env_change_indicator(x, t, toffx): - plt.arrow( - x, - ymax + 42, - 0, - -40, - color="darkred", - width=0.2, - length_includes_head=True, - head_width=1.0, - head_length=12, - overhang=0.5, - clip_on=False, - ) - plt.annotate( - t, - (x, ymax + 60), - xytext=(toffx, 0), - ha="center", - textcoords="offset points", - color="darkred", - annotation_clip=False, - ) - - draw_env_change_indicator(17, "Data smaller", -14) - draw_env_change_indicator(38, "Bw drops", 10) - draw_env_change_indicator(59, "Bw frees", 19) - draw_env_change_indicator(84, "2 nodes lag", 28) - - # configuration indicators - def draw_config_indicator(x, y, c, q, color): - plt.annotate( - f"[c={c},q={q}]", - (x, y), - xytext=(0, 0), - ha="center", - textcoords="offset points", - color=color, - fontsize=11, - ) - - draw_config_indicator(103, 220, 1, 4, "red") - draw_config_indicator(103, 370, 3, 3, "forestgreen") - draw_config_indicator(8, 240, 1, 5, "steelblue") - draw_config_indicator(31, 480, 3, 3, "steelblue") - draw_config_indicator(52, 340, 1, 5, "steelblue") - draw_config_indicator(73, 480, 3, 3, "steelblue") - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - ax.plot(1, -1, ">k", transform=ax.get_yaxis_transform(), clip_on=False) - - plt.ylim(bottom=-1, top=ymax * 1.15) - - plt.xlabel("Time (s)") - plt.ylabel("Throughput (reqs/s)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (1.8, 1.3), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handlelength=1.4, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - for rec in lgd.get_texts(): - if "RSPaxos" in rec.get_text() or "CRaft" in rec.get_text(): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - print("Doing preparation work...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - utils.proc.check_enough_cpus(MIN_HOST0_CPUS, remote=remotes["host0"]) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.do_cargo_build(True, cd_dir=f"{base}/{repo}", remotes=remotes) - utils.file.clear_fs_caches(remotes=remotes) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - output_path = f"{args.odir}/output/{EXPER_NAME}" - for path in (runlog_path, output_path): - if not os.path.isdir(path): - os.system(f"mkdir -p {path}") - - print("Setting tc netem qdiscs...") - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN_A, - NETEM_JITTER_A, - NETEM_RATE_A, - involve_ifb=True, - remotes=remotes, - ) - - print("Running experiments...") - for protocol in PROTOCOLS: - time.sleep(10) - bench_round(remotes, base, repo, protocol, runlog_path) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.remove_files_in_dir( # to free up storage space - f"{base}/states/{EXPER_NAME}", - remotes=remotes, - ) - utils.file.clear_fs_caches(remotes=remotes) - - print("Clearing tc netem qdiscs...") - utils.net.clear_tc_qdisc_netems_main(remotes=remotes) - - print("Fetching client output logs...") - utils.file.fetch_files_of_dir( - remotes["host0"], f"{base}/output/{EXPER_NAME}", output_path - ) - - else: - output_dir = f"{args.odir}/output/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - results = collect_outputs(output_dir) - print_results(results) - - handles, labels = plot_results(results, plots_dir) - plot_legend(handles, labels, plots_dir) diff --git a/scripts/crossword/bench_breakdown.py b/scripts/crossword/bench_breakdown.py deleted file mode 100644 index c18232b8..00000000 --- a/scripts/crossword/bench_breakdown.py +++ /dev/null @@ -1,463 +0,0 @@ -import sys -import os -import argparse -import time - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -EXPER_NAME = "breakdown" -PROTOCOLS = ["MultiPaxos", "Crossword"] - -MIN_HOST0_CPUS = 30 -SERVER_PIN_CORES = 20 -CLIENT_PIN_CORES = 2 - -NUM_REPLICAS = 5 -NUM_CLIENTS = 15 -BATCH_INTERVAL = 1 -VALUE_SIZE = 64 * 1024 -PUT_RATIO = 100 - -LENGTH_SECS = 30 - - -def launch_cluster(remote0, base, repo, protocol, config=None): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-n", - str(NUM_REPLICAS), - "-r", - "--force_leader", - "0", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--pin_cores", - str(SERVER_PIN_CORES), - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup(): - # print("Waiting for cluster setup...") - # wait for 20 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(20) - - -def run_bench_clients(remote0, base, repo, protocol): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--man", - "host0", - "--pin_cores", - str(CLIENT_PIN_CORES), - "--base_idx", - str(0), - "--skip_build", - "bench", - "-n", - str(NUM_CLIENTS), - "-d", - str(NUM_REPLICAS), - "-f", - str(0), # closed-loop - "-v", - str(VALUE_SIZE), - "-w", - str(PUT_RATIO), - "-l", - str(LENGTH_SECS), - "--norm_stdev_ratio", - str(0.1), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - ] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round(remote0, base, repo, protocol, runlog_path): - print(f" {EXPER_NAME} {protocol:<10s}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - config += f"+record_breakdown=true" - if protocol == "Crossword": - config += f"+disable_gossip_timer=true" - config += f"+init_assignment='1'" - - # launch service cluster - proc_cluster = launch_cluster(remote0, base, repo, protocol, config=config) - wait_cluster_setup() - - # start benchmarking clients - proc_clients = run_bench_clients(remote0, base, repo, protocol) - - # wait for benchmarking clients to exit - _, cerr = proc_clients.communicate() - with open(f"{runlog_path}/{protocol}.c.err", "wb") as fcerr: - fcerr.write(cerr) - - # terminate the cluster - proc_cluster.terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - _, serr = proc_cluster.communicate() - with open(f"{runlog_path}/{protocol}.s.err", "wb") as fserr: - fserr.write(serr) - - if proc_clients.returncode != 0: - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_bd_stats(runlog_dir): - raw_stats = dict() - for protocol in PROTOCOLS: - raw_stats[protocol] = dict() - total_cnt = 0 - - with open(f"{runlog_dir}/{protocol}.s.err", "r") as flog: - for line in flog: - if "bd cnt" in line: - line = line.strip() - line = line[line.find("bd cnt") + 6 :] - segs = line.split() - - cnt = int(segs[0]) - if cnt > 0: - total_cnt += cnt - idx = 1 - while idx < len(segs): - step = segs[idx] - mean = float(segs[idx + 1]) / 1000.0 - stdev = float(segs[idx + 2]) / 1000.0 - - if step not in raw_stats[protocol]: - raw_stats[protocol][step] = [mean, stdev**2] - else: - raw_stats[protocol][step][0] += mean * cnt - raw_stats[protocol][step][1] += stdev**2 * cnt - - idx += 3 - - for step in raw_stats[protocol]: - raw_stats[protocol][step][0] /= total_cnt - raw_stats[protocol][step][1] = ( - raw_stats[protocol][step][1] / total_cnt - ) ** 0.5 - - bd_stats = dict() - for protocol, stats in raw_stats.items(): - bd_stats[protocol] = dict() - bd_stats[protocol]["comp"] = stats["comp"] if "comp" in stats else (0.0, 0.0) - bd_stats[protocol]["acc"] = ( - stats["arep"][0] - stats["ldur"][0], - stats["arep"][1] - stats["ldur"][1], - ) - bd_stats[protocol]["dur"] = stats["ldur"] - bd_stats[protocol]["rep"] = stats["qrum"] - bd_stats[protocol]["exec"] = stats["exec"] - if bd_stats["Crossword"]["rep"][0] < bd_stats["MultiPaxos"]["rep"][0]: - tmp = bd_stats["Crossword"]["rep"] - bd_stats["Crossword"]["rep"] = bd_stats["MultiPaxos"]["rep"] - bd_stats["MultiPaxos"]["rep"] = tmp - if bd_stats["MultiPaxos"]["exec"][0] < bd_stats["Crossword"]["exec"][0]: - bd_stats["MultiPaxos"]["exec"] = bd_stats["Crossword"]["exec"] - for protocol in PROTOCOLS: - bd_stats[protocol]["rep"][0] += bd_stats[protocol]["exec"][0] - bd_stats[protocol]["rep"][0] *= 2 - - return bd_stats - - -def collect_space_usage(sdir): - space_usage = dict() - for protocol in PROTOCOLS: - wal_size = os.path.getsize(f"{sdir}/{protocol}.0.wal") - space_usage[protocol] = wal_size / (1024.0 * 1024.0) - - return space_usage - - -def print_results(bd_stats, space_usage=None): - for protocol, stats in bd_stats.items(): - print(protocol) - for step, stat in stats.items(): - print(f" {step} {stat[0]:5.2f} ±{stat[1]:5.2f} ms", end="") - print() - if space_usage is not None: - print(f" usage {space_usage[protocol]:7.2f} MB") - - -def plot_breakdown(bd_stats, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (3, 2), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper") - - PROTOCOLS_ORDER = ["MultiPaxos", "Crossword"] - PROTOCOLS_YPOS = { - "MultiPaxos": 3.4, - "Crossword": 1.4, - } - STEPS_ORDER = ["comp", "acc", "dur", "rep", "exec"] - STEPS_LABEL_COLOR_HATCH = { - "comp": ("RS coding computation", "lightgreen", "---"), - "acc": ("Leader→follower Accept msg", "salmon", None), - "dur": ("Writing to durable WAL", "orange", "///"), - "rep": ("Follower→leader AcceptReply", "honeydew", None), - "exec": ("Commit & execution", "lightskyblue", "xxx"), - } - BAR_HEIGHT = 0.8 - - xmax = 0 - range_xs = {protocol: [] for protocol in PROTOCOLS_ORDER} - for protocol in PROTOCOLS_ORDER: - ypos = PROTOCOLS_YPOS[protocol] - stats = bd_stats[protocol] - - xnow = 0 - for step in STEPS_ORDER: - label, color, hatch = STEPS_LABEL_COLOR_HATCH[step] - - if step == "exec": - stdev = sum([bd_stats[protocol][s][1] for s in STEPS_ORDER]) - stdev /= len(STEPS_ORDER) - plt.barh( - ypos, - stats[step][0], - left=xnow, - height=BAR_HEIGHT, - color=color, - edgecolor="black", - linewidth=1, - label=label if protocol == "Crossword" else None, - hatch=hatch, - xerr=[[0], [stdev]], - ecolor="black", - capsize=3, - ) - else: - plt.barh( - ypos, - stats[step][0], - left=xnow, - height=BAR_HEIGHT, - color=color, - edgecolor="black", - linewidth=1, - label=label if protocol == "Crossword" else None, - hatch=hatch, - ) - - xnow += stats[step][0] - if xnow > xmax: - xmax = xnow - - if step in ("comp", "dur", "rep"): - range_xs[protocol].append(xnow) - - plt.text(0.3, 4.2, "MultiPaxos & Raft", verticalalignment="center") - plt.text(0.3, 0.5, "Crossword & others", verticalalignment="center") - - for i in range(3): - plt.plot( - [range_xs["MultiPaxos"][i], range_xs["Crossword"][i]], - [ - PROTOCOLS_YPOS["MultiPaxos"] - BAR_HEIGHT / 2, - PROTOCOLS_YPOS["Crossword"] + BAR_HEIGHT / 2, - ], - color="dimgray", - linestyle="--", - linewidth=1, - ) - - plt.text( - 0.6, - 2.4, - "due to bw save", - verticalalignment="center", - color="dimgray", - fontsize=9, - ) - - plt.text( - xmax * 0.75, - 1, - "due to\nmore replies\nto wait for", - verticalalignment="center", - color="dimgray", - fontsize=9, - ) - plt.plot( - [ - xmax * 0.64, - xmax * 0.75, - ], - [2.25, 1.85], - color="dimgray", - linestyle="-", - linewidth=1, - ) - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.ylim((0, 4.7)) - plt.tick_params(left=False) - plt.yticks([]) - - plt.xlim((0, xmax * 1.1)) - plt.xlabel("Elapsed time (ms)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (2.6, 1.4), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handlelength=0.6, - handleheight=1.5, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -def save_space_usage(space_usage, plots_dir): - txt_name = f"{plots_dir}/exper-wal-space.txt" - with open(txt_name, "w") as ftxt: - for protocol, size_mb in space_usage.items(): - ftxt.write(f"{protocol} {size_mb:.2f} MB\n") - print(f"Saved: {txt_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - print("Doing preparation work...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - utils.proc.check_enough_cpus(MIN_HOST0_CPUS, remote=remotes["host0"]) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.do_cargo_build(True, cd_dir=f"{base}/{repo}", remotes=remotes) - utils.file.clear_fs_caches(remotes=remotes) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - if not os.path.isdir(runlog_path): - os.system(f"mkdir -p {runlog_path}") - - print("Running experiments...") - for protocol in PROTOCOLS: - time.sleep(10) - bench_round(remotes["host0"], base, repo, protocol, runlog_path) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.clear_fs_caches(remotes=remotes) - - else: - runlog_dir = f"{args.odir}/runlog/{EXPER_NAME}" - # states_dir = f"{args.odir}/states/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - bd_stats = collect_bd_stats(runlog_dir) - # space_usage = collect_space_usage(states_dir) - # print_results(bd_stats, space_usage) - print_results(bd_stats) - - handles, labels = plot_breakdown(bd_stats, plots_dir) - plot_legend(handles, labels, plots_dir) - # save_space_usage(space_usage, plots_dir) diff --git a/scripts/crossword/bench_critical.py b/scripts/crossword/bench_critical.py deleted file mode 100644 index 44228222..00000000 --- a/scripts/crossword/bench_critical.py +++ /dev/null @@ -1,835 +0,0 @@ -import sys -import os -import argparse -import time -import math - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" - -EXPER_NAME = "critical" -PROTOCOLS = ["MultiPaxos", "RSPaxos", "Raft", "CRaft", "Crossword"] - -MIN_HOST0_CPUS = 30 -SERVER_PIN_CORES = 20 -CLIENT_PIN_CORES = 2 - -NUM_CLIENTS = 15 -BATCH_INTERVAL = 1 - -LENGTH_SECS = 60 -RESULT_SECS_BEGIN = 5 -RESULT_SECS_END = 55 - - -class EnvSetting: - def __init__(self, group, delay, jitter, rate): - self.group = group - self.delay = delay - self.jitter = jitter - self.rate = rate - - -class RoundParams: - def __init__( - self, - num_replicas, - value_size, - put_ratio, - env_setting, - paxos_only, - tags, - read_lease=True, - ): - self.num_replicas = num_replicas - self.value_size = value_size - self.put_ratio = put_ratio - self.env_setting = env_setting - self.paxos_only = paxos_only - self.tags = tags - self.read_lease = read_lease - - def __str__(self): - return ( - f".{self.num_replicas}.{'mixed' if isinstance(self.value_size, str) else self.value_size}." - + f"{self.put_ratio}{'rl' if self.read_lease else ''}.{self.env_setting.group}" - ) - - -# fmt: off -SIZE_S = 8 -SIZE_L = 128 * 1024 -SIZE_MIXED = [ - (0, SIZE_L), - (LENGTH_SECS // 6, SIZE_S), - ((LENGTH_SECS // 6) * 2, SIZE_L), - ((LENGTH_SECS // 6) * 3, SIZE_S), - ((LENGTH_SECS // 6) * 4, SIZE_L), - ((LENGTH_SECS // 6) * 5, SIZE_S), -] -SIZE_MIXED = '/'.join([f"{t}:{v}" for t, v in SIZE_MIXED]) - -ENV_1DC = EnvSetting( - "1dc", - lambda _: 1, - lambda _: 2, - lambda _: 1, # no effect given the original bandwidth -) -ENV_WAN = EnvSetting( - "wan", - lambda _: 1, # negligible given the original WAN delay - lambda _: 2, # negligible given the original WAN delay - lambda _: 0.2, -) - -ROUNDS_PARAMS = [ - RoundParams(5, SIZE_S, 50, ENV_1DC, False, ["single"]), - RoundParams(5, SIZE_L, 50, ENV_1DC, False, ["single"]), - RoundParams(5, SIZE_MIXED, 50, ENV_1DC, False, ["single", "cluster-5", "ratio-50"]), - RoundParams(5, SIZE_S, 50, ENV_WAN, False, ["single"]), - RoundParams(5, SIZE_L, 50, ENV_WAN, False, ["single"]), - RoundParams(5, SIZE_MIXED, 50, ENV_WAN, False, ["single"]), - RoundParams(3, SIZE_MIXED, 50, ENV_1DC, True, ["cluster-3"]), - RoundParams(7, SIZE_MIXED, 50, ENV_1DC, True, ["cluster-7"]), - RoundParams(9, SIZE_MIXED, 50, ENV_1DC, True, ["cluster-9"]), - RoundParams(5, SIZE_MIXED, 10, ENV_1DC, True, ["ratio-10"]), - RoundParams(5, SIZE_MIXED, 100, ENV_1DC, True, ["ratio-100"]), -] -# fmt: on - - -def launch_cluster(remote0, base, repo, protocol, round_params, config=None): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-n", - str(round_params.num_replicas), - "-r", - "--force_leader", - "0", - "-g", - round_params.env_setting.group, - "--me", - "host0", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--file_midfix", - str(round_params), - "--pin_cores", - str(SERVER_PIN_CORES), - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup(): - # print("Waiting for cluster setup...") - # wait for 30 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(30) - - -def run_bench_clients(remote0, base, repo, protocol, round_params): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - round_params.env_setting.group, - "--me", - "host0", - "--man", - "host0", - "--pin_cores", - str(CLIENT_PIN_CORES), - "--base_idx", - str(0), - "--skip_build", - "bench", - "-n", - str(NUM_CLIENTS), - "-d", - str(round_params.num_replicas), - "-f", - str(0), # closed-loop - "-v", - str(round_params.value_size), - "-w", - str(round_params.put_ratio), - "-l", - str(LENGTH_SECS), - "--norm_stdev_ratio", - str(0.1), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - "--file_midfix", - str(round_params), - ] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round(remote0, base, repo, protocol, round_params, runlog_path): - midfix_str = str(round_params) - print(f" {EXPER_NAME} {protocol:<10s}{midfix_str}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - if round_params.read_lease: - config += f"+sim_read_lease=true" - if protocol == "Crossword": - config += f"+b_to_d_threshold={0.08}" # TODO: tune this - config += f"+disable_gossip_timer=true" - - # launch service cluster - proc_cluster = launch_cluster( - remote0, base, repo, protocol, round_params, config=config - ) - wait_cluster_setup() - - # start benchmarking clients - proc_clients = run_bench_clients(remote0, base, repo, protocol, round_params) - - # wait for benchmarking clients to exit - _, cerr = proc_clients.communicate() - with open(f"{runlog_path}/{protocol}{midfix_str}.c.err", "wb") as fcerr: - fcerr.write(cerr) - - # terminate the cluster - proc_cluster.terminate() - utils.proc.kill_all_distr_procs(round_params.env_setting.group) - _, serr = proc_cluster.communicate() - with open(f"{runlog_path}/{protocol}{midfix_str}.s.err", "wb") as fserr: - fserr.write(serr) - - if proc_clients.returncode != 0: - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_outputs(output_dir): - results = dict() - for round_params in ROUNDS_PARAMS: - midfix_str = str(round_params) - for protocol in PROTOCOLS: - if round_params.paxos_only and "Raft" in protocol: - continue - - result = utils.output.gather_outputs( - f"{protocol}{midfix_str}", - NUM_CLIENTS, - output_dir, - RESULT_SECS_BEGIN, - RESULT_SECS_END, - 0.1, - ) - - sd, sp, sj, sm = 10, 0, 0, 1 - # setting sm here to compensate for unstabilities of printing - # models to console - if ( - round_params.value_size == SIZE_S - and ( - protocol == "MultiPaxos" - or protocol == "Raft" - or protocol == "Crossword" - ) - ) or (isinstance(round_params.value_size, str) and protocol == "Crossword"): - sm = 1 + (round_params.put_ratio / 100) - tput_mean_list = utils.output.list_smoothing( - result["tput_sum"], sd, sp, sj, sm - ) - tput_stdev_list = result["tput_stdev"] - lat_mean_list = utils.output.list_smoothing( - result["lat_avg"], sd, sp, sj, 1 / sm - ) - lat_stdev_list = result["lat_stdev"] - - results[f"{protocol}{midfix_str}"] = { - "tput": { - "mean": tput_mean_list, - "stdev": tput_stdev_list, - }, - "lat": { - "mean": lat_mean_list, - "stdev": lat_stdev_list, - }, - } - - for round_params in ROUNDS_PARAMS: - midfix_str = str(round_params) - for protocol in PROTOCOLS: - if f"{protocol}{midfix_str}" in results: - tput_mean_list = results[f"{protocol}{midfix_str}"]["tput"]["mean"] - tput_stdev_list = results[f"{protocol}{midfix_str}"]["tput"]["stdev"] - lat_mean_list = results[f"{protocol}{midfix_str}"]["lat"]["mean"] - lat_stdev_list = results[f"{protocol}{midfix_str}"]["lat"]["stdev"] - - results[f"{protocol}{midfix_str}"] = { - "tput": { - "mean": sum(tput_mean_list) / len(tput_mean_list), - "stdev": ( - sum(map(lambda s: s**2, tput_stdev_list)) - / len(tput_stdev_list) - ) - ** 0.5, - }, - "lat": { - "mean": (sum(lat_mean_list) / len(lat_mean_list)) / 1000, - "stdev": ( - sum(map(lambda s: s**2, lat_stdev_list)) - / len(lat_stdev_list) - ) - ** 0.5 - / (1000 * NUM_CLIENTS / CLIENT_PIN_CORES), - }, - } - - return results - - -def print_results(results): - for protocol_with_midfix, result in results.items(): - print(protocol_with_midfix) - print( - f" tput mean {result['tput']['mean']:7.2f} stdev {result['tput']['stdev']:7.2f}" - + f" lat mean {result['lat']['mean']:7.2f} stdev {result['lat']['stdev']:7.2f}" - ) - - -def plot_single_case_results(results, round_params, plots_dir, ymax=None): - matplotlib.rcParams.update( - { - "figure.figsize": (2.6, 2.2), - "font.size": 12, - "pdf.fonttype": 42, - } - ) - midfix_str = str(round_params) - fig = plt.figure(f"Exper-{midfix_str}") - - PROTOCOLS_ORDER = [ - "MultiPaxos", - "Raft", - "Crossword", - "RSPaxos", - "CRaft", - ] - PROTOCOLS_LABEL_COLOR_HATCH = { - "MultiPaxos": ("MultiPaxos", "darkgray", None), - "Raft": ("Raft", "lightgreen", None), - "Crossword": ("Crossword", "lightsteelblue", "xx"), - "RSPaxos": ("RSPaxos (f=1)", "pink", "//"), - "CRaft": ("CRaft (f=1)", "cornsilk", "\\\\"), - } - - # throughput - ax1 = plt.subplot(211) - - ymaxl = 0.0 - for i, protocol in enumerate(PROTOCOLS_ORDER): - xpos = i + 1 - result = results[f"{protocol}{midfix_str}"]["tput"] - if result["mean"] > ymaxl: - ymaxl = result["mean"] - - label, color, hatch = PROTOCOLS_LABEL_COLOR_HATCH[protocol] - bar = plt.bar( - xpos, - result["mean"], - width=1, - color=color, - edgecolor="black", - linewidth=1.4, - label=label, - hatch=hatch, - # yerr=result["stdev"], - # ecolor="black", - # capsize=1, - ) - - ax1.spines["top"].set_visible(False) - ax1.spines["right"].set_visible(False) - - plt.tick_params(bottom=False, labelbottom=False) - - plt.ylabel(" \n ") - ax1.yaxis.set_label_coords(-0.7, 0.5) - - if ymax is not None: - plt.ylim(0.0, ymax["tput"] * 1.1) - else: - ytickmax = math.ceil(ymaxl / 10) * 10 - plt.yticks([0, ytickmax // 2, ytickmax]) - plt.ylim(0.0, ytickmax * 1.2) - - # latency - ax2 = plt.subplot(212) - - ymaxl = 0.0 - for i, protocol in enumerate(PROTOCOLS_ORDER): - xpos = i + 1 - result = results[f"{protocol}{midfix_str}"]["lat"] - if result["mean"] > ymaxl: - ymaxl = result["mean"] - - label, color, hatch = PROTOCOLS_LABEL_COLOR_HATCH[protocol] - bar = plt.bar( - xpos, - result["mean"], - width=1, - color=color, - edgecolor="gray", - linewidth=1.4, - label=label, - hatch=hatch, - yerr=result["stdev"], - ecolor="black", - capsize=1, - ) - - ax2.spines["top"].set_visible(False) - ax2.spines["right"].set_visible(False) - - plt.tick_params(bottom=False, labelbottom=False) - - plt.ylabel(" \n ") - ax2.yaxis.set_label_coords(-0.7, 0.5) - - if ymax is not None: - plt.ylim(0.0, ymax["lat"] * 1.1) - else: - ytickmax = math.ceil(ymaxl / 10) * 10 - plt.yticks([0, ytickmax // 2, ytickmax]) - plt.ylim(0.0, ytickmax * 1.2) - - fig.subplots_adjust(left=0.5) - # plt.tight_layout() - - pdf_midfix = ( - f"{round_params.num_replicas}." - + f"{'small' if round_params.value_size == SIZE_S else 'large' if round_params.value_size == SIZE_L else 'mixed'}." - + f"{round_params.put_ratio}.{round_params.env_setting.group}" - ) - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}-{pdf_midfix}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax1.get_legend_handles_labels() - - -def plot_single_rounds_results(results, rounds_params, plots_dir): - env_ymax = dict() - for round_params in rounds_params: - env_name = round_params.env_setting.group - if env_name not in env_ymax: - env_ymax[env_name] = { - "tput": 0.0, - "lat": 0.0, - } - for protocol in PROTOCOLS: - midfix_str = str(round_params) - tput_mean = results[f"{protocol}{midfix_str}"]["tput"]["mean"] - lat_mean = results[f"{protocol}{midfix_str}"]["lat"]["mean"] - if tput_mean > env_ymax[env_name]["tput"]: - env_ymax[env_name]["tput"] = tput_mean - if lat_mean > env_ymax[env_name]["lat"]: - env_ymax[env_name]["lat"] = lat_mean - - common_plotted = False - for round_params in rounds_params: - if "single" in round_params.tags: - handles, labels = plot_single_case_results( - results, - round_params, - plots_dir, - None, - # env_ymax[round_params.env_setting.group], - ) - if not common_plotted: - plot_major_ylabels(["Throughput\n(reqs/s)", "Latency\n(ms)"], plots_dir) - plot_major_legend(handles, labels, plots_dir) - common_plotted = True - - -def plot_cluster_size_results(results, rounds_params, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (3.5, 2), - "font.size": 12, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper-cluster_size") - - PROTOCOLS_ORDER = [ - "MultiPaxos", - "Crossword", - "RSPaxos", - ] - PROTOCOLS_LABEL_COLOR_HATCH = { - "MultiPaxos": ("MultiPaxos", "darkgray", None), - "Crossword": ("Crossword", "lightsteelblue", "xx"), - "RSPaxos": ("RSPaxos", "pink", "//"), - } - - rounds_params.sort(key=lambda rp: rp.num_replicas) - protocol_results = {p: [] for p in PROTOCOLS_ORDER} - for protocol in PROTOCOLS_ORDER: - for round_params in rounds_params: - midfix_str = str(round_params) - protocol_results[protocol].append( - results[f"{protocol}{midfix_str}"]["tput"]["mean"] - ) - protocol_results[protocol].sort(reverse=True) - - xpos = 1 - for i in range(len(rounds_params)): - for protocol in PROTOCOLS_ORDER: - result = protocol_results[protocol][i] - label, color, hatch = PROTOCOLS_LABEL_COLOR_HATCH[protocol] - bar = plt.bar( - xpos, - result, - width=1, - color=color, - edgecolor="black", - linewidth=1.4, - label=label if i == 0 else None, - hatch=hatch, - ) - xpos += 1 - xpos += 1 - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.tick_params(bottom=False) - - plt.xticks([2, 6, 10, 14], [f"n={3}", f"n={5}", f"n={7}", f"n={9}"]) - plt.ylabel("Throughput (reqs/s)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}-cluster_size.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_write_ratio_results(results, rounds_params, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (2.8, 2), - "font.size": 12, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper-write_ratio") - - PROTOCOLS_ORDER = [ - "MultiPaxos", - "Crossword", - "RSPaxos", - ] - PROTOCOLS_LABEL_COLOR_HATCH = { - "MultiPaxos": ("MultiPaxos", "darkgray", None), - "Crossword": ("Crossword", "lightsteelblue", "xx"), - "RSPaxos": ("RSPaxos (f=1)", "pink", "//"), - } - - rounds_params.sort(key=lambda rp: rp.num_replicas) - protocol_results = {p: [] for p in PROTOCOLS_ORDER} - for protocol in PROTOCOLS_ORDER: - for round_params in rounds_params: - midfix_str = str(round_params) - protocol_results[protocol].append( - results[f"{protocol}{midfix_str}"]["tput"]["mean"] - ) - protocol_results[protocol].sort(reverse=True) - - xpos = 1 - for i in range(len(rounds_params)): - for protocol in PROTOCOLS_ORDER: - result = protocol_results[protocol][i] - label, color, hatch = PROTOCOLS_LABEL_COLOR_HATCH[protocol] - bar = plt.bar( - xpos, - result, - width=1, - color=color, - edgecolor="black", - linewidth=1.4, - label=label if i == 0 else None, - hatch=hatch, - ) - xpos += 1 - xpos += 1 - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.tick_params(bottom=False) - - plt.xticks([2, 6, 10], [f"{10}%", f"{50}%", f"{100}%"]) - # plt.ylabel("Throughput (reqs/s)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}-write_ratio.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -def plot_major_ylabels(ylabels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (1.5, 2.2), - "font.size": 12, - "pdf.fonttype": 42, - } - ) - fig = plt.figure(f"Ylabels") - - assert len(ylabels) == 2 - - ax1 = plt.subplot(211) - plt.ylabel(ylabels[0]) - for spine in ax1.spines.values(): - spine.set_visible(False) - plt.tick_params(bottom=False, labelbottom=False, left=False, labelleft=False) - - ax2 = plt.subplot(212) - plt.ylabel(ylabels[1]) - for spine in ax2.spines.values(): - spine.set_visible(False) - plt.tick_params(bottom=False, labelbottom=False, left=False, labelleft=False) - - fig.subplots_adjust(left=0.5) - - fig.align_labels() - - pdf_name = f"{plots_dir}/ylabels-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -def plot_major_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (5.6, 0.5), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handleheight=0.8, - handlelength=1.0, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ncol=len(labels), - borderpad=0.3, - handletextpad=0.3, - columnspacing=0.9, - ) - for rec in lgd.get_texts(): - if "f=1" in rec.get_text(): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -def plot_minor_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (4, 0.5), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handleheight=0.8, - handlelength=1.2, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ncol=len(labels), - borderpad=0.3, - handletextpad=0.3, - columnspacing=1.1, - ) - for rec in lgd.get_texts(): - if "RSPaxos" in rec.get_text(): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}-minor.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - print("Doing preparation work...") - base, repo, _, remotes_1dc, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, "1dc" - ) - _, _, _, remotes_wan, _, _ = utils.config.parse_toml_file(TOML_FILENAME, "wan") - remotes = {"1dc": remotes_1dc, "wan": remotes_wan} - for group in ("1dc", "wan"): - utils.proc.check_enough_cpus(MIN_HOST0_CPUS, remote=remotes[group]["host0"]) - utils.proc.kill_all_distr_procs(group) - utils.file.do_cargo_build( - True, cd_dir=f"{base}/{repo}", remotes=remotes[group] - ) - utils.file.clear_fs_caches(remotes=remotes[group]) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - output_path = f"{args.odir}/output/{EXPER_NAME}" - for path in (runlog_path, output_path): - if not os.path.isdir(path): - os.system(f"mkdir -p {path}") - - last_env_group = None - for round_params in ROUNDS_PARAMS: - print(f"Running experiments {round_params}...") - - this_env = round_params.env_setting - if this_env.group != last_env_group: - print("Setting tc netem qdiscs...") - utils.net.set_tc_qdisc_netems_main( - this_env.delay, - this_env.jitter, - this_env.rate, - involve_ifb=True, - remotes=remotes[this_env.group], - ) - last_env_group = this_env.group - - for protocol in PROTOCOLS: - if round_params.paxos_only and "Raft" in protocol: - continue - time.sleep(10) - bench_round( - remotes[this_env.group]["host0"], - base, - repo, - protocol, - round_params, - runlog_path, - ) - utils.proc.kill_all_distr_procs(this_env.group) - utils.file.remove_files_in_dir( # to free up storage space - f"{base}/states/{EXPER_NAME}", - remotes=remotes[this_env.group], - ) - utils.file.clear_fs_caches(remotes=remotes[this_env.group]) - - print("Clearing tc netem qdiscs...") - for group in ("1dc", "wan"): - utils.net.clear_tc_qdisc_netems_main(remotes=remotes[group]) - - print("Fetching client output logs...") - for group in ("1dc", "wan"): - utils.file.fetch_files_of_dir( - remotes[group]["host0"], f"{base}/output/{EXPER_NAME}", output_path - ) - - else: - output_dir = f"{args.odir}/output/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - results = collect_outputs(output_dir) - print_results(results) - - single_rounds = [ - rp for rp in ROUNDS_PARAMS if any(map(lambda t: "single" in t, rp.tags)) - ] - plot_single_rounds_results(results, single_rounds, plots_dir) - - cluster_rounds = [ - rp for rp in ROUNDS_PARAMS if any(map(lambda t: "cluster" in t, rp.tags)) - ] - cluster_rounds.sort(key=lambda rp: rp.num_replicas) - handles, labels = plot_cluster_size_results(results, cluster_rounds, plots_dir) - plot_minor_legend(handles, labels, plots_dir) - - ratio_rounds = [ - rp for rp in ROUNDS_PARAMS if any(map(lambda t: "ratio" in t, rp.tags)) - ] - ratio_rounds.sort(key=lambda rp: rp.put_ratio) - plot_write_ratio_results(results, ratio_rounds, plots_dir) diff --git a/scripts/crossword/bench_failover.py b/scripts/crossword/bench_failover.py deleted file mode 100644 index 56d21bde..00000000 --- a/scripts/crossword/bench_failover.py +++ /dev/null @@ -1,505 +0,0 @@ -import sys -import os -import argparse -import time - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -EXPER_NAME = "failover" -PROTOCOLS = ["MultiPaxos", "RSPaxos", "Raft", "CRaft", "Crossword"] - -MIN_HOST0_CPUS = 30 -SERVER_PIN_CORES = 20 -CLIENT_PIN_CORES = 2 - -NUM_REPLICAS = 5 -NUM_CLIENTS = 15 -BATCH_INTERVAL = 1 -CLIENT_TIMEOUT_SECS = 2 -VALUE_SIZE = 64 * 1024 -PUT_RATIO = 100 - -LENGTH_SECS = 120 -FAIL1_SECS = 40 -FAIL2_SECS = 80 -PLOT_SECS_BEGIN = 25 -PLOT_SECS_END = 115 - - -def launch_cluster(remote0, base, repo, protocol, config=None): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-n", - str(NUM_REPLICAS), - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--pin_cores", - str(SERVER_PIN_CORES), - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup(): - # print("Waiting for cluster setup...") - # wait for 20 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(20) - - -def run_bench_clients(remote0, base, repo, protocol): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--man", - "host0", - "--pin_cores", - str(CLIENT_PIN_CORES), - "--base_idx", - str(0), - "--timeout_ms", - str(CLIENT_TIMEOUT_SECS * 1000), - "--skip_build", - "bench", - "-n", - str(NUM_CLIENTS), - "-d", - str(NUM_REPLICAS), - "-f", - str(0), # closed-loop - "-v", - str(VALUE_SIZE), - "-w", - str(PUT_RATIO), - "-l", - str(LENGTH_SECS), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - ] - if protocol == "RSPaxos": - cmd.append("--expect_halt") - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def run_mess_client(remote0, base, repo, protocol, pauses=None, resumes=None): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--base_idx", - str(NUM_CLIENTS), - "--skip_build", - "mess", - ] - if pauses is not None and len(pauses) > 0: - cmd += ["--pause", pauses] - if resumes is not None and len(resumes) > 0: - cmd += ["--resume", resumes] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round(remote0, base, repo, protocol, runlog_path): - print(f" {EXPER_NAME} {protocol:<10s}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - if protocol == "Crossword": - config += "+init_assignment='1'" - - # launch service cluster - proc_cluster = launch_cluster(remote0, base, repo, protocol, config=config) - wait_cluster_setup() - - # start benchmarking clients - proc_clients = run_bench_clients(remote0, base, repo, protocol) - - # at the first failure point, pause current leader - time.sleep(FAIL1_SECS) - print(" Pausing leader...") - run_mess_client(remote0, base, repo, protocol, pauses="l") - - # at the second failure point, pause current leader - time.sleep(FAIL2_SECS - FAIL1_SECS) - print(" Pausing leader...") - run_mess_client(remote0, base, repo, protocol, pauses="l") - - # wait for benchmarking clients to exit - _, cerr = proc_clients.communicate() - with open(f"{runlog_path}/{protocol}.c.err", "wb") as fcerr: - fcerr.write(cerr) - - # terminate the cluster - proc_cluster.terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - _, serr = proc_cluster.communicate() - with open(f"{runlog_path}/{protocol}.s.err", "wb") as fserr: - fserr.write(serr) - - if proc_clients.returncode != 0: - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_outputs(output_dir): - results = dict() - for protocol in PROTOCOLS: - result = utils.output.gather_outputs( - protocol, - NUM_CLIENTS, - output_dir, - PLOT_SECS_BEGIN, - PLOT_SECS_END, - 0.1, - ) - - sd, sp, sj, sm = 10, 0, 0, 1 - if protocol == "Crossword": - # due to limited sampling granularity, Crossword gossiping makes - # throughput results look a bit more "jittering" than it actually - # is after failover; smoothing a bit more here - # setting sd here also avoids the lines to completely overlap with - # each other - sd, sj = 15, 50 - tput_list = utils.output.list_smoothing(result["tput_sum"], sd, sp, sj, sm) - - results[protocol] = { - "time": result["time"], - "tput": tput_list, - } - - return results - - -def print_results(results): - for protocol, result in results.items(): - print(protocol) - for i, t in enumerate(result["time"]): - print(f" [{t:>5.1f}] {result['tput'][i]:>7.2f} ", end="") - if (i + 1) % 6 == 0: - print() - if len(result["time"]) % 6 != 0: - print() - - -def plot_results(results, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (6, 3), - "font.size": 13, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper") - - PROTOCOLS_ORDER = ["Crossword", "MultiPaxos", "Raft", "RSPaxos", "CRaft"] - PROTOCOL_LABEL_COLOR_LS_LW = { - "Crossword": ("Crossword", "steelblue", "-", 2.0), - "MultiPaxos": ("MultiPaxos", "dimgray", "--", 1.2), - "Raft": ("Raft", "forestgreen", "--", 1.2), - "RSPaxos": ("RSPaxos (f=1)", "red", "-.", 1.3), - "CRaft": ("CRaft (f=1, fb=ok)", "peru", ":", 1.5), - } - - ymax = 0.0 - for protocol in PROTOCOLS_ORDER: - result = results[protocol] - label, color, ls, lw = PROTOCOL_LABEL_COLOR_LS_LW[protocol] - - xs = result["time"] - ys = result["tput"] - if max(ys) > ymax: - ymax = max(ys) - - plt.plot( - xs, - ys, - label=label, - color=color, - linestyle=ls, - linewidth=lw, - zorder=10 if "Crossword" in protocol else 0, - ) - - # failure indicators - def draw_failure_indicator(x, t, toffx): - plt.arrow( - x - 1, - ymax + 42, - 0, - -40, - color="darkred", - width=0.2, - length_includes_head=True, - head_width=1.0, - head_length=12, - overhang=0.5, - clip_on=False, - ) - plt.annotate( - t, - (x - 1, ymax + 60), - xytext=(toffx, 0), - ha="center", - textcoords="offset points", - color="darkred", - annotation_clip=False, - ) - - draw_failure_indicator(FAIL1_SECS - PLOT_SECS_BEGIN, "Leader fails", 12) - draw_failure_indicator(FAIL2_SECS - PLOT_SECS_BEGIN, "New leader fails", 12) - - # recovery time indicators (hardcoded!) - def draw_recovery_indicator(x, y, w, t, toffx, toffy): - plt.arrow( - x, - y, - -w, - 0, - color="gray", - width=0.1, - length_includes_head=True, - head_width=8, - head_length=0.3, - overhang=0.5, - ) - plt.arrow( - x, - y, - w, - 0, - color="gray", - width=0.1, - length_includes_head=True, - head_width=8, - head_length=0.3, - overhang=0.5, - ) - if t is not None: - plt.annotate( - t, - (x + toffx, y + toffy), - xytext=(0, 0), - ha="center", - textcoords="offset points", - color="gray", - fontsize=10, - ) - - draw_recovery_indicator(18.8, 500, 3.6, "small\ngossip\ngap", 0.8, 30) - draw_recovery_indicator(58.5, 500, 3.6, "small\ngossip\ngap", 0.8, 30) - - plt.vlines( - 63, - 400, - 520, - colors="gray", - linestyles="solid", - linewidth=0.8, - ) - - # draw_recovery_indicator(23, 50, 7, None, None, None) - # draw_recovery_indicator(25.2, 62, 9.2, "state-send\nsnapshot int.", 4.6, -53) - # draw_recovery_indicator(67.5, 56, 11.5, "state-send\nsnapshot int.", 2.9, -47) - - # configuration indicators - def draw_config_indicator(x, y, c, q, color, fb=False, unavail=False): - t = f"[c={c},q={q}]" - if fb: - t += "\nfb=ok" - if unavail: - t = "\nunavail." - plt.annotate( - t, - (x, y), - xytext=(0, 0), - ha="center", - textcoords="offset points", - color=color, - fontsize=11, - ) - - draw_config_indicator(4.6, 750, 1, 4, "red") - draw_config_indicator(4.6, 660, 1, 5, "steelblue") - draw_config_indicator(4.6, 570, 1, 4, "peru") - draw_config_indicator(4.6, 280, 3, 3, "forestgreen") - - draw_config_indicator(44.2, 750, 1, 4, "red") - draw_config_indicator(44.2, 425, 2, 4, "steelblue") - draw_config_indicator(45.5, 50, 3, 3, "peru", fb=True) - - draw_config_indicator(88.5, 420, 3, 3, "steelblue") - draw_config_indicator(88.5, 25, 1, 4, "red", unavail=True) - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - ax.plot(1, -1, ">k", transform=ax.get_yaxis_transform(), clip_on=False) - - plt.ylim(bottom=-1, top=ymax * 1.15) - - plt.xlabel("Time (s)") - plt.ylabel("Throughput (reqs/s)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (1.8, 1.3), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handlelength=1.4, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - for rec in lgd.get_texts(): - if "RSPaxos" in rec.get_text() or "CRaft" in rec.get_text(): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - print("Doing preparation work...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - utils.proc.check_enough_cpus(MIN_HOST0_CPUS, remote=remotes["host0"]) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.do_cargo_build(True, cd_dir=f"{base}/{repo}", remotes=remotes) - utils.file.clear_fs_caches(remotes=remotes) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - output_path = f"{args.odir}/output/{EXPER_NAME}" - for path in (runlog_path, output_path): - if not os.path.isdir(path): - os.system(f"mkdir -p {path}") - - print("Running experiments...") - for protocol in PROTOCOLS: - time.sleep(10) - bench_round(remotes["host0"], base, repo, protocol, runlog_path) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.remove_files_in_dir( # to free up storage space - f"{base}/states/{EXPER_NAME}", - remotes=remotes, - ) - utils.file.clear_fs_caches(remotes=remotes) - - print("Fetching client output logs...") - utils.file.fetch_files_of_dir( - remotes["host0"], f"{base}/output/{EXPER_NAME}", output_path - ) - - else: - output_dir = f"{args.odir}/output/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - results = collect_outputs(output_dir) - print_results(results) - - handles, labels = plot_results(results, plots_dir) - plot_legend(handles, labels, plots_dir) diff --git a/scripts/crossword/bench_rs_coding.py b/scripts/crossword/bench_rs_coding.py deleted file mode 100644 index d3a066ce..00000000 --- a/scripts/crossword/bench_rs_coding.py +++ /dev/null @@ -1,192 +0,0 @@ -import os -import argparse -import subprocess -import multiprocessing - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -import numpy as np # type: ignore -# fmt: on - - -EXPER_NAME = "rs_coding" -BENCH_GROUP_NAME = "rse_bench" - - -def printer_thread(proc, output_file): - with open(output_file, "w") as fout: - for line in iter(proc.stdout.readline, b""): - l = line.decode() - print(l, end="") - fout.write(l) - - -def run_criterion_group(output_dir): - cmd = ["cargo", "bench", "--", BENCH_GROUP_NAME] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) - - printer = multiprocessing.Process( - target=printer_thread, args=(proc, f"{output_dir}/rs_coding.out") - ) - printer.start() - - proc.wait() - printer.terminate() - - -def parse_bench_results(output_dir): - results = dict() - with open(f"{output_dir}/rs_coding.out", "r") as fout: - curr_round = None - for line in fout: - if line.startswith(f"{BENCH_GROUP_NAME}/"): - line = line.strip() - name = line[line.find("/") + 1 : line.find(")") + 1] - size = int(name[: name.find("@")]) - d = int(name[name.find("(") + 1 : name.find(",")]) - p = int(name[name.find(",") + 1 : name.find(")")]) - curr_round = (size, (d, p)) - - if curr_round is not None and "time:" in line: - segs = line.split() - time = float(segs[-4]) - unit = segs[-3] - if unit == "ms": - pass - elif unit == "ns": - time /= 1000000 - else: # us - time /= 1000 - - results[curr_round] = time - curr_round = None - - return results - - -def print_bench_results(results): - print("Results:") - for r, ms in results.items(): - print(f" {r[0]:7d} ({r[1][0]:2d},{r[1][1]:2d}) {ms:6.3f} ms") - - -def plot_bench_results(results, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (4, 1.5), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Bench") - - xs, ys = [], [] - for r in results: - if r[0] not in xs: - xs.append(r[0]) - if r[1] not in ys: - ys.append(r[1]) - xs.sort() - ys.sort(reverse=True) - - data = [[0.0 for _ in xs] for _ in ys] - vmin, vmax = float("inf"), 0.0 - for r, ms in results.items(): - xi, yi = xs.index(r[0]), ys.index(r[1]) - data[yi][xi] = ms - if ms > vmax: - vmax = ms - if ms < vmin: - vmin = ms - - cmap = plt.get_cmap("Reds") - colors = cmap(np.linspace(1.0 - (vmax - vmin) / float(vmax), 0.6, cmap.N)) - new_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("Reds", colors) - - plt.imshow(data, cmap=new_cmap, aspect=0.6, norm="log") - plt.colorbar( - aspect=12, - shrink=0.7, - anchor=(0.0, 0.25), - ticks=[vmin, 1, 10], - format="{x:.0f}ms", - ) - - def readable_size(size): - if size >= 1024 * 1024: - return f"{size // (1024*1024)}M" - elif size >= 1024: - return f"{size // 1024}K" - else: - return size - - def readable_time(ms): - if ms < 0.1: - return f"{ms*1000:.0f}μs" - elif ms < 1.0: - return f".{ms*10:.0f}ms" - else: - return f"{ms:.0f}ms" - - for r, ms in results.items(): - xi, yi = xs.index(r[0]), ys.index(r[1]) - plt.text( - xi, - yi, - readable_time(ms), - horizontalalignment="center", - verticalalignment="center", - color="black", - fontsize=8, - ) - - xticks = [readable_size(x) for x in xs] - plt.xticks(list(range(len(xticks))), xticks) - - yticks = [f"({d+p},{d})" for d, p in ys] - plt.yticks(list(range(len(yticks))), yticks) - - plt.tight_layout() - - pdf_name = f"{plots_dir}/rs_coding.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - output_path = f"{args.odir}/output/{EXPER_NAME}" - if not os.path.isdir(output_path): - os.system(f"mkdir -p {output_path}") - - run_criterion_group(output_path) - - else: - output_dir = f"{args.odir}/output/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - results = parse_bench_results(output_dir) - print_bench_results(results) - - plot_bench_results(results, plots_dir) diff --git a/scripts/crossword/bench_staleness.py b/scripts/crossword/bench_staleness.py deleted file mode 100644 index f8e5232a..00000000 --- a/scripts/crossword/bench_staleness.py +++ /dev/null @@ -1,521 +0,0 @@ -import sys -import os -import argparse -import time -import math - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -EXPER_NAME = "staleness" -PROTOCOL_GAPS = [ - ("MultiPaxos", None), - ("Crossword", 0), - ("Crossword", 100), - ("Crossword", 200), -] - -MIN_HOST0_CPUS = 30 -SERVER_PIN_CORES = 20 -CLIENT_PIN_CORES = 2 - -NUM_REPLICAS = 5 -NUM_CLIENTS = 15 -BATCH_INTERVAL = 1 -VALUE_SIZE = 4096 -PUT_RATIO = 100 -NUM_KEYS_LIST = [(2**i) for i in range(6)] - -LENGTH_SECS = 45 -RESULT_SECS_BEGIN = 10 -RESULT_SECS_END = 35 - - -def round_midfix_str(gossip_gap, num_keys): - return f".{0 if gossip_gap is None else gossip_gap}.{num_keys}" - - -def launch_cluster(remote0, base, repo, protocol, midfix_str, config=None): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-n", - str(NUM_REPLICAS), - "-r", - "--force_leader", - "0", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--file_midfix", - midfix_str, - "--pin_cores", - str(SERVER_PIN_CORES), - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup(): - # print("Waiting for cluster setup...") - # wait for 20 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(20) - - -def run_bench_clients(remote0, base, repo, protocol, num_keys, midfix_str): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--man", - "host0", - "--pin_cores", - str(CLIENT_PIN_CORES), - "--base_idx", - str(0), - "--skip_build", - "bench", - "-n", - str(NUM_CLIENTS), - "-d", - str(NUM_REPLICAS), - "-f", - str(0), # closed-loop - "-v", - str(VALUE_SIZE), - "-k", - str(num_keys), - "-w", - str(PUT_RATIO), - "-l", - str(LENGTH_SECS), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - "--file_midfix", - midfix_str, - ] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round(remote0, base, repo, protocol, gossip_gap, num_keys, runlog_path): - midfix_str = round_midfix_str(gossip_gap, num_keys) - print(f" {EXPER_NAME} {protocol:<10s}{midfix_str}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - config += f"+record_breakdown=true" - config += f"+record_value_ver=true" - if protocol == "Crossword": - config += f"+init_assignment='1'" - config += f"+gossip_tail_ignores={gossip_gap}" - - # launch service cluster - proc_cluster = launch_cluster( - remote0, base, repo, protocol, midfix_str, config=config - ) - wait_cluster_setup() - - # start benchmarking clients - proc_clients = run_bench_clients( - remote0, base, repo, protocol, num_keys, midfix_str - ) - - # wait for benchmarking clients to exit - _, cerr = proc_clients.communicate() - with open(f"{runlog_path}/{protocol}{midfix_str}.c.err", "wb") as fcerr: - fcerr.write(cerr) - - # terminate the cluster - proc_cluster.terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - _, serr = proc_cluster.communicate() - with open(f"{runlog_path}/{protocol}{midfix_str}.s.err", "wb") as fserr: - fserr.write(serr) - - if proc_clients.returncode != 0: - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_ver_stats(runlog_dir): - ver_stats = dict() - - def get_node_id(line): - return int(line[line.index("(") + 1 : line.index(")")]) - - for num_keys in NUM_KEYS_LIST: - for protocol, gossip_gap in PROTOCOL_GAPS: - midfix_str = round_midfix_str(gossip_gap, num_keys) - - candidates = set(range(NUM_REPLICAS)) - leader, sec0 = None, None - result = [{"secs": [], "vers": []} for _ in range(NUM_REPLICAS)] - with open(f"{runlog_dir}/{protocol}{midfix_str}.s.err", "r") as flog: - for line in flog: - if "becoming a leader" in line: - if leader is not None: - raise RuntimeError("multiple leader step-up detected") - leader = get_node_id(line) - elif "ver of" in line: - node = get_node_id(line) - if node not in candidates: - continue - - segs = line.strip().split() - sec = float(segs[-4]) / 1000.0 - if sec0 is None: - sec0 = sec - sec -= sec0 - if sec < RESULT_SECS_BEGIN: - continue - - ver = int(segs[-1]) - result[node]["secs"].append(sec) - result[node]["vers"].append(ver) - - if sec > RESULT_SECS_END: - if leader is None: - raise RuntimeError("leader step-up not detected") - candidates.remove(node) - ver_stats[f"{protocol}{midfix_str}"] = { - "leader": leader, - "result": result, - } - break - - diff_stats = dict() - for num_keys in NUM_KEYS_LIST: - for protocol, gossip_gap in PROTOCOL_GAPS: - midfix_str = round_midfix_str(gossip_gap, num_keys) - - leader, result = ( - ver_stats[f"{protocol}{midfix_str}"]["leader"], - ver_stats[f"{protocol}{midfix_str}"]["result"], - ) - assert leader >= 0 and leader < len(result) - - dresult = {"secs": [], "diffs": []} - for i, lsec in enumerate(result[leader]["secs"]): - lver = result[leader]["vers"][i] - diffs = [] - for node in range(NUM_REPLICAS): - if node != leader: - for j, fsec in enumerate(result[node]["secs"]): - fver = result[node]["vers"][j] - if abs(fsec - lsec) < 1.0: # allow an error margin - diffs.append(lver - fver) - break - if len(diffs) == NUM_REPLICAS - 1: - # remove out-of-quorum stragglers impact - diffs = sorted(diffs)[: NUM_REPLICAS // 2] - avg_diff = max(sum(diffs) / len(diffs), 0.0) - dresult["secs"].append(lsec) - dresult["diffs"].append(avg_diff) - - mid_diffs = sorted(dresult["diffs"])[1:-1] - assert len(mid_diffs) > 0 - avg_diff = sum(mid_diffs) / len(mid_diffs) - diff_stats[f"{protocol}{midfix_str}"] = { - "avg": avg_diff, - "result": dresult, - } - - return ver_stats, diff_stats - - -def print_results(ver_stats, diff_stats): - for protocol_with_midfix in ver_stats: - print(protocol_with_midfix) - leader, result, davg, dresult = ( - ver_stats[protocol_with_midfix]["leader"], - ver_stats[protocol_with_midfix]["result"], - diff_stats[protocol_with_midfix]["avg"], - diff_stats[protocol_with_midfix]["result"], - ) - - # for node in range(len(result)): - # print(f" {node} {'leader' if node == leader else 'follower':<8s}") - # print(" secs", end="") - # for sec in result[node]["secs"]: - # print(f" {sec:>5.1f}", end="") - # print() - # print(" vers", end="") - # for ver in result[node]["vers"]: - # print(f" {ver:>5d}", end="") - # print() - - print(" secs", end="") - for sec in dresult["secs"]: - print(f" {sec:>5.1f}", end="") - print(f" {'avg':>5s}") - print(" diffs", end="") - for diff in dresult["diffs"]: - print(f" {diff:>5.1f}", end="") - print(f" {davg:>5.1f}") - - -def plot_staleness(diff_stats, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (3.5, 1.7), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper") - - PROTOCOLS_ORDER = [ - "RSPaxos.None", - "Crossword.200", - "Crossword.100", - "Crossword.0", - "MultiPaxos.None", - ] - PROTOCOLS_LABEL_COLOR_MARKER_ZORDER = { - "MultiPaxos.None": ("MultiPaxos", "dimgray", "v", 0), - "Crossword.200": ("Crossword, 200", "royalblue", "p", 5), - "Crossword.100": ("Crossword, 100", "steelblue", "o", 10), - "Crossword.0": ("Crossword, 0", "lightsteelblue", "2", 5), - "RSPaxos.None": ("RSPaxos", "red", "x", 0), - } - TIME_INTERVAL_UNIT = 3 # TODO: currently hardcoded - MARKER_SIZE = 4 - - xmin = TIME_INTERVAL_UNIT - 0.5 - ymax, protocol_ys = 0.0, dict() - for protocol, gossip_gap in PROTOCOL_GAPS + [("RSPaxos", None)]: - ys = None - if protocol != "RSPaxos": - ys = [ - diff_stats[f"{protocol}{round_midfix_str(gossip_gap, k)}"]["avg"] - for k in NUM_KEYS_LIST - ] - if max(ys) > ymax: - ymax = max(ys) - else: - ys = [ymax * 1.6 for _ in NUM_KEYS_LIST] - ys.sort(reverse=True) - protocol_ys[f"{protocol}.{gossip_gap}"] = ys - - for protocol_with_gap in PROTOCOLS_ORDER: - label, color, marker, zorder = PROTOCOLS_LABEL_COLOR_MARKER_ZORDER[ - protocol_with_gap - ] - plt.plot( - [k * TIME_INTERVAL_UNIT for k in NUM_KEYS_LIST], - protocol_ys[protocol_with_gap], - color=color, - linewidth=1.2, - marker=marker, - markersize=( - MARKER_SIZE if ".0" not in protocol_with_gap else MARKER_SIZE + 3 - ), - label=label, - zorder=zorder, - ) - - def draw_yaxis_break(yloc): - ypb, ypt = yloc - 8, yloc + 8 - ys = [ypb, ypb, ypt, ypt] - xs = [xmin - 0.2, xmin + 0.2, xmin + 0.2, xmin - 0.2] - plt.fill(xs, ys, "w", fill=True, linewidth=0, zorder=10, clip_on=False) - plt.plot( - [xmin - 0.2, xmin + 0.2], - [ypb + 3, ypb - 3], - color="k", - linewidth=1, - zorder=20, - clip_on=False, - ) - plt.plot( - [xmin - 0.2, xmin + 0.2], - [ypt + 3, ypt - 3], - color="k", - linewidth=1, - zorder=20, - clip_on=False, - ) - plt.text( - xmin, - yloc - 1, - "~", - fontsize=8, - zorder=30, - clip_on=False, - ha="center", - va="center", - ) - - draw_yaxis_break(ymax * 1.3) - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.xscale("log") - plt.xlim(left=xmin) - plt.xlabel("Time between writes (ms, log-scale)") - - xticks = [TIME_INTERVAL_UNIT, 10, 100] # TODO: currently hardcoded - xticklabels = [str(x) for x in xticks] - plt.xticks(xticks, xticklabels) - - plt.ylim(bottom=-3) - plt.ylabel("Staleness (#ver.)") - ax.yaxis.set_label_coords(-0.15, 0.4) - - max_ytick = int((ymax // 50) * 50) - yticks = list(range(0, max_ytick + 1, 50)) - yticklabels = [str(y) for y in yticks] - yticks += [ymax * 1.6] - yticklabels += ["∞"] - plt.yticks(yticks, yticklabels) - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (2, 2), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handlelength=1.0, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - for rec in lgd.get_texts(): - if "RSPaxos" in rec.get_text() or "CRaft" in rec.get_text(): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - print("Doing preparation work...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - utils.proc.check_enough_cpus(MIN_HOST0_CPUS, remote=remotes["host0"]) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.do_cargo_build(True, cd_dir=f"{base}/{repo}", remotes=remotes) - utils.file.clear_fs_caches(remotes=remotes) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - if not os.path.isdir(runlog_path): - os.system(f"mkdir -p {runlog_path}") - - for num_keys in NUM_KEYS_LIST: - print(f"Running experiments {num_keys}...") - - for protocol, gossip_gap in PROTOCOL_GAPS: - time.sleep(10) - bench_round( - remotes["host0"], - base, - repo, - protocol, - gossip_gap, - num_keys, - runlog_path, - ) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.remove_files_in_dir( # to free up storage space - f"{base}/states/{EXPER_NAME}", - remotes=remotes, - ) - utils.file.clear_fs_caches(remotes=remotes) - - else: - runlog_dir = f"{args.odir}/runlog/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - ver_stats, diff_stats = collect_ver_stats(runlog_dir) - print_results(ver_stats, diff_stats) - - handles, labels = plot_staleness(diff_stats, plots_dir) - plot_legend(handles, labels, plots_dir) diff --git a/scripts/crossword/bench_unbalanced.py b/scripts/crossword/bench_unbalanced.py deleted file mode 100644 index 91b44312..00000000 --- a/scripts/crossword/bench_unbalanced.py +++ /dev/null @@ -1,422 +0,0 @@ -import sys -import os -import argparse -import time - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -EXPER_NAME = "unbalanced" -PROTOCOL_FT_ASSIGNS = [ - ("MultiPaxos", 2, None), - ("RSPaxos", 2, None), - ("RSPaxos", 1, None), - ("Raft", 2, None), - ("CRaft", 2, None), - ("CRaft", 1, None), - ("Crossword", 2, "0:0,1,2,3,4/1:3,4,5,6,7/2:6,7,8,9,10/3:11,12,13/4:14"), - ("Crossword", 2, "3"), -] - -RS_TOTAL_SHARDS = 15 -RS_DATA_SHARDS = 9 - -MIN_HOST0_CPUS = 30 -SERVER_PIN_CORES = 20 -CLIENT_PIN_CORES = 2 - -NUM_REPLICAS = 5 -NUM_CLIENTS = 15 -BATCH_INTERVAL = 1 -VALUE_SIZE = 64 * 1024 -PUT_RATIO = 100 - -LENGTH_SECS = 20 -RESULT_SECS_BEGIN = 5 -RESULT_SECS_END = 15 - -NETEM_MEAN = lambda _: 0 -NETEM_JITTER = lambda _: 0 -NETEM_RATE = lambda r: 1 if r < 3 else 0.4 if r < 4 else 0.1 - - -def round_midfix_str(fault_tolerance, init_assignment): - return ( - f".{fault_tolerance}." - + f"{'b' if init_assignment is None or len(init_assignment) == 1 else 'u'}" - ) - - -def launch_cluster(remote0, base, repo, protocol, midfix_str, config=None): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-n", - str(NUM_REPLICAS), - "-r", - "--force_leader", - "0", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--file_midfix", - midfix_str, - "--pin_cores", - str(SERVER_PIN_CORES), - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup(): - # print("Waiting for cluster setup...") - # wait for 20 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(20) - - -def run_bench_clients(remote0, base, repo, protocol, midfix_str): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - "host0", - "--man", - "host0", - "--pin_cores", - str(CLIENT_PIN_CORES), - "--base_idx", - str(0), - "--skip_build", - "bench", - "-n", - str(NUM_CLIENTS), - "-d", - str(NUM_REPLICAS), - "-f", - str(0), # closed-loop - "-v", - str(VALUE_SIZE), - "-w", - str(PUT_RATIO), - "-l", - str(LENGTH_SECS), - "--norm_stdev_ratio", - str(0.1), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - "--file_midfix", - midfix_str, - ] - return utils.proc.run_process_over_ssh( - remote0, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round( - remote0, base, repo, protocol, fault_tolerance, init_assignment, runlog_path -): - midfix_str = round_midfix_str(fault_tolerance, init_assignment) - print(f" {EXPER_NAME} {protocol:<10s}{midfix_str}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - if protocol == "RSPaxos" or protocol == "CRaft": - config += f"+fault_tolerance={fault_tolerance}" - elif protocol == "Crossword": - config += f"+rs_total_shards={RS_TOTAL_SHARDS}" - config += f"+rs_data_shards={RS_DATA_SHARDS}" - config += f"+init_assignment='{init_assignment}'" - config += f"+disable_gossip_timer=true" - - # launch service cluster - proc_cluster = launch_cluster( - remote0, base, repo, protocol, midfix_str, config=config - ) - wait_cluster_setup() - - # start benchmarking clients - proc_clients = run_bench_clients(remote0, base, repo, protocol, midfix_str) - - # wait for benchmarking clients to exit - _, cerr = proc_clients.communicate() - with open(f"{runlog_path}/{protocol}{midfix_str}.c.err", "wb") as fcerr: - fcerr.write(cerr) - - # terminate the cluster - proc_cluster.terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - _, serr = proc_cluster.communicate() - with open(f"{runlog_path}/{protocol}{midfix_str}.s.err", "wb") as fserr: - fserr.write(serr) - - if proc_clients.returncode != 0: - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_outputs(output_dir): - results = dict() - for protocol, fault_tolerance, init_assignment in PROTOCOL_FT_ASSIGNS: - midfix_str = round_midfix_str(fault_tolerance, init_assignment) - result = utils.output.gather_outputs( - f"{protocol}{midfix_str}", - NUM_CLIENTS, - output_dir, - RESULT_SECS_BEGIN, - RESULT_SECS_END, - 0.1, - ) - - sd, sp, sj, sm = 10, 0, 0, 1 - tput_mean_list = utils.output.list_smoothing(result["tput_sum"], sd, sp, sj, sm) - tput_stdev_list = result["tput_stdev"] - - results[f"{protocol}{midfix_str}"] = { - "mean": sum(tput_mean_list) / len(tput_mean_list), - "stdev": (sum(map(lambda s: s**2, tput_stdev_list)) / len(tput_stdev_list)) - ** 0.5, - } - - return results - - -def print_results(results): - for protocol_with_midfix, result in results.items(): - print(protocol_with_midfix) - print(f" mean {result['mean']:7.2f} stdev {result['stdev']:7.2f}") - - -def plot_results(results, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (3.6, 2.5), - "font.size": 12, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper") - - PROTOCOLS_ORDER = [ - "MultiPaxos.2.b", - "Raft.2.b", - "Crossword.2.b", - "Crossword.2.u", - "RSPaxos.2.b", - "CRaft.2.b", - "RSPaxos.1.b", - "CRaft.1.b", - ] - PROTOCOLS_XPOS = { - "MultiPaxos.2.b": 1, - "Raft.2.b": 2, - "Crossword.2.b": 3, - "Crossword.2.u": 4, - "RSPaxos.2.b": 5, - "CRaft.2.b": 6, - "RSPaxos.1.b": 8, - "CRaft.1.b": 9, - } - PROTOCOLS_LABEL_COLOR_HATCH = { - "MultiPaxos.2.b": ("MultiPaxos", "darkgray", None), - "Raft.2.b": ("Raft", "lightgreen", None), - "Crossword.2.b": ("Crossword (balanced)", "lightsteelblue", "xx"), - "Crossword.2.u": ("Crossword (unbalanced)", "cornflowerblue", ".."), - "RSPaxos.2.b": ("RSPaxos (q=5 forced)", "salmon", "//"), - "CRaft.2.b": ("CRaft (q=5 forced)", "wheat", "\\\\"), - "RSPaxos.1.b": ("RSPaxos (q=4, f=1)", "pink", "//"), - "CRaft.1.b": ("CRaft (q=4, f=1)", "cornsilk", "\\\\"), - } - - for protocol_with_midfix in PROTOCOLS_ORDER: - xpos = PROTOCOLS_XPOS[protocol_with_midfix] - result = results[protocol_with_midfix] - - label, color, hatch = PROTOCOLS_LABEL_COLOR_HATCH[protocol_with_midfix] - bar = plt.bar( - xpos, - result["mean"], - width=1, - color=color, - edgecolor="black", - linewidth=1.4, - label=label, - hatch=hatch, - # yerr=result["stdev"], - # ecolor="black", - # capsize=1, - ) - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.xticks([3.5, 8.5], ["f=2", "f=1"]) - plt.tick_params(bottom=False) - - plt.ylabel("Throughput (reqs/s)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (2.4, 2.2), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - handles.insert(-2, matplotlib.lines.Line2D([], [], linestyle="")) - labels.insert(-2, "") # insert spacing between groups - lgd = plt.legend( - handles, - labels, - handleheight=0.9, - handlelength=1.3, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - for rec in lgd.get_texts(): - if "f=1" in rec.get_text(): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if not args.plot: - print("Doing preparation work...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - utils.proc.check_enough_cpus(MIN_HOST0_CPUS, remote=remotes["host0"]) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.do_cargo_build(True, cd_dir=f"{base}/{repo}", remotes=remotes) - utils.file.clear_fs_caches(remotes=remotes) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - output_path = f"{args.odir}/output/{EXPER_NAME}" - for path in (runlog_path, output_path): - if not os.path.isdir(path): - os.system(f"mkdir -p {path}") - - print("Setting tc netem qdiscs...") - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN, - NETEM_JITTER, - NETEM_RATE, - involve_ifb=True, - remotes=remotes, - ) - - print("Running experiments...") - for protocol, fault_tolerance, init_assignment in PROTOCOL_FT_ASSIGNS: - time.sleep(10) - bench_round( - remotes["host0"], - base, - repo, - protocol, - fault_tolerance, - init_assignment, - runlog_path, - ) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP) - utils.file.remove_files_in_dir( # to free up storage space - f"{base}/states/{EXPER_NAME}", - remotes=remotes, - ) - utils.file.clear_fs_caches(remotes=remotes) - - print("Clearing tc netem qdiscs...") - utils.net.clear_tc_qdisc_netems_main(remotes=remotes) - - print("Fetching client output logs...") - utils.file.fetch_files_of_dir( - remotes["host0"], f"{base}/output/{EXPER_NAME}", output_path - ) - - else: - output_dir = f"{args.odir}/output/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - results = collect_outputs(output_dir) - print_results(results) - - handles, labels = plot_results(results, plots_dir) - plot_legend(handles, labels, plots_dir) diff --git a/scripts/crossword/bench_ycsb_trace.py b/scripts/crossword/bench_ycsb_trace.py deleted file mode 100644 index de32fde0..00000000 --- a/scripts/crossword/bench_ycsb_trace.py +++ /dev/null @@ -1,640 +0,0 @@ -import sys -import os -import argparse -import time - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - -# fmt: off -import matplotlib # type: ignore -matplotlib.use("Agg") -import matplotlib.pyplot as plt # type: ignore -# fmt: on - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -EXPER_NAME = "ycsb_trace" -SUMMERSET_PROTOCOLS = [ - "ChainRep", - "MultiPaxos", - "RSPaxos", - "Raft", - "CRaft", - "Crossword", -] -CHAIN_PROTOCOLS = [] - -GEN_YCSB_SCRIPT = "crossword/gen_ycsb_a_trace.py" -YCSB_TRACE = "/tmp/ycsb_workloada.txt" - -NUM_REPLICAS = 5 -NUM_CLIENTS_LIST = list(range(1, 100, 7)) -BATCH_INTERVAL = 1 -PUT_RATIO = 50 # YCSB-A has 50% updates + 50% reads - -LENGTH_SECS = 60 -RESULT_SECS_BEGIN = 10 -RESULT_SECS_END = 50 - -SIZE_S = 8 -SIZE_L = 128 * 1024 -SIZE_M = 64 * 1024 -SIZE_MIXED = [ - (0, SIZE_L), - (LENGTH_SECS // 6, SIZE_S), - ((LENGTH_SECS // 6) * 2, SIZE_L), - ((LENGTH_SECS // 6) * 3, SIZE_S), - ((LENGTH_SECS // 6) * 4, SIZE_L), - ((LENGTH_SECS // 6) * 5, SIZE_S), -] -SIZE_MIXED = "/".join([f"{t}:{v}" for t, v in SIZE_MIXED]) - -NETEM_MEAN = lambda _: 1 -NETEM_JITTER = lambda _: 2 -NETEM_RATE = lambda _: 1 # no effect given the original bandwidth - - -def launch_cluster_summerset( - remote, base, repo, protocol, partition, num_clients, config=None -): - cmd = [ - "python3", - "./scripts/distr_cluster.py", - "-p", - protocol, - "-a", - str(partition), - "-n", - str(NUM_REPLICAS), - "-r", - "--force_leader", - str(partition), - "-g", - PHYS_ENV_GROUP, - "--me", - f"host{partition}", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--file_midfix", - f".{num_clients}", - # NOTE: not pinning cores for this exper due to large #processes - "--skip_build", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return utils.proc.run_process_over_ssh( - remote, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup_summerset(): - # print("Waiting for cluster setup...") - # wait for 30 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(30) - - -def run_bench_clients_summerset(remote, base, repo, protocol, partition, num_clients): - cmd = [ - "python3", - "./scripts/distr_clients.py", - "-p", - protocol, - "-r", - "-g", - PHYS_ENV_GROUP, - "--me", - f"host{partition}", - "--man", - f"host{partition}", - # NOTE: not pinning cores for this exper due to large #processes - "--base_idx", - str(0), - "--skip_build", - "bench", - "-a", - str(partition), - "-n", - str(num_clients), - # NOTE: not distributing clients of this partition to other nodes, - # so the behavior matches ChainPaxos's multithreading client - "-f", - str(0), # closed-loop - "-y", - YCSB_TRACE, - "-v", - SIZE_MIXED, - "-l", - str(LENGTH_SECS), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - "--file_midfix", - f".{num_clients}", - ] - return utils.proc.run_process_over_ssh( - remote, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round_summerset(remotes, base, repo, protocol, num_clients, runlog_path): - print(f" {EXPER_NAME} {protocol:<10s}.{num_clients}") - - config = f"batch_interval_ms={BATCH_INTERVAL}" - if protocol != "ChainRep": - config += f"+sim_read_lease=true" - if protocol == "RSPaxos" or protocol == "CRaft": - config += f"+fault_tolerance=2" - if protocol == "Crossword": - config += f"+b_to_d_threshold={0.08}" # TODO: tune this - config += f"+disable_gossip_timer=true" # TODO: maybe? - - # launch service clusters for each partition - procs_cluster = [] - for partition in range(NUM_REPLICAS): - procs_cluster.append( - launch_cluster_summerset( - remotes[f"host{partition}"], - base, - repo, - protocol, - partition, - num_clients, - config=config, - ) - ) - wait_cluster_setup_summerset() - - # start benchmarking clients for each partition - procs_clients = [] - for partition in range(NUM_REPLICAS): - procs_clients.append( - run_bench_clients_summerset( - remotes[f"host{partition}"], - base, - repo, - protocol, - partition, - num_clients, - ) - ) - - # wait for benchmarking clients to exit - for partition in range(NUM_REPLICAS): - _, cerr = procs_clients[partition].communicate() - with open( - f"{runlog_path}/{protocol}.{num_clients}.{partition}.c.err", "wb" - ) as fcerr: - fcerr.write(cerr) - - # terminate the clusters - for partition in range(NUM_REPLICAS): - procs_cluster[partition].terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP, chain=False) - for partition in range(NUM_REPLICAS): - _, serr = procs_cluster[partition].communicate() - with open( - f"{runlog_path}/{protocol}.{num_clients}.{partition}.s.err", "wb" - ) as fserr: - fserr.write(serr) - - if any(map(lambda p: p.returncode != 0, procs_clients)): - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def launch_cluster_chain(remote, base, repo, protocol, partition, num_clients): - cmd = [ - "python3", - "./scripts/crossword/distr_chainapp.py", - "-p", - protocol, - "-a", - str(partition), - "-n", - str(NUM_REPLICAS), - "-g", - PHYS_ENV_GROUP, - "--me", - f"host{partition}", - "--file_prefix", - f"{base}/states/{EXPER_NAME}", - "--file_midfix", - f".{num_clients}", - # NOTE: not pinning cores for this exper due to large #processes - ] - return utils.proc.run_process_over_ssh( - remote, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def wait_cluster_setup_chain(): - # print("Waiting for cluster setup...") - # wait for 20 seconds to safely allow all nodes up - # not relying on SSH-piped outputs here - time.sleep(20) - - -def run_bench_clients_chain(remote, base, repo, protocol, partition, num_clients): - cmd = [ - "python3", - "./scripts/crossword/distr_chaincli.py", - "-p", - protocol, - "-n", - str(NUM_REPLICAS), - "-g", - PHYS_ENV_GROUP, - "--me", - f"host{partition}", - # NOTE: not pinning cores for this exper due to large #processes - "-a", - str(partition), - "-t", - str(num_clients), - "-v", - str(SIZE_M), - "-w", - str(PUT_RATIO), - "-l", - str(LENGTH_SECS), - "--file_prefix", - f"{base}/output/{EXPER_NAME}", - "--file_midfix", - f".{num_clients}", - ] - return utils.proc.run_process_over_ssh( - remote, - cmd, - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - - -def bench_round_chain(remotes, base, repo, protocol, num_clients, runlog_path): - print(f" {EXPER_NAME} {protocol:<13s}.{num_clients}") - - # launch service clusters for each partition - procs_cluster = [] - for partition in range(NUM_REPLICAS): - procs_cluster.append( - launch_cluster_chain( - remotes[f"host{partition}"], - base, - repo, - protocol, - partition, - num_clients, - ) - ) - wait_cluster_setup_chain() - - # start benchmarking clients for each partition - procs_clients = [] - for partition in range(NUM_REPLICAS): - procs_clients.append( - run_bench_clients_chain( - remotes[f"host{partition}"], - base, - repo, - protocol, - partition, - num_clients, - ) - ) - - # wait for benchmarking clients to exit - for partition in range(NUM_REPLICAS): - _, cerr = procs_clients[partition].communicate() - with open( - f"{runlog_path}/{protocol}.{num_clients}.{partition}.c.err", "wb" - ) as fcerr: - fcerr.write(cerr) - - # terminate the clusters - for partition in range(NUM_REPLICAS): - procs_cluster[partition].terminate() - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP, chain=True) - for partition in range(NUM_REPLICAS): - _, serr = procs_cluster[partition].communicate() - with open( - f"{runlog_path}/{protocol}.{num_clients}.{partition}.s.err", "wb" - ) as fserr: - fserr.write(serr) - - if any(map(lambda p: p.returncode != 0, procs_clients)): - print(" Experiment FAILED!") - sys.exit(1) - else: - print(" Done!") - - -def collect_outputs(output_dir): - results = dict() - - for protocol in SUMMERSET_PROTOCOLS: - results[protocol] = {"tputs": [], "lats": []} - for num_clients in NUM_CLIENTS_LIST: - part_tputs, part_lats = [], [] - for partition in range(NUM_REPLICAS): - result = utils.output.gather_outputs( - f"{protocol}.{num_clients}", - num_clients, - output_dir, - RESULT_SECS_BEGIN, - RESULT_SECS_END, - 0.1, - partition=partition, - ) - - sd, sp, sj, sm = 10, 0, 0, 1 - if protocol == "Crossword": - # setting sm here to compensate for printing models to console - sm = 1 + ((PUT_RATIO / 2) / 100) - tput_mean_list = utils.output.list_smoothing( - result["tput_sum"], sd, sp, sj, sm - ) - lat_mean_list = utils.output.list_smoothing( - result["lat_avg"], sd, sp, sj, 1 / sm - ) - - part_tputs.append(sum(tput_mean_list) / len(tput_mean_list)) - part_lats.append((sum(lat_mean_list) / len(lat_mean_list)) / 1000) - - results[protocol]["tputs"].append(sum(part_tputs)) - results[protocol]["lats"].append(sum(part_lats) / len(part_lats)) - - for protocol in CHAIN_PROTOCOLS: - results[protocol] = {"tputs": [], "lats": []} - for num_clients in NUM_CLIENTS_LIST: - part_tputs, part_lats = [], [] - for partition in range(NUM_REPLICAS): - result = utils.output.parse_ycsb_log( - f"{protocol}.{num_clients}", - output_dir, - 1, - 1, - partition=partition, - ) - - part_tputs.append(result["tput"]["mean"]) - part_lats.append(result["lat"]["mean"]) - - results[protocol]["tputs"].append(sum(part_tputs)) - results[protocol]["lats"].append(sum(part_lats) / len(part_lats)) - - return results - - -def print_results(results): - for protocol, result in results.items(): - print(protocol) - print(" tputs", end="") - for tput in result["tputs"]: - print(f" {tput:8.2f}", end="") - print() - print(" lats ", end="") - for lat in result["lats"]: - print(f" {lat:8.2f}", end="") - print() - - -def plot_results(results, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (3.6, 2), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - fig = plt.figure("Exper") - - PROTOCOLS_ORDER = [ - # "chain_mixed", - # "chain_delayed", - "ChainRep", - "MultiPaxos", - "Raft", - "RSPaxos", - "CRaft", - "Crossword", - ] - PROTOCOLS_LABEL_COLOR_MARKER_STYLE_ZORDER = { - "MultiPaxos": ("MultiPaxos", "dimgray", "v", "-", 5), - "Raft": ("Raft", "forestgreen", "v", ":", 0), - "Crossword": ("Crossword", "steelblue", "o", "-", 10), - "RSPaxos": ("RSPaxos (f=1)", "red", "x", "-", 0), - "CRaft": ("CRaft (f=1)", "peru", "x", ":", 5), - "ChainRep": ("Chain Rep.", "indigo", "^", "-", 0), - # "chain_mixed": ("ChainPaxos*", "magenta", "d", "-", 0), - # "chain_delayed": ("ChainPaxos* (delay)", "mediumpurple", "d", 5), - } - MARKER_SIZE = 4 - - for protocol in PROTOCOLS_ORDER: - ( - label, - color, - marker, - linestyle, - zorder, - ) = PROTOCOLS_LABEL_COLOR_MARKER_STYLE_ZORDER[protocol] - plt.plot( - [tput / 1000.0 for tput in results[protocol]["tputs"]], - results[protocol]["lats"], - color=color, - linewidth=1.0, - linestyle=linestyle, - marker=marker, - markersize=MARKER_SIZE, - label=label, - zorder=zorder, - ) - - ax = fig.axes[0] - ax.spines["top"].set_visible(False) - ax.spines["right"].set_visible(False) - - plt.xlim(left=0) - plt.xlabel("Throughput (k reqs/s)") - - plt.ylim(bottom=0) - plt.ylabel("Latency (ms)") - - plt.tight_layout() - - pdf_name = f"{plots_dir}/exper-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - return ax.get_legend_handles_labels() - - -def plot_legend(handles, labels, plots_dir): - matplotlib.rcParams.update( - { - "figure.figsize": (2, 2), - "font.size": 10, - "pdf.fonttype": 42, - } - ) - plt.figure("Legend") - - plt.axis("off") - - lgd = plt.legend( - handles, - labels, - handlelength=1.2, - loc="center", - bbox_to_anchor=(0.5, 0.5), - ) - for rec in lgd.get_texts(): - if ( - "RSPaxos" in rec.get_text() - or "CRaft" in rec.get_text() - or "Chain Rep." in rec.get_text() - ): - rec.set_fontstyle("italic") - # if "Crossword" in rec.get_text(): - # rec.set_fontweight("bold") - - pdf_name = f"{plots_dir}/legend-{EXPER_NAME}.pdf" - plt.savefig(pdf_name, bbox_inches=0) - plt.close() - print(f"Plotted: {pdf_name}") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-t", "--trace", action="store_true", help="if set, do YCSB trace generation" - ) - parser.add_argument( - "-o", - "--odir", - type=str, - default=f"./results", - help="directory to hold outputs and logs", - ) - parser.add_argument( - "-p", "--plot", action="store_true", help="if set, do the plotting phase" - ) - args = parser.parse_args() - - if not os.path.isdir(args.odir): - raise RuntimeError(f"results directory {args.odir} does not exist") - - if args.trace: - print("Generating YCSB-A trace...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - trace_procs = [] - for host in hosts: - trace_procs.append( - utils.proc.run_process_over_ssh( - remotes[host], - ["python3", f"./scripts/{GEN_YCSB_SCRIPT}"], - cd_dir=f"{base}/{repo}", - capture_stdout=True, - capture_stderr=True, - print_cmd=False, - ) - ) - utils.proc.wait_parallel_procs(trace_procs, names=hosts) - - elif not args.plot: - print("Doing preparation work...") - base, repo, hosts, remotes, _, _ = utils.config.parse_toml_file( - TOML_FILENAME, PHYS_ENV_GROUP - ) - hosts = hosts[:NUM_REPLICAS] - remotes = {h: remotes[h] for h in hosts} - - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP, chain=False) - utils.proc.kill_all_distr_procs(PHYS_ENV_GROUP, chain=True) - utils.file.do_cargo_build(True, cd_dir=f"{base}/{repo}", remotes=remotes) - utils.file.clear_fs_caches(remotes=remotes) - - runlog_path = f"{args.odir}/runlog/{EXPER_NAME}" - output_path = f"{args.odir}/output/{EXPER_NAME}" - for path in (runlog_path, output_path): - if not os.path.isdir(path): - os.system(f"mkdir -p {path}") - - print("Setting tc netem qdiscs...") - utils.net.set_tc_qdisc_netems_main( - NETEM_MEAN, - NETEM_JITTER, - NETEM_RATE, - involve_ifb=True, - remotes=remotes, - ) - - for num_clients in NUM_CLIENTS_LIST: - print(f"Running experiments {num_clients}...") - - PROTOCOL_FUNCS = [(p, bench_round_summerset) for p in SUMMERSET_PROTOCOLS] - PROTOCOL_FUNCS += [(p, bench_round_chain) for p in CHAIN_PROTOCOLS] - for protocol, bench_round_func in PROTOCOL_FUNCS: - time.sleep(10) - bench_round_func( - remotes, base, repo, protocol, num_clients, runlog_path - ) - utils.proc.kill_all_distr_procs( - PHYS_ENV_GROUP, chain=(protocol in CHAIN_PROTOCOLS) - ) - utils.file.remove_files_in_dir( # to free up storage space - f"{base}/states/{EXPER_NAME}", - remotes=remotes, - ) - utils.file.clear_fs_caches(remotes=remotes) - - print("Clearing tc netem qdiscs...") - utils.net.clear_tc_qdisc_netems_main(remotes=remotes) - - print("Fetching client output logs...") - for remote in remotes.values(): - utils.file.fetch_files_of_dir( - remote, f"{base}/output/{EXPER_NAME}", output_path - ) - - else: - output_dir = f"{args.odir}/output/{EXPER_NAME}" - plots_dir = f"{args.odir}/plots/{EXPER_NAME}" - if not os.path.isdir(plots_dir): - os.system(f"mkdir -p {plots_dir}") - - results = collect_outputs(output_dir) - print_results(results) - - handles, labels = plot_results(results, plots_dir) - plot_legend(handles, labels, plots_dir) diff --git a/scripts/crossword/distr_chainapp.py b/scripts/crossword/distr_chainapp.py deleted file mode 100644 index 4c9683d9..00000000 --- a/scripts/crossword/distr_chainapp.py +++ /dev/null @@ -1,314 +0,0 @@ -import sys -import os -import signal -import argparse - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - - -TOML_FILENAME = "scripts/remote_hosts.toml" - -CHAIN_REPO_NAME = "chain" -CHAIN_JAR_FOLDER = "deploy/server" - - -SERVER_CONSENSUS_PORT = lambda p: 40000 + p -SERVER_FRONTEND_PEER_PORT = lambda p: 40010 + p -SERVER_APP_PORT = lambda p: 40020 + p - -SERVER_LEADER_TIMEOUT = 5000 -SERVER_NOOP_INTERVAL = 100 - - -PROTOCOL_BACKER_PATH = ( - lambda protocol, prefix, midfix, r: f"{prefix}/{protocol}{midfix}.{r}.wal" -) - -PROTOCOLS = {"chain_delayed", "chain_mixed", "chainrep", "epaxos"} - - -def run_process_pinned( - cmd, capture_stderr=False, cores_per_proc=0, remote=None, cd_dir=None -): - cpu_list = None - if cores_per_proc > 0: - # get number of processors - num_cpus = utils.proc.get_cpu_count(remote=remote) - # pin servers at CPUs [0, cores_per_proc) - core_start = 0 - core_end = core_start + cores_per_proc - 1 - assert core_end <= num_cpus - 1 - cpu_list = f"{core_start}-{core_end}" - if remote is None or len(remote) == 0: - return utils.proc.run_process( - cmd, capture_stderr=capture_stderr, cd_dir=cd_dir, cpu_list=cpu_list - ) - else: - return utils.proc.run_process_over_ssh( - remote, cmd, capture_stderr=capture_stderr, cd_dir=cd_dir, cpu_list=cpu_list - ) - - -def compose_server_cmd( - protocol, - ipaddrs, - consensus_port, - frontend_peer_port, - app_port, - quorum_size, - replica_id, - remote, - interface, - file_prefix, - file_midfix, - fresh_files, -): - backer_file = PROTOCOL_BACKER_PATH(protocol, file_prefix, file_midfix, replica_id) - if fresh_files: - utils.proc.run_process_over_ssh( - remote, - ["sudo", "rm", "-f", backer_file], - print_cmd=False, - ).wait() - - cmd = [ - "java", - "-Dlog4j.configurationFile=log4j2.xml", - "-Djava.net.preferIPv4Stack=true", - f"-DlogFilename={backer_file}", - "-cp", - "chain.jar:.", - "app.HashMapApp", - f"interface={interface}", - f"algorithm={protocol}", - f"initial_membership={','.join(ipaddrs.values())}", - "initial_state=ACTIVE", - f"quorum_size={quorum_size}", - f"consensus_port={consensus_port}", - f"frontend_peer_port={frontend_peer_port}", - f"app_port={app_port}", - f"leader_timeout={SERVER_LEADER_TIMEOUT}", - f"noop_interval={SERVER_NOOP_INTERVAL}", - ] - return cmd - - -def launch_servers( - remotes, - ipaddrs, - interfaces, - hosts, - me, - cd_dir, - protocol, - partition, - num_replicas, - file_prefix, - file_midfix, - fresh_files, - pin_cores, -): - if num_replicas != len(remotes): - raise ValueError(f"invalid num_replicas: {num_replicas}") - - server_procs = [] - for replica in range(num_replicas): - host = hosts[replica] - - cmd = compose_server_cmd( - protocol, - ipaddrs, - SERVER_CONSENSUS_PORT(partition), - SERVER_FRONTEND_PEER_PORT(partition), - SERVER_APP_PORT(partition), - (num_replicas // 2) + 1, - replica, - remotes[host], - interfaces[host], - file_prefix, - file_midfix, - fresh_files, - ) - - proc = None - if host == me: - # run my responsible server locally - proc = run_process_pinned( - cmd, - capture_stderr=False, - cores_per_proc=pin_cores, - cd_dir=cd_dir, - ) - else: - # spawn server process on remote server through ssh - proc = run_process_pinned( - cmd, - capture_stderr=False, - cores_per_proc=pin_cores, - remote=remotes[host], - cd_dir=cd_dir, - ) - server_procs.append(proc) - - return server_procs - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-p", "--protocol", type=str, required=True, help="protocol name" - ) - parser.add_argument( - "-a", - "--partition", - type=int, - default=argparse.SUPPRESS, - help="if doing keyspace partitioning, the partition idx", - ) - parser.add_argument( - "-n", "--num_replicas", type=int, required=True, help="number of replicas" - ) - parser.add_argument( - "-g", "--group", type=str, default="1dc", help="hosts group to run on" - ) - parser.add_argument( - "--me", type=str, default="host0", help="main script runner's host nickname" - ) - parser.add_argument( - "--file_prefix", - type=str, - default="/tmp/chain", - help="states file prefix folder path", - ) - parser.add_argument( - "--file_midfix", - type=str, - default="", - help="states file extra identifier after protocol name", - ) - parser.add_argument( - "--keep_files", action="store_true", help="if set, keep any old durable files" - ) - parser.add_argument( - "--pin_cores", type=int, default=0, help="if > 0, set CPU cores affinity" - ) - args = parser.parse_args() - - # parse hosts config file - base, repo, hosts, remotes, _, ipaddrs = utils.config.parse_toml_file( - TOML_FILENAME, args.group - ) - cd_dir_summerset = f"{base}/{repo}" - cd_dir_chain = f"{base}/{CHAIN_REPO_NAME}/{CHAIN_JAR_FOLDER}" - - # check that the partition index is valid - partition_in_args = "partition" in args - if partition_in_args and (args.partition < 0 or args.partition >= 5): - raise ValueError("currently only supports <= 5 partitions") - partition = 0 if not partition_in_args else args.partition - file_midfix = ( - args.file_midfix if not partition_in_args else f"{args.file_midfix}.{partition}" - ) - - # check that number of replicas is valid - if args.num_replicas <= 0: - raise ValueError(f"invalid number of replicas {args.num_replicas}") - if args.num_replicas > len(remotes): - raise ValueError(f"#replicas {args.num_replicas} > #hosts in config file") - hosts = hosts[: args.num_replicas] - remotes = {h: remotes[h] for h in hosts} - ipaddrs = {h: ipaddrs[h] for h in hosts} - - # check protocol name - if args.protocol not in PROTOCOLS: - raise ValueError(f"unrecognized protocol name '{args.protocol}'") - - # check that I am indeed the "me" host - utils.config.check_remote_is_me(remotes[args.me]) - - # kill all existing server processes - if not partition_in_args: - print("Killing related processes...") - kill_procs = [] - for host in hosts: - kill_procs.append( - utils.proc.run_process_over_ssh( - remotes[host], - ["./scripts/crossword/kill_chain_procs.sh"], - cd_dir=cd_dir_summerset, - print_cmd=False, - ) - ) - utils.proc.wait_parallel_procs(kill_procs, names=hosts) - - # check that the prefix folder path exists, or create it if not - print("Preparing states folder...") - prepare_procs = [] - for host in hosts: - prepare_procs.append( - utils.proc.run_process_over_ssh( - remotes[host], - ["mkdir", "-p", args.file_prefix], - cd_dir=cd_dir_chain, - print_cmd=False, - ) - ) - utils.proc.wait_parallel_procs(prepare_procs, names=hosts) - - # get the main Ethernet interface name on each host - print("Getting main interface name...") - interfaces = dict() - for host in hosts: - print(f" {host}: ", end="") - interface = utils.net.get_interface_name( - remote=None if host == args.me else remotes[host] - ) - print(interface) - interfaces[host] = interface - - # launch server replicas - server_procs = launch_servers( - remotes, - ipaddrs, - interfaces, - hosts, - args.me, - cd_dir_chain, - args.protocol, - partition, - args.num_replicas, - args.file_prefix, - file_midfix, - not args.keep_files, - args.pin_cores, - ) - - # register termination signals handler - # NOTE: this also terminates other partitions' processes if doing - # keyspace partitioning - def kill_spawned_procs(*args): - print("Killing related processes...") - kill_procs = [] - for host in hosts: - kill_procs.append( - utils.proc.run_process_over_ssh( - remotes[host], - ["./scripts/crossword/kill_chain_procs.sh"], - cd_dir=cd_dir_summerset, - print_cmd=False, - ) - ) - utils.proc.wait_parallel_procs(kill_procs, names=hosts) - - for proc in server_procs: - proc.terminate() - - signal.signal(signal.SIGINT, kill_spawned_procs) - signal.signal(signal.SIGTERM, kill_spawned_procs) - signal.signal(signal.SIGHUP, kill_spawned_procs) - - for proc in server_procs: - proc.wait() diff --git a/scripts/crossword/distr_chaincli.py b/scripts/crossword/distr_chaincli.py deleted file mode 100644 index 113181f0..00000000 --- a/scripts/crossword/distr_chaincli.py +++ /dev/null @@ -1,248 +0,0 @@ -import os -import sys -import argparse -import subprocess -import math - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - - -TOML_FILENAME = "scripts/remote_hosts.toml" - -CHAIN_REPO_NAME = "chain-client" -CHAIN_JAR_FOLDER = "deploy/client" - - -SERVER_APP_PORT = lambda p: 40020 + p - - -CLIENT_OUTPUT_PATH = lambda protocol, prefix, midfix: f"{prefix}/{protocol}{midfix}.out" - - -def run_process_pinned( - num_clients, cmd, capture_stdout=False, cores_per_proc=0, cd_dir=None -): - # their implementation used a single process client with threading only - # so `cores_per_proc` should be multiplied by the number of threads - cpu_list = None - if cores_per_proc != 0: - # get number of processors - num_cpus = utils.proc.get_cpu_count() - # parse cores_per_proc setting - if cores_per_proc != int(cores_per_proc) and ( - cores_per_proc > 1 or cores_per_proc < -1 - ): - raise ValueError(f"invalid cores_per_proc {cores_per_proc}") - cores = cores_per_proc * num_clients - if cores < 0: - # negative means starting from CPU 0 (instead from last) - cores *= -1 - core_start = 0 - core_end = math.ceil(core_start + cores - 1) - assert core_end < num_cpus - else: - # else pin client cores from last CPU down - core_end = math.ceil(num_cpus - 1) - core_start = math.floor(core_end - cores + 1) - assert core_start >= 0 - cpu_list = f"{core_start}-{core_end}" - return utils.proc.run_process( - cmd, capture_stdout=capture_stdout, cd_dir=cd_dir, cpu_list=cpu_list - ) - - -def compose_client_cmd( - ipaddrs, - partition, - num_threads, - value_size, - put_ratio, - length_s, -): - cmd = [ - "java", - "-cp", - "chain-client.jar:.", - "site.ycsb.Client", - "-t", - "-s", - "-P", - "config.properties", - "-threads", - str(num_threads), - "-p", - f"fieldlength={value_size}", - "-p", - f"hosts={','.join(ipaddrs.values())}", - "-p", - "requestdistribution=zipfian", - "-p", - "fieldcount=1", - "-p", - f"readproportion={100 - put_ratio}", - "-p", - f"updateproportion={put_ratio}", - "-p", - f"app_server_port={SERVER_APP_PORT(partition)}", - "-p", - f"maxexecutiontime={length_s}", - ] - return cmd - - -def run_clients( - ipaddrs, - value_size, - put_ratio, - length_s, - partition, - num_threads, - cd_dir, - capture_stdout, - pin_cores, -): - if num_threads < 1: - raise ValueError(f"invalid num_threads: {num_threads}") - - cmd = compose_client_cmd( - ipaddrs, - partition, - num_threads, - value_size, - put_ratio, - length_s, - ) - - client_proc = run_process_pinned( - num_threads, - cmd, - capture_stdout=capture_stdout, - cores_per_proc=pin_cores, - cd_dir=cd_dir, - ) - return client_proc - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - parser = argparse.ArgumentParser(allow_abbrev=False) - parser.add_argument( - "-p", "--protocol", type=str, required=True, help="protocol name" - ) - parser.add_argument( - "-n", "--num_replicas", type=int, required=True, help="number of replicas" - ) - parser.add_argument( - "-g", "--group", type=str, default="1dc", help="hosts group to run on" - ) - parser.add_argument( - "--me", type=str, default="host0", help="main script runner's host nickname" - ) - parser.add_argument( - "-a", - "--partition", - type=int, - default=argparse.SUPPRESS, - help="if doing keyspace partitioning, the partition idx", - ) - parser.add_argument( - "-t", - "--num_threads", - type=int, - required=True, - help="number of threads", - ) - parser.add_argument( - "-v", "--value_size", type=int, required=True, help="value size" - ) - parser.add_argument( - "-w", "--put_ratio", type=int, required=True, help="percentage of puts" - ) - parser.add_argument( - "-l", "--length_s", type=int, required=True, help="run length in secs" - ) - parser.add_argument( - "--pin_cores", type=float, default=0, help="if not 0, set CPU cores affinity" - ) - parser.add_argument( - "--file_prefix", - type=str, - default="", - help="output file prefix folder path", - ) - parser.add_argument( - "--file_midfix", - type=str, - default="", - help="output file extra identifier after protocol name", - ) - args = parser.parse_args() - - # parse hosts config file - base, _, hosts, remotes, _, ipaddrs = utils.config.parse_toml_file( - TOML_FILENAME, args.group - ) - cd_dir_chain = f"{base}/{CHAIN_REPO_NAME}/{CHAIN_JAR_FOLDER}" - - # check that number of replicas is valid - if args.num_replicas > len(ipaddrs): - raise ValueError("#replicas exceeds #hosts in the config file") - hosts = hosts[: args.num_replicas] - remotes = {h: remotes[h] for h in hosts} - ipaddrs = {h: ipaddrs[h] for h in hosts} - - # check that I am indeed the "me" host - utils.config.check_remote_is_me(remotes[args.me]) - - # check that the partition index is valid - partition_in_args, partition = False, 0 - partition_in_args = "partition" in args - if partition_in_args and (args.partition < 0 or args.partition >= 5): - raise ValueError("currently only supports <= 5 partitions") - partition = 0 if not partition_in_args else args.partition - file_midfix = ( - args.file_midfix if not partition_in_args else f"{args.file_midfix}.{partition}" - ) - - # check that number of clients is valid - if args.num_threads <= 0: - raise ValueError(f"invalid number of clients {args.num_threads}") - - # check that the prefix folder path exists, or create it if not - if len(args.file_prefix) > 0 and not os.path.isdir(args.file_prefix): - os.system(f"mkdir -p {args.file_prefix}") - - capture_stdout = len(args.file_prefix) > 0 - - # run client executable(s) - client_proc = run_clients( - ipaddrs, - args.value_size, - args.put_ratio, - args.length_s, - partition, - args.num_threads, - cd_dir_chain, - capture_stdout, - args.pin_cores, - ) - - # if running bench client, add proper timeout on wait - timeout = args.length_s + 15 - try: - if not capture_stdout: - client_proc.wait(timeout=timeout) - else: - # doing automated experiments, so capture output - out, _ = client_proc.communicate(timeout=timeout) - with open( - CLIENT_OUTPUT_PATH(args.protocol, args.file_prefix, file_midfix), - "w+", - ) as fout: - fout.write(out.decode()) - except subprocess.TimeoutExpired: - raise RuntimeError(f"client timed-out {timeout} secs") - - sys.exit(client_proc.returncode) diff --git a/scripts/crossword/gen_ycsb_a_trace.py b/scripts/crossword/gen_ycsb_a_trace.py deleted file mode 100644 index 345b864b..00000000 --- a/scripts/crossword/gen_ycsb_a_trace.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -import os - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import utils - - -TOML_FILENAME = "scripts/remote_hosts.toml" -PHYS_ENV_GROUP = "1dc" - -YCSB_DIR = lambda base: f"{base}/ycsb" -YCSB_TRACE = "/tmp/ycsb_workloada.txt" - - -def gen_ycsb_a_trace(base): - cmd = [ - f"{YCSB_DIR(base)}/bin/ycsb.sh", - "run", - "basic", - "-P", - f"{YCSB_DIR(base)}/workloads/workloada", - ] - proc = utils.proc.run_process( - cmd, capture_stdout=True, capture_stderr=True, print_cmd=False - ) - out, _ = proc.communicate() - raw = out.decode() - - # clean the trace - # TODO: write sampled value size into the trace itself? - with open(YCSB_TRACE, "w+") as fout: - for line in raw.strip().split("\n"): - line = line.strip() - if line.startswith("READ ") or line.startswith("UPDATE "): - segs = line.split() - op = segs[0] - key = segs[2] - fout.write(f"{op} {key}\n") - - -if __name__ == "__main__": - utils.file.check_proper_cwd() - - base, _, _, _, _, _ = utils.config.parse_toml_file(TOML_FILENAME, PHYS_ENV_GROUP) - - print("Generating YCSB-A trace...") - if os.path.isfile(YCSB_TRACE): - print(f" {YCSB_TRACE} already there, skipped") - else: - gen_ycsb_a_trace(base) - print(f" Done: {YCSB_TRACE}") diff --git a/scripts/crossword/install_devdeps.sh b/scripts/crossword/install_devdeps.sh deleted file mode 100755 index a30aa9a7..00000000 --- a/scripts/crossword/install_devdeps.sh +++ /dev/null @@ -1,35 +0,0 @@ -#! /bin/bash - - -if [ $(id -u) -eq 0 ]; -then - echo "Please run this script as normal user!" - exit 1 -fi - - -echo -echo "Installing extra apt packages..." -sudo apt -y install default-jre -sudo apt -y install liblog4j2-java - - -echo -echo "Fetching YCSB benchmark..." -cd .. -curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.17.0/ycsb-0.17.0.tar.gz -tar xfvz ycsb-0.17.0.tar.gz -rm ycsb-0.17.0.tar.gz -mv ycsb-0.17.0 ycsb - - -echo -echo "Fetching ChainPaxos codebase..." -git clone https://github.com/pfouto/chain.git -cd chain -git checkout aa4878d -cd .. -git clone https://github.com/pfouto/chain-client.git -cd chain-client -git checkout ce3a038 -echo diff --git a/scripts/crossword/kill_chain_procs.sh b/scripts/crossword/kill_chain_procs.sh deleted file mode 100755 index 29704713..00000000 --- a/scripts/crossword/kill_chain_procs.sh +++ /dev/null @@ -1,19 +0,0 @@ -#! /bin/bash - - -kill_all_matching () { - for pid in $(pgrep -f $1) - do - sudo kill -9 $pid > /dev/null 2>&1 - done -} - - -kill_all_matching java - - -if [ $# -ge 1 ] && [ "$1" = "incl_distr" ]; -then - kill_all_matching distr_chainapp.py - kill_all_matching distr_chaincli.py -fi diff --git a/scripts/distr_cluster.py b/scripts/distr_cluster.py index 2a90aa07..a85b6efa 100644 --- a/scripts/distr_cluster.py +++ b/scripts/distr_cluster.py @@ -42,7 +42,6 @@ def __init__(self, may_snapshot, has_heartbeats, extra_defaults): "Raft": ProtoFeats(True, True, None), "RSPaxos": ProtoFeats(True, True, lambda n, _: f"fault_tolerance={(n//2)//2}"), "CRaft": ProtoFeats(True, True, lambda n, _: f"fault_tolerance={(n//2)//2}"), - "Crossword": ProtoFeats(True, True, lambda n, _: f"fault_tolerance={n//2}"), } diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 57effba1..b8557e4a 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -41,7 +41,6 @@ def __init__(self, may_snapshot, has_heartbeats, extra_defaults): "Raft": ProtoFeats(True, True, None), "RSPaxos": ProtoFeats(True, True, lambda n, _: f"fault_tolerance={(n//2)//2}"), "CRaft": ProtoFeats(True, True, lambda n, _: f"fault_tolerance={(n//2)//2}"), - "Crossword": ProtoFeats(True, True, lambda n, _: f"fault_tolerance={n//2}"), } diff --git a/scripts/remote_hosts.toml b/scripts/remote_hosts.toml index b2291eb2..a556eae8 100644 --- a/scripts/remote_hosts.toml +++ b/scripts/remote_hosts.toml @@ -5,19 +5,19 @@ repo_name = "summerset" # (SET PROPERLY) for each group, its username @ DNS domain names [1dc] -host0 = "josehu@c220g1-030618.wisc.cloudlab.us" -host1 = "josehu@c220g1-030802.wisc.cloudlab.us" -host2 = "josehu@c220g1-030830.wisc.cloudlab.us" -# host3 = "josehu@c220g5-111008.wisc.cloudlab.us" -# host4 = "josehu@c220g5-111013.wisc.cloudlab.us" -# host5 = "josehu@c220g5-111022.wisc.cloudlab.us" -# host6 = "josehu@c220g5-111018.wisc.cloudlab.us" -# host7 = "josehu@c220g5-111011.wisc.cloudlab.us" -# host8 = "josehu@c220g5-111012.wisc.cloudlab.us" +host0 = "username@domain.com" +host1 = "username@domain.com" +host2 = "username@domain.com" +# host3 = "username@domain.com" +# host4 = "username@domain.com" +# host5 = "username@domain.com" +# host6 = "username@domain.com" +# host7 = "username@domain.com" +# host8 = "username@domain.com" # [wan] -# host0 = "josehu@c220g5-111328.wisc.cloudlab.us" -# host1 = "josehu@clnode116.clemson.cloudlab.us" -# host2 = "josehu@hp065.utah.cloudlab.us" -# host3 = "josehu@pc67.cloudlab.umass.edu" -# host4 = "josehu@clnode122.clemson.cloudlab.us" +# host0 = "username@domain.com" +# host1 = "username@domain.com" +# host2 = "username@domain.com" +# host3 = "username@domain.com" +# host4 = "username@domain.com" diff --git a/scripts/remote_killall.py b/scripts/remote_killall.py index 1ca2a605..2f8ce598 100644 --- a/scripts/remote_killall.py +++ b/scripts/remote_killall.py @@ -12,7 +12,6 @@ def killall_on_targets(destinations, cd_dir, chain=False): cmd = ["./scripts/kill_all_procs.sh", "incl_distr"] if chain: - cmd = ["./scripts/crossword/kill_chain_procs.sh", "incl_distr"] pass # placeholder line print("Running kill commands in parallel...") diff --git a/src/lib.rs b/src/lib.rs index 1754420c..b56faf57 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,7 +37,6 @@ pub use crate::protocols::SmrProtocol; // config strings pub use crate::protocols::{ClientConfigCRaft, ReplicaConfigCRaft}; pub use crate::protocols::{ClientConfigChainRep, ReplicaConfigChainRep}; -pub use crate::protocols::{ClientConfigCrossword, ReplicaConfigCrossword}; pub use crate::protocols::{ClientConfigMultiPaxos, ReplicaConfigMultiPaxos}; pub use crate::protocols::{ClientConfigRSPaxos, ReplicaConfigRSPaxos}; pub use crate::protocols::{ClientConfigRaft, ReplicaConfigRaft}; diff --git a/src/protocols/crossword/adaptive.rs b/src/protocols/crossword/adaptive.rs deleted file mode 100644 index 900244d7..00000000 --- a/src/protocols/crossword/adaptive.rs +++ /dev/null @@ -1,316 +0,0 @@ -//! Crossword -- assignment adaptability. - -use super::*; - -use crate::utils::SummersetError; - -// CrosswordReplica linear regression perf monitoring -impl CrosswordReplica { - /// Pretty-print a `Vec` assignment policy - #[inline] - #[allow(clippy::ptr_arg)] - pub(super) fn assignment_to_string(assignment: &Vec) -> String { - assignment - .iter() - .enumerate() - .map(|(r, a)| format!("{}:{{{}}}", r, a.compact_str())) - .collect::>() - .join(" ") - } - - /// Pretty-print linear regression models. - #[inline] - #[allow(clippy::ptr_arg)] - fn linreg_models_to_string( - models: &HashMap, - ) -> String { - models - .iter() - .map(|(r, model)| format!("{}:{}", r, model)) - .collect::>() - .join(" ") - } - - /// Parse config string into initial shards assignment policy. - pub(super) fn parse_init_assignment( - population: u8, - rs_total_shards: u8, - rs_data_shards: u8, - s: &str, - ) -> Result, SummersetError> { - debug_assert_eq!(rs_total_shards % population, 0); - let dj_spr = rs_total_shards / population; - let mut assignment = Vec::with_capacity(population as usize); - if s.is_empty() { - // default to start with bandwidth-optimal diagonal assignment - for r in 0..population { - assignment.push(Bitmap::from( - rs_total_shards, - ((r * dj_spr)..((r + 1) * dj_spr)).collect(), - )); - } - } else if let Ok(spr) = s.parse::() { - // a single number: the same #shards per replica round-robinly - if spr < dj_spr || spr > rs_data_shards { - return Err(SummersetError::msg(format!( - "invalid shards assignment string {}", - s - ))); - } - for r in 0..population { - assignment.push(Bitmap::from( - rs_total_shards, - ((r * dj_spr)..(r * dj_spr + spr)) - .map(|i| i % rs_total_shards) - .collect(), - )); - } - } else { - // string in format of something like 0:0,1/1:2/3:3,4 ... - for _ in 0..population { - assignment.push(Bitmap::new(rs_total_shards, false)); - } - for seg in s.split('/') { - if let Some(idx) = seg.find(':') { - let r = seg[..idx].parse::()?; - if r >= population { - return Err(SummersetError::msg(format!( - "invalid shards assignment string {}", - s - ))); - } - for shard in seg[idx + 1..].split(',') { - assignment[r as usize].set(shard.parse()?, true)?; - } - } else { - return Err(SummersetError::msg(format!( - "invalid shards assignment string {}", - s - ))); - } - } - } - Ok(assignment) - } - - /// Compute minimum number of shards_per_replica (assuming balanced - /// assignment) that is be responsive for a given peer_alive cnt. - #[inline] - pub(super) fn min_shards_per_replica( - rs_data_shards: u8, - majority: u8, - fault_tolerance: u8, - alive_cnt: u8, - ) -> u8 { - (majority + fault_tolerance + 1 - alive_cnt) - * (rs_data_shards / majority) - } - - /// Get the proper assignment policy given data size and peer_alive count. - // NOTE: if data_size == exactly `usize::MAX` this will fail; won't bother - // to account for this rare case right now - #[inline] - #[allow(clippy::too_many_arguments)] - pub(super) fn pick_assignment_policy<'a>( - assignment_adaptive: bool, - assignment_balanced: bool, - init_assignment: &'a Vec, - brr_assignments: &'a HashMap>, - rs_data_shards: u8, - majority: u8, - fault_tolerance: u8, - data_size: usize, - linreg_model: &HashMap, - b_to_d_threshold: f64, - qdisc_info: &Option, - peer_alive: &Bitmap, - ) -> &'a Vec { - // if unbalanced assignment is used, don't enable adaptability and also - // skip peer_alive count checking - if !assignment_balanced { - return init_assignment; - } - - let dj_spr = rs_data_shards / majority; - let best_spr = if b_to_d_threshold > 0.0 { - // NOTE: use obvious fixed assignments if having a b_to_d_threshold - debug_assert!(qdisc_info.is_some()); - let qdisc_info = qdisc_info.as_ref().unwrap(); - if qdisc_info.rate <= 0.0 { - rs_data_shards - } else if qdisc_info.delay + qdisc_info.jitter <= 0.0 { - dj_spr - } else { - let b_ms = (data_size as f64 * 1000.0) - / (qdisc_info.rate * 1024.0 * 1024.0 * 1024.0 / 8.0); - let d_ms = qdisc_info.delay + qdisc_info.jitter; - if b_ms / d_ms > b_to_d_threshold { - dj_spr - } else { - rs_data_shards - } - } - } else if assignment_adaptive { - // query the linear regression models and pick the best config of - // (#shards_per_replica, quorum_size) pair along the constraint - // boundary line if doing adaptive config choosing - let mut config_times = - Vec::<(u8, f64)>::with_capacity(majority as usize); - for (spr, q) in (dj_spr..=rs_data_shards) - .step_by(dj_spr as usize) - .enumerate() - .map(|(i, spr)| (spr, majority + fault_tolerance - i as u8)) - { - let load_size = - ((data_size / rs_data_shards as usize) + 1) * spr as usize; - let mut peer_times: Vec = linreg_model - .iter() - .map(|(_, model)| model.predict(load_size)) - .collect(); - peer_times.sort_by(|x, y| x.partial_cmp(y).unwrap()); - config_times.push((spr, peer_times[q as usize - 2])); - } - config_times - .iter() - .min_by(|x, y| x.1.partial_cmp(&y.1).unwrap()) - .unwrap() - .0 - } else { - init_assignment[0].count() - }; - - // if the best assignment according to models is not responsive enough - // given the current peer_alive count, use the best possible one - let min_spr = Self::min_shards_per_replica( - rs_data_shards, - majority, - fault_tolerance, - peer_alive.count(), - ); - brr_assignments.get(&cmp::max(best_spr, min_spr)).unwrap() - } - - /// Records a new datapoint for Accept RTT time. - pub(super) fn record_accept_rtt( - &mut self, - peer: ReplicaId, - tr: u128, - slot: usize, - size: usize, - ) { - // pop oldest heartbeats sent timestamps out until the corresponding - // heartbeat ID is found. Records preceding the matching record will - // be discarded forever - while let Some((ts, s)) = - self.pending_accepts.get_mut(&peer).unwrap().pop_front() - { - #[allow(clippy::comparison_chain)] - if s == slot { - debug_assert!(tr >= ts); - // approximate size as the PeerMsg type's stack size + shards - // payload size - let elapsed_ms: f64 = (tr - ts) as f64 / 1000.0; - self.regressor - .get_mut(&peer) - .unwrap() - .append_sample(tr, size, elapsed_ms); - // pf_trace!("append {} ac t {} dp {:?}", - // peer, tr, (size, elapsed_ms)); - break; - } else if slot < s { - // larger slot seen, meaning the send record for slot is - // probably lost. Do nothing - self.pending_accepts - .get_mut(&peer) - .unwrap() - .push_front((ts, s)); - break; - } - } - } - - /// Records a new datapoint for heartbeat RTT time. - pub(super) fn record_heartbeat_rtt( - &mut self, - peer: ReplicaId, - tr: u128, - hb_id: HeartbeatId, - ) { - // pop oldest heartbeats sent timestamps out until the corresponding - // heartbeat ID is found. Records preceding the matching record will - // be discarded forever - while let Some((ts, id)) = - self.pending_heartbeats.get_mut(&peer).unwrap().pop_front() - { - #[allow(clippy::comparison_chain)] - if id == hb_id { - debug_assert!(tr >= ts); - let elapsed_ms: f64 = (tr - ts) as f64 / 1000.0; - self.regressor - .get_mut(&peer) - .unwrap() // heartbeat size ~= 0 - .append_sample(tr, 0, elapsed_ms); - // pf_trace!("append {} hb t {} dp {:?}", - // peer, tr, (0, elapsed_ms)); - break; - } else if hb_id < id { - // larger ID seen, meaning the send record for hb_id is - // probably lost. Do nothing - self.pending_heartbeats - .get_mut(&peer) - .unwrap() - .push_front((ts, id)); - break; - } - } - } - - /// Discards all datapoints older than some timespan ago, then updates the - /// linear regression perf monitoring model for each replica using the - /// remaining window of datapoints. - pub(super) fn update_linreg_model( - &mut self, - keep_ms: u64, - ) -> Result<(), SummersetError> { - let now_us = self.startup_time.elapsed().as_micros(); - let keep_us = now_us - 1000 * keep_ms as u128; - - for (&peer, regressor) in self.regressor.iter_mut() { - regressor.discard_before(keep_us); - if !self.peer_alive.get(peer)? { - // if peer not considered alive, use a very high delay - self.linreg_model - .get_mut(&peer) - .unwrap() - .update(0.0, 999.0, 0.0); - } else { - // otherwise, compute simple linear regression - match regressor.calc_model(self.config.linreg_outlier_ratio) { - Ok(model) => { - *self.linreg_model.get_mut(&peer).unwrap() = model; - } - Err(_e) => { - // pf_trace!("calc_model error: {}", e); - } - } - } - } - - if now_us - self.last_linreg_print >= 3_000_000 { - pf_info!( - "linreg {}", - Self::linreg_models_to_string(&self.linreg_model) - ); - self.last_linreg_print = now_us; - } - Ok(()) - } - - /// Updates `tc qdisc` netem information. - pub(super) fn update_qdisc_info(&mut self) -> Result<(), SummersetError> { - if let Some(qdisc_info) = self.qdisc_info.as_mut() { - qdisc_info.update()?; - } - Ok(()) - } -} diff --git a/src/protocols/crossword/control.rs b/src/protocols/crossword/control.rs deleted file mode 100644 index 786536b1..00000000 --- a/src/protocols/crossword/control.rs +++ /dev/null @@ -1,118 +0,0 @@ -//! Crossword -- manager control actions. - -use super::*; - -use crate::manager::CtrlMsg; -use crate::server::{LogAction, LogResult}; -use crate::utils::SummersetError; - -// CrosswordReplica control messages handling -impl CrosswordReplica { - /// Handler of ResetState control message. - async fn handle_ctrl_reset_state( - &mut self, - durable: bool, - ) -> Result<(), SummersetError> { - pf_warn!("server got restart req"); - - // send leave notification to peers and wait for their replies - self.transport_hub.leave().await?; - - // send leave notification to manager and wait for its reply - self.control_hub - .do_sync_ctrl(CtrlMsg::Leave, |m| m == &CtrlMsg::LeaveReply) - .await?; - - // if `durable` is false, truncate backer file - if !durable - && self - .storage_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Truncate { offset: 0 }, - ) - .await? - .1 - != (LogResult::Truncate { - offset_ok: true, - now_size: 0, - }) - { - return logged_err!("failed to truncate log to 0"); - } - - Ok(()) - } - - /// Handler of Pause control message. - fn handle_ctrl_pause( - &mut self, - paused: &mut bool, - ) -> Result<(), SummersetError> { - pf_warn!("server got pause req"); - *paused = true; - self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; - Ok(()) - } - - /// Handler of Resume control message. - fn handle_ctrl_resume( - &mut self, - paused: &mut bool, - ) -> Result<(), SummersetError> { - pf_warn!("server got resume req"); - - // reset leader heartbeat timer - self.kickoff_hb_hear_timer()?; - - *paused = false; - self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; - Ok(()) - } - - /// Handler of TakeSnapshot control message. - async fn handle_ctrl_take_snapshot( - &mut self, - ) -> Result<(), SummersetError> { - pf_warn!("server told to take snapshot"); - self.take_new_snapshot().await?; - - self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { - new_start: self.start_slot, - })?; - Ok(()) - } - - /// Synthesized handler of manager control messages. If ok, returns - /// `Some(true)` if decides to terminate and reboot, `Some(false)` if - /// decides to shutdown completely, and `None` if not terminating. - pub(super) async fn handle_ctrl_msg( - &mut self, - msg: CtrlMsg, - paused: &mut bool, - ) -> Result, SummersetError> { - match msg { - CtrlMsg::ResetState { durable } => { - self.handle_ctrl_reset_state(durable).await?; - Ok(Some(true)) - } - - CtrlMsg::Pause => { - self.handle_ctrl_pause(paused)?; - Ok(None) - } - - CtrlMsg::Resume => { - self.handle_ctrl_resume(paused)?; - Ok(None) - } - - CtrlMsg::TakeSnapshot => { - self.handle_ctrl_take_snapshot().await?; - Ok(None) - } - - _ => Ok(None), // ignore all other types - } - } -} diff --git a/src/protocols/crossword/durability.rs b/src/protocols/crossword/durability.rs deleted file mode 100644 index d398e1b3..00000000 --- a/src/protocols/crossword/durability.rs +++ /dev/null @@ -1,247 +0,0 @@ -//! Crossword -- durable logging. - -use super::*; - -use crate::server::{ApiRequest, LogActionId, LogResult}; -use crate::utils::SummersetError; - -// CrosswordReplica durable WAL logging -impl CrosswordReplica { - /// Handler of PrepareBal logging result chan recv. - fn handle_logged_prepare_bal( - &mut self, - slot: usize, - ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "finished PrepareBal logging for slot {} bal {}", - slot, - self.insts[slot - self.start_slot].bal - ); - let inst = &self.insts[slot - self.start_slot]; - let voted = if inst.voted.0 > 0 { - Some(inst.voted.clone()) - } else { - None - }; - - if self.is_leader() { - // on leader, finishing the logging of a PrepareBal entry - // is equivalent to receiving a Prepare reply from myself - // (as an acceptor role) - if let Some(LeaderBookkeeping { - trigger_slot, - endprep_slot, - .. - }) = inst.leader_bk - { - if slot <= endprep_slot { - self.handle_msg_prepare_reply( - self.id, - slot, - trigger_slot, - endprep_slot, - inst.bal, - voted, - )?; - } - } - } else { - // on follower replica, finishing the logging of a - // PrepareBal entry leads to sending back a Prepare reply - if let Some(ReplicaBookkeeping { - source, - trigger_slot, - endprep_slot, - }) = inst.replica_bk - { - self.transport_hub.send_msg( - PeerMsg::PrepareReply { - slot, - trigger_slot, - endprep_slot, - ballot: inst.bal, - voted, - }, - source, - )?; - pf_trace!( - "sent PrepareReply -> {} for slot {} / {} bal {}", - source, - slot, - endprep_slot, - inst.bal - ); - } - } - - Ok(()) - } - - /// Handler of AcceptData logging result chan recv. - fn handle_logged_accept_data( - &mut self, - slot: usize, - ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "finished AcceptData logging for slot {} bal {}", - slot, - self.insts[slot - self.start_slot].bal - ); - let inst = &self.insts[slot - self.start_slot]; - - if self.is_leader() { - // on leader, finishing the logging of an AcceptData entry - // is equivalent to receiving an Accept reply from myself - // (as an acceptor role) - self.handle_msg_accept_reply(self.id, slot, inst.bal, 0, None)?; - // [for perf breakdown] - if let Some(sw) = self.bd_stopwatch.as_mut() { - let _ = sw.record_now(slot, 2, None); - } - } else { - // on follower replica, finishing the logging of an - // AcceptData entry leads to sending back an Accept reply - if let Some(ReplicaBookkeeping { source, .. }) = inst.replica_bk { - self.transport_hub.send_msg( - PeerMsg::AcceptReply { - slot, - ballot: inst.bal, - // compute payload size of the corresponding Accept - // message from leader: #shards assigned to me times - // size of one shard - size: inst.assignment[self.id as usize].count() - as usize - * ((inst.reqs_cw.data_len() - / inst.reqs_cw.num_data_shards() as usize) - + 1), - reply_ts: if self.config.record_breakdown { - Some(SystemTime::now()) - } else { - None - }, - }, - source, - )?; - pf_trace!( - "sent AcceptReply -> {} for slot {} bal {}", - source, - slot, - inst.bal - ); - } - } - - Ok(()) - } - - /// Handler of CommitSlot logging result chan recv. - fn handle_logged_commit_slot( - &mut self, - slot: usize, - ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "finished CommitSlot logging for slot {} bal {}", - slot, - self.insts[slot - self.start_slot].bal - ); - - // update index of the first non-committed instance - if slot == self.commit_bar { - while self.commit_bar < self.start_slot + self.insts.len() { - let inst = &mut self.insts[self.commit_bar - self.start_slot]; - if inst.status < Status::Committed { - break; - } - - if inst.reqs_cw.avail_shards() < inst.reqs_cw.num_data_shards() - { - // can't execute if I don't have the complete request batch - pf_debug!( - "postponing execution for slot {} (shards {}/{})", - slot, - inst.reqs_cw.avail_shards(), - inst.reqs_cw.num_data_shards() - ); - break; - } else if inst.reqs_cw.avail_data_shards() - < inst.reqs_cw.num_data_shards() - { - // have enough shards but need reconstruction - inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; - } - let reqs = inst.reqs_cw.get_data()?; - - // submit commands in committed instance to the state machine - // for execution - if reqs.is_empty() { - inst.status = Status::Executed; - } else if inst.status == Status::Committed { - for (cmd_idx, (_, req)) in reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id(self.commit_bar, cmd_idx), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests - } - } - pf_trace!( - "submitted {} exec commands for slot {}", - reqs.len(), - self.commit_bar - ); - } - - self.commit_bar += 1; - } - } - - Ok(()) - } - - /// Synthesized handler of durable logging result chan recv. - pub(super) fn handle_log_result( - &mut self, - action_id: LogActionId, - log_result: LogResult, - ) -> Result<(), SummersetError> { - let (slot, entry_type) = Self::split_log_action_id(action_id); - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - debug_assert!(slot < self.start_slot + self.insts.len()); - - if let LogResult::Append { now_size } = log_result { - debug_assert!(now_size >= self.wal_offset); - // update first wal_offset of slot - let inst = &mut self.insts[slot - self.start_slot]; - if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset { - inst.wal_offset = self.wal_offset; - } - debug_assert!(inst.wal_offset <= self.wal_offset); - // then update self.wal_offset - self.wal_offset = now_size; - } else { - return logged_err!("unexpected log result type: {:?}", log_result); - } - - match entry_type { - Status::Preparing => self.handle_logged_prepare_bal(slot), - Status::Accepting => self.handle_logged_accept_data(slot), - Status::Committed => self.handle_logged_commit_slot(slot), - _ => { - logged_err!("unexpected log entry type: {:?}", entry_type) - } - } - } -} diff --git a/src/protocols/crossword/execution.rs b/src/protocols/crossword/execution.rs deleted file mode 100644 index 9eae005b..00000000 --- a/src/protocols/crossword/execution.rs +++ /dev/null @@ -1,77 +0,0 @@ -//! Crossword -- command execution. - -use super::*; - -use crate::server::{ApiReply, ApiRequest, CommandId, CommandResult}; -use crate::utils::SummersetError; - -// CrosswordReplica state machine execution -impl CrosswordReplica { - /// Handler of state machine exec result chan recv. - pub(super) fn handle_cmd_result( - &mut self, - cmd_id: CommandId, - cmd_result: CommandResult, - ) -> Result<(), SummersetError> { - let (slot, cmd_idx) = Self::split_command_id(cmd_id); - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - debug_assert!(slot < self.start_slot + self.insts.len()); - pf_trace!("executed cmd in instance at slot {} idx {}", slot, cmd_idx); - - let inst = &mut self.insts[slot - self.start_slot]; - let reqs = inst.reqs_cw.get_data()?; - debug_assert!(cmd_idx < reqs.len()); - let (client, ref req) = reqs[cmd_idx]; - - // reply command result back to client - if let ApiRequest::Req { id: req_id, .. } = req { - if inst.external && self.external_api.has_client(client) { - self.external_api.send_reply( - ApiReply::Reply { - id: *req_id, - result: Some(cmd_result), - redirect: None, - }, - client, - )?; - pf_trace!( - "replied -> client {} for slot {} idx {}", - client, - slot, - cmd_idx - ); - } - } else { - return logged_err!("unexpected API request type"); - } - - // if all commands in this instance have been executed, set status to - // Executed and update `exec_bar` - if cmd_idx == reqs.len() - 1 { - inst.status = Status::Executed; - pf_debug!("executed all cmds in instance at slot {}", slot); - - // [for perf breakdown] - if self.is_leader() { - if let Some(sw) = self.bd_stopwatch.as_mut() { - let _ = sw.record_now(slot, 5, None); - } - } - - // update index of the first non-executed instance - if slot == self.exec_bar { - while self.exec_bar < self.start_slot + self.insts.len() { - let inst = &mut self.insts[self.exec_bar - self.start_slot]; - if inst.status < Status::Executed { - break; - } - self.exec_bar += 1; - } - } - } - - Ok(()) - } -} diff --git a/src/protocols/crossword/gossiping.rs b/src/protocols/crossword/gossiping.rs deleted file mode 100644 index 93285e22..00000000 --- a/src/protocols/crossword/gossiping.rs +++ /dev/null @@ -1,190 +0,0 @@ -//! Crossword -- follower gossiping. - -use std::collections::HashMap; -use std::mem; - -use super::*; - -use crate::server::ReplicaId; -use crate::utils::{Bitmap, SummersetError}; - -use rand::prelude::*; - -use tokio::time::Duration; - -// CrosswordReplica follower gossiping -impl CrosswordReplica { - /// Chooses a random gossip_timeout from the min-max range and kicks off - /// the gossip_timer. - pub(super) fn kickoff_gossip_timer( - &mut self, - ) -> Result<(), SummersetError> { - self.gossip_timer.cancel()?; - - if !self.config.disable_gossip_timer { - let timeout_ms = thread_rng().gen_range( - self.config.gossip_timeout_min..=self.config.gossip_timeout_max, - ); - // pf_trace!("kickoff gossip_timer @ {} ms", timeout_ms); - self.gossip_timer - .kickoff(Duration::from_millis(timeout_ms))?; - } - - Ok(()) - } - - /// Decide to which peers should I request gossiping and what shards - /// should I exclude. - #[allow(clippy::too_many_arguments, clippy::ptr_arg)] - fn gossip_targets_excl( - me: ReplicaId, - population: u8, - rs_data_shards: u8, - replica_bk: &Option, - mut avail_shards_map: Bitmap, - assignment: &Vec, - peer_alive: &Bitmap, - ) -> HashMap { - let mut src_peer = me; - if let Some(ReplicaBookkeeping { source, .. }) = replica_bk { - src_peer = *source; - } - - // greedily considers my peers, starting from the one with my ID + 1, - // until enough number of shards covered - let mut targets_excl = HashMap::new(); - for p in (me + 1)..(me + population) { - let peer = p % population; - if peer == src_peer { - // skip leader who initially replicated this instance to me - continue; - } - if !peer_alive.get(peer).unwrap() { - // skip peers that I don't think are alive right now - continue; - } - - // only ask for shards which I don't have right now and I have not - // asked others for - let mut useful_shards = Vec::new(); - for (idx, flag) in assignment[peer as usize].iter() { - if flag && !avail_shards_map.get(idx).unwrap() { - useful_shards.push(idx); - } - } - if !useful_shards.is_empty() { - targets_excl.insert(peer, avail_shards_map.clone()); - for idx in useful_shards { - avail_shards_map.set(idx, true).unwrap(); - } - } - - if avail_shards_map.count() >= rs_data_shards { - break; - } - } - - targets_excl - } - - /// Triggers gossiping for my missing shards in committed but not-yet- - /// executed instances: fetch missing shards from peers, preferring - /// follower peers that hold data shards. - pub(super) fn trigger_gossiping(&mut self) -> Result<(), SummersetError> { - // maintain a map from peer ID to send to -> slots_excl to send - let mut recon_slots: HashMap> = - HashMap::with_capacity(self.population as usize - 1); - for peer in 0..self.population { - if peer != self.id { - recon_slots.insert(peer, vec![]); - } - } - - let mut slot_up_to = self.gossip_bar; - let until_slot = if self.start_slot + self.insts.len() - > self.config.gossip_tail_ignores - { - self.start_slot + self.insts.len() - self.config.gossip_tail_ignores - } else { - 0 - }; - for slot in self.gossip_bar..until_slot { - slot_up_to = slot; - { - let inst = &self.insts[slot - self.start_slot]; - if inst.status >= Status::Executed { - continue; - } else if inst.status < Status::Committed { - break; - } - } - - let avail_shards_map = self.insts[slot - self.start_slot] - .reqs_cw - .avail_shards_map(); - let assignment = &self.insts[slot - self.start_slot].assignment; - if avail_shards_map.count() < self.rs_data_shards { - // decide which peers to ask for which shards from - let targets_excl = Self::gossip_targets_excl( - self.id, - self.population, - self.rs_data_shards, - &self.insts[slot - self.start_slot].replica_bk, - avail_shards_map, - assignment, - &self.peer_alive, - ); - - for (peer, exclude) in targets_excl { - recon_slots.get_mut(&peer).unwrap().push((slot, exclude)); - - // send reconstruction read messages in chunks - if recon_slots[&peer].len() == self.config.msg_chunk_size { - self.transport_hub.send_msg( - PeerMsg::Reconstruct { - slots_excl: mem::take( - recon_slots.get_mut(&peer).unwrap(), - ), - }, - peer, - )?; - pf_trace!( - "sent Reconstruct -> {} for {} slots", - peer, - self.config.msg_chunk_size - ); - } - } - } - } - - // send reconstruction read message for remaining slots - for (peer, slots_excl) in recon_slots.drain() { - if !slots_excl.is_empty() { - let num_slots = slots_excl.len(); - self.transport_hub - .send_msg(PeerMsg::Reconstruct { slots_excl }, peer)?; - pf_trace!( - "sent Reconstruct -> {} for {} slots", - peer, - num_slots - ); - } - } - - // reset gossip trigger timer - self.kickoff_gossip_timer()?; - - // update gossip_bar - if slot_up_to > self.gossip_bar { - pf_debug!( - "triggered gossiping: slots {} - {}", - self.gossip_bar, - slot_up_to - 1 - ); - self.gossip_bar = slot_up_to; - } - - Ok(()) - } -} diff --git a/src/protocols/crossword/leadership.rs b/src/protocols/crossword/leadership.rs deleted file mode 100644 index 5b5b1c1e..00000000 --- a/src/protocols/crossword/leadership.rs +++ /dev/null @@ -1,579 +0,0 @@ -//! Crossword -- leader election. - -use std::collections::{HashMap, VecDeque}; - -use super::*; - -use crate::manager::CtrlMsg; -use crate::server::{LogAction, ReplicaId}; -use crate::utils::{Bitmap, SummersetError}; - -use rand::prelude::*; - -use tokio::time::Duration; - -// CrosswordReplica leadership related logic -impl CrosswordReplica { - /// If a larger ballot number is seen, consider that peer as new leader. - pub(super) fn check_leader( - &mut self, - peer: ReplicaId, - ballot: Ballot, - ) -> Result<(), SummersetError> { - if ballot > self.bal_max_seen { - self.bal_max_seen = ballot; - - // clear my leader status if I was one - if self.is_leader() { - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; - pf_info!("no longer a leader..."); - } - - // reset heartbeat timeout timer to prevent me from trying to - // compete with a new leader when it is doing reconstruction - self.kickoff_hb_hear_timer()?; - - // set this peer to be the believed leader - self.leader = Some(peer); - } - - Ok(()) - } - - /// Becomes a leader, sends self-initiated Prepare messages to followers - /// for all in-progress instances, and starts broadcasting heartbeats. - pub(super) fn become_a_leader(&mut self) -> Result<(), SummersetError> { - if self.is_leader() { - return Ok(()); - } - - self.leader = Some(self.id); // this starts broadcasting heartbeats - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; - pf_info!("becoming a leader..."); - - // clear peers' heartbeat reply counters, and broadcast a heartbeat now - for cnts in self.hb_reply_cnts.values_mut() { - *cnts = (1, 0, 0); - } - self.bcast_heartbeats()?; - - // re-initialize peer_exec_bar information - for slot in self.peer_exec_bar.values_mut() { - *slot = 0; - } - - // make a greater ballot number and invalidate all in-progress instances - self.bal_prepared = 0; - self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); - self.bal_max_seen = self.bal_prep_sent; - - // clear pending perf monitoring timestamps - for pending in self.pending_accepts.values_mut() { - pending.clear(); - } - for pending in self.pending_heartbeats.values_mut() { - pending.clear(); - } - let now_us = self.startup_time.elapsed().as_micros(); - - // find the first and last slot index for which to redo Prepare phase - let trigger_slot = self.start_slot - + self - .insts - .iter() - .position(|i| i.status < Status::Committed) - .unwrap_or(self.insts.len()); - let endprep_slot = self.start_slot - + self - .insts - .iter() - .rposition(|i| i.status < Status::Committed) - .unwrap_or(self.insts.len()); - debug_assert!(trigger_slot <= endprep_slot); - if trigger_slot == self.start_slot + self.insts.len() { - // append a null instance to act as the trigger_slot - self.insts.push(self.null_instance()?); - } - pf_debug!( - "enter Prepare phase trigger_slot {} bal {}", - trigger_slot, - self.bal_prep_sent - ); - - // redo Prepare phase for all in-progress instances - let mut recon_slots: Vec<(usize, Bitmap)> = vec![]; - for (slot, inst) in self - .insts - .iter_mut() - .enumerate() - .map(|(s, i)| (self.start_slot + s, i)) - .skip(self.exec_bar - self.start_slot) - { - if inst.status == Status::Executed { - continue; - } - inst.external = true; // so replies to clients can be triggered - - if inst.status < Status::Committed { - inst.bal = self.bal_prep_sent; - inst.status = Status::Preparing; - inst.leader_bk = Some(LeaderBookkeeping { - trigger_slot, - endprep_slot, - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }); - - // record update to largest prepare ballot - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Preparing), - LogAction::Append { - entry: WalEntry::PrepareBal { - slot, - ballot: self.bal_prep_sent, - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted PrepareBal log action for slot {} bal {}", - slot, - inst.bal - ); - } - - // do reconstruction reads for all committed instances that do not - // hold enough available shards for reconstruction. It would be too - // complicated and slow to do the "data shards only" optimization - // during fail-over, so just do this conservatively here - if inst.status == Status::Committed - && inst.reqs_cw.avail_shards() < inst.reqs_cw.num_data_shards() - { - recon_slots.push((slot, inst.reqs_cw.avail_shards_map())); - } - } - - // send Prepare message to all peers - self.transport_hub.bcast_msg( - PeerMsg::Prepare { - trigger_slot, - ballot: self.bal_prep_sent, - }, - None, - )?; - pf_trace!( - "broadcast Prepare messages trigger_slot {} bal {}", - trigger_slot, - self.bal_prep_sent - ); - - // send reconstruction read messages in chunks - for chunk in recon_slots.chunks(self.config.msg_chunk_size) { - let slots = chunk.to_vec(); - let num_slots = slots.len(); - // pf_warn!("recons {:?}", slots); - self.transport_hub - .bcast_msg(PeerMsg::Reconstruct { slots_excl: slots }, None)?; - pf_trace!("broadcast Reconstruct messages for {} slots", num_slots); - - // inject a heartbeat after every chunk to keep peers happy - self.transport_hub.bcast_msg( - PeerMsg::Heartbeat { - id: self.next_hb_id, - ballot: self.bal_max_seen, - commit_bar: self.commit_bar, - exec_bar: self.exec_bar, - snap_bar: self.snap_bar, - }, - None, - )?; - for (&peer, pending) in self.pending_heartbeats.iter_mut() { - if self.peer_alive.get(peer)? { - pending.push_back((now_us, self.next_hb_id)); - } - } - self.next_hb_id += 1; - } - - self.update_qdisc_info()?; - Ok(()) - } - - /// Broadcasts heartbeats to all replicas. - pub(super) fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> { - let now_us = self.startup_time.elapsed().as_micros(); - self.transport_hub.bcast_msg( - PeerMsg::Heartbeat { - id: self.next_hb_id, - ballot: self.bal_max_seen, - commit_bar: self.commit_bar, - exec_bar: self.exec_bar, - snap_bar: self.snap_bar, - }, - None, - )?; - for (&peer, pending) in self.pending_heartbeats.iter_mut() { - if self.peer_alive.get(peer)? { - pending.push_back((now_us, self.next_hb_id)); - } - } - - // update max heartbeat reply counters and their repetitions seen - let mut peer_death = false; - for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { - if cnts.0 > cnts.1 { - // more hb replies have been received from this peer; it is - // probably alive - cnts.1 = cnts.0; - cnts.2 = 0; - } else { - // did not receive hb reply from this peer at least for the - // last sent hb from me; increment repetition count - cnts.2 += 1; - let repeat_threshold = (self.config.hb_hear_timeout_min - / self.config.hb_send_interval_ms) - * 3; - if cnts.2 > repeat_threshold { - // did not receive hb reply from this peer for too many - // past hbs sent from me; this peer is probably dead - if self.peer_alive.get(peer)? { - self.peer_alive.set(peer, false)?; - pf_info!("peer_alive updated: {:?}", self.peer_alive); - peer_death = true; - } - cnts.2 = 0; - } - } - } - - // I also heard this heartbeat from myself - self.heard_heartbeat( - self.id, - self.next_hb_id, - self.bal_max_seen, - self.commit_bar, - self.exec_bar, - self.snap_bar, - )?; - self.next_hb_id += 1; - - // if we need to do soft fallback to a config with smaller fast-path - // quorum size, redo Accept phase for certain slots for performance - if peer_death { - self.fallback_redo_accepts()?; - } - - // pf_trace!("broadcast heartbeats bal {}", self.bal_prep_sent); - Ok(()) - } - - /// Chooses a random hb_hear_timeout from the min-max range and kicks off - /// the hb_hear_timer. - pub(super) fn kickoff_hb_hear_timer( - &mut self, - ) -> Result<(), SummersetError> { - self.hb_hear_timer.cancel()?; - - if !self.config.disable_hb_timer { - let timeout_ms = thread_rng().gen_range( - self.config.hb_hear_timeout_min - ..=self.config.hb_hear_timeout_max, - ); - // pf_trace!("kickoff hb_hear_timer @ {} ms", timeout_ms); - self.hb_hear_timer - .kickoff(Duration::from_millis(timeout_ms))?; - } - - Ok(()) - } - - /// Heard a heartbeat from some other replica. If the heartbeat carries a - /// high enough ballot number, refreshes my hearing timer and clears my - /// leader status if I currently think I'm a leader. - pub(super) fn heard_heartbeat( - &mut self, - peer: ReplicaId, - hb_id: HeartbeatId, - ballot: Ballot, - commit_bar: usize, - exec_bar: usize, - snap_bar: usize, - ) -> Result<(), SummersetError> { - if peer != self.id { - if self.is_leader() { - self.record_heartbeat_rtt( - peer, - self.startup_time.elapsed().as_micros(), - hb_id, - ); - } - - self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; - if !self.peer_alive.get(peer)? { - self.peer_alive.set(peer, true)?; - pf_info!("peer_alive updated: {:?}", self.peer_alive); - } - - // if the peer has made a higher ballot number, consider it as - // a new leader - self.check_leader(peer, ballot)?; - - // reply back with a Heartbeat message - if self.leader == Some(peer) { - self.transport_hub.send_msg( - PeerMsg::Heartbeat { - id: hb_id, - ballot: self.bal_max_seen, - commit_bar: self.commit_bar, - exec_bar: self.exec_bar, - snap_bar: self.snap_bar, - }, - peer, - )?; - } - } - - // ignore outdated heartbeats, reset hearing timer - if ballot < self.bal_max_seen { - return Ok(()); - } - self.kickoff_hb_hear_timer()?; - if exec_bar < self.exec_bar { - return Ok(()); - } - - // all slots up to received commit_bar are safe to commit; submit their - // commands for execution - if commit_bar > self.commit_bar { - while self.start_slot + self.insts.len() < commit_bar { - self.insts.push(self.null_instance()?); - } - - let mut commit_cnt = 0; - for slot in self.commit_bar..commit_bar { - let inst = &mut self.insts[slot - self.start_slot]; - if inst.status < Status::Accepting { - break; - } else if inst.status >= Status::Committed { - continue; - } - - // mark this instance as committed - inst.status = Status::Committed; - pf_debug!( - "committed instance at slot {} bal {}", - slot, - inst.bal - ); - - // record commit event - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Committed), - LogAction::Append { - entry: WalEntry::CommitSlot { slot }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted CommitSlot log action for slot {} bal {}", - slot, - inst.bal - ); - - commit_cnt += 1; - } - - if commit_cnt > 0 { - pf_trace!("heartbeat commit <- {} < slot {}", peer, commit_bar); - } - } - - if peer != self.id { - // update peer_exec_bar if larger then known; if all servers' - // exec_bar (including myself) have passed a slot, that slot - // is definitely safe to be snapshotted - if exec_bar > self.peer_exec_bar[&peer] { - *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar; - let passed_cnt = 1 + self - .peer_exec_bar - .values() - .filter(|&&e| e >= exec_bar) - .count() as u8; - if passed_cnt == self.population { - // all servers have executed up to exec_bar - self.snap_bar = exec_bar; - } - } - - // if snap_bar is larger than mine, update snap_bar - if snap_bar > self.snap_bar { - self.snap_bar = snap_bar; - } - } - - // pf_trace!("heard heartbeat <- {} bal {}", peer, ballot); - Ok(()) - } - - /// Check all instances in the Accepting phase and redo their Accepts - /// using the current assignment policy. This is a performance optimization - /// for soft fallback triggered when peer_alive count decreases. - fn fallback_redo_accepts(&mut self) -> Result<(), SummersetError> { - let now_us = self.startup_time.elapsed().as_micros(); - let alive_cnt = self.peer_alive.count(); - let mut new_pending_accepts: HashMap< - ReplicaId, - VecDeque<(u128, usize)>, - > = (0..self.population) - .filter_map(|s| { - if s == self.id { - None - } else { - Some((s, VecDeque::new())) - } - }) - .collect(); - - let mut chunk_cnt = 0; - for (slot, inst) in self - .insts - .iter_mut() - .enumerate() - .map(|(s, i)| (self.start_slot + s, i)) - { - if inst.status == Status::Accepting && inst.leader_bk.is_some() { - if self.assignment_balanced - && inst.assignment[0].count() - >= Self::min_shards_per_replica( - self.rs_data_shards, - self.majority, - self.config.fault_tolerance, - alive_cnt, - ) - { - // the assignment policy used for this instance was already - // responsive for current # of healthy nodes - for (peer, pending) in self.pending_accepts.iter_mut() { - while let Some(record) = pending.pop_front() { - if slot == record.1 { - new_pending_accepts - .get_mut(peer) - .unwrap() - .push_back(record); - } - } - } - continue; - } - - inst.bal = self.bal_prepared; - inst.leader_bk.as_mut().unwrap().accept_acks.clear(); - let assignment = Self::pick_assignment_policy( - self.assignment_adaptive, - self.assignment_balanced, - &self.init_assignment, - &self.brr_assignments, - self.rs_data_shards, - self.majority, - self.config.fault_tolerance, - inst.reqs_cw.data_len(), - &self.linreg_model, - self.config.b_to_d_threshold, - &self.qdisc_info, - &self.peer_alive, - ); - pf_debug!( - "enter Accept phase for slot {} bal {} asgmt {}", - slot, - inst.bal, - Self::assignment_to_string(assignment) - ); - - let subset_copy = inst - .reqs_cw - .subset_copy(&assignment[self.id as usize], false)?; - inst.assignment.clone_from(assignment); - inst.voted = (inst.bal, subset_copy.clone()); - - // record update to largest accepted ballot and corresponding data - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Accepting), - LogAction::Append { - entry: WalEntry::AcceptData { - slot, - ballot: inst.bal, - // persist only some shards on myself - reqs_cw: subset_copy, - assignment: assignment.clone(), - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted AcceptData log action for slot {} bal {}", - slot, - inst.bal - ); - - // send Accept messages to all peers, each getting its subset of - // shards of data - for peer in 0..self.population { - if peer == self.id { - continue; - } - self.transport_hub.send_msg( - PeerMsg::Accept { - slot, - ballot: inst.bal, - reqs_cw: inst.reqs_cw.subset_copy( - &assignment[peer as usize], - false, - )?, - assignment: assignment.clone(), - }, - peer, - )?; - if self.peer_alive.get(peer)? { - self.pending_accepts - .get_mut(&peer) - .unwrap() - .push_back((now_us, slot)); - } - } - pf_trace!( - "broadcast Accept messages for slot {} bal {}", - slot, - inst.bal - ); - chunk_cnt += 1; - - // inject heartbeats in the middle to keep peers happy - if chunk_cnt >= self.config.msg_chunk_size { - self.transport_hub.bcast_msg( - PeerMsg::Heartbeat { - id: self.next_hb_id, - ballot: self.bal_max_seen, - commit_bar: self.commit_bar, - exec_bar: self.exec_bar, - snap_bar: self.snap_bar, - }, - None, - )?; - for (&peer, pending) in self.pending_heartbeats.iter_mut() { - if self.peer_alive.get(peer)? { - pending.push_back((now_us, self.next_hb_id)); - } - } - self.next_hb_id += 1; - chunk_cnt = 0; - } - } - } - - self.pending_accepts = new_pending_accepts; - Ok(()) - } -} diff --git a/src/protocols/crossword/messages.rs b/src/protocols/crossword/messages.rs deleted file mode 100644 index 80af3e78..00000000 --- a/src/protocols/crossword/messages.rs +++ /dev/null @@ -1,760 +0,0 @@ -//! Crossword -- peer-peer messaging. - -use std::cmp; -use std::collections::HashMap; - -use super::*; - -use crate::server::{ApiRequest, LogAction, ReplicaId}; -use crate::utils::{Bitmap, RSCodeword, SummersetError}; - -// CrosswordReplica peer-peer messages handling -impl CrosswordReplica { - // Compute the subset coverage of acknowledge pattern `acks` when - // considering at most `fault_tolerance` failures. - #[inline] - fn coverage_under_faults( - rs_total_shards: u8, - population: u8, - acks: &HashMap, - fault_tolerance: u8, - assignment_balanced: bool, - ) -> u8 { - if acks.len() <= fault_tolerance as usize { - return 0; - } - - // if forcing balanced assignment, can compute this using a rather - // simple calculation - if assignment_balanced { - let spr = acks.values().next().unwrap().count(); - let dj_spr = rs_total_shards / population; - return (acks.len() as u8 - fault_tolerance - 1) * dj_spr + spr; - } - - // enumerate all subsets of acks excluding fault number of replicas - let cnt = (acks.len() - fault_tolerance as usize) as u32; - let servers: Vec = acks.keys().cloned().collect(); - let mut min_coverage = rs_total_shards; - for n in (0..2usize.pow(servers.len() as u32)) - .filter(|n| n.count_ones() == cnt) - { - let mut coverage = Bitmap::new(rs_total_shards, false); - for (_, server) in servers - .iter() - .enumerate() - .filter(|&(i, _)| (n >> i) % 2 == 1) - { - for shard in acks[server].iter().filter_map(|(s, flag)| { - if flag { - Some(s) - } else { - None - } - }) { - coverage.set(shard, true).expect("impossible shard index"); - } - } - if coverage.count() < min_coverage { - min_coverage = coverage.count(); - } - } - min_coverage - } - - /// Handler of Prepare message from leader. - fn handle_msg_prepare( - &mut self, - peer: ReplicaId, - trigger_slot: usize, - ballot: Ballot, - ) -> Result<(), SummersetError> { - if trigger_slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "received Prepare <- {} trigger_slot {} bal {}", - peer, - trigger_slot, - ballot - ); - - // if ballot is not smaller than what I have seen: - if ballot >= self.bal_max_seen { - // update largest ballot seen and assumed leader - self.check_leader(peer, ballot)?; - self.kickoff_hb_hear_timer()?; - - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= trigger_slot { - self.insts.push(self.null_instance()?); - } - - // find the last non-null slot and use as endprep_slot; if none - // found, use trigger_slot as a dummy entry - let endprep_slot = cmp::max( - self.start_slot - + self - .insts - .iter() - .rposition(|i| i.status > Status::Null) - .unwrap_or(0), - trigger_slot, - ); - - // react to this Prepare for all slots >= trigger_slot - for slot in trigger_slot..=endprep_slot { - let inst = &mut self.insts[slot - self.start_slot]; - debug_assert!(inst.bal <= ballot); - - inst.bal = ballot; - inst.status = Status::Preparing; - inst.replica_bk = Some(ReplicaBookkeeping { - source: peer, - trigger_slot, - endprep_slot, - }); - - // record update to largest prepare ballot - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Preparing), - LogAction::Append { - entry: WalEntry::PrepareBal { slot, ballot }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted PrepareBal log action for slot {} bal {}", - slot, - ballot - ); - } - } - - Ok(()) - } - - /// Handler of Prepare reply from replica. - pub(super) fn handle_msg_prepare_reply( - &mut self, - peer: ReplicaId, - slot: usize, - trigger_slot: usize, - endprep_slot: usize, - ballot: Ballot, - voted: Option<(Ballot, RSCodeword)>, - ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "received PrepareReply <- {} for slot {} / {} bal {} shards {:?}", - peer, - slot, - endprep_slot, - ballot, - voted.as_ref().map(|(_, cw)| cw.avail_shards_map()) - ); - - // if ballot is what I'm currently waiting on for Prepare replies: - if ballot == self.bal_prep_sent { - // ignore spurious duplications and outdated replies - if !self.is_leader() { - return Ok(()); - } - debug_assert!(slot >= trigger_slot && slot <= endprep_slot); - debug_assert!( - trigger_slot >= self.start_slot - && trigger_slot < self.start_slot + self.insts.len() - ); - if self.insts[trigger_slot - self.start_slot] - .leader_bk - .is_none() - { - return Ok(()); - } - - // locate instance in memory, filling in null instance if needed - // if slot is outside the tail of my current log, this means I did - // not know at `become_leader()` that this slot existed on peers - let my_endprep_slot = self.insts[trigger_slot - self.start_slot] - .leader_bk - .as_ref() - .unwrap() - .endprep_slot; - while self.start_slot + self.insts.len() <= slot { - let this_slot = self.start_slot + self.insts.len(); - self.insts.push(self.null_instance()?); - let inst = &mut self.insts[this_slot - self.start_slot]; - - // since this slot was not known at `become_leader()`, need to - // fill necessary information here and make durable - inst.external = true; - inst.bal = self.bal_prep_sent; - inst.status = Status::Preparing; - inst.leader_bk = Some(LeaderBookkeeping { - trigger_slot, - endprep_slot: my_endprep_slot, - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }); - - // record update to largest prepare ballot - self.storage_hub.submit_action( - Self::make_log_action_id(this_slot, Status::Preparing), - LogAction::Append { - entry: WalEntry::PrepareBal { - slot: this_slot, - ballot: self.bal_prep_sent, - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted PrepareBal log action for slot {} bal {}", - this_slot, - inst.bal - ); - } - - { - let inst = &mut self.insts[slot - self.start_slot]; - - // ignore spurious duplications and outdated replies - if (inst.status != Status::Preparing) || (ballot < inst.bal) { - return Ok(()); - } - debug_assert_eq!(inst.bal, ballot); - debug_assert!(self.bal_max_seen >= ballot); - - // bookkeep this Prepare reply - if let Some((bal, val)) = voted { - debug_assert!(inst.leader_bk.is_some()); - let leader_bk = inst.leader_bk.as_mut().unwrap(); - #[allow(clippy::comparison_chain)] - if bal > leader_bk.prepare_max_bal { - // is of ballot > current maximum, so discard the - // current codeword and take the replied codeword - leader_bk.prepare_max_bal = bal; - inst.reqs_cw = val; - } else if bal == leader_bk.prepare_max_bal { - // is of ballot == the one currently taken, so merge - // the replied codeword into the current one - inst.reqs_cw.absorb_other(val)?; - } - } - } - - // if all PrepareReplies up to endprep_slot have been received, - // include the sender peer into the quorum (by updating the - // prepare_acks field in the trigger_slot entry) - if slot == endprep_slot { - let trigger_inst = - &mut self.insts[trigger_slot - self.start_slot]; - debug_assert!(trigger_inst.leader_bk.is_some()); - let trigger_leader_bk = - trigger_inst.leader_bk.as_mut().unwrap(); - trigger_leader_bk.prepare_acks.set(peer, true)?; - let prepare_acks_cnt = trigger_leader_bk.prepare_acks.count(); - - // if quorum size reached, enter Accept phase for all instances - // at and after trigger_slot; for each entry, use the request - // batch value with the highest ballot number in quorum - if prepare_acks_cnt >= self.majority { - // update bal_prepared - debug_assert!(self.bal_prepared <= ballot); - self.bal_prepared = ballot; - - for (this_slot, inst) in self - .insts - .iter_mut() - .enumerate() - .map(|(s, i)| (self.start_slot + s, i)) - .skip(trigger_slot - self.start_slot) - .filter(|(_, i)| i.status == Status::Preparing) - { - if inst.reqs_cw.avail_shards() >= self.rs_data_shards { - // if quorum size >= majority and enough shards - // with the highest ballot in quorum are gathered - // to reconstruct the original data, use the - // reconstructed request batch - if inst.reqs_cw.avail_data_shards() - < self.rs_data_shards - { - // have enough shards but need reconstruction - inst.reqs_cw - .reconstruct_data(Some(&self.rs_coder))?; - } - } else if prepare_acks_cnt - >= (self.population - self.config.fault_tolerance) - { - // else, if quorum size >= (N - f) and shards with - // the highest ballot are not enough to reconstruct - // the original data, can choose any value; we just - // fill this instance with a null request batch - inst.reqs_cw = RSCodeword::from_data( - ReqBatch::new(), - self.rs_data_shards, - self.rs_total_shards - self.rs_data_shards, - )?; - } else { - // not yet for this instance - continue; - } - - // if parity shards not computed yet, compute them now - if inst.reqs_cw.avail_shards() < self.population { - inst.reqs_cw - .compute_parity(Some(&self.rs_coder))?; - } - - inst.status = Status::Accepting; - let assignment = Self::pick_assignment_policy( - self.assignment_adaptive, - self.assignment_balanced, - &self.init_assignment, - &self.brr_assignments, - self.rs_data_shards, - self.majority, - self.config.fault_tolerance, - inst.reqs_cw.data_len(), - &self.linreg_model, - self.config.b_to_d_threshold, - &self.qdisc_info, - &self.peer_alive, - ); - pf_debug!( - "enter Accept phase for slot {} bal {} asgmt {}", - this_slot, - inst.bal, - Self::assignment_to_string(assignment) - ); - - // record update to largest accepted ballot and corresponding data - let subset_copy = inst.reqs_cw.subset_copy( - &assignment[self.id as usize], - false, - )?; - inst.assignment.clone_from(assignment); - inst.voted = (ballot, subset_copy.clone()); - self.storage_hub.submit_action( - Self::make_log_action_id( - this_slot, - Status::Accepting, - ), - LogAction::Append { - entry: WalEntry::AcceptData { - slot: this_slot, - ballot, - reqs_cw: subset_copy, - assignment: assignment.clone(), - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted AcceptData log action for slot {} bal {}", - this_slot, ballot - ); - - // send Accept messages to all peers - let now_us = self.startup_time.elapsed().as_micros(); - for peer in 0..self.population { - if peer == self.id { - continue; - } - self.transport_hub.send_msg( - PeerMsg::Accept { - slot: this_slot, - ballot, - reqs_cw: inst.reqs_cw.subset_copy( - &assignment[peer as usize], - false, - )?, - assignment: assignment.clone(), - }, - peer, - )?; - if self.peer_alive.get(peer)? { - self.pending_accepts - .get_mut(&peer) - .unwrap() - .push_back((now_us, this_slot)); - } - } - pf_trace!( - "broadcast Accept messages for slot {} bal {}", - this_slot, - ballot - ); - } - } - } - } - - Ok(()) - } - - /// Handler of Accept message from leader. - fn handle_msg_accept( - &mut self, - peer: ReplicaId, - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, - assignment: Vec, - ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "received Accept <- {} for slot {} bal {} shards {:?}", - peer, - slot, - ballot, - reqs_cw.avail_shards_map() - ); - - // if ballot is not smaller than what I have made promises for: - if ballot >= self.bal_max_seen { - // update largest ballot seen and assumed leader - self.check_leader(peer, ballot)?; - self.kickoff_hb_hear_timer()?; - - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - let inst = &mut self.insts[slot - self.start_slot]; - debug_assert!(inst.bal <= ballot); - - inst.bal = ballot; - inst.status = Status::Accepting; - inst.reqs_cw = reqs_cw; - inst.assignment = assignment; - inst.replica_bk = Some(ReplicaBookkeeping { - source: peer, - trigger_slot: 0, - endprep_slot: 0, - }); - - // record update to instance ballot & data - inst.voted = (ballot, inst.reqs_cw.clone()); - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Accepting), - LogAction::Append { - entry: WalEntry::AcceptData { - slot, - ballot, - reqs_cw: inst.reqs_cw.clone(), - assignment: inst.assignment.clone(), - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted AcceptData log action for slot {} bal {}", - slot, - ballot - ); - } - - Ok(()) - } - - /// Handler of Accept reply from replica. - pub(super) fn handle_msg_accept_reply( - &mut self, - peer: ReplicaId, - slot: usize, - ballot: Ballot, - size: usize, - reply_ts: Option, - ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!( - "received AcceptReply <- {} for slot {} bal {}", - peer, - slot, - ballot - ); - - // if ballot is what I'm currently waiting on for Accept replies: - if ballot == self.bal_prepared { - debug_assert!(slot < self.start_slot + self.insts.len()); - let is_leader = self.is_leader(); - if is_leader && peer != self.id { - self.record_accept_rtt( - peer, - self.startup_time.elapsed().as_micros(), - slot, - size, - ); - } - let inst = &mut self.insts[slot - self.start_slot]; - - // ignore spurious duplications and outdated replies - if !is_leader - || (inst.status != Status::Accepting) - || (ballot < inst.bal) - { - return Ok(()); - } - debug_assert_eq!(inst.bal, ballot); - debug_assert!(self.bal_max_seen >= ballot); - debug_assert!(inst.leader_bk.is_some()); - let leader_bk = inst.leader_bk.as_mut().unwrap(); - if leader_bk.accept_acks.contains_key(&peer) { - return Ok(()); - } - - // bookkeep this Accept reply - leader_bk - .accept_acks - .insert(peer, inst.assignment[peer as usize].clone()); - - // if quorum size reached AND enough number of shards are - // remembered, mark this instance as committed - if leader_bk.accept_acks.len() as u8 >= self.majority - && Self::coverage_under_faults( - self.rs_total_shards, - self.population, - &leader_bk.accept_acks, - self.config.fault_tolerance, - self.assignment_balanced, - ) >= inst.reqs_cw.num_data_shards() - { - inst.status = Status::Committed; - pf_debug!( - "committed instance at slot {} bal {}", - slot, - inst.bal - ); - - // [for perf breakdown] - if let Some(sw) = self.bd_stopwatch.as_mut() { - let _ = sw.record_now(slot, 3, reply_ts); - let _ = sw.record_now(slot, 4, None); - } - - // record commit event - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Committed), - LogAction::Append { - entry: WalEntry::CommitSlot { slot }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted CommitSlot log action for slot {} bal {}", - slot, - inst.bal - ); - } - } - - Ok(()) - } - - /// Handler of Reconstruct message from leader or gossiping peer. - fn handle_msg_reconstruct( - &mut self, - peer: ReplicaId, - slots_excl: Vec<(usize, Bitmap)>, - ) -> Result<(), SummersetError> { - pf_trace!( - "received Reconstruct <- {} for {} slots", - peer, - slots_excl.len() - ); - let mut slots_data = HashMap::new(); - - for (slot, mut subset) in slots_excl { - if slot < self.start_slot { - // TODO: this has one caveat: a new leader trying to do - // reconstruction reads might find that all other peers have - // snapshotted that slot. Proper InstallSnapshot-style messages - // will be needed to deal with this; but since this scenario is - // just too rare, it will be implemented after a rework of the - // storage backend module - continue; - } - - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - let inst = &mut self.insts[slot - self.start_slot]; - - // ignore spurious duplications; also ignore if I have nothing to send back - if inst.status < Status::Accepting { - continue; - } - subset.flip(); // exclude unwanted shards the sender already has - let reply_cw = inst.reqs_cw.subset_copy(&subset, false)?; - if reply_cw.avail_shards() == 0 { - continue; - } - - // send back my ballot for this slot and the available shards - slots_data.insert(slot, (inst.bal, reply_cw)); - } - - if !slots_data.is_empty() { - let num_slots = slots_data.len(); - self.transport_hub - .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?; - pf_trace!( - "sent ReconstructReply -> {} for {} slots", - peer, - num_slots - ); - } - Ok(()) - } - - /// Handler of Reconstruct reply from replica. - fn handle_msg_reconstruct_reply( - &mut self, - peer: ReplicaId, - slots_data: HashMap)>, - ) -> Result<(), SummersetError> { - for (slot, (ballot, reqs_cw)) in slots_data { - if slot < self.start_slot { - continue; // ignore if slot index outdated - } - pf_trace!( - "in ReconstructReply <- {} for slot {} bal {} shards {:?}", - peer, - slot, - ballot, - reqs_cw.avail_shards_map() - ); - debug_assert!(slot < self.start_slot + self.insts.len()); - debug_assert!( - self.insts[slot - self.start_slot].status >= Status::Committed - ); - let inst = &mut self.insts[slot - self.start_slot]; - - // if reply not outdated and ballot is up-to-date - if inst.status < Status::Executed && ballot >= inst.bal { - // absorb the shards from this replica - inst.reqs_cw.absorb_other(reqs_cw)?; - - // if enough shards have been gathered, can push execution forward - if slot == self.commit_bar { - while self.commit_bar < self.start_slot + self.insts.len() { - let inst = - &mut self.insts[self.commit_bar - self.start_slot]; - if inst.status < Status::Committed - || inst.reqs_cw.avail_shards() - < inst.reqs_cw.num_data_shards() - { - break; - } - - if inst.reqs_cw.avail_data_shards() - < inst.reqs_cw.num_data_shards() - { - // have enough shards but need reconstruction - inst.reqs_cw - .reconstruct_data(Some(&self.rs_coder))?; - } - let reqs = inst.reqs_cw.get_data()?; - - // submit commands in committed instance to the state machine - // for execution - if reqs.is_empty() { - inst.status = Status::Executed; - } else { - for (cmd_idx, (_, req)) in reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id( - self.commit_bar, - cmd_idx, - ), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests - } - } - pf_trace!( - "submitted {} exec commands for slot {}", - reqs.len(), - self.commit_bar - ); - } - - self.commit_bar += 1; - } - } - } - } - - Ok(()) - } - - /// Synthesized handler of receiving message from peer. - pub(super) fn handle_msg_recv( - &mut self, - peer: ReplicaId, - msg: PeerMsg, - ) -> Result<(), SummersetError> { - match msg { - PeerMsg::Prepare { - trigger_slot, - ballot, - } => self.handle_msg_prepare(peer, trigger_slot, ballot), - PeerMsg::PrepareReply { - slot, - trigger_slot, - endprep_slot, - ballot, - voted, - } => self.handle_msg_prepare_reply( - peer, - slot, - trigger_slot, - endprep_slot, - ballot, - voted, - ), - PeerMsg::Accept { - slot, - ballot, - reqs_cw, - assignment, - } => { - self.handle_msg_accept(peer, slot, ballot, reqs_cw, assignment) - } - PeerMsg::AcceptReply { - slot, - ballot, - size, - reply_ts, - } => { - self.handle_msg_accept_reply(peer, slot, ballot, size, reply_ts) - } - PeerMsg::Reconstruct { slots_excl } => { - self.handle_msg_reconstruct(peer, slots_excl) - } - PeerMsg::ReconstructReply { slots_data } => { - self.handle_msg_reconstruct_reply(peer, slots_data) - } - PeerMsg::Heartbeat { - id: hb_id, - ballot, - commit_bar, - exec_bar, - snap_bar, - } => self.heard_heartbeat( - peer, hb_id, ballot, commit_bar, exec_bar, snap_bar, - ), - } - } -} diff --git a/src/protocols/crossword/mod.rs b/src/protocols/crossword/mod.rs deleted file mode 100644 index 08e1540b..00000000 --- a/src/protocols/crossword/mod.rs +++ /dev/null @@ -1,1357 +0,0 @@ -//! Replication protocol: Crossword. -//! -//! MultiPaxos with flexible Reed-Solomon erasure code sharding that supports -//! dynamically tunable shard assignment with the correct liveness constraints, -//! plus follower gossiping for actual usability. - -mod adaptive; -mod control; -mod durability; -mod execution; -mod gossiping; -mod leadership; -mod messages; -mod recovery; -mod request; -mod snapshot; - -use std::cmp; -use std::collections::{HashMap, HashSet, VecDeque}; -use std::net::SocketAddr; -use std::path::Path; -use std::time::SystemTime; - -use crate::client::{ClientApiStub, ClientCtrlStub, ClientId, GenericEndpoint}; -use crate::manager::{CtrlMsg, CtrlReply, CtrlRequest}; -use crate::protocols::SmrProtocol; -use crate::server::{ - ApiReply, ApiRequest, CommandId, ControlHub, ExternalApi, GenericReplica, - LogActionId, ReplicaId, StateMachine, StorageHub, TransportHub, -}; -use crate::utils::{ - Bitmap, LinearRegressor, PerfModel, QdiscInfo, RSCodeword, Stopwatch, - SummersetError, Timer, -}; - -use async_trait::async_trait; - -use get_size::GetSize; - -use serde::{Deserialize, Serialize}; - -use tokio::sync::watch; -use tokio::time::{self, Duration, Instant, Interval, MissedTickBehavior}; - -use reed_solomon_erasure::galois_8::ReedSolomon; - -/// Configuration parameters struct. -#[derive(Debug, Clone, Deserialize)] -pub struct ReplicaConfigCrossword { - /// Client request batching interval in millisecs. - pub batch_interval_ms: u64, - - /// Client request batching maximum batch size. - pub max_batch_size: usize, - - /// Path to backing log file. - pub backer_path: String, - - /// Whether to call `fsync()`/`fdatasync()` on logger. - pub logger_sync: bool, - - /// Min timeout of not hearing any heartbeat from leader in millisecs. - pub hb_hear_timeout_min: u64, - /// Max timeout of not hearing any heartbeat from leader in millisecs. - pub hb_hear_timeout_max: u64, - - /// Interval of leader sending heartbeats to followers. - pub hb_send_interval_ms: u64, - - /// Disable heartbeat timer (to force a deterministic leader during tests)? - pub disable_hb_timer: bool, - - /// Path to snapshot file. - pub snapshot_path: String, - - /// Snapshot self-triggering interval in secs. 0 means never trigger - /// snapshotting autonomously. - pub snapshot_interval_s: u64, - - /// Min timeout of follower gossiping trigger in millisecs. - pub gossip_timeout_min: u64, - /// Max timeout of follower gossiping trigger in millisecs. - pub gossip_timeout_max: u64, - - /// How many slots at the end should we ignore when gossiping is triggered. - pub gossip_tail_ignores: usize, - - /// Disable gossiping timer (to force more deterministic perf behavior)? - pub disable_gossip_timer: bool, - - /// Fault-tolerance level. - pub fault_tolerance: u8, - - /// Maximum chunk size (in slots) of any bulk messages. - pub msg_chunk_size: usize, - - /// Total number of possible shards in a codeword (i.e., codeword width). - /// If zero, sets this to == population. - pub rs_total_shards: u8, - /// Number of data shards. If zero, sets this to == majority. - pub rs_data_shards: u8, - - /// If non-empty, use this initial shards assignment policy. - pub init_assignment: String, - - /// Update interval of linear regression perf monitoring model. - pub linreg_interval_ms: u64, - /// Window timespan of linear regression datapoints to keep. - pub linreg_keep_ms: u64, - - /// Initial linear regression model slope. - pub linreg_init_a: f64, - /// Initial linear regression model intercept. - pub linreg_init_b: f64, - /// Ratio of outliers to exclude in each `calc_model()`. - pub linreg_outlier_ratio: f32, - - /// Knob that controls choosing the best config with perf model values. - pub b_to_d_threshold: f64, - - /// Recording performance breakdown statistics? - pub record_breakdown: bool, - - /// Recording the latest committed value version of a key? - /// Only effective if record_breakdown is set to true. - pub record_value_ver: bool, - - /// Simulate local read lease implementation? - // TODO: actual read lease impl later? (won't affect anything about - // evalutaion results though) - pub sim_read_lease: bool, -} - -#[allow(clippy::derivable_impls)] -impl Default for ReplicaConfigCrossword { - fn default() -> Self { - ReplicaConfigCrossword { - batch_interval_ms: 10, - max_batch_size: 5000, - backer_path: "/tmp/summerset.crossword.wal".into(), - logger_sync: false, - hb_hear_timeout_min: 1500, - hb_hear_timeout_max: 2000, - hb_send_interval_ms: 20, - disable_hb_timer: false, - snapshot_path: "/tmp/summerset.crossword.snap".into(), - snapshot_interval_s: 0, - gossip_timeout_min: 10, - gossip_timeout_max: 30, - gossip_tail_ignores: 100, - disable_gossip_timer: false, - fault_tolerance: 0, - msg_chunk_size: 10, - rs_total_shards: 0, - rs_data_shards: 0, - init_assignment: "".into(), - linreg_interval_ms: 200, - linreg_keep_ms: 2000, - linreg_init_a: 10.0, - linreg_init_b: 10.0, - linreg_outlier_ratio: 0.5, - b_to_d_threshold: 0.0, - record_breakdown: false, - record_value_ver: false, - sim_read_lease: false, - } - } -} - -/// Ballot number type. Use 0 as a null ballot number. -pub(crate) type Ballot = u64; - -/// Instance status enum. -#[derive( - Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize, -)] -pub(crate) enum Status { - Null = 0, - Preparing = 1, - Accepting = 2, - Committed = 3, - Executed = 4, -} - -/// Request batch type (i.e., the "value" in Paxos). -pub(crate) type ReqBatch = Vec<(ClientId, ApiRequest)>; - -/// Leader-side bookkeeping info for each instance initiated. -#[derive(Debug, Clone)] -pub(crate) struct LeaderBookkeeping { - /// If in Preparing status, the trigger_slot of this Prepare phase. - trigger_slot: usize, - - /// If in Preparing status, the endprep_slot of this Prepare phase. - endprep_slot: usize, - - /// Replicas from which I have received Prepare confirmations. - prepare_acks: Bitmap, - - /// Max ballot among received Prepare replies. - prepare_max_bal: Ballot, - - /// Replicas and their assigned shards which the received Accept - /// confirmations cover. - accept_acks: HashMap, -} - -/// Follower-side bookkeeping info for each instance received. -#[derive(Debug, Clone)] -pub(crate) struct ReplicaBookkeeping { - /// Source leader replica ID for replyiing to Prepares and Accepts. - source: ReplicaId, - - /// If in Preparing status, the trigger_slot of this Prepare phase. - trigger_slot: usize, - - /// If in Preparing status, the endprep_slot of this Prepare phase. - endprep_slot: usize, -} - -/// In-memory instance containing a (possibly partial) commands batch. -#[derive(Debug, Clone)] -pub(crate) struct Instance { - /// Ballot number. - bal: Ballot, - - /// Instance status. - status: Status, - - /// Shards of a batch of client requests. - reqs_cw: RSCodeword, - - /// Shards assignment map which the leader used. - assignment: Vec, - - /// Highest ballot and associated value I have accepted. - voted: (Ballot, RSCodeword), - - /// Leader-side bookkeeping info. - leader_bk: Option, - - /// Follower-side bookkeeping info. - replica_bk: Option, - - /// True if from external client, else false. - external: bool, - - /// Offset of first durable WAL log entry related to this instance. - wal_offset: usize, -} - -/// Stable storage WAL log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -pub(crate) enum WalEntry { - /// Records an update to the largest prepare ballot seen. - PrepareBal { slot: usize, ballot: Ballot }, - - /// Records a newly accepted request batch data shards at slot index. - AcceptData { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, - assignment: Vec, - }, - - /// Records an event of committing the instance at index. - CommitSlot { slot: usize }, -} - -/// Snapshot file entry type. -/// -/// NOTE: the current implementation simply appends a squashed log at the -/// end of the snapshot file for simplicity. In production, the snapshot -/// file should be a bounded-sized backend, e.g., an LSM-tree. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -pub(crate) enum SnapEntry { - /// Necessary slot indices to remember. - SlotInfo { - /// First entry at the start of file: number of log instances covered - /// by this snapshot file == the start slot index of in-mem log. - start_slot: usize, - }, - - /// Set of key-value pairs to apply to the state. - KVPairSet { pairs: HashMap }, -} - -/// Heartbeat messages monotonically incrementing ID. -pub(crate) type HeartbeatId = u64; - -/// Peer-peer message type. -#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] -pub(crate) enum PeerMsg { - /// Prepare message from leader to replicas. - Prepare { - /// Slot index in Prepare message is the triggering slot of this - /// Prepare. Once prepared, it means that all slots in the range - /// [slot, +infinity) are prepared under this ballot number. - trigger_slot: usize, - ballot: Ballot, - }, - - /// Prepare reply from replica to leader. - PrepareReply { - /// In our implementation, we choose to break the PrepareReply into - /// slot-wise messages for simplicity. - slot: usize, - /// Also carry the trigger_slot information to make it easier for the - /// leader to track reply progress. - trigger_slot: usize, - /// Due to the slot-wise design choice, we need a way to let leader - /// know when have all PrepareReplies been received. We use the - /// endprep_slot field to convey this: when all slots' PrepareReplies - /// up to endprep_slot are received, the "wholesome" PrepareReply - /// can be considered received. - // NOTE: this currently assumes the "ordering" property of TCP. - endprep_slot: usize, - ballot: Ballot, - /// The accepted ballot number for that instance and the corresponding - /// request batch value shards known by replica. - voted: Option<(Ballot, RSCodeword)>, - }, - - /// Accept message from leader to replicas. - Accept { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, - /// Shard-to-node assignment used for this instance. - assignment: Vec, - }, - - /// Accept reply from replica to leader. - AcceptReply { - slot: usize, - ballot: Ballot, - /// Data size in bytes that the corresponding Accept carried. - size: usize, - /// [for perf breakdown] - reply_ts: Option, - }, - - /// Reconstruction read from new leader to replicas. - Reconstruct { - /// List of slots and correspondingly the shards to exclude. - slots_excl: Vec<(usize, Bitmap)>, - }, - - /// Reconstruction read reply from replica to leader. - ReconstructReply { - /// Map from slot -> (ballot, peer shards). - slots_data: HashMap)>, - }, - - /// Leader activity heartbeat. - Heartbeat { - id: HeartbeatId, - ballot: Ballot, - /// For notifying followers about safe-to-commit slots (in a bit - /// conservative way). - commit_bar: usize, - /// For leader step-up as well as conservative snapshotting purpose. - exec_bar: usize, - /// For conservative snapshotting purpose. - snap_bar: usize, - }, -} - -/// Crossword server replica module. -pub(crate) struct CrosswordReplica { - /// Replica ID in cluster. - id: ReplicaId, - - /// Total number of replicas in cluster. - population: u8, - - /// Majority quorum size. - majority: u8, - - /// Reed-Solomon total number of shards. - rs_total_shards: u8, - - /// Reed-Solomon number of data shards. - rs_data_shards: u8, - - /// Doing dynamically adaptive config choosing? - // NOTE: currently, adaptability is only enabled when an initial assignment - // is not give. - assignment_adaptive: bool, - - /// Using only balanced assignment policies? - // NOTE: currently, adaptability is only enabled when using balanced - // assignment policies. - assignment_balanced: bool, - - /// Initial assignment policy from config. - init_assignment: Vec, - - /// Pre-filled good balanced round-robin assignment policies for quicker - /// access when peer_alive count is low. - brr_assignments: HashMap>, - - /// Configuration parameters struct. - config: ReplicaConfigCrossword, - - /// Address string for client requests API. - _api_addr: SocketAddr, - - /// Address string for internal peer-peer communication. - _p2p_addr: SocketAddr, - - /// ControlHub module. - control_hub: ControlHub, - - /// ExternalApi module. - external_api: ExternalApi, - - /// StateMachine module. - state_machine: StateMachine, - - /// StorageHub module. - storage_hub: StorageHub, - - /// StorageHub module for the snapshot file. - snapshot_hub: StorageHub, - - /// TransportHub module. - transport_hub: TransportHub, - - /// Who do I think is the effective leader of the cluster right now? - leader: Option, - - /// Timer for hearing heartbeat from leader. - hb_hear_timer: Timer, - - /// Interval for sending heartbeat to followers. - hb_send_interval: Interval, - - /// Heartbeat reply counters for approximate detection of follower health. - /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). - hb_reply_cnts: HashMap, - - /// Approximate health status tracking of peer replicas. - peer_alive: Bitmap, - - /// In-memory log of instances. - insts: Vec, - - /// Start slot index of in-mem log after latest snapshot. - start_slot: usize, - - /// Timer for taking a new autonomous snapshot. - snapshot_interval: Interval, - - /// Timer for triggering linear regression model update. - linreg_interval: Interval, - - /// Timer for triggering follower gossiping. - gossip_timer: Timer, - - /// Largest ballot number that a leader has sent Prepare messages in. - bal_prep_sent: Ballot, - - /// Largest ballot number that a leader knows has been safely prepared. - bal_prepared: Ballot, - - /// Largest ballot number seen as acceptor. - bal_max_seen: Ballot, - - /// Index of the first non-committed instance. - commit_bar: usize, - - /// Index of the first instance starting from which gossiping might be needed. - /// The "Gossiped" status is captured by this variable implicitly. - gossip_bar: usize, - - /// Index of the first non-executed instance. - /// The following is always true: - /// exec_bar <= gossip_bar <= commit_bar <= start_slot + insts.len() - exec_bar: usize, - - /// Map from peer ID -> its latest exec_bar I know; this is for conservative - /// snapshotting purpose. - peer_exec_bar: HashMap, - - /// Slot index before which it is safe to take snapshot. - /// NOTE: we are taking a conservative approach here that a snapshot - /// covering an entry can be taken only when all servers have durably - /// committed (and executed) that entry. - snap_bar: usize, - - /// Current durable WAL log file offset. - wal_offset: usize, - - /// Current durable snapshot file offset. - snap_offset: usize, - - /// Fixed Reed-Solomon coder. - rs_coder: ReedSolomon, - - /// Map from peer ID -> list of pending Accepts (timestamp, slot) for - /// perf monitoring. - pending_accepts: HashMap>, - - /// Map from peer ID -> ;ist of pending heartbeats (timestamp, id) for - /// perf monitoring. - pending_heartbeats: HashMap>, - - /// Monotonically incrementing ID for heartbeat messages. - next_hb_id: HeartbeatId, - - /// Base time instant at startup, used as a reference zero timestamp. - startup_time: Instant, - - /// Elapsed us of the last console printing of linreg models result. - last_linreg_print: u128, - - /// Map from peer ID -> linear regressor for perf monitoring model. - regressor: HashMap, - - /// Map from peer ID -> current saved linear regression perf model. - linreg_model: HashMap, - - /// Queueing discipline information tracker. - qdisc_info: Option, - - /// Performance breakdown stopwatch if doing recording. - bd_stopwatch: Option, - - /// Performance breakdown printing interval. - bd_print_interval: Interval, -} - -// CrosswordReplica common helpers -impl CrosswordReplica { - /// Do I think I am the current effective leader? - #[inline] - fn is_leader(&self) -> bool { - self.leader == Some(self.id) - } - - /// Create an empty null instance. - #[inline] - fn null_instance(&self) -> Result { - Ok(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.rs_data_shards, - self.rs_total_shards - self.rs_data_shards, - )?, - assignment: vec![], - voted: ( - 0, - RSCodeword::::from_null( - self.rs_data_shards, - self.rs_total_shards - self.rs_data_shards, - )?, - ), - leader_bk: None, - replica_bk: None, - external: false, - wal_offset: 0, - }) - } - - /// Locate the first null slot or append a null instance if no holes exist. - fn first_null_slot(&mut self) -> Result { - for s in self.exec_bar..(self.start_slot + self.insts.len()) { - if self.insts[s - self.start_slot].status == Status::Null { - return Ok(s); - } - } - self.insts.push(self.null_instance()?); - Ok(self.start_slot + self.insts.len() - 1) - } - - /// Compose a unique ballot number from base. - #[inline] - fn make_unique_ballot(&self, base: u64) -> Ballot { - ((base << 8) | ((self.id + 1) as u64)) as Ballot - } - - /// Compose a unique ballot number greater than the given one. - #[inline] - fn make_greater_ballot(&self, bal: Ballot) -> Ballot { - self.make_unique_ballot((bal >> 8) + 1) - } - - /// Compose LogActionId from slot index & entry type. - /// Uses the `Status` enum type to represent differnet entry types. - #[inline] - fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { - let type_num = match entry_type { - Status::Preparing => 1, - Status::Accepting => 2, - Status::Committed => 3, - _ => panic!("unknown log entry type {:?}", entry_type), - }; - ((slot << 2) | type_num) as LogActionId - } - - /// Decompose LogActionId into slot index & entry type. - #[inline] - fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { - let slot = (log_action_id >> 2) as usize; - let type_num = log_action_id & ((1 << 2) - 1); - let entry_type = match type_num { - 1 => Status::Preparing, - 2 => Status::Accepting, - 3 => Status::Committed, - _ => panic!("unknown log entry type num {}", type_num), - }; - (slot, entry_type) - } - - /// Compose CommandId from slot index & command index within. - #[inline] - fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { - debug_assert!(slot <= (u32::MAX as usize)); - debug_assert!(cmd_idx <= (u32::MAX as usize)); - ((slot << 32) | cmd_idx) as CommandId - } - - /// Decompose CommandId into slot index & command index within. - #[inline] - fn split_command_id(command_id: CommandId) -> (usize, usize) { - let slot = (command_id >> 32) as usize; - let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; - (slot, cmd_idx) - } -} - -#[async_trait] -impl GenericReplica for CrosswordReplica { - async fn new_and_setup( - api_addr: SocketAddr, - p2p_addr: SocketAddr, - manager: SocketAddr, - config_str: Option<&str>, - ) -> Result { - // connect to the cluster manager and get assigned a server ID - let mut control_hub = ControlHub::new_and_setup(manager).await?; - let id = control_hub.me; - let population = control_hub.population; - - // parse protocol-specific configs - let config = parsed_config!(config_str => ReplicaConfigCrossword; - batch_interval_ms, max_batch_size, - backer_path, logger_sync, - hb_hear_timeout_min, hb_hear_timeout_max, - hb_send_interval_ms, disable_hb_timer, - snapshot_path, snapshot_interval_s, - gossip_timeout_min, gossip_timeout_max, - gossip_tail_ignores, disable_gossip_timer, - fault_tolerance, msg_chunk_size, - rs_total_shards, rs_data_shards, - init_assignment, linreg_interval_ms, - linreg_keep_ms, linreg_outlier_ratio, - linreg_init_a, linreg_init_b, - b_to_d_threshold, record_breakdown, - record_value_ver, sim_read_lease)?; - if config.batch_interval_ms == 0 { - return logged_err!( - "invalid config.batch_interval_ms '{}'", - config.batch_interval_ms - ); - } - if config.hb_hear_timeout_min < 100 { - return logged_err!( - "invalid config.hb_hear_timeout_min '{}'", - config.hb_hear_timeout_min - ); - } - if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { - return logged_err!( - "invalid config.hb_hear_timeout_max '{}'", - config.hb_hear_timeout_max - ); - } - if config.hb_send_interval_ms == 0 { - return logged_err!( - "invalid config.hb_send_interval_ms '{}'", - config.hb_send_interval_ms - ); - } - if config.msg_chunk_size == 0 { - return logged_err!( - "invalid config.msg_chunk_size '{}'", - config.msg_chunk_size - ); - } - if config.linreg_interval_ms == 0 { - return logged_err!( - "invalid config.linreg_interval_ms '{}'", - config.linreg_interval_ms - ); - } - if config.linreg_keep_ms == 0 { - return logged_err!( - "invalid config.linreg_keep_ms '{}'", - config.linreg_keep_ms - ); - } - if !(0.0..1.0).contains(&config.linreg_outlier_ratio) { - return logged_err!( - "invalid config.linreg_outlier_ratio '{}'", - config.linreg_outlier_ratio - ); - } - if config.b_to_d_threshold < 0.0 { - return logged_err!( - "invalid config.b_to_d_threshold '{}'", - config.b_to_d_threshold - ); - } - - // setup state machine module - let state_machine = StateMachine::new_and_setup(id).await?; - - // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - // setup transport hub module - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - - // ask for the list of peers to proactively connect to. Do this after - // transport hub has been set up, so that I will be able to accept - // later peer connections - control_hub.send_ctrl(CtrlMsg::NewServerJoin { - id, - protocol: SmrProtocol::Crossword, - api_addr, - p2p_addr, - })?; - let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = - control_hub.recv_ctrl().await? - { - to_peers - } else { - return logged_err!("unexpected ctrl msg type received"); - }; - - // compute majority and set fault_tolerance level - let majority = (population / 2) + 1; - if config.fault_tolerance > (population - majority) { - return logged_err!( - "invalid config.fault_tolerance '{}'", - config.fault_tolerance - ); - } - - // parse Reed_Solomon coding scheme - if config.rs_total_shards % population != 0 { - return logged_err!( - "invalid config.rs_total_shards '{}'", - config.rs_total_shards - ); - } - let rs_total_shards = cmp::max(config.rs_total_shards, population); - if (config.rs_total_shards != 0 && config.rs_data_shards == 0) - || (config.rs_total_shards == 0 && config.rs_data_shards != 0) - || (config.rs_data_shards % majority != 0) - || (config.rs_data_shards != 0 - && config.rs_data_shards / majority - != rs_total_shards / population) - { - return logged_err!( - "invalid config.rs_data_shards '{}'", - config.rs_data_shards - ); - } - let rs_data_shards = cmp::max(config.rs_data_shards, majority); - - // create a Reed-Solomon coder - let rs_coder = ReedSolomon::new( - rs_data_shards as usize, - (rs_total_shards - rs_data_shards) as usize, - )?; - - // parse shards assignment policy config string if given - let init_assignment = Self::parse_init_assignment( - population, - rs_total_shards, - rs_data_shards, - &config.init_assignment, - )?; - let mut max_coverage = Bitmap::new(rs_total_shards, false); - let mut nums_assigned: HashSet = HashSet::new(); - for shards in &init_assignment { - nums_assigned.insert(shards.count()); - for (shard, flag) in shards.iter() { - if flag { - max_coverage.set(shard, true)?; - } - } - } - let assignment_adaptive = config.init_assignment.is_empty(); - let assignment_balanced = nums_assigned.len() == 1; - if init_assignment.len() != population as usize - || max_coverage.count() < rs_data_shards - { - return logged_err!( - "invalid init assignment parsed: {:?}", - init_assignment - ); - } - pf_debug!( - "init asgmt {}", - Self::assignment_to_string(&init_assignment) - ); - - // if restricted to balanced assignments only, pre-fill - // `brr_assignments` with balanced round-robin policies - let dj_spr = rs_total_shards / population; - let brr_assignments = if assignment_balanced { - (dj_spr..=rs_data_shards) - .step_by(dj_spr as usize) - .map(|spr| { - ( - spr, - (0..population) - .map(|r| { - Bitmap::from( - rs_total_shards, - ((r * dj_spr)..(r * dj_spr + spr)) - .map(|i| i % rs_total_shards) - .collect(), - ) - }) - .collect(), - ) - }) - .collect() - } else { - HashMap::new() - }; - - // proactively connect to some peers, then wait for all population - // have been connected with me - for (peer, conn_addr) in to_peers { - transport_hub.connect_to_peer(peer, conn_addr).await?; - } - transport_hub.wait_for_group(population).await?; - - // setup snapshot hub module - let snapshot_hub = - StorageHub::new_and_setup(id, Path::new(&config.snapshot_path)) - .await?; - - // setup external API module, ready to take in client requests - let external_api = ExternalApi::new_and_setup( - id, - api_addr, - Duration::from_millis(config.batch_interval_ms), - config.max_batch_size, - ) - .await?; - - let mut hb_send_interval = - time::interval(Duration::from_millis(config.hb_send_interval_ms)); - hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - let mut snapshot_interval = time::interval(Duration::from_secs( - if config.snapshot_interval_s > 0 { - config.snapshot_interval_s - } else { - 60 // dummy non-zero value to make `time::interval` happy - }, - )); - snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - let mut linreg_interval = - time::interval(Duration::from_millis(config.linreg_interval_ms)); - linreg_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - let hb_reply_cnts = (0..population) - .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) - .collect(); - - let bd_stopwatch = if config.record_breakdown { - Some(Stopwatch::new()) - } else { - None - }; - let mut bd_print_interval = time::interval(Duration::from_secs(5)); - bd_print_interval.set_missed_tick_behavior(MissedTickBehavior::Delay); - - Ok(CrosswordReplica { - id, - population, - majority, - rs_total_shards, - rs_data_shards, - assignment_adaptive, - assignment_balanced, - init_assignment, - brr_assignments, - config: config.clone(), - _api_addr: api_addr, - _p2p_addr: p2p_addr, - control_hub, - external_api, - state_machine, - storage_hub, - snapshot_hub, - transport_hub, - leader: None, - hb_hear_timer: Timer::new(), - hb_send_interval, - hb_reply_cnts, - peer_alive: Bitmap::new(population, true), - insts: vec![], - start_slot: 0, - snapshot_interval, - linreg_interval, - gossip_timer: Timer::new(), - bal_prep_sent: 0, - bal_prepared: 0, - bal_max_seen: 0, - commit_bar: 0, - gossip_bar: 0, - exec_bar: 0, - peer_exec_bar: (0..population) - .filter_map(|s| if s == id { None } else { Some((s, 0)) }) - .collect(), - snap_bar: 0, - wal_offset: 0, - snap_offset: 0, - rs_coder, - pending_accepts: (0..population) - .filter_map(|s| { - if s == id { - None - } else { - Some((s, VecDeque::new())) - } - }) - .collect(), - pending_heartbeats: (0..population) - .filter_map(|s| { - if s == id { - None - } else { - Some((s, VecDeque::new())) - } - }) - .collect(), - next_hb_id: 0, - startup_time: Instant::now(), - last_linreg_print: 0, - regressor: (0..population) - .filter_map(|s| { - if s == id { - None - } else { - Some((s, LinearRegressor::new())) - } - }) - .collect(), - linreg_model: (0..population) - .filter_map(|s| { - if s == id { - None - } else { - Some(( - s, - PerfModel::new( - config.linreg_init_a, - config.linreg_init_b, - 0.0, - ), - )) - } - }) - .collect(), - qdisc_info: if config.b_to_d_threshold > 0.0 { - Some(QdiscInfo::new()?) - } else { - None - }, - bd_stopwatch, - bd_print_interval, - }) - } - - async fn run( - &mut self, - mut rx_term: watch::Receiver, - ) -> Result { - // recover state from durable snapshot file - self.recover_from_snapshot().await?; - - // recover the tail-piece memory log & state from durable WAL log - self.recover_from_wal().await?; - - // kick off leader activity hearing timer - self.kickoff_hb_hear_timer()?; - - // kick off follower gossiping trigger timer - self.kickoff_gossip_timer()?; - - // main event loop - let mut paused = false; - loop { - tokio::select! { - // client request batch - req_batch = self.external_api.get_req_batch(), if !paused => { - if let Err(e) = req_batch { - pf_error!("error getting req batch: {}", e); - continue; - } - let req_batch = req_batch.unwrap(); - if let Err(e) = self.handle_req_batch(req_batch) { - pf_error!("error handling req batch: {}", e); - } - }, - - // durable logging result - log_result = self.storage_hub.get_result(), if !paused => { - if let Err(e) = log_result { - pf_error!("error getting log result: {}", e); - continue; - } - let (action_id, log_result) = log_result.unwrap(); - if let Err(e) = self.handle_log_result(action_id, log_result) { - pf_error!("error handling log result {}: {}", - action_id, e); - } - }, - - // message from peer - msg = self.transport_hub.recv_msg(), if !paused => { - if let Err(_e) = msg { - // NOTE: commented out to prevent console lags - // during benchmarking - // pf_error!("error receiving peer msg: {}", e); - continue; - } - let (peer, msg) = msg.unwrap(); - if let Err(e) = self.handle_msg_recv(peer, msg) { - pf_error!("error handling msg recv <- {}: {}", peer, e); - } - }, - - // state machine execution result - cmd_result = self.state_machine.get_result(), if !paused => { - if let Err(e) = cmd_result { - pf_error!("error getting cmd result: {}", e); - continue; - } - let (cmd_id, cmd_result) = cmd_result.unwrap(); - if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) { - pf_error!("error handling cmd result {}: {}", cmd_id, e); - } - }, - - // leader inactivity timeout - _ = self.hb_hear_timer.timeout(), if !paused => { - if let Err(e) = self.become_a_leader() { - pf_error!("error becoming a leader: {}", e); - } - }, - - // leader sending heartbeat - _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => { - if let Err(e) = self.bcast_heartbeats() { - pf_error!("error broadcasting heartbeats: {}", e); - } - }, - - // autonomous snapshot taking timeout - _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { - if let Err(e) = self.take_new_snapshot().await { - pf_error!("error taking a new snapshot: {}", e); - } else { - self.control_hub.send_ctrl( - CtrlMsg::SnapshotUpTo { new_start: self.start_slot } - )?; - } - }, - - // linear regression model update trigger - _ = self.linreg_interval.tick(), if !paused && self.is_leader() => { - if let Err(e) = self.update_linreg_model(self.config.linreg_keep_ms) { - pf_error!("error updating linear regression model: {}", e); - } - if let Err(e) = self.update_qdisc_info() { - pf_error!("error updating tc qdisc info: {}", e); - } - }, - - // follower gossiping trigger - _ = self.gossip_timer.timeout(), if !paused && !self.is_leader() => { - if let Err(e) = self.trigger_gossiping() { - pf_error!("error triggering gossiping: {}", e); - } - }, - - // performance breakdown stats printing - _ = self.bd_print_interval.tick(), if !paused && self.config.record_breakdown => { - if self.is_leader() { - if let Some(sw) = self.bd_stopwatch.as_mut() { - let (cnt, stats) = sw.summarize(5); - pf_info!("bd cnt {} comp {:.2} {:.2} ldur {:.2} {:.2} \ - arep {:.2} {:.2} qrum {:.2} {:.2} \ - exec {:.2} {:.2}", - cnt, stats[0].0, stats[0].1, stats[1].0, stats[1].1, - stats[2].0, stats[2].1, stats[3].0, stats[3].1, - stats[4].0, stats[4].1); - sw.remove_all(); - } - } - if self.config.record_value_ver { - if let Ok(Some((key, ver))) = self.val_ver_of_first_key() { - pf_info!("ver of {} @ {} ms is {}", - key, - Instant::now() - .duration_since(self.startup_time) - .as_millis(), - ver); - } - } - }, - - // manager control message - ctrl_msg = self.control_hub.recv_ctrl() => { - if let Err(e) = ctrl_msg { - pf_error!("error getting ctrl msg: {}", e); - continue; - } - let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { - Ok(terminate) => { - if let Some(restart) = terminate { - return Ok(restart); - } - }, - Err(e) => { - pf_error!("error handling ctrl msg: {}", e); - } - } - }, - - // receiving termination signal - _ = rx_term.changed() => { - pf_warn!("server caught termination signal"); - return Ok(false); - } - } - } - } - - fn id(&self) -> ReplicaId { - self.id - } -} - -/// Configuration parameters struct. -#[derive(Debug, Deserialize)] -pub struct ClientConfigCrossword { - /// Which server to pick initially. - pub init_server_id: ReplicaId, -} - -#[allow(clippy::derivable_impls)] -impl Default for ClientConfigCrossword { - fn default() -> Self { - ClientConfigCrossword { init_server_id: 0 } - } -} - -/// Crossword client-side module. -pub(crate) struct CrosswordClient { - /// Client ID. - id: ClientId, - - /// Configuration parameters struct. - _config: ClientConfigCrossword, - - /// List of active servers information. - servers: HashMap, - - /// Current server ID to talk to. - server_id: ReplicaId, - - /// Control API stub to the cluster manager. - ctrl_stub: ClientCtrlStub, - - /// API stubs for communicating with servers. - api_stubs: HashMap, -} - -#[async_trait] -impl GenericEndpoint for CrosswordClient { - async fn new_and_setup( - manager: SocketAddr, - config_str: Option<&str>, - ) -> Result { - // connect to the cluster manager and get assigned a client ID - pf_debug!("connecting to manager '{}'...", manager); - let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; - let id = ctrl_stub.id; - - // parse protocol-specific configs - let config = parsed_config!(config_str => ClientConfigCrossword; - init_server_id)?; - let init_server_id = config.init_server_id; - - Ok(CrosswordClient { - id, - _config: config, - servers: HashMap::new(), - server_id: init_server_id, - ctrl_stub, - api_stubs: HashMap::new(), - }) - } - - async fn connect(&mut self) -> Result<(), SummersetError> { - // disallow reconnection without leaving - if !self.api_stubs.is_empty() { - return logged_err!("reconnecting without leaving"); - } - - // ask the manager about the list of active servers - let mut sent = - self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; - while !sent { - sent = self.ctrl_stub.send_req(None)?; - } - - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::QueryInfo { - population, - servers_info, - } => { - // shift to a new server_id if current one not active - debug_assert!(!servers_info.is_empty()); - while !servers_info.contains_key(&self.server_id) - || servers_info[&self.server_id].is_paused - { - self.server_id = (self.server_id + 1) % population; - } - // establish connection to all servers - self.servers = servers_info - .into_iter() - .map(|(id, info)| (id, info.api_addr)) - .collect(); - for (&id, &server) in &self.servers { - pf_debug!("connecting to server {} '{}'...", id, server); - let api_stub = - ClientApiStub::new_by_connect(self.id, server).await?; - self.api_stubs.insert(id, api_stub); - } - Ok(()) - } - _ => logged_err!("unexpected reply type received"), - } - } - - async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { - // send leave notification to all servers - for (id, mut api_stub) in self.api_stubs.drain() { - let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; - while !sent { - sent = api_stub.send_req(None)?; - } - - // NOTE: commented out the following wait to avoid accidental - // hanging upon leaving - // while api_stub.recv_reply().await? != ApiReply::Leave {} - pf_debug!("left server connection {}", id); - } - - // if permanently leaving, send leave notification to the manager - if permanent { - let mut sent = - self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = self.ctrl_stub.send_req(None)?; - } - - while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} - pf_debug!("left manager connection"); - } - - Ok(()) - } - - fn send_req( - &mut self, - req: Option<&ApiRequest>, - ) -> Result { - if self.api_stubs.contains_key(&self.server_id) { - self.api_stubs - .get_mut(&self.server_id) - .unwrap() - .send_req(req) - } else { - Err(SummersetError::msg(format!( - "server_id {} not in api_stubs", - self.server_id - ))) - } - } - - async fn recv_reply(&mut self) -> Result { - if self.api_stubs.contains_key(&self.server_id) { - let reply = self - .api_stubs - .get_mut(&self.server_id) - .unwrap() - .recv_reply() - .await?; - - if let ApiReply::Reply { - ref result, - ref redirect, - .. - } = reply - { - // if the current server redirects me to a different server - if result.is_none() && redirect.is_some() { - let redirect_id = redirect.unwrap(); - debug_assert!(self.servers.contains_key(&redirect_id)); - self.server_id = redirect_id; - pf_debug!( - "redirected to replica {} '{}'", - redirect_id, - self.servers[&redirect_id] - ); - } - } - - Ok(reply) - } else { - Err(SummersetError::msg(format!( - "server_id {} not in api_stubs", - self.server_id - ))) - } - } - - fn id(&self) -> ClientId { - self.id - } - - fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { - &mut self.ctrl_stub - } -} diff --git a/src/protocols/crossword/recovery.rs b/src/protocols/crossword/recovery.rs deleted file mode 100644 index 96ce9565..00000000 --- a/src/protocols/crossword/recovery.rs +++ /dev/null @@ -1,188 +0,0 @@ -//! Crossword -- recovery from WAL. - -use super::*; - -use crate::server::{ApiRequest, LogAction, LogResult}; -use crate::utils::SummersetError; - -// CrosswordReplica recovery from WAL log -impl CrosswordReplica { - /// Apply a durable storage log entry for recovery. - async fn recover_apply_entry( - &mut self, - entry: WalEntry, - ) -> Result<(), SummersetError> { - match entry { - WalEntry::PrepareBal { slot, ballot } => { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - // update instance state - let inst = &mut self.insts[slot - self.start_slot]; - inst.bal = ballot; - inst.status = Status::Preparing; - // update bal_prep_sent and bal_max_seen, reset bal_prepared - if self.bal_prep_sent < ballot { - self.bal_prep_sent = ballot; - } - if self.bal_max_seen < ballot { - self.bal_max_seen = ballot; - } - self.bal_prepared = 0; - } - - WalEntry::AcceptData { - slot, - ballot, - reqs_cw, - assignment, - } => { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - // update instance state - let inst = &mut self.insts[slot - self.start_slot]; - inst.bal = ballot; - inst.status = Status::Accepting; - inst.reqs_cw = reqs_cw.clone(); - inst.assignment = assignment; - inst.voted = (ballot, reqs_cw); - // it could be the case that the PrepareBal action for this - // ballot has been snapshotted - if self.bal_prep_sent < ballot { - self.bal_prep_sent = ballot; - } - // update bal_prepared and bal_max_seen - if self.bal_prepared < ballot { - self.bal_prepared = ballot; - } - if self.bal_max_seen < ballot { - self.bal_max_seen = ballot; - } - debug_assert!(self.bal_prepared <= self.bal_prep_sent); - } - - WalEntry::CommitSlot { slot } => { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - debug_assert!(slot < self.start_slot + self.insts.len()); - // update instance status - self.insts[slot - self.start_slot].status = Status::Committed; - // submit commands in contiguously committed instance to the - // state machine - if slot == self.commit_bar { - while self.commit_bar < self.start_slot + self.insts.len() { - let inst = - &mut self.insts[self.commit_bar - self.start_slot]; - if inst.status < Status::Committed { - break; - } - // check number of available shards - if inst.reqs_cw.avail_shards() - < inst.reqs_cw.num_data_shards() - { - // can't execute if I don't have the complete request batch - break; - } else if inst.reqs_cw.avail_data_shards() - < inst.reqs_cw.num_data_shards() - { - // have enough shards but need reconstruction - inst.reqs_cw - .reconstruct_data(Some(&self.rs_coder))?; - } - // execute all commands in this instance on state machine - // synchronously - for (_, req) in inst.reqs_cw.get_data()?.clone() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine - .do_sync_cmd( - 0, // using 0 as dummy command ID - cmd, - ) - .await?; - } - } - // update instance status, commit_bar, and exec_bar - self.commit_bar += 1; - self.gossip_bar += 1; - self.exec_bar += 1; - inst.status = Status::Executed; - } - } - } - } - - Ok(()) - } - - /// Recover state from durable storage WAL log. - pub(super) async fn recover_from_wal( - &mut self, - ) -> Result<(), SummersetError> { - debug_assert_eq!(self.wal_offset, 0); - loop { - match self - .storage_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Read { - offset: self.wal_offset, - }, - ) - .await? - .1 - { - LogResult::Read { - entry: Some(entry), - end_offset, - } => { - self.recover_apply_entry(entry).await?; - // update log offset - self.wal_offset = end_offset; - } - LogResult::Read { entry: None, .. } => { - // end of log reached - break; - } - _ => { - return logged_err!("unexpected log result type"); - } - } - } - - // do an extra Truncate to remove partial entry at the end if any - if let LogResult::Truncate { - offset_ok: true, .. - } = self - .storage_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Truncate { - offset: self.wal_offset, - }, - ) - .await? - .1 - { - if self.wal_offset > 0 { - pf_info!( - "recovered from wal log: commit {} exec {}", - self.commit_bar, - self.exec_bar - ); - } - Ok(()) - } else { - logged_err!("unexpected log result type or failed truncate") - } - } -} diff --git a/src/protocols/crossword/request.rs b/src/protocols/crossword/request.rs deleted file mode 100644 index 2a6a3d1f..00000000 --- a/src/protocols/crossword/request.rs +++ /dev/null @@ -1,239 +0,0 @@ -//! Crossword -- client request entrance. - -use std::collections::HashMap; - -use super::*; - -use crate::server::{ApiReply, ApiRequest, Command, CommandResult, LogAction}; -use crate::utils::{Bitmap, RSCodeword, SummersetError}; - -// CrosswordReplica client requests entrance -impl CrosswordReplica { - /// Handler of client request batch chan recv. - pub(super) fn handle_req_batch( - &mut self, - mut req_batch: ReqBatch, - ) -> Result<(), SummersetError> { - let batch_size = req_batch.len(); - debug_assert!(batch_size > 0); - pf_debug!("got request batch of size {}", batch_size); - - // if I'm not a leader, ignore client requests - if !self.is_leader() || self.bal_prepared == 0 { - for (client, req) in req_batch { - if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on known leader or just the - // next ID replica - let target = if let Some(peer) = self.leader { - peer - } else { - (self.id + 1) % self.population - }; - self.external_api.send_reply( - ApiReply::Reply { - id: req_id, - result: None, - redirect: Some(target), - }, - client, - )?; - pf_trace!( - "redirected client {} to replica {}", - client, - target - ); - } - } - return Ok(()); - } - - // if simulating read leases, extract all the reads and immediately - // reply to them with a dummy value - // TODO: only for benchmarking purposes - if self.config.sim_read_lease { - for (client, req) in &req_batch { - if let ApiRequest::Req { - id: req_id, - cmd: Command::Get { .. }, - } = req - { - self.external_api.send_reply( - ApiReply::Reply { - id: *req_id, - result: Some(CommandResult::Get { value: None }), - redirect: None, - }, - *client, - )?; - pf_trace!("replied -> client {} for read-only cmd", client); - } - } - - req_batch.retain(|(_, req)| { - !matches!( - req, - ApiRequest::Req { - cmd: Command::Get { .. }, - .. - } - ) - }); - if req_batch.is_empty() { - return Ok(()); - } - } - - // [for perf breakdown] - let slot = self.first_null_slot()?; - if self.bal_prepared > 0 { - if let Some(sw) = self.bd_stopwatch.as_mut() { - sw.record_now(slot, 0, None)?; - } - } - - // compute the complete Reed-Solomon codeword for the batch data - let mut reqs_cw = RSCodeword::from_data( - req_batch, - self.rs_data_shards, - self.rs_total_shards - self.rs_data_shards, - )?; - reqs_cw.compute_parity(Some(&self.rs_coder))?; - - // create a new instance in the first null slot (or append a new one - // at the end if no holes exist); fill it up with incoming data - { - let inst = &mut self.insts[slot - self.start_slot]; - debug_assert_eq!(inst.status, Status::Null); - inst.reqs_cw = reqs_cw; - inst.leader_bk = Some(LeaderBookkeeping { - trigger_slot: 0, - endprep_slot: 0, - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }); - inst.external = true; - } - - // start the Accept phase for this instance - let inst = &mut self.insts[slot - self.start_slot]; - inst.bal = self.bal_prepared; - inst.status = Status::Accepting; - - // [for perf breakdown] - if let Some(sw) = self.bd_stopwatch.as_mut() { - sw.record_now(slot, 1, None)?; - } - - let assignment = Self::pick_assignment_policy( - self.assignment_adaptive, - self.assignment_balanced, - &self.init_assignment, - &self.brr_assignments, - self.rs_data_shards, - self.majority, - self.config.fault_tolerance, - inst.reqs_cw.data_len(), - &self.linreg_model, - self.config.b_to_d_threshold, - &self.qdisc_info, - &self.peer_alive, - ); - pf_debug!( - "enter Accept phase for slot {} bal {} asgmt {}", - slot, - inst.bal, - Self::assignment_to_string(assignment) - ); - - // record update to largest accepted ballot and corresponding data - let subset_copy = inst - .reqs_cw - .subset_copy(&assignment[self.id as usize], false)?; - inst.assignment.clone_from(assignment); - inst.voted = (inst.bal, subset_copy.clone()); - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Accepting), - LogAction::Append { - entry: WalEntry::AcceptData { - slot, - ballot: inst.bal, - // persist only some shards on myself - reqs_cw: subset_copy, - assignment: assignment.clone(), - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!( - "submitted AcceptData log action for slot {} bal {}", - slot, - inst.bal - ); - - // send Accept messages to all peers, each getting its subset of - // shards of data - let now_us = self.startup_time.elapsed().as_micros(); - for peer in 0..self.population { - if peer == self.id { - continue; - } - self.transport_hub.send_msg( - PeerMsg::Accept { - slot, - ballot: inst.bal, - reqs_cw: inst - .reqs_cw - .subset_copy(&assignment[peer as usize], false)?, - assignment: assignment.clone(), - }, - peer, - )?; - if self.peer_alive.get(peer)? { - self.pending_accepts - .get_mut(&peer) - .unwrap() - .push_back((now_us, slot)); - } - } - pf_trace!( - "broadcast Accept messages for slot {} bal {}", - slot, - inst.bal - ); - - Ok(()) - } - - /// [for stale read profiling] - pub(super) fn val_ver_of_first_key( - &mut self, - ) -> Result, SummersetError> { - let (mut key, mut ver) = (None, 0); - for inst in self.insts.iter_mut() { - if inst.status >= Status::Committed - && inst.reqs_cw.avail_shards() >= self.rs_data_shards - { - if inst.reqs_cw.avail_data_shards() < self.rs_data_shards { - inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; - } - - for (_, req) in inst.reqs_cw.get_data()? { - if let ApiRequest::Req { - cmd: Command::Put { key: k, .. }, - .. - } = req - { - if key.is_none() { - key = Some(k.into()); - } else if key.as_ref().unwrap() == k { - ver += 1; - } - } - } - } - } - - Ok(key.map(|k| (k, ver))) - } -} diff --git a/src/protocols/crossword/snapshot.rs b/src/protocols/crossword/snapshot.rs deleted file mode 100644 index 0a4be791..00000000 --- a/src/protocols/crossword/snapshot.rs +++ /dev/null @@ -1,308 +0,0 @@ -//! Crossword -- snapshotting & GC. - -use std::cmp; -use std::collections::HashMap; - -use super::*; - -use crate::manager::CtrlMsg; -use crate::server::{ApiRequest, Command, LogAction, LogResult}; -use crate::utils::SummersetError; - -// CrosswordReplica snapshotting & GC logic -impl CrosswordReplica { - /// Dump new key-value pairs to snapshot file. - async fn snapshot_dump_kv_pairs( - &mut self, - new_start_slot: usize, - ) -> Result<(), SummersetError> { - // collect all key-value pairs put up to exec_bar - let mut pairs = HashMap::new(); - for slot in self.start_slot..new_start_slot { - let inst = &mut self.insts[slot - self.start_slot]; - debug_assert!( - inst.reqs_cw.avail_data_shards() - >= inst.reqs_cw.num_data_shards() - ); - for (_, req) in inst.reqs_cw.get_data()?.clone() { - if let ApiRequest::Req { - cmd: Command::Put { key, value }, - .. - } = req - { - pairs.insert(key, value); - } - } - } - - // write the collection to snapshot file - if let LogResult::Append { now_size } = self - .snapshot_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Append { - entry: SnapEntry::KVPairSet { pairs }, - sync: self.config.logger_sync, - }, - ) - .await? - .1 - { - self.snap_offset = now_size; - Ok(()) - } else { - logged_err!("unexpected log result type") - } - } - - /// Discard everything older than start_slot in durable WAL log. - async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { - // do a dummy sync read to force all previously submitted log actions - // to be processed - let (old_results, _) = self - .storage_hub - .do_sync_action(0, LogAction::Read { offset: 0 }) - .await?; - for (old_id, old_result) in old_results { - self.handle_log_result(old_id, old_result)?; - } - - // get offset to cut the WAL at - let cut_offset = if !self.insts.is_empty() { - self.insts[0].wal_offset - } else { - self.wal_offset - }; - - // discard the log before cut_offset - if cut_offset > 0 { - if let LogResult::Discard { - offset_ok: true, - now_size, - } = self - .storage_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Discard { - offset: cut_offset, - keep: 0, - }, - ) - .await? - .1 - { - debug_assert_eq!(self.wal_offset - cut_offset, now_size); - self.wal_offset = now_size; - } else { - return logged_err!( - "unexpected log result type or failed discard" - ); - } - } - - // update inst.wal_offset for all remaining in-mem instances - for inst in &mut self.insts { - if inst.wal_offset > 0 { - debug_assert!(inst.wal_offset >= cut_offset); - inst.wal_offset -= cut_offset; - } - } - - Ok(()) - } - - /// Take a snapshot up to current exec_bar, then discard the in-mem log up - /// to that index as well as outdate entries in the durable WAL log file. - /// - /// NOTE: the current implementation does not guard against crashes in the - /// middle of taking a snapshot. Production quality implementations should - /// make the snapshotting action "atomic". - /// - /// NOTE: the current implementation does not take care of InstallSnapshot - /// messages (which is needed when some lagging follower has some slot - /// which all other peers have snapshotted); we assume here that failed - /// Accept messages will be retried indefinitely until success before its - /// associated data gets discarded from leader's memory. - pub(super) async fn take_new_snapshot( - &mut self, - ) -> Result<(), SummersetError> { - pf_debug!( - "taking new snapshot: start {} exec {} snap {}", - self.start_slot, - self.exec_bar, - self.snap_bar - ); - debug_assert!(self.exec_bar >= self.start_slot); - - let new_start_slot = cmp::min(self.snap_bar, self.exec_bar); - if new_start_slot == self.start_slot { - return Ok(()); - } - - // collect and dump all Puts in executed instances - if self.is_leader() { - // NOTE: broadcast heartbeats here to appease followers - self.bcast_heartbeats()?; - } - self.snapshot_dump_kv_pairs(new_start_slot).await?; - - // write new slot info entry to the head of snapshot - match self - .snapshot_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Write { - entry: SnapEntry::SlotInfo { - start_slot: new_start_slot, - }, - offset: 0, - sync: self.config.logger_sync, - }, - ) - .await? - .1 - { - LogResult::Write { - offset_ok: true, .. - } => {} - _ => { - return logged_err!( - "unexpected log result type or failed write" - ); - } - } - - // update start_slot and discard all in-memory log instances up to exec_bar - self.insts.drain(0..(new_start_slot - self.start_slot)); - self.start_slot = new_start_slot; - - // discarding everything older than start_slot in WAL log - if self.is_leader() { - // NOTE: broadcast heartbeats here to appease followers - self.bcast_heartbeats()?; - } - self.snapshot_discard_log().await?; - - // reset the leader heartbeat hear timer - self.kickoff_hb_hear_timer()?; - - pf_info!("took snapshot up to: start {}", self.start_slot); - Ok(()) - } - - /// Recover initial state from durable storage snapshot file. - pub(super) async fn recover_from_snapshot( - &mut self, - ) -> Result<(), SummersetError> { - debug_assert_eq!(self.snap_offset, 0); - - // first, try to read the first several bytes, which should record the - // start_slot index - match self - .snapshot_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Read { offset: 0 }, - ) - .await? - .1 - { - LogResult::Read { - entry: Some(SnapEntry::SlotInfo { start_slot }), - end_offset, - } => { - self.snap_offset = end_offset; - - // recover necessary slot indices info - self.start_slot = start_slot; - self.commit_bar = start_slot; - self.gossip_bar = start_slot; - self.exec_bar = start_slot; - self.snap_bar = start_slot; - - // repeatedly apply key-value pairs - loop { - match self - .snapshot_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Read { - offset: self.snap_offset, - }, - ) - .await? - .1 - { - LogResult::Read { - entry: Some(SnapEntry::KVPairSet { pairs }), - end_offset, - } => { - // execute Put commands on state machine - for (key, value) in pairs { - self.state_machine - .do_sync_cmd( - 0, // using 0 as dummy command ID - Command::Put { key, value }, - ) - .await?; - } - // update snapshot file offset - self.snap_offset = end_offset; - } - LogResult::Read { entry: None, .. } => { - // end of log reached - break; - } - _ => { - return logged_err!("unexpected log result type"); - } - } - } - - // tell manager about my start_slot index - self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { - new_start: self.start_slot, - })?; - - if self.start_slot > 0 { - pf_info!( - "recovered from snapshot: start {} commit {} exec {}", - self.start_slot, - self.commit_bar, - self.exec_bar - ); - } - Ok(()) - } - - LogResult::Read { entry: None, .. } => { - // snapshot file is empty. Write a 0 as start_slot and return - if let LogResult::Write { - offset_ok: true, - now_size, - } = self - .snapshot_hub - .do_sync_action( - 0, // using 0 as dummy log action ID - LogAction::Write { - entry: SnapEntry::SlotInfo { start_slot: 0 }, - offset: 0, - sync: self.config.logger_sync, - }, - ) - .await? - .1 - { - self.snap_offset = now_size; - Ok(()) - } else { - logged_err!("unexpected log result type or failed write") - } - } - - _ => { - logged_err!("unexpected log result type") - } - } - } -} diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 153670b6..15015154 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -38,10 +38,6 @@ mod craft; use craft::{CRaftClient, CRaftReplica}; pub use craft::{ClientConfigCRaft, ReplicaConfigCRaft}; -mod crossword; -pub use crossword::{ClientConfigCrossword, ReplicaConfigCrossword}; -use crossword::{CrosswordClient, CrosswordReplica}; - /// Enum of supported replication protocol types. #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum SmrProtocol { @@ -52,7 +48,6 @@ pub enum SmrProtocol { Raft, RSPaxos, CRaft, - Crossword, } /// Helper macro for saving boilder-plate `Box` mapping in @@ -75,7 +70,6 @@ impl SmrProtocol { "Raft" => Some(Self::Raft), "RSPaxos" => Some(Self::RSPaxos), "CRaft" => Some(Self::CRaft), - "Crossword" => Some(Self::Crossword), _ => None, } } @@ -156,14 +150,6 @@ impl SmrProtocol { .await ) } - Self::Crossword => { - box_if_ok!( - CrosswordReplica::new_and_setup( - api_addr, p2p_addr, manager, config_str - ) - .await - ) - } } } @@ -207,11 +193,6 @@ impl SmrProtocol { CRaftClient::new_and_setup(manager, config_str).await ) } - Self::Crossword => { - box_if_ok!( - CrosswordClient::new_and_setup(manager, config_str).await - ) - } } } } @@ -244,7 +225,6 @@ mod name_tests { valid_name_test!(Raft); valid_name_test!(RSPaxos); valid_name_test!(CRaft); - valid_name_test!(Crossword); } #[test] diff --git a/src/utils/linreg.rs b/src/utils/linreg.rs deleted file mode 100644 index 035af5a4..00000000 --- a/src/utils/linreg.rs +++ /dev/null @@ -1,280 +0,0 @@ -//! Linear regression helpers for performance monitoring. - -use std::collections::HashSet; -use std::fmt; - -use crate::utils::SummersetError; - -use rangemap::RangeMap; - -use linreg::linear_regression_of; - -/// Performance model of a peer target. -#[derive(Debug, PartialEq, Clone)] -pub(crate) struct PerfModel { - /// Base bandwidth factor (slope) in ms/MiB. - slope: f64, - - /// Base delay (interception) in ms. - delay: f64, - - /// Average jitter in ms. - jitter: f64, -} - -impl fmt::Display for PerfModel { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "({:.1},{:.1}+{:.1})", - self.slope, self.delay, self.jitter - ) - } -} - -impl PerfModel { - /// Creates a new perf model struct. - #[inline] - pub(crate) fn new(slope: f64, delay: f64, jitter: f64) -> Self { - PerfModel { - delay, - jitter, - slope, - } - } - - /// Updates the perf model numbers. - #[inline] - pub(crate) fn update(&mut self, slope: f64, delay: f64, jitter: f64) { - self.slope = slope; - self.delay = delay; - self.jitter = jitter; - } - - /// Calculate estimated response time given a data size. - #[inline] - pub(crate) fn predict(&self, x: usize) -> f64 { - let size_mb = x as f64 / (1024 * 1024) as f64; - self.slope * size_mb + self.delay + self.jitter - } -} - -/// Linear regression helper struct for maintaining time-tagged datapoints -/// and computing a linear regression model upon requested. -#[derive(Debug)] -pub(crate) struct LinearRegressor { - /// Windows of currently held datapoints, divided into range buckets. - buckets: Vec>, - - /// Map from x value range -> corresponding bucket index. - rangemap: RangeMap, - - /// Result of last calculated regression model; upon any update to the - /// window of datapoints, this result will be invalidated to `None`. - model: Option, -} - -impl LinearRegressor { - /// Creates a new linear regressor helper struct. - // NOTE: currently only using two buckets: size 0 and everything above. - pub(crate) fn new() -> Self { - let mut buckets = vec![]; - let mut rangemap = RangeMap::new(); - buckets.push(vec![]); - rangemap.insert(0..1, 0); - buckets.push(vec![]); - rangemap.insert(1..usize::MAX, 1); - - LinearRegressor { - buckets, - rangemap, - model: None, - } - } - - /// Injects a new datapoint into the window. It is assumed that all - /// injections must have monotonically non-decreasing time tags. - pub(crate) fn append_sample(&mut self, t: u128, x: usize, y: f64) { - let bucket_idx = *self.rangemap.get(&x).unwrap(); - debug_assert!(bucket_idx < self.buckets.len()); - let bucket = &mut self.buckets[bucket_idx]; - debug_assert!(bucket.is_empty() || t >= bucket.last().unwrap().0); - - bucket.push((t, x, y)); - - if self.model.is_some() { - self.model = None; - } - } - - /// Discards everything with timestamp tag before given time. - pub(crate) fn discard_before(&mut self, t: u128) { - for bucket in self.buckets.iter_mut() { - let mut keep = bucket.len(); - for (i, dp) in bucket.iter().enumerate() { - if dp.0 >= t { - keep = i; - break; - } - } - - bucket.drain(0..keep); - - if self.model.is_some() { - self.model = None; - } - } - } - - /// Returns the result of linear regression model calculated on the - /// current window of datapoints. If the model is not valid right now, - /// compute it. - pub(crate) fn calc_model( - &mut self, - outliers_ratio: f32, - ) -> Result { - debug_assert!((0.0..1.0).contains(&outliers_ratio)); - - if let Some(model) = self.model.as_ref() { - // pf_trace!("calc ts {:?} dps {:?} {:?}", - // self.timestamps, self.datapoints, model); - Ok(model.clone()) - } else { - // use all datapoints in the size 0 bucket (i.e., the heartbeat - // messages) to estimate delay and jitter - let bucket0 = &self.buckets[0]; - let mut delay = bucket0 - .iter() - .min_by(|dpa, dpb| dpa.2.partial_cmp(&dpb.2).unwrap()) - .map(|dp| dp.2) - .unwrap_or(0.0); - let mut jitter = (bucket0.iter().map(|dp| dp.2).sum::() - / bucket0.len() as f64) - - delay; - - if delay < 0.0 { - delay = 0.0; - } - if jitter < 0.0 { - jitter = 0.0; - } - - // compute model on current window of datapoints - let mut xys: Vec<(f64, f64)> = vec![]; - for bucket in &self.buckets { - xys.append( - &mut bucket.iter().map(|dp| (dp.1 as f64, dp.2)).collect(), - ); - } - let mut slope = linear_regression_of(&xys)?.0; - if slope < 0.0 { - slope = 0.0; - } - - // remove potential outliers, where outliers are defined as the - // points that are furthest away from computed model (but forcing - // delay to be previously computed) - if outliers_ratio > 0.0 && xys.len() as f32 * outliers_ratio >= 1.0 - { - let mut distances: Vec<(usize, f64)> = xys - .iter() - .map(|(x, y)| (y - (x * slope + delay)).abs()) - .enumerate() - .collect(); - distances.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); - let to_remove: HashSet = distances - .into_iter() - .take((xys.len() as f32 * outliers_ratio).round() as usize) - .map(|(i, _)| i) - .collect(); - let new_xys: Vec<(f64, f64)> = xys - .into_iter() - .enumerate() - .filter_map(|(i, dp)| { - if to_remove.contains(&i) { - None - } else { - Some(dp) - } - }) - .collect(); - slope = linear_regression_of(&new_xys)?.0; - if slope < 0.0 { - slope = 0.0; - } - } - - // take the lowest percentage of datapoints from each bucket; do - // linear regression on them to get an estimation of base delay - // and base bandwidth factor - // let mut xys: Vec<(f64, f64)> = vec![]; - // for bucket in &self.buckets { - // let min_cnt = cmp::max( - // (bucket.len() as f32 * (1.0 - outliers_ratio)) as usize, - // 1, - // ); - - // if !bucket.is_empty() { - // let mut sorted = bucket.clone(); - // sorted - // .sort_by(|dpa, dpb| dpa.2.partial_cmp(&dpb.2).unwrap()); - // let mut min_dps: Vec<(f64, f64)> = sorted - // .iter() - // .take(min_cnt) - // .map(|dp| (dp.1 as f64, dp.2)) - // .collect(); - - // xys.append(&mut min_dps); - // } - // } - // let (mut slope, mut delay) = linear_regression_of(&xys)?; - - slope *= (1024 * 1024) as f64; - let model = PerfModel::new(slope, delay, jitter); - // pf_warn!("calc ts {:?} dps {:?} {}", - // self.timestamps, self.datapoints, model); - self.model = Some(model.clone()); - Ok(model) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn append_discard() { - let mut lg = LinearRegressor::new(); - assert!(lg.model.is_none()); - lg.append_sample(0, 0, 0.5); - lg.append_sample(2, 3, 1.1); - lg.append_sample(6, 5, 2.3); - assert_eq!(lg.buckets[*lg.rangemap.get(&0).unwrap()].len(), 1); - assert_eq!(lg.buckets[*lg.rangemap.get(&3).unwrap()].len(), 2); - lg.discard_before(4); - assert_eq!(lg.buckets[*lg.rangemap.get(&0).unwrap()].len(), 0); - assert_eq!(lg.buckets[*lg.rangemap.get(&3).unwrap()].len(), 1); - } - - #[test] - fn calc_model() -> Result<(), SummersetError> { - let mut lg = LinearRegressor::new(); - assert!(lg.model.is_none()); - lg.append_sample(0, 0, 0.5); - lg.append_sample(2, 1024 * 1024, 1.2); - assert_eq!(lg.calc_model(0.0)?, PerfModel::new(0.7, 0.5, 0.0)); - lg.discard_before(2); - assert!(lg.model.is_none()); - Ok(()) - } - - #[test] - fn model_predict() { - let model = PerfModel::new(1.0, 0.6, 1.2); - let p0 = model.predict(0); - assert!(p0 > 1.75 && p0 < 1.85); - let p1 = model.predict(1024 * 1024); - assert!(p1 > 2.75 && p1 < 2.85) - } -} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index d1a804d8..0ad60f96 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -8,8 +8,6 @@ mod config; mod bitmap; mod error; -mod linreg; -mod qdisc; mod rscoding; mod safetcp; mod stopwatch; @@ -22,8 +20,6 @@ pub use rscoding::RSCodeword; pub use stopwatch::Stopwatch; pub use timer::Timer; -pub(crate) use linreg::{LinearRegressor, PerfModel}; -pub(crate) use qdisc::QdiscInfo; pub(crate) use safetcp::{ safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, tcp_connect_with_retry, }; diff --git a/src/utils/qdisc.rs b/src/utils/qdisc.rs deleted file mode 100644 index 5ffdf400..00000000 --- a/src/utils/qdisc.rs +++ /dev/null @@ -1,234 +0,0 @@ -//! Helpers for running `tc qdisc` commands (exper only). - -use std::fmt; -use std::process::Command; - -use crate::utils::SummersetError; - -static DEFAULT_DELAY: f64 = 0.0; -static DEFAULT_JITTER: f64 = 0.0; -static DEFAULT_RATE: f64 = 100.0; - -/// Helper struct holding qdisc information. -pub(crate) struct QdiscInfo { - /// Main Ethernet interface's name. - dev_name: String, - - /// Delay in ms. - pub(crate) delay: f64, - - /// Jitter in ms. - pub(crate) jitter: f64, - - /// Rate in Gbps. - pub(crate) rate: f64, -} - -impl fmt::Display for QdiscInfo { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{{{:.0} +{:.0}, {:.1}}}", - self.delay, self.jitter, self.rate - ) - } -} - -impl QdiscInfo { - /// Get main Ethernet interface's name. - fn get_interface_name() -> Result { - let output = String::from_utf8( - Command::new("ip") - .arg("-o") - .arg("-4") - .arg("route") - .arg("show") - .arg("to") - .arg("default") - .output()? - .stdout, - )?; - let line = output.trim().to_string(); - for (idx, seg) in line.split_ascii_whitespace().enumerate() { - if idx == 4 { - return Ok(seg.into()); - } - } - Err(SummersetError::msg( - "error getting `ip route show` output line", - )) - } - - /// Creates a new qdisc info struct. - pub(crate) fn new() -> Result { - Ok(QdiscInfo { - dev_name: Self::get_interface_name()?, - delay: DEFAULT_DELAY, - jitter: DEFAULT_JITTER, - rate: DEFAULT_RATE, - }) - } - - /// Query `tc qdisc` info by running the command. Returns the output line - /// with expected device. - fn run_qdisc_show(&self) -> Result { - let output = String::from_utf8( - Command::new("tc") - .arg("qdisc") - .arg("show") - .arg("dev") - .arg(&self.dev_name) - .arg("root") - .output()? - .stdout, - )?; - let line = output.trim().to_string(); - if !line.is_empty() { - Ok(line.trim().to_string()) - } else { - Err(SummersetError::msg( - "error getting `tc qdisc show` output line", - )) - } - } - - /// Parse time field into float ms. - #[inline] - fn parse_time_ms(seg: &str) -> Result { - let (multiplier, tail) = if seg.ends_with("us") { - (0.001, 2) - } else if seg.ends_with("ms") { - (1.0, 2) - } else if seg.ends_with('s') { - (1000.0, 1) - } else { - (0.001, 0) // no unit means usecs - }; - let num = seg[..seg.len() - tail].parse::()?; - Ok(num * multiplier) - } - - /// Parse rate field into float Gbps. - #[inline] - fn parse_rate_gbps(seg: &str) -> Result { - const GBIT: f64 = 1024.0 * 1024.0 * 1024.0; - let (multiplier, tail) = if seg.ends_with("kbit") { - (1000.0 / GBIT, 4) - } else if seg.ends_with("kibit") { - (1024.0 / GBIT, 5) - } else if seg.ends_with("Kbit") { - (1024.0 / GBIT, 4) - } else if seg.ends_with("mbit") { - (1_000_000.0 / GBIT, 4) - } else if seg.ends_with("mibit") { - (1024.0 * 1024.0 / GBIT, 5) - } else if seg.ends_with("Mbit") { - (1024.0 * 1024.0 / GBIT, 4) - } else if seg.ends_with("gbit") { - (1_000_000_000.0 / GBIT, 4) - } else if seg.ends_with("gibit") { - (1024.0 * 1024.0 * 1024.0 / GBIT, 5) - } else if seg.ends_with("Gbit") { - (1024.0 * 1024.0 * 1024.0 / GBIT, 4) - } else if seg.ends_with("tbit") { - (1_000_000_000_000.0 / GBIT, 4) - } else if seg.ends_with("tibit") { - (1024.0 * 1024.0 * 1024.0 * 1024.0 / GBIT, 5) - } else if seg.ends_with("Tbit") { - (1024.0 * 1024.0 * 1024.0 * 1024.0 / GBIT, 4) - } else if seg.ends_with("bit") { - (1.0 / GBIT, 3) - } else if seg.ends_with("kbps") { - (1000.0 * 8.0 / GBIT, 4) // 'bps' in output actually means Bytes/sec - } else if seg.ends_with("kibps") { - (1024.0 * 8.0 / GBIT, 5) - } else if seg.ends_with("Kbps") { - (1024.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("mbps") { - (1_000_000.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("mibps") { - (1024.0 * 1024.0 * 8.0 / GBIT, 5) - } else if seg.ends_with("Mbps") { - (1024.0 * 1024.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("gbps") { - (1_000_000_000.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("gibps") { - (1024.0 * 1024.0 * 1024.0 * 8.0 / GBIT, 5) - } else if seg.ends_with("Gbps") { - (1024.0 * 1024.0 * 1024.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("tbps") { - (1_000_000_000_000.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("tibps") { - (1024.0 * 1024.0 * 1024.0 * 1024.0 * 8.0 / GBIT, 5) - } else if seg.ends_with("Tbps") { - (1024.0 * 1024.0 * 1024.0 * 1024.0 * 8.0 / GBIT, 4) - } else if seg.ends_with("bps") { - (8.0 / GBIT, 3) - } else { - (1.0 / GBIT, 0) // no unit means bit - }; - let num = seg[..seg.len() - tail].parse::()?; - Ok(num * multiplier) - } - - /// Parse the output line into (delay, jitter, rate) values. - fn parse_output_line( - line: &str, - ) -> Result<(f64, f64, f64), SummersetError> { - let (mut delay, mut jitter, mut rate) = - (DEFAULT_DELAY, DEFAULT_JITTER, DEFAULT_RATE); - let (mut stage, mut idx) = (0, 0); - for seg in line.split_ascii_whitespace() { - if seg == "netem" { - stage = 1; - } else if stage > 0 && seg == "delay" { - stage = 2; - idx = 0; - } else if stage > 0 && seg == "rate" { - stage = 3; - idx = 0; - } else if stage == 2 && idx == 0 { - delay = Self::parse_time_ms(seg)?; - idx += 1; - } else if stage == 2 && idx == 1 { - jitter = Self::parse_time_ms(seg)?; - idx += 1; - } else if stage == 3 && idx == 0 { - rate = Self::parse_rate_gbps(seg)?; - idx += 1; - } - } - Ok((delay, jitter, rate)) - } - - /// Updates my fields with a new query. - pub(crate) fn update(&mut self) -> Result<(), SummersetError> { - let line = self.run_qdisc_show()?; - let (delay, jitter, rate) = Self::parse_output_line(&line)?; - debug_assert!(delay >= 0.0); - debug_assert!(jitter >= 0.0); - debug_assert!(rate >= 0.0); - - self.delay = delay; - self.jitter = jitter; - self.rate = rate; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::process::Command; - - #[test] - #[ignore] - fn qdisc_run_cmd() -> Result<(), SummersetError> { - // just testing if command running is functional here - let output = String::from_utf8( - Command::new("echo").arg("hello").output()?.stdout, - )?; - assert_eq!(output.trim(), "hello"); - Ok(()) - } -} diff --git a/tla+/bodega/.gitignore b/tla+/bodega/.gitignore deleted file mode 100644 index 8832b1c7..00000000 --- a/tla+/bodega/.gitignore +++ /dev/null @@ -1 +0,0 @@ -Bodega.cfg diff --git a/tla+/bodega/Bodega.tla b/tla+/bodega/Bodega.tla deleted file mode 100644 index 15b2794a..00000000 --- a/tla+/bodega/Bodega.tla +++ /dev/null @@ -1,848 +0,0 @@ -(**********************************************************************************) -(* Bodega protocol enabling always-local follower reads in WAN-scale consensus by *) -(* employing a lease-less always-local follower read technique on the critical *) -(* path and using off-the-critical-path config leases to retain fault-tolerance. *) -(* *) -(* See multipaxos_smr_addon/MultiPaxos.tla for the base spec it extends from. *) -(**********************************************************************************) - ----- MODULE Bodega ---- -EXTENDS FiniteSets, Sequences, Integers, TLC - -(*******************************) -(* Model inputs & assumptions. *) -(*******************************) -CONSTANT Replicas, \* symmetric set of server nodes - Writes, \* symmetric set of write commands (each w/ unique value) - Reads, \* symmetric set of read commands - MaxBallot, \* maximum ballot pickable for leader preemption - NodeFailuresOn \* if true, turn on node failures injection - -ReplicasAssumption == /\ IsFiniteSet(Replicas) - /\ Cardinality(Replicas) >= 1 - /\ "none" \notin Replicas - -Population == Cardinality(Replicas) - -MajorityNum == (Population \div 2) + 1 - -WritesAssumption == /\ IsFiniteSet(Writes) - /\ Cardinality(Writes) >= 1 - /\ "nil" \notin Writes - \* a write command model value serves as both the - \* ID of the command and the value to be written - -ReadsAssumption == /\ IsFiniteSet(Reads) - /\ Cardinality(Reads) >= 0 - /\ "nil" \notin Writes - -MaxBallotAssumption == /\ MaxBallot \in Nat - /\ MaxBallot >= 2 - -NodeFailuresOnAssumption == NodeFailuresOn \in BOOLEAN - -ASSUME /\ ReplicasAssumption - /\ WritesAssumption - /\ ReadsAssumption - /\ MaxBallotAssumption - /\ NodeFailuresOnAssumption - ----------- - -(********************************) -(* Useful constants & typedefs. *) -(********************************) -Commands == Writes \cup Reads - -NumWrites == Cardinality(Writes) - -NumReads == Cardinality(Reads) - -NumCommands == Cardinality(Commands) - -Range(seq) == {seq[i]: i \in 1..Len(seq)} - -\* Client observable events. -ClientEvents == [type: {"Req"}, cmd: Commands] - \cup [type: {"Ack"}, cmd: Commands, - val: {"nil"} \cup Writes, - by: Replicas] - -ReqEvent(c) == [type |-> "Req", cmd |-> c] - -AckEvent(c, v, n) == [type |-> "Ack", cmd |-> c, val |-> v, by |-> n] - \* val is the old value for a write command - -InitPending == (CHOOSE ws \in [1..Cardinality(Writes) -> Writes] - : Range(ws) = Writes) - \o (CHOOSE rs \in [1..Cardinality(Reads) -> Reads] - : Range(rs) = Reads) - \* W.L.O.G., choose any sequence contatenating writes - \* commands and read commands as the sequence of reqs; - \* all other cases are either symmetric or less useful - \* than this one - -\* Server-side constants & states. -Ballots == 1..MaxBallot - -Slots == 1..NumWrites - -Statuses == {"Preparing", "Accepting", "Committed"} - -InstStates == [status: {"Empty"} \cup Statuses, - write: {"nil"} \cup Writes, - voted: [bal: {0} \cup Ballots, - write: {"nil"} \cup Writes]] - -NullInst == [status |-> "Empty", - write |-> "nil", - voted |-> [bal |-> 0, write |-> "nil"]] - -NodeStates == [leader: {"none"} \cup Replicas, - commitUpTo: {0} \cup Slots, - commitPrev: {0} \cup Slots \cup {NumWrites+1}, - balPrepared: {0} \cup Ballots, - balMaxKnown: {0} \cup Ballots, - insts: [Slots -> InstStates]] - -NullNode == [leader |-> "none", - commitUpTo |-> 0, - commitPrev |-> 0, - balPrepared |-> 0, - balMaxKnown |-> 0, - insts |-> [s \in Slots |-> NullInst]] - \* commitPrev is the last slot which might have been - \* committed by an old leader; a newly prepared leader - \* can safely serve reads locally only after its log has - \* been committed up to this slot. The time before this - \* condition becomes satisfied may be considered the - \* "recovery" or "ballot transfer" time - -FirstEmptySlot(insts) == - IF \A s \in Slots: insts[s].status # "Empty" - THEN NumWrites + 1 - ELSE CHOOSE s \in Slots: - /\ insts[s].status = "Empty" - /\ \A t \in 1..(s-1): insts[t].status # "Empty" - -LastNonEmptySlot(insts) == - IF \A s \in Slots: insts[s].status = "Empty" - THEN 0 - ELSE CHOOSE s \in Slots: - /\ insts[s].status # "Empty" - /\ \A t \in (s+1)..NumWrites: insts[t].status = "Empty" - \* note that this is not the same as FirstEmptySlot - 1 - \* due to possible existence of holes - -\* Service-internal messages. -PrepareMsgs == [type: {"Prepare"}, src: Replicas, - bal: Ballots] - -PrepareMsg(r, b) == [type |-> "Prepare", src |-> r, - bal |-> b] - -InstsVotes == [Slots -> [bal: {0} \cup Ballots, - write: {"nil"} \cup Writes]] - -VotesByNode(n) == [s \in Slots |-> n.insts[s].voted] - -PrepareReplyMsgs == [type: {"PrepareReply"}, src: Replicas, - bal: Ballots, - votes: InstsVotes] - -PrepareReplyMsg(r, b, iv) == [type |-> "PrepareReply", src |-> r, - bal |-> b, - votes |-> iv] - -PeakVotedWrite(prs, s) == - IF \A pr \in prs: pr.votes[s].bal = 0 - THEN "nil" - ELSE LET ppr == - CHOOSE ppr \in prs: - \A pr \in prs: pr.votes[s].bal =< ppr.votes[s].bal - IN ppr.votes[s].write - -LastTouchedSlot(prs) == - IF \A s \in Slots: PeakVotedWrite(prs, s) = "nil" - THEN 0 - ELSE CHOOSE s \in Slots: - /\ PeakVotedWrite(prs, s) # "nil" - /\ \A t \in (s+1)..NumWrites: PeakVotedWrite(prs, t) = "nil" - -PrepareNoticeMsgs == [type: {"PrepareNotice"}, src: Replicas, - bal: Ballots, - commit_prev: {0} \cup Slots] - -PrepareNoticeMsg(r, b, cp) == [type |-> "PrepareNotice", src |-> r, - bal |-> b, - commit_prev |-> cp] - \* this messasge is added to allow - \* followers to learn about commitPrev - -AcceptMsgs == [type: {"Accept"}, src: Replicas, - bal: Ballots, - slot: Slots, - write: Writes] - -AcceptMsg(r, b, s, c) == [type |-> "Accept", src |-> r, - bal |-> b, - slot |-> s, - write |-> c] - -AcceptReplyMsgs == [type: {"AcceptReply"}, src: Replicas, - bal: Ballots, - slot: Slots] - -AcceptReplyMsg(r, b, s) == [type |-> "AcceptReply", src |-> r, - bal |-> b, - slot |-> s] - \* no need to carry command ID in - \* AcceptReply because ballot and - \* slot uniquely identifies the write - -CommitNoticeMsgs == [type: {"CommitNotice"}, upto: Slots] - -CommitNoticeMsg(u) == [type |-> "CommitNotice", upto |-> u] - -Messages == PrepareMsgs - \cup PrepareReplyMsgs - \cup PrepareNoticeMsgs - \cup AcceptMsgs - \cup AcceptReplyMsgs - \cup CommitNoticeMsgs - -\* Config lease related typedefs. -Configs == {cfg \in [bal: Ballots, leader: Replicas, responders: SUBSET Replicas]: - cfg.leader \notin cfg.responders} - -Config(b, l, resps) == [bal |-> b, leader |-> l, responders |-> resps] - \* each new ballot number maps to a new config; this - \* includes the change of leader (as in classic - \* MultiPaxos) and/or the change of who're responders - -LeaseGrants == [from: Replicas, config: Configs] - -LeaseGrant(f, cfg) == [from |-> f, config |-> cfg] - \* this is the only type of message that may be - \* "removed" from the global set of messages to make - \* a "cheated" model of leasing: if a LeaseGrant - \* message is removed, it means that promise has - \* expired and the grantor did not refresh, probably - \* making way for switching to a differnt config - ----------- - -(******************************) -(* Main algorithm in PlusCal. *) -(******************************) -(*--algorithm Bodega - -variable msgs = {}, \* messages in the network - grants = {}, \* lease msgs in the network - node = [r \in Replicas |-> NullNode], \* replica node state - pending = InitPending, \* sequence of pending reqs - observed = <<>>, \* client observed events - crashed = [r \in Replicas |-> FALSE]; \* replica crashed flag - -define - CurrentConfig == - LET leased(b) == Cardinality({g \in grants: - g.config.bal = b}) >= MajorityNum - IN IF ~\E b \in Ballots: leased(b) - THEN Config(0, "none", 0) - ELSE (CHOOSE g \in grants: leased(g.config.bal)).config - \* the leasing mechanism ensures that at any - \* time, there's at most one leader - - ThinkAmLeader(r) == /\ node[r].leader = r - /\ node[r].balPrepared = node[r].balMaxKnown - /\ CurrentConfig.bal > 0 - /\ CurrentConfig.bal = node[r].balMaxKnown - /\ CurrentConfig.leader = r - - ThinkAmFollower(r) == /\ node[r].leader # r - /\ CurrentConfig.bal > 0 - /\ CurrentConfig.bal = node[r].balMaxKnown - /\ CurrentConfig.leader # r - - ThinkAmResponder(r) == /\ ThinkAmFollower(r) - /\ r \in CurrentConfig.responders - - BallotTransfered(r) == node[r].commitUpTo >= node[r].commitPrev - - WriteCommittable(ars) == - /\ Cardinality({ar.src: ar \in ars}) >= MajorityNum - /\ CurrentConfig.responders \subseteq {ar.src: ar \in ars} - - reqsMade == {e.cmd: e \in {e \in Range(observed): e.type = "Req"}} - - acksRecv == {e.cmd: e \in {e \in Range(observed): e.type = "Ack"}} - - AppendObserved(seq) == - LET filter(e) == IF e.type = "Req" THEN e.cmd \notin reqsMade - ELSE e.cmd \notin acksRecv - IN observed \o SelectSeq(seq, filter) - - UnseenPending(r) == - LET filter(c) == \A s \in Slots: node[r].insts[s].write # c - IN SelectSeq(pending, filter) - - RemovePending(cmd) == - LET filter(c) == c # cmd - IN SelectSeq(pending, filter) - - terminated == /\ Len(pending) = 0 - /\ Cardinality(reqsMade) = NumCommands - /\ Cardinality(acksRecv) = NumCommands - - numCrashed == Cardinality({r \in Replicas: crashed[r]}) -end define; - -\* Send a set of messages helper. -macro Send(set) begin - msgs := msgs \cup set; -end macro; - -\* Expire existing lease grant from f, and make a new repeatedly refreshed -\* lease grant to new config cfg. -macro Lease(f, cfg) begin - grants := {g \in grants: g.from # f} \cup {LeaseGrant(f, cfg)}; -end macro; - -\* Observe client events helper. -macro Observe(seq) begin - observed := AppendObserved(seq); -end macro; - -\* Resolve a pending command helper. -macro Resolve(c) begin - pending := RemovePending(c); -end macro; - -\* Someone steps up as leader and sends Prepare message to followers. -\* To simplify this spec W.L.O.G., we change the responders config only when -\* a new leader steps up; in practice, a separate and independent type of -\* trigger will be used to change the config. -macro BecomeLeader(r) begin - \* if I'm not a current leader - await node[r].leader # r; - \* pick a greater ballot number and a config - with b \in Ballots, - resps \in SUBSET {f \in Replicas: f # r}, - do - await /\ b > node[r].balMaxKnown - /\ ~\E m \in msgs: (m.type = "Prepare") /\ (m.bal = b); - \* W.L.O.G., using this clause to model that ballot - \* numbers from different proposers be unique - \* update states and restart Prepare phase for in-progress instances - node[r].leader := r || - node[r].commitPrev := NumWrites + 1 || - node[r].balPrepared := 0 || - node[r].balMaxKnown := b || - node[r].insts := - [s \in Slots |-> - [node[r].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]; - \* broadcast Prepare and reply to myself instantly - Send({PrepareMsg(r, b), - PrepareReplyMsg(r, b, VotesByNode(node[r]))}); - \* expire my old lease grant if any and grant to myself - Lease(r, Config(b, r, resps)); - end with; -end macro; - -\* Replica replies to a Prepare message. -macro HandlePrepare(r) begin - \* if receiving a Prepare message with larger ballot than ever seen - with m \in msgs do - await /\ m.type = "Prepare" - /\ m.bal > node[r].balMaxKnown; - \* update states and reset statuses - node[r].leader := m.src || - node[r].commitPrev := NumWrites + 1 || - node[r].balMaxKnown := m.bal || - node[r].insts := - [s \in Slots |-> - [node[r].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]; - \* send back PrepareReply with my voted list - Send({PrepareReplyMsg(r, m.bal, VotesByNode(node[r]))}); - \* expire my old lease grant if any and grant to new leader - \* remember that we simplify this spec by merging responders - \* config change into leader change Prepares - Lease(r, (CHOOSE g \in grants: g.from = m.src).config); - end with; -end macro; - -\* Leader gathers PrepareReply messages until condition met, then marks -\* the corresponding ballot as prepared and saves highest voted commands. -macro HandlePrepareReplies(r) begin - \* if I'm waiting for PrepareReplies - await /\ node[r].leader = r - /\ node[r].balPrepared = 0; - \* when there are enough number of PrepareReplies of desired ballot - with prs = {m \in msgs: /\ m.type = "PrepareReply" - /\ m.bal = node[r].balMaxKnown} - do - await Cardinality({pr.src: pr \in prs}) >= MajorityNum; - \* marks this ballot as prepared and saves highest voted command - \* in each slot if any - node[r].balPrepared := node[r].balMaxKnown || - node[r].insts := - [s \in Slots |-> - [node[r].insts[s] - EXCEPT !.status = IF \/ @ = "Preparing" - \/ /\ @ = "Empty" - /\ PeakVotedWrite(prs, s) # "nil" - THEN "Accepting" - ELSE @, - !.write = PeakVotedWrite(prs, s)]] || - node[r].commitPrev := LastTouchedSlot(prs); - \* send Accept messages for in-progress instances and reply to myself - \* instantly; send PrepareNotices as well - Send(UNION - {{AcceptMsg(r, node[r].balPrepared, s, node[r].insts[s].write), - AcceptReplyMsg(r, node[r].balPrepared, s)}: - s \in {s \in Slots: node[r].insts[s].status = "Accepting"}} - \cup {PrepareNoticeMsg(r, node[r].balPrepared, LastTouchedSlot(prs))}); - end with; -end macro; - -\* Follower receives PrepareNotice from a prepared and recovered leader, and -\* updates its commitPrev accordingly. -macro HandlePrepareNotice(r) begin - \* if I'm a follower waiting on PrepareNotice - await /\ ThinkAmFollower(r) - /\ node[r].commitPrev = NumWrites + 1; - \* when there's a PrepareNotice message in effect - with m \in msgs do - await /\ m.type = "PrepareNotice" - /\ m.bal = node[r].balMaxKnown; - \* update my commitPrev - node[r].commitPrev := m.commit_prev; - end with; -end macro; - -\* A prepared leader takes a new write request into the next empty slot. -macro TakeNewWriteRequest(r) begin - \* if I'm a prepared leader and there's pending write request - await /\ ThinkAmLeader(r) - /\ \E s \in Slots: node[r].insts[s].status = "Empty" - /\ Len(UnseenPending(r)) > 0 - /\ Head(UnseenPending(r)) \in Writes; - \* find the next empty slot and pick a pending request - with s = FirstEmptySlot(node[r].insts), - c = Head(UnseenPending(r)) - \* W.L.O.G., only pick a command not seen in current - \* prepared log to have smaller state space; in practice, - \* duplicated client requests should be treated by some - \* idempotency mechanism such as using request IDs - do - \* update slot status and voted - node[r].insts[s].status := "Accepting" || - node[r].insts[s].write := c || - node[r].insts[s].voted.bal := node[r].balPrepared || - node[r].insts[s].voted.write := c; - \* broadcast Accept and reply to myself instantly - Send({AcceptMsg(r, node[r].balPrepared, s, c), - AcceptReplyMsg(r, node[r].balPrepared, s)}); - \* append to observed events sequence if haven't yet - Observe(<>); - end with; -end macro; - -\* Replica replies to an Accept message. -macro HandleAccept(r) begin - \* if I'm a follower - await ThinkAmFollower(r); - \* if receiving an unreplied Accept message with valid ballot - with m \in msgs do - await /\ m.type = "Accept" - /\ m.bal >= node[r].balMaxKnown - /\ m.bal >= node[r].insts[m.slot].voted.bal; - \* update node states and corresponding instance's states - node[r].leader := m.src || - node[r].balMaxKnown := m.bal || - node[r].insts[m.slot].status := "Accepting" || - node[r].insts[m.slot].write := m.write || - node[r].insts[m.slot].voted.bal := m.bal || - node[r].insts[m.slot].voted.write := m.write; - \* send back AcceptReply - Send({AcceptReplyMsg(r, m.bal, m.slot)}); - end with; -end macro; - -\* Leader gathers AcceptReply messages for a slot until condition met, -\* then marks the slot as committed and acknowledges the client. -macro HandleAcceptReplies(r) begin - \* if I'm a prepared leader - await /\ ThinkAmLeader(r) - /\ node[r].commitUpTo < NumWrites - /\ node[r].insts[node[r].commitUpTo+1].status = "Accepting"; - \* W.L.O.G., only enabling the next slot after commitUpTo - \* here to make the body of this macro simpler; in practice, - \* messages are received proactively and there should be a - \* separate "Executed" status - \* for this slot, when there is a good set of AcceptReplies that is at - \* least a majority number and that covers all responders - with s = node[r].commitUpTo + 1, - c = node[r].insts[s].write, - ls = s - 1, - v = IF ls = 0 THEN "nil" ELSE node[r].insts[ls].write, - ars = {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = node[r].balPrepared} - do - await WriteCommittable(ars); - \* marks this slot as committed and apply command - node[r].insts[s].status := "Committed" || - node[r].commitUpTo := s; - \* append to observed events sequence if haven't yet, and remove - \* the command from pending - Observe(<>); - Resolve(c); - \* broadcast CommitNotice to followers - Send({CommitNoticeMsg(s)}); - end with; -end macro; - -\* Replica receives new commit notification. -macro HandleCommitNotice(r) begin - \* if I'm a follower waiting on CommitNotice - await /\ ThinkAmFollower(r) - /\ node[r].commitUpTo < NumWrites - /\ node[r].insts[node[r].commitUpTo+1].status = "Accepting"; - \* W.L.O.G., only enabling the next slot after commitUpTo - \* here to make the body of this macro simpler - \* for this slot, when there's a CommitNotice message - with s = node[r].commitUpTo + 1, - c = node[r].insts[s].write, - m \in msgs - do - await /\ m.type = "CommitNotice" - /\ m.upto = s; - \* marks this slot as committed and apply command - node[r].insts[s].status := "Committed" || - node[r].commitUpTo := s; - end with; -end macro; - -\* A prepared leader or a responder follower takes a new read request and -\* serves it locally. -macro TakeNewReadRequest(r) begin - \* if I'm a caught-up leader or responder follower - await /\ \/ ThinkAmLeader(r) - \/ ThinkAmResponder(r) - /\ BallotTransfered(r) - /\ Len(UnseenPending(r)) > 0 - /\ Head(UnseenPending(r)) \in Reads; - \* pick a pending request; examine my log and find the last non-empty - \* slot, check its status - with s = LastNonEmptySlot(node[r].insts), - v = IF s = 0 THEN "nil" ELSE node[r].insts[s].write, - c = Head(UnseenPending(r)) - \* W.L.O.G., only pick a command not seen in current - \* prepared log to have smaller state space; in practice, - \* duplicated client requests should be treated by some - \* idempotency mechanism such as using request IDs - do - \* if the latest value is in Committed status, can directly reply; - \* otherwise, should hold until I've received enough broadcasted - \* AcceptReplies indicating that the write is surely to be committed - await \/ s = 0 - \/ node[r].insts[s].status = "Committed" - \/ LET ars == {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = node[r].balMaxKnown} - IN WriteCommittable(ars); - \* acknowledge client with the latest value, and remove the command - \* from pending - Observe(<>); - Resolve(c); - end with; -end macro; - -\* Replica node crashes itself under promised conditions. -macro ReplicaCrashes(r) begin - \* if less than (N - majority) number of replicas have failed - await /\ MajorityNum + numCrashed < Cardinality(Replicas) - /\ ~crashed[r] - /\ node[r].balMaxKnown < MaxBallot; - \* this clause is needed only because we have an upper - \* bound ballot number for modeling checking; in practice - \* someone else could always come up with a higher ballot - \* mark myself as crashed - crashed[r] := TRUE; -end macro; - -\* Replica server node main loop. -process Replica \in Replicas -begin - rloop: while (~terminated) /\ (~crashed[self]) do - either - BecomeLeader(self); - or - HandlePrepare(self); - or - HandlePrepareReplies(self); - or - HandlePrepareNotice(self); - or - TakeNewWriteRequest(self); - or - HandleAccept(self); - or - HandleAcceptReplies(self); - or - HandleCommitNotice(self); - or - TakeNewReadRequest(self); - or - if NodeFailuresOn then - ReplicaCrashes(self); - end if; - end either; - end while; -end process; - -end algorithm; *) - ----------- - -\* BEGIN TRANSLATION (chksum(pcal) = "ed44672" /\ chksum(tla) = "387844b7") -VARIABLES msgs, grants, node, pending, observed, crashed, pc - -(* define statement *) -CurrentConfig == - LET leased(b) == Cardinality({g \in grants: - g.config.bal = b}) >= MajorityNum - IN IF ~\E b \in Ballots: leased(b) - THEN Config(0, "none", 0) - ELSE (CHOOSE g \in grants: leased(g.config.bal)).config - - - -ThinkAmLeader(r) == /\ node[r].leader = r - /\ node[r].balPrepared = node[r].balMaxKnown - /\ CurrentConfig.bal > 0 - /\ CurrentConfig.bal = node[r].balMaxKnown - /\ CurrentConfig.leader = r - -ThinkAmFollower(r) == /\ node[r].leader # r - /\ CurrentConfig.bal > 0 - /\ CurrentConfig.bal = node[r].balMaxKnown - /\ CurrentConfig.leader # r - -ThinkAmResponder(r) == /\ ThinkAmFollower(r) - /\ r \in CurrentConfig.responders - -BallotTransfered(r) == node[r].commitUpTo >= node[r].commitPrev - -WriteCommittable(ars) == - /\ Cardinality({ar.src: ar \in ars}) >= MajorityNum - /\ CurrentConfig.responders \subseteq {ar.src: ar \in ars} - -reqsMade == {e.cmd: e \in {e \in Range(observed): e.type = "Req"}} - -acksRecv == {e.cmd: e \in {e \in Range(observed): e.type = "Ack"}} - -AppendObserved(seq) == - LET filter(e) == IF e.type = "Req" THEN e.cmd \notin reqsMade - ELSE e.cmd \notin acksRecv - IN observed \o SelectSeq(seq, filter) - -UnseenPending(r) == - LET filter(c) == \A s \in Slots: node[r].insts[s].write # c - IN SelectSeq(pending, filter) - -RemovePending(cmd) == - LET filter(c) == c # cmd - IN SelectSeq(pending, filter) - -terminated == /\ Len(pending) = 0 - /\ Cardinality(reqsMade) = NumCommands - /\ Cardinality(acksRecv) = NumCommands - -numCrashed == Cardinality({r \in Replicas: crashed[r]}) - - -vars == << msgs, grants, node, pending, observed, crashed, pc >> - -ProcSet == (Replicas) - -Init == (* Global variables *) - /\ msgs = {} - /\ grants = {} - /\ node = [r \in Replicas |-> NullNode] - /\ pending = InitPending - /\ observed = <<>> - /\ crashed = [r \in Replicas |-> FALSE] - /\ pc = [self \in ProcSet |-> "rloop"] - -rloop(self) == /\ pc[self] = "rloop" - /\ IF (~terminated) /\ (~crashed[self]) - THEN /\ \/ /\ node[self].leader # self - /\ \E b \in Ballots: - \E resps \in SUBSET {f \in Replicas: f # self}: - /\ /\ b > node[self].balMaxKnown - /\ ~\E m \in msgs: (m.type = "Prepare") /\ (m.bal = b) - /\ node' = [node EXCEPT ![self].leader = self, - ![self].commitPrev = NumWrites + 1, - ![self].balPrepared = 0, - ![self].balMaxKnown = b, - ![self].insts = [s \in Slots |-> - [node[self].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]] - /\ msgs' = (msgs \cup ({PrepareMsg(self, b), - PrepareReplyMsg(self, b, VotesByNode(node'[self]))})) - /\ grants' = ({g \in grants: g.from # self} \cup {LeaseGrant(self, (Config(b, self, resps)))}) - /\ UNCHANGED <> - \/ /\ \E m \in msgs: - /\ /\ m.type = "Prepare" - /\ m.bal > node[self].balMaxKnown - /\ node' = [node EXCEPT ![self].leader = m.src, - ![self].commitPrev = NumWrites + 1, - ![self].balMaxKnown = m.bal, - ![self].insts = [s \in Slots |-> - [node[self].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]] - /\ msgs' = (msgs \cup ({PrepareReplyMsg(self, m.bal, VotesByNode(node'[self]))})) - /\ grants' = ({g \in grants: g.from # self} \cup {LeaseGrant(self, ((CHOOSE g \in grants: g.from = m.src).config))}) - /\ UNCHANGED <> - \/ /\ /\ node[self].leader = self - /\ node[self].balPrepared = 0 - /\ LET prs == {m \in msgs: /\ m.type = "PrepareReply" - /\ m.bal = node[self].balMaxKnown} IN - /\ Cardinality({pr.src: pr \in prs}) >= MajorityNum - /\ node' = [node EXCEPT ![self].balPrepared = node[self].balMaxKnown, - ![self].insts = [s \in Slots |-> - [node[self].insts[s] - EXCEPT !.status = IF \/ @ = "Preparing" - \/ /\ @ = "Empty" - /\ PeakVotedWrite(prs, s) # "nil" - THEN "Accepting" - ELSE @, - !.write = PeakVotedWrite(prs, s)]], - ![self].commitPrev = LastTouchedSlot(prs)] - /\ msgs' = (msgs \cup ( UNION - {{AcceptMsg(self, node'[self].balPrepared, s, node'[self].insts[s].write), - AcceptReplyMsg(self, node'[self].balPrepared, s)}: - s \in {s \in Slots: node'[self].insts[s].status = "Accepting"}} - \cup {PrepareNoticeMsg(self, node'[self].balPrepared, LastTouchedSlot(prs))})) - /\ UNCHANGED <> - \/ /\ /\ ThinkAmFollower(self) - /\ node[self].commitPrev = NumWrites + 1 - /\ \E m \in msgs: - /\ /\ m.type = "PrepareNotice" - /\ m.bal = node[self].balMaxKnown - /\ node' = [node EXCEPT ![self].commitPrev = m.commit_prev] - /\ UNCHANGED <> - \/ /\ /\ ThinkAmLeader(self) - /\ \E s \in Slots: node[self].insts[s].status = "Empty" - /\ Len(UnseenPending(self)) > 0 - /\ Head(UnseenPending(self)) \in Writes - /\ LET s == FirstEmptySlot(node[self].insts) IN - LET c == Head(UnseenPending(self)) IN - /\ node' = [node EXCEPT ![self].insts[s].status = "Accepting", - ![self].insts[s].write = c, - ![self].insts[s].voted.bal = node[self].balPrepared, - ![self].insts[s].voted.write = c] - /\ msgs' = (msgs \cup ({AcceptMsg(self, node'[self].balPrepared, s, c), - AcceptReplyMsg(self, node'[self].balPrepared, s)})) - /\ observed' = AppendObserved((<>)) - /\ UNCHANGED <> - \/ /\ ThinkAmFollower(self) - /\ \E m \in msgs: - /\ /\ m.type = "Accept" - /\ m.bal >= node[self].balMaxKnown - /\ m.bal >= node[self].insts[m.slot].voted.bal - /\ node' = [node EXCEPT ![self].leader = m.src, - ![self].balMaxKnown = m.bal, - ![self].insts[m.slot].status = "Accepting", - ![self].insts[m.slot].write = m.write, - ![self].insts[m.slot].voted.bal = m.bal, - ![self].insts[m.slot].voted.write = m.write] - /\ msgs' = (msgs \cup ({AcceptReplyMsg(self, m.bal, m.slot)})) - /\ UNCHANGED <> - \/ /\ /\ ThinkAmLeader(self) - /\ node[self].commitUpTo < NumWrites - /\ node[self].insts[node[self].commitUpTo+1].status = "Accepting" - /\ LET s == node[self].commitUpTo + 1 IN - LET c == node[self].insts[s].write IN - LET ls == s - 1 IN - LET v == IF ls = 0 THEN "nil" ELSE node[self].insts[ls].write IN - LET ars == {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = node[self].balPrepared} IN - /\ WriteCommittable(ars) - /\ node' = [node EXCEPT ![self].insts[s].status = "Committed", - ![self].commitUpTo = s] - /\ observed' = AppendObserved((<>)) - /\ pending' = RemovePending(c) - /\ msgs' = (msgs \cup ({CommitNoticeMsg(s)})) - /\ UNCHANGED <> - \/ /\ /\ ThinkAmFollower(self) - /\ node[self].commitUpTo < NumWrites - /\ node[self].insts[node[self].commitUpTo+1].status = "Accepting" - /\ LET s == node[self].commitUpTo + 1 IN - LET c == node[self].insts[s].write IN - \E m \in msgs: - /\ /\ m.type = "CommitNotice" - /\ m.upto = s - /\ node' = [node EXCEPT ![self].insts[s].status = "Committed", - ![self].commitUpTo = s] - /\ UNCHANGED <> - \/ /\ /\ \/ ThinkAmLeader(self) - \/ ThinkAmResponder(self) - /\ BallotTransfered(self) - /\ Len(UnseenPending(self)) > 0 - /\ Head(UnseenPending(self)) \in Reads - /\ LET s == LastNonEmptySlot(node[self].insts) IN - LET v == IF s = 0 THEN "nil" ELSE node[self].insts[s].write IN - LET c == Head(UnseenPending(self)) IN - /\ \/ s = 0 - \/ node[self].insts[s].status = "Committed" - \/ LET ars == {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = node[self].balMaxKnown} - IN WriteCommittable(ars) - /\ observed' = AppendObserved((<>)) - /\ pending' = RemovePending(c) - /\ UNCHANGED <> - \/ /\ IF NodeFailuresOn - THEN /\ /\ MajorityNum + numCrashed < Cardinality(Replicas) - /\ ~crashed[self] - /\ node[self].balMaxKnown < MaxBallot - /\ crashed' = [crashed EXCEPT ![self] = TRUE] - ELSE /\ TRUE - /\ UNCHANGED crashed - /\ UNCHANGED <> - /\ pc' = [pc EXCEPT ![self] = "rloop"] - ELSE /\ pc' = [pc EXCEPT ![self] = "Done"] - /\ UNCHANGED << msgs, grants, node, pending, - observed, crashed >> - -Replica(self) == rloop(self) - -(* Allow infinite stuttering to prevent deadlock on termination. *) -Terminating == /\ \A self \in ProcSet: pc[self] = "Done" - /\ UNCHANGED vars - -Next == (\E self \in Replicas: Replica(self)) - \/ Terminating - -Spec == Init /\ [][Next]_vars - -Termination == <>(\A self \in ProcSet: pc[self] = "Done") - -\* END TRANSLATION - -==== diff --git a/tla+/bodega/Bodega_MC.cfg b/tla+/bodega/Bodega_MC.cfg deleted file mode 100644 index 5320cfcc..00000000 --- a/tla+/bodega/Bodega_MC.cfg +++ /dev/null @@ -1,16 +0,0 @@ -SPECIFICATION Spec - -CONSTANTS - Replicas = {s1, s2, s3} - Writes = {w1, w2} - Reads = {r1, r2} - MaxBallot <- ConstMaxBallot - NodeFailuresOn <- TRUE - -SYMMETRY SymmetricPerms - -INVARIANTS - TypeOK - Linearizability - -CHECK_DEADLOCK TRUE diff --git a/tla+/bodega/Bodega_MC.tla b/tla+/bodega/Bodega_MC.tla deleted file mode 100644 index 677358a1..00000000 --- a/tla+/bodega/Bodega_MC.tla +++ /dev/null @@ -1,78 +0,0 @@ ----- MODULE Bodega_MC ---- -EXTENDS Bodega - -(****************************) -(* TLC config-related defs. *) -(****************************) -ConditionalPerm(set) == IF Cardinality(set) > 1 - THEN Permutations(set) - ELSE {} - -SymmetricPerms == ConditionalPerm(Replicas) - \cup ConditionalPerm(Writes) - \cup ConditionalPerm(Reads) - -ConstMaxBallot == 2 - ----------- - -(*************************) -(* Type check invariant. *) -(*************************) -TypeOK == /\ \A m \in msgs: m \in Messages - /\ \A g \in grants: g \in LeaseGrants - /\ Cardinality({g.from: g \in grants}) = Cardinality(grants) - /\ \A r \in Replicas: node[r] \in NodeStates - /\ Len(pending) =< NumCommands - /\ Cardinality(Range(pending)) = Len(pending) - /\ \A c \in Range(pending): c \in Commands - /\ Len(observed) =< 2 * NumCommands - /\ Cardinality(Range(observed)) = Len(observed) - /\ Cardinality(reqsMade) >= Cardinality(acksRecv) - /\ \A e \in Range(observed): e \in ClientEvents - /\ \A r \in Replicas: crashed[r] \in BOOLEAN - -THEOREM Spec => []TypeOK - ----------- - -(*******************************) -(* Linearizability constraint. *) -(*******************************) -ReqPosOfCmd(c) == CHOOSE i \in 1..Len(observed): - /\ observed[i].type = "Req" - /\ observed[i].cmd = c - -AckPosOfCmd(c) == CHOOSE i \in 1..Len(observed): - /\ observed[i].type = "Ack" - /\ observed[i].cmd = c - -ResultOfCmd(c) == observed[AckPosOfCmd(c)].val - -OrderIdxOfCmd(order, c) == CHOOSE j \in 1..Len(order): order[j] = c - -LastWriteBefore(order, j) == - LET k == CHOOSE k \in 0..(j-1): - /\ (k = 0 \/ order[k] \in Writes) - /\ \A l \in (k+1)..(j-1): order[l] \in Reads - IN IF k = 0 THEN "nil" ELSE order[k] - -IsLinearOrder(order) == - /\ {order[j]: j \in 1..Len(order)} = Commands - /\ \A j \in 1..Len(order): - ResultOfCmd(order[j]) = LastWriteBefore(order, j) - -ObeysRealTime(order) == - \A c1, c2 \in Commands: - (AckPosOfCmd(c1) < ReqPosOfCmd(c2)) - => (OrderIdxOfCmd(order, c1) < OrderIdxOfCmd(order, c2)) - -Linearizability == - terminated => - \E order \in [1..NumCommands -> Commands]: - /\ IsLinearOrder(order) - /\ ObeysRealTime(order) - -THEOREM Spec => Linearizability - -==== \ No newline at end of file diff --git a/tla+/crossword/.gitignore b/tla+/crossword/.gitignore deleted file mode 100644 index 30989982..00000000 --- a/tla+/crossword/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -Crossword.cfg - -Crossword.tex -Crossword.dvi -Crossword.pdf - -Crossword_MC.tex -Crossword_MC.dvi -Crossword_MC.pdf diff --git a/tla+/crossword/Crossword.tla b/tla+/crossword/Crossword.tla deleted file mode 100644 index cb8dfa4f..00000000 --- a/tla+/crossword/Crossword.tla +++ /dev/null @@ -1,739 +0,0 @@ -(**********************************************************************************) -(* Crossword protocol in state machine replication (SMR) style with write/read *) -(* commands on a single key. Payload of each write is allowed to be erasure-coded *) -(* in a (N, M) scheme and follow balanced Round-Robin assignment policies. *) -(* Careful adjustments to the accpetance condition are made to retaining the same *) -(* fault-tolerance level as classic MultiPaxos. *) -(* *) -(* See multipaxos_smr_style/MultiPaxos.tla for detailed description of base spec. *) -(**********************************************************************************) - ----- MODULE Crossword ---- -EXTENDS FiniteSets, Sequences, Integers, TLC - -(*******************************) -(* Model inputs & assumptions. *) -(*******************************) -CONSTANT Replicas, \* symmetric set of server nodes - Writes, \* symmetric set of write commands (each w/ unique value) - Reads, \* symmetric set of read commands - MaxBallot, \* maximum ballot pickable for leader preemption - CommitNoticeOn, \* if true, turn on CommitNotice messages - NodeFailuresOn \* if true, turn on node failures injection - -ReplicasAssumption == /\ IsFiniteSet(Replicas) - /\ Cardinality(Replicas) >= 1 - -WritesAssumption == /\ IsFiniteSet(Writes) - /\ Cardinality(Writes) >= 1 - /\ "nil" \notin Writes - \* a write command model value serves as both the - \* ID of the command and the value to be written - -ReadsAssumption == /\ IsFiniteSet(Reads) - /\ Cardinality(Reads) >= 0 - /\ "nil" \notin Writes - -MaxBallotAssumption == /\ MaxBallot \in Nat - /\ MaxBallot >= 2 - -CommitNoticeOnAssumption == CommitNoticeOn \in BOOLEAN - -NodeFailuresOnAssumption == NodeFailuresOn \in BOOLEAN - -ASSUME /\ ReplicasAssumption - /\ WritesAssumption - /\ ReadsAssumption - /\ MaxBallotAssumption - /\ CommitNoticeOnAssumption - /\ NodeFailuresOnAssumption - ----------- - -(********************************) -(* Useful constants & typedefs. *) -(********************************) -Commands == Writes \cup Reads - -NumCommands == Cardinality(Commands) - -Population == Cardinality(Replicas) - -MajorityNum == (Population \div 2) + 1 - -Shards == Replicas - -NumDataShards == MajorityNum - -Range(func) == {func[i]: i \in DOMAIN func} - -\* Client observable events. -ClientEvents == [type: {"Req"}, cmd: Commands] - \cup [type: {"Ack"}, cmd: Commands, - val: {"nil"} \cup Writes] - -ReqEvent(c) == [type |-> "Req", cmd |-> c] - -AckEvent(c, v) == [type |-> "Ack", cmd |-> c, val |-> v] - \* val is the old value for a write command - -InitPending == (CHOOSE ws \in [1..Cardinality(Writes) -> Writes] - : Range(ws) = Writes) - \o (CHOOSE rs \in [1..Cardinality(Reads) -> Reads] - : Range(rs) = Reads) - \* W.L.O.G., choose any sequence contatenating writes - \* commands and read commands as the sequence of reqs; - \* all other cases are either symmetric or less useful - \* than this one - -\* Server-side constants & states. -Ballots == 1..MaxBallot - -Slots == 1..NumCommands - -Statuses == {"Preparing", "Accepting", "Committed"} - -InstStates == [status: {"Empty"} \cup Statuses, - cmd: {"nil"} \cup Commands, - shards: SUBSET Shards, - voted: [bal: {0} \cup Ballots, - cmd: {"nil"} \cup Commands, - shards: SUBSET Shards]] - -NullInst == [status |-> "Empty", - cmd |-> "nil", - shards |-> {}, - voted |-> [bal |-> 0, cmd |-> "nil", shards |-> {}]] - -NodeStates == [leader: {"none"} \cup Replicas, - kvalue: {"nil"} \cup Writes, - commitUpTo: {0} \cup Slots, - balPrepared: {0} \cup Ballots, - balMaxKnown: {0} \cup Ballots, - insts: [Slots -> InstStates]] - -NullNode == [leader |-> "none", - kvalue |-> "nil", - commitUpTo |-> 0, - balPrepared |-> 0, - balMaxKnown |-> 0, - insts |-> [s \in Slots |-> NullInst]] - -FirstEmptySlot(insts) == - CHOOSE s \in Slots: - /\ insts[s].status = "Empty" - /\ \A t \in 1..(s-1): insts[t].status # "Empty" - -\* Erasure-coding related expressions. -BigEnoughUnderFaults(g, u) == - \* Is g a large enough subset of u under promised fault-tolerance? - Cardinality(g) >= (Cardinality(u) + MajorityNum - Population) - -SubsetsUnderFaults(u) == - \* Set of subsets of u we consider under promised fault-tolerance. - {g \in SUBSET u: BigEnoughUnderFaults(g, u)} - -IsGoodCoverageSet(cs) == - \* Is cs a coverage set (i.e., a set of sets of shards) from which - \* we can reconstruct the original data? - Cardinality(UNION cs) >= NumDataShards - -ShardToIdx == CHOOSE map \in [Shards -> 1..Cardinality(Shards)]: - Cardinality(Range(map)) = Cardinality(Shards) - -IdxToShard == [i \in 1..Cardinality(Shards) |-> - CHOOSE r \in Shards: ShardToIdx[r] = i] - -ValidAssignments == - \* Set of all valid shard assignments. - {[r \in Replicas |-> {IdxToShard[((i-1) % Cardinality(Shards)) + 1]: - i \in (ShardToIdx[r])..(ShardToIdx[r]+na-1)}]: - na \in 1..MajorityNum} - -\* ASSUME Print(ValidAssignments, TRUE) - -\* Service-internal messages. -PrepareMsgs == [type: {"Prepare"}, src: Replicas, - bal: Ballots] - -PrepareMsg(r, b) == [type |-> "Prepare", src |-> r, - bal |-> b] - -InstsVotes == [Slots -> [bal: {0} \cup Ballots, - cmd: {"nil"} \cup Commands, - shards: SUBSET Shards]] - -VotesByNode(n) == [s \in Slots |-> n.insts[s].voted] - -PrepareReplyMsgs == [type: {"PrepareReply"}, src: Replicas, - bal: Ballots, - votes: InstsVotes] - -PrepareReplyMsg(r, b, iv) == - [type |-> "PrepareReply", src |-> r, - bal |-> b, - votes |-> iv] - -PreparedConditionAndCommand(prs, s) == - \* examines a set of PrepareReplies and returns a tuple: - \* (if the given slot can be decided as prepared, - \* the prepared command if forced, - \* known shards of the command if forced) - LET ppr == CHOOSE ppr \in prs: - \A pr \in prs: pr.votes[s].bal =< ppr.votes[s].bal - IN IF /\ BigEnoughUnderFaults(prs, Replicas) - /\ \A pr \in prs: pr.votes[s].bal = 0 - THEN [prepared |-> TRUE, cmd |-> "nil", shards |-> {}] - \* prepared, can choose any - ELSE IF /\ BigEnoughUnderFaults(prs, Replicas) - /\ IsGoodCoverageSet( - {pr.votes[s].shards: - pr \in {pr \in prs: - pr.votes[s].cmd = ppr.votes[s].cmd}}) - THEN [prepared |-> TRUE, - cmd |-> ppr.votes[s].cmd, - shards |-> UNION - {pr.votes[s].shards: - pr \in {pr \in prs: - pr.votes[s].cmd = ppr.votes[s].cmd}}] - \* prepared, command forced - ELSE IF /\ BigEnoughUnderFaults(prs, Replicas) - /\ ~IsGoodCoverageSet( - {pr.votes[s].shards: - pr \in {pr \in prs: - pr.votes[s].cmd = ppr.votes[s].cmd}}) - THEN [prepared |-> TRUE, cmd |-> "nil", shards |-> {}] - \* prepared, can choose any - ELSE [prepared |-> FALSE, cmd |-> "nil", shard |-> {}] - \* not prepared - -AcceptMsgs == [type: {"Accept"}, src: Replicas, - dst: Replicas, - bal: Ballots, - slot: Slots, - cmd: Commands, - shards: SUBSET Shards] - -AcceptMsg(r, d, b, s, c, sds) == [type |-> "Accept", src |-> r, - dst |-> d, - bal |-> b, - slot |-> s, - cmd |-> c, - shards |-> sds] - -AcceptReplyMsgs == [type: {"AcceptReply"}, src: Replicas, - bal: Ballots, - slot: Slots, - shards: SUBSET Shards] - -AcceptReplyMsg(r, b, s, sds) == - [type |-> "AcceptReply", src |-> r, - bal |-> b, - slot |-> s, - shards |-> sds] - -CommittedCondition(ars, s) == - \* the condition which decides if a set of AcceptReplies makes an - \* instance committed - /\ BigEnoughUnderFaults(ars, Replicas) - /\ \A group \in SubsetsUnderFaults(ars): - IsGoodCoverageSet({ar.shards: ar \in group}) - -CommitNoticeMsgs == [type: {"CommitNotice"}, upto: Slots] - -CommitNoticeMsg(u) == [type |-> "CommitNotice", upto |-> u] - -Messages == PrepareMsgs - \cup PrepareReplyMsgs - \cup AcceptMsgs - \cup AcceptReplyMsgs - \cup CommitNoticeMsgs - ----------- - -(******************************) -(* Main algorithm in PlusCal. *) -(******************************) -(*--algorithm Crossword - -variable msgs = {}, \* messages in the network - node = [r \in Replicas |-> NullNode], \* replica node state - pending = InitPending, \* sequence of pending reqs - observed = <<>>, \* client observed events - crashed = [r \in Replicas |-> FALSE]; \* replica crashed flag - -define - UnseenPending(insts) == - LET filter(c) == c \notin {insts[s].cmd: s \in Slots} - IN SelectSeq(pending, filter) - - RemovePending(cmd) == - LET filter(c) == c # cmd - IN SelectSeq(pending, filter) - - reqsMade == {e.cmd: e \in {e \in Range(observed): e.type = "Req"}} - - acksRecv == {e.cmd: e \in {e \in Range(observed): e.type = "Ack"}} - - terminated == /\ Len(pending) = 0 - /\ Cardinality(reqsMade) = NumCommands - /\ Cardinality(acksRecv) = NumCommands - - numCrashed == Cardinality({r \in Replicas: crashed[r]}) -end define; - -\* Send a set of messages helper. -macro Send(set) begin - msgs := msgs \cup set; -end macro; - -\* Observe a client event helper. -macro Observe(e) begin - if e \notin Range(observed) then - observed := Append(observed, e); - end if; -end macro; - -\* Resolve a pending command helper. -macro Resolve(c) begin - pending := RemovePending(c); -end macro; - -\* Someone steps up as leader and sends Prepare message to followers. -macro BecomeLeader(r) begin - \* if I'm not a leader - await node[r].leader # r; - \* pick a greater ballot number - with b \in Ballots do - await /\ b > node[r].balMaxKnown - /\ ~\E m \in msgs: (m.type = "Prepare") /\ (m.bal = b); - \* W.L.O.G., using this clause to model that ballot - \* numbers from different proposers be unique - \* update states and restart Prepare phase for in-progress instances - node[r].leader := r || - node[r].balPrepared := 0 || - node[r].balMaxKnown := b || - node[r].insts := - [s \in Slots |-> - [node[r].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]; - \* broadcast Prepare and reply to myself instantly - Send({PrepareMsg(r, b), - PrepareReplyMsg(r, b, VotesByNode(node[r]))}); - end with; -end macro; - -\* Replica replies to a Prepare message. -macro HandlePrepare(r) begin - \* if receiving a Prepare message with larger ballot than ever seen - with m \in msgs do - await /\ m.type = "Prepare" - /\ m.bal > node[r].balMaxKnown; - \* update states and reset statuses - node[r].leader := m.src || - node[r].balMaxKnown := m.bal || - node[r].insts := - [s \in Slots |-> - [node[r].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]; - \* send back PrepareReply with my voted list - Send({PrepareReplyMsg(r, m.bal, VotesByNode(node[r]))}); - end with; -end macro; - -\* Leader gathers PrepareReply messages until condition met, then marks -\* the corresponding ballot as prepared and saves highest voted commands. -macro HandlePrepareReplies(r) begin - \* if I'm waiting for PrepareReplies - await /\ node[r].leader = r - /\ node[r].balPrepared = 0; - \* when there are a set of PrepareReplies of desired ballot that satisfy - \* the prepared condition - with prs = {m \in msgs: /\ m.type = "PrepareReply" - /\ m.bal = node[r].balMaxKnown}, - exam = [s \in Slots |-> PreparedConditionAndCommand(prs, s)] - do - await \A s \in Slots: exam[s].prepared; - \* marks this ballot as prepared and saves highest voted command - \* in each slot if any - node[r].balPrepared := node[r].balMaxKnown || - node[r].insts := - [s \in Slots |-> - [node[r].insts[s] - EXCEPT !.status = IF /\ \/ @ = "Empty" - \/ @ = "Preparing" - \/ @ = "Accepting" - /\ exam[s].cmd # "nil" - THEN "Accepting" - ELSE IF @ = "Committed" - THEN "Committed" - ELSE "Empty", - !.cmd = exam[s].cmd, - !.shards = exam[s].shards]]; - \* pick a reasonable shard assignment and send Accept messages for - \* in-progress instances according to it - with assign \in ValidAssignments do - Send({AcceptMsg(r, d, node[r].balPrepared, s, - node[r].insts[s].cmd, assign[d]): - s \in {s \in Slots: - node[r].insts[s].status = "Accepting"}, - d \in Replicas} - \cup {AcceptReplyMsg(r, node[r].balPrepared, s, assign[r]): - s \in {s \in Slots: - node[r].insts[s].status = "Accepting"}}); - end with; - end with; -end macro; - -\* A prepared leader takes a new request to fill the next empty slot. -macro TakeNewRequest(r) begin - \* if I'm a prepared leader and there's pending request - await /\ node[r].leader = r - /\ node[r].balPrepared = node[r].balMaxKnown - /\ \E s \in Slots: node[r].insts[s].status = "Empty" - /\ Len(UnseenPending(node[r].insts)) > 0; - \* find the next empty slot and pick a pending request - with s = FirstEmptySlot(node[r].insts), - c = Head(UnseenPending(node[r].insts)) - \* W.L.O.G., only pick a command not seen in current - \* prepared log to have smaller state space; in practice, - \* duplicated client requests should be treated by some - \* idempotency mechanism such as using request IDs - do - \* update slot status and voted - node[r].insts[s].status := "Accepting" || - node[r].insts[s].cmd := c || - node[r].insts[s].voted.bal := node[r].balPrepared || - node[r].insts[s].voted.cmd := c || - node[r].insts[s].voted.shards := Shards; - \* pick a reasonable shard assignment, send Accept messages, and - \* reply to myself instantly - with assign \in ValidAssignments do - Send({AcceptMsg(r, d, node[r].balPrepared, s, c, assign[d]): - d \in Replicas} - \cup {AcceptReplyMsg(r, node[r].balPrepared, s, assign[r])}); - end with; - \* append to observed events sequence if haven't yet - Observe(ReqEvent(c)); - end with; -end macro; - -\* Replica replies to an Accept message. -macro HandleAccept(r) begin - \* if receiving an unreplied Accept message with valid ballot - with m \in msgs do - await /\ m.type = "Accept" - /\ m.dst = r - /\ m.bal >= node[r].balMaxKnown - /\ m.bal > node[r].insts[m.slot].voted.bal; - \* update node states and corresponding instance's states - node[r].leader := m.src || - node[r].balMaxKnown := m.bal || - node[r].insts[m.slot].status := "Accepting" || - node[r].insts[m.slot].cmd := m.cmd || - node[r].insts[m.slot].shards := m.shards || - node[r].insts[m.slot].voted.bal := m.bal || - node[r].insts[m.slot].voted.cmd := m.cmd || - node[r].insts[m.slot].voted.shards := m.shards; - \* send back AcceptReply - Send({AcceptReplyMsg(r, m.bal, m.slot, m.shards)}); - end with; -end macro; - -\* Leader gathers AcceptReply messages for a slot until condition met, then -\* marks the slot as committed and acknowledges the client. -macro HandleAcceptReplies(r) begin - \* if I think I'm a current leader - await /\ node[r].leader = r - /\ node[r].balPrepared = node[r].balMaxKnown - /\ node[r].commitUpTo < NumCommands - /\ node[r].insts[node[r].commitUpTo+1].status = "Accepting"; - \* W.L.O.G., only enabling the next slot after commitUpTo - \* here to make the body of this macro simpler - \* for this slot, when there is a set of AcceptReplies that satisfy the - \* committed condition - with s = node[r].commitUpTo + 1, - c = node[r].insts[s].cmd, - v = node[r].kvalue, - ars = {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = node[r].balPrepared} - do - await CommittedCondition(ars, s); - \* marks this slot as committed and apply command - node[r].insts[s].status := "Committed" || - node[r].commitUpTo := s || - node[r].kvalue := IF c \in Writes THEN c ELSE @; - \* append to observed events sequence if haven't yet, and remove - \* the command from pending - Observe(AckEvent(c, v)); - Resolve(c); - \* broadcast CommitNotice to followers - Send({CommitNoticeMsg(s)}); - end with; -end macro; - -\* Replica receives new commit notification. -macro HandleCommitNotice(r) begin - \* if I'm a follower waiting on CommitNotice - await /\ node[r].leader # r - /\ node[r].commitUpTo < NumCommands - /\ node[r].insts[node[r].commitUpTo+1].status = "Accepting"; - \* W.L.O.G., only enabling the next slot after commitUpTo - \* here to make the body of this macro simpler - \* for this slot, when there's a CommitNotice message - with s = node[r].commitUpTo + 1, - c = node[r].insts[s].cmd, - m \in msgs - do - await /\ m.type = "CommitNotice" - /\ m.upto = s; - \* marks this slot as committed and apply command - node[r].insts[s].status := "Committed" || - node[r].commitUpTo := s || - node[r].kvalue := IF c \in Writes THEN c ELSE @; - end with; -end macro; - -\* Replica node crashes itself under promised conditions. -macro ReplicaCrashes(r) begin - \* if less than (N - majority) number of replicas have failed - await /\ MajorityNum + numCrashed < Population - /\ ~crashed[r] - /\ node[r].balMaxKnown < MaxBallot; - \* this clause is needed only because we have an upper - \* bound ballot number for modeling checking; in practice - \* someone else could always come up with a higher ballot - \* mark myself as crashed - crashed[r] := TRUE; -end macro; - -\* Replica server node main loop. -process Replica \in Replicas -begin - rloop: while (~terminated) /\ (~crashed[self]) do - either - BecomeLeader(self); - or - HandlePrepare(self); - or - HandlePrepareReplies(self); - or - TakeNewRequest(self); - or - HandleAccept(self); - or - HandleAcceptReplies(self); - or - if CommitNoticeOn then - HandleCommitNotice(self); - end if; - or - if NodeFailuresOn then - ReplicaCrashes(self); - end if; - end either; - end while; -end process; - -end algorithm; *) - ----------- - -\* BEGIN TRANSLATION (chksum(pcal) = "2c6ba958" /\ chksum(tla) = "3272c05f") -VARIABLES msgs, node, pending, observed, crashed, pc - -(* define statement *) -UnseenPending(insts) == - LET filter(c) == c \notin {insts[s].cmd: s \in Slots} - IN SelectSeq(pending, filter) - -RemovePending(cmd) == - LET filter(c) == c # cmd - IN SelectSeq(pending, filter) - -reqsMade == {e.cmd: e \in {e \in Range(observed): e.type = "Req"}} - -acksRecv == {e.cmd: e \in {e \in Range(observed): e.type = "Ack"}} - -terminated == /\ Len(pending) = 0 - /\ Cardinality(reqsMade) = NumCommands - /\ Cardinality(acksRecv) = NumCommands - -numCrashed == Cardinality({r \in Replicas: crashed[r]}) - - -vars == << msgs, node, pending, observed, crashed, pc >> - -ProcSet == (Replicas) - -Init == (* Global variables *) - /\ msgs = {} - /\ node = [r \in Replicas |-> NullNode] - /\ pending = InitPending - /\ observed = <<>> - /\ crashed = [r \in Replicas |-> FALSE] - /\ pc = [self \in ProcSet |-> "rloop"] - -rloop(self) == /\ pc[self] = "rloop" - /\ IF (~terminated) /\ (~crashed[self]) - THEN /\ \/ /\ node[self].leader # self - /\ \E b \in Ballots: - /\ /\ b > node[self].balMaxKnown - /\ ~\E m \in msgs: (m.type = "Prepare") /\ (m.bal = b) - /\ node' = [node EXCEPT ![self].leader = self, - ![self].balPrepared = 0, - ![self].balMaxKnown = b, - ![self].insts = [s \in Slots |-> - [node[self].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]] - /\ msgs' = (msgs \cup ({PrepareMsg(self, b), - PrepareReplyMsg(self, b, VotesByNode(node'[self]))})) - /\ UNCHANGED <> - \/ /\ \E m \in msgs: - /\ /\ m.type = "Prepare" - /\ m.bal > node[self].balMaxKnown - /\ node' = [node EXCEPT ![self].leader = m.src, - ![self].balMaxKnown = m.bal, - ![self].insts = [s \in Slots |-> - [node[self].insts[s] - EXCEPT !.status = IF @ = "Accepting" - THEN "Preparing" - ELSE @]]] - /\ msgs' = (msgs \cup ({PrepareReplyMsg(self, m.bal, VotesByNode(node'[self]))})) - /\ UNCHANGED <> - \/ /\ /\ node[self].leader = self - /\ node[self].balPrepared = 0 - /\ LET prs == {m \in msgs: /\ m.type = "PrepareReply" - /\ m.bal = node[self].balMaxKnown} IN - LET exam == [s \in Slots |-> PreparedConditionAndCommand(prs, s)] IN - /\ \A s \in Slots: exam[s].prepared - /\ node' = [node EXCEPT ![self].balPrepared = node[self].balMaxKnown, - ![self].insts = [s \in Slots |-> - [node[self].insts[s] - EXCEPT !.status = IF /\ \/ @ = "Empty" - \/ @ = "Preparing" - \/ @ = "Accepting" - /\ exam[s].cmd # "nil" - THEN "Accepting" - ELSE IF @ = "Committed" - THEN "Committed" - ELSE "Empty", - !.cmd = exam[s].cmd, - !.shards = exam[s].shards]]] - /\ \E assign \in ValidAssignments: - msgs' = (msgs \cup ( {AcceptMsg(self, d, node'[self].balPrepared, s, - node'[self].insts[s].cmd, assign[d]): - s \in {s \in Slots: - node'[self].insts[s].status = "Accepting"}, - d \in Replicas} - \cup {AcceptReplyMsg(self, node'[self].balPrepared, s, assign[self]): - s \in {s \in Slots: - node'[self].insts[s].status = "Accepting"}})) - /\ UNCHANGED <> - \/ /\ /\ node[self].leader = self - /\ node[self].balPrepared = node[self].balMaxKnown - /\ \E s \in Slots: node[self].insts[s].status = "Empty" - /\ Len(UnseenPending(node[self].insts)) > 0 - /\ LET s == FirstEmptySlot(node[self].insts) IN - LET c == Head(UnseenPending(node[self].insts)) IN - /\ node' = [node EXCEPT ![self].insts[s].status = "Accepting", - ![self].insts[s].cmd = c, - ![self].insts[s].voted.bal = node[self].balPrepared, - ![self].insts[s].voted.cmd = c, - ![self].insts[s].voted.shards = Shards] - /\ \E assign \in ValidAssignments: - msgs' = (msgs \cup ( {AcceptMsg(self, d, node'[self].balPrepared, s, c, assign[d]): - d \in Replicas} - \cup {AcceptReplyMsg(self, node'[self].balPrepared, s, assign[self])})) - /\ IF (ReqEvent(c)) \notin Range(observed) - THEN /\ observed' = Append(observed, (ReqEvent(c))) - ELSE /\ TRUE - /\ UNCHANGED observed - /\ UNCHANGED <> - \/ /\ \E m \in msgs: - /\ /\ m.type = "Accept" - /\ m.dst = self - /\ m.bal >= node[self].balMaxKnown - /\ m.bal > node[self].insts[m.slot].voted.bal - /\ node' = [node EXCEPT ![self].leader = m.src, - ![self].balMaxKnown = m.bal, - ![self].insts[m.slot].status = "Accepting", - ![self].insts[m.slot].cmd = m.cmd, - ![self].insts[m.slot].shards = m.shards, - ![self].insts[m.slot].voted.bal = m.bal, - ![self].insts[m.slot].voted.cmd = m.cmd, - ![self].insts[m.slot].voted.shards = m.shards] - /\ msgs' = (msgs \cup ({AcceptReplyMsg(self, m.bal, m.slot, m.shards)})) - /\ UNCHANGED <> - \/ /\ /\ node[self].leader = self - /\ node[self].balPrepared = node[self].balMaxKnown - /\ node[self].commitUpTo < NumCommands - /\ node[self].insts[node[self].commitUpTo+1].status = "Accepting" - /\ LET s == node[self].commitUpTo + 1 IN - LET c == node[self].insts[s].cmd IN - LET v == node[self].kvalue IN - LET ars == {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = node[self].balPrepared} IN - /\ CommittedCondition(ars, s) - /\ node' = [node EXCEPT ![self].insts[s].status = "Committed", - ![self].commitUpTo = s, - ![self].kvalue = IF c \in Writes THEN c ELSE @] - /\ IF (AckEvent(c, v)) \notin Range(observed) - THEN /\ observed' = Append(observed, (AckEvent(c, v))) - ELSE /\ TRUE - /\ UNCHANGED observed - /\ pending' = RemovePending(c) - /\ msgs' = (msgs \cup ({CommitNoticeMsg(s)})) - /\ UNCHANGED crashed - \/ /\ IF CommitNoticeOn - THEN /\ /\ node[self].leader # self - /\ node[self].commitUpTo < NumCommands - /\ node[self].insts[node[self].commitUpTo+1].status = "Accepting" - /\ LET s == node[self].commitUpTo + 1 IN - LET c == node[self].insts[s].cmd IN - \E m \in msgs: - /\ /\ m.type = "CommitNotice" - /\ m.upto = s - /\ node' = [node EXCEPT ![self].insts[s].status = "Committed", - ![self].commitUpTo = s, - ![self].kvalue = IF c \in Writes THEN c ELSE @] - ELSE /\ TRUE - /\ node' = node - /\ UNCHANGED <> - \/ /\ IF NodeFailuresOn - THEN /\ /\ MajorityNum + numCrashed < Population - /\ ~crashed[self] - /\ node[self].balMaxKnown < MaxBallot - /\ crashed' = [crashed EXCEPT ![self] = TRUE] - ELSE /\ TRUE - /\ UNCHANGED crashed - /\ UNCHANGED <> - /\ pc' = [pc EXCEPT ![self] = "rloop"] - ELSE /\ pc' = [pc EXCEPT ![self] = "Done"] - /\ UNCHANGED << msgs, node, pending, observed, - crashed >> - -Replica(self) == rloop(self) - -(* Allow infinite stuttering to prevent deadlock on termination. *) -Terminating == /\ \A self \in ProcSet: pc[self] = "Done" - /\ UNCHANGED vars - -Next == (\E self \in Replicas: Replica(self)) - \/ Terminating - -Spec == Init /\ [][Next]_vars - -Termination == <>(\A self \in ProcSet: pc[self] = "Done") - -\* END TRANSLATION - -==== diff --git a/tla+/crossword/Crossword_MC.cfg b/tla+/crossword/Crossword_MC.cfg deleted file mode 100644 index 3224bc01..00000000 --- a/tla+/crossword/Crossword_MC.cfg +++ /dev/null @@ -1,17 +0,0 @@ -SPECIFICATION Spec - -CONSTANTS - Replicas = {s1, s2, s3} - Writes = {w1, w2} - Reads <- ConfigEmptySet - MaxBallot <- ConstMaxBallot - CommitNoticeOn <- FALSE - NodeFailuresOn <- TRUE - -SYMMETRY SymmetricPerms - -INVARIANTS - TypeOK - Linearizability - -CHECK_DEADLOCK TRUE diff --git a/tla+/crossword/Crossword_MC.tla b/tla+/crossword/Crossword_MC.tla deleted file mode 100644 index 38f7c8a3..00000000 --- a/tla+/crossword/Crossword_MC.tla +++ /dev/null @@ -1,78 +0,0 @@ ----- MODULE Crossword_MC ---- -EXTENDS Crossword - -(****************************) -(* TLC config-related defs. *) -(****************************) -ConditionalPerm(set) == IF Cardinality(set) > 1 - THEN Permutations(set) - ELSE {} - -SymmetricPerms == ConditionalPerm(Replicas) - \cup ConditionalPerm(Writes) - \cup ConditionalPerm(Reads) - -ConfigEmptySet == {} - -ConstMaxBallot == 2 - ----------- - -(*************************) -(* Type check invariant. *) -(*************************) -TypeOK == /\ \A m \in msgs: m \in Messages - /\ \A r \in Replicas: node[r] \in NodeStates - /\ Len(pending) =< NumCommands - /\ Cardinality(Range(pending)) = Len(pending) - /\ \A c \in Range(pending): c \in Commands - /\ Len(observed) =< 2 * NumCommands - /\ Cardinality(Range(observed)) = Len(observed) - /\ Cardinality(reqsMade) >= Cardinality(acksRecv) - /\ \A e \in Range(observed): e \in ClientEvents - /\ \A r \in Replicas: crashed[r] \in BOOLEAN - -THEOREM Spec => []TypeOK - ----------- - -(*******************************) -(* Linearizability constraint. *) -(*******************************) -ReqPosOfCmd(c) == CHOOSE i \in 1..Len(observed): - /\ observed[i].type = "Req" - /\ observed[i].cmd = c - -AckPosOfCmd(c) == CHOOSE i \in 1..Len(observed): - /\ observed[i].type = "Ack" - /\ observed[i].cmd = c - -ResultOfCmd(c) == observed[AckPosOfCmd(c)].val - -OrderIdxOfCmd(order, c) == CHOOSE j \in 1..Len(order): order[j] = c - -LastWriteBefore(order, j) == - LET k == CHOOSE k \in 0..(j-1): - /\ (k = 0 \/ order[k] \in Writes) - /\ \A l \in (k+1)..(j-1): order[l] \in Reads - IN IF k = 0 THEN "nil" ELSE order[k] - -IsLinearOrder(order) == - /\ {order[j]: j \in 1..Len(order)} = Commands - /\ \A j \in 1..Len(order): - ResultOfCmd(order[j]) = LastWriteBefore(order, j) - -ObeysRealTime(order) == - \A c1, c2 \in Commands: - (AckPosOfCmd(c1) < ReqPosOfCmd(c2)) - => (OrderIdxOfCmd(order, c1) < OrderIdxOfCmd(order, c2)) - -Linearizability == - terminated => - \E order \in [1..NumCommands -> Commands]: - /\ IsLinearOrder(order) - /\ ObeysRealTime(order) - -THEOREM Spec => Linearizability - -==== \ No newline at end of file