From 0d6aa24659c7677fafb476abbf225369ad5d858e Mon Sep 17 00:00:00 2001
From: Guanzhou Jose Hu <35757009+josehu07@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:48:21 -0500
Subject: [PATCH] Raft implemented & Several bug fixes (#19)

* Implemented ATC '14 version of Raft
* Made snapshotting conservative
* Added missing holes filling mechanism to Paxos variants
* Several bug fixes
---
 .github/workflows/tests_proc.yml            |    6 +-
 .github/workflows/tests_unit.yml            |    2 +-
 Cargo.lock                                  |  137 +-
 README.md                                   |   11 +-
 scripts/local_cluster.py                    |   15 +-
 scripts/workflow_test.py                    |   14 +
 src/lib.rs                                  |    1 +
 src/manager/clusman.rs                      |   74 +-
 src/manager/reigner.rs                      |    2 +-
 src/protocols/mod.rs                        |   18 +
 src/protocols/multipaxos.rs                 |  596 +++--
 src/protocols/raft.rs                       | 2222 +++++++++++++++++++
 src/protocols/rep_nothing.rs                |   72 +-
 src/protocols/rs_paxos.rs                   |  861 ++++---
 src/protocols/simple_push.rs                |   83 +-
 src/server/external.rs                      |    3 +-
 src/server/storage.rs                       |   79 +-
 src/server/transport.rs                     |   20 +-
 src/utils/bitmap.rs                         |   24 +
 src/utils/error.rs                          |    2 +
 src/utils/rscoding.rs                       |   15 +-
 summerset_client/src/clients/repl.rs        |  167 +-
 summerset_client/src/clients/tester.rs      |   21 +
 summerset_client/src/drivers/closed_loop.rs |  148 +-
 summerset_client/src/drivers/open_loop.rs   |   60 +-
 25 files changed, 3933 insertions(+), 720 deletions(-)
 create mode 100644 src/protocols/raft.rs

diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml
index e8fbde3f..dbd7195e 100644
--- a/.github/workflows/tests_proc.yml
+++ b/.github/workflows/tests_proc.yml
@@ -16,5 +16,7 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Run proc tests
-      run: python3 scripts/workflow_test.py
+    - name: Run proc tests (MultiPaxos)
+      run: python3 scripts/workflow_test.py -p MultiPaxos
+    - name: Run proc tests (Raft)
+      run: python3 scripts/workflow_test.py -p Raft
diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml
index 0a1fd8d6..57aa8fb3 100644
--- a/.github/workflows/tests_unit.yml
+++ b/.github/workflows/tests_unit.yml
@@ -16,5 +16,5 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Run unit tests
+    - name: Run all unit tests
       run: cargo test --workspace --verbose
diff --git a/Cargo.lock b/Cargo.lock
index 48d1c203..3a18786a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -39,9 +39,9 @@ dependencies = [
 
 [[package]]
 name = "anstream"
-version = "0.5.0"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c"
+checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -53,15 +53,15 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
+checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
+checksum = "317b9a89c1868f5ea6ff1d9539a69f45dffc21ce321ac1fd1160dfa48c8e2140"
 dependencies = [
  "utf8parse",
 ]
@@ -77,9 +77,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "2.1.0"
+version = "3.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd"
+checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628"
 dependencies = [
  "anstyle",
  "windows-sys",
@@ -93,7 +93,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -105,7 +105,7 @@ dependencies = [
  "attribute-derive-macro",
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -121,7 +121,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "quote-use",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -159,9 +159,9 @@ checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
 
 [[package]]
 name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
@@ -189,9 +189,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "clap"
-version = "4.4.4"
+version = "4.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136"
+checksum = "d04704f56c2cde07f43e8e2c154b43f216dc5c92fc98ada720177362f953b956"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -199,9 +199,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.4.4"
+version = "4.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56"
+checksum = "0e231faeaca65ebd1ea3c737966bf858971cd38c3849107aa3ea7de90a804e45"
 dependencies = [
  "anstream",
  "anstyle",
@@ -218,7 +218,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -280,7 +280,7 @@ checksum = "146398d62142a0f35248a608f17edf0dde57338354966d6e41d0eb2d16980ccb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -324,25 +324,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.3"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
 dependencies = [
- "errno-dragonfly",
  "libc",
  "windows-sys",
 ]
 
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
- "libc",
-]
-
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@@ -423,7 +412,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -486,7 +475,7 @@ checksum = "13a1bcfb855c1f340d5913ab542e36f25a1c56f57de79022928297632435dec2"
 dependencies = [
  "attribute-derive",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -517,9 +506,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.0"
+version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12"
 
 [[package]]
 name = "heck"
@@ -541,12 +530,12 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 
 [[package]]
 name = "indexmap"
-version = "2.0.0"
+version = "2.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d"
+checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
 ]
 
 [[package]]
@@ -589,21 +578,21 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.148"
+version = "0.2.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
+checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
 
 [[package]]
 name = "libm"
-version = "0.2.7"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.7"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128"
+checksum = "45786cec4d5e54a224b15cb9f06751883103a27c19c93eda09b0b4f5f08fefac"
 
 [[package]]
 name = "lock_api"
@@ -656,9 +645,9 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.6.3"
+version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "minimal-lexical"
@@ -729,9 +718,9 @@ dependencies = [
 
 [[package]]
 name = "num-traits"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
 dependencies = [
  "autocfg",
 ]
@@ -914,9 +903,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.67"
+version = "1.0.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328"
+checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
 dependencies = [
  "unicode-ident",
 ]
@@ -938,7 +927,7 @@ checksum = "a7b5abe3fe82fdeeb93f44d66a7b444dedf2e4827defb0a8e69c437b2de2ef94"
 dependencies = [
  "quote",
  "quote-use-macros",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -950,7 +939,7 @@ dependencies = [
  "derive-where",
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -1029,13 +1018,13 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.9.5"
+version = "1.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
+checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata 0.3.8",
+ "regex-automata 0.3.9",
  "regex-syntax 0.7.5",
 ]
 
@@ -1050,9 +1039,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
+checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -1101,9 +1090,9 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
 [[package]]
 name = "rustix"
-version = "0.38.14"
+version = "0.38.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f"
+checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7"
 dependencies = [
  "bitflags 2.4.0",
  "errno",
@@ -1153,7 +1142,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -1178,9 +1167,9 @@ dependencies = [
 
 [[package]]
 name = "sharded-slab"
-version = "0.1.4"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
 dependencies = [
  "lazy_static",
 ]
@@ -1313,9 +1302,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.37"
+version = "2.0.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8"
+checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1346,22 +1335,22 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.48"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7"
+checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.48"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35"
+checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -1401,7 +1390,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -1458,7 +1447,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.37",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -1638,9 +1627,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "winnow"
-version = "0.5.15"
+version = "0.5.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c2e3184b9c4e92ad5167ca73039d0c42476302ab603e2fec4487511f38ccefc"
+checksum = "037711d82167854aff2018dfd193aa0fef5370f456732f0d5a0c59b0f1b4b907"
 dependencies = [
  "memchr",
 ]
diff --git a/README.md b/README.md
index 58d3a50a..b83a7a64 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 [![Proc tests status](https://github.com/josehu07/summerset/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_proc)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
-Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.
+Summerset is a distributed, replicated, protocol-generic key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.
 
 <p align="center">
   <img width="360" src="./README.png">
@@ -21,6 +21,7 @@ Summerset is a distributed key-value store supporting a wide range of state mach
 | `SimplePush` | Pushing to peers w/o any consistency guarantees |
 | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol |
 | `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding |
+| `Raft` | [Raft](https://raft.github.io/raft.pdf) on explicit log and strong leadership |
 
 Formal TLA+ specification of some protocols are provided in `tla+/`.
 
@@ -32,6 +33,7 @@ Formal TLA+ specification of some protocols are provided in `tla+/`.
 - **Async Rust**: Summerset is written in Rust and demonstrates canonical usage of async programming structures backed by the [`tokio`](https://tokio.rs/) framework;
 - **Event-based**: Summerset adopts a channel-oriented, event-based system architecture; each replication protocol is basically just a set of event handlers plus a `tokio::select!` loop;
 - **Modularized**: Common components of a distributed KV store, e.g. network transport and durable logger, are cleanly separated from each other and connected through channels.
+- **Protocol-generic**: With the above two points combined, Summerset is able to support a set of different replication protocols in one codebase, each being just a single file, with common functionalities abstracted out.
 
 These design choices make protocol implementation in Summerset surprisingly straight-forward and **understandable**, without any sacrifice on performance. Comments / issues / PRs are always welcome!
 
@@ -118,12 +120,15 @@ Complete cluster management and benchmarking scripts are available in another re
   - [ ] specialize read-only commands?
   - [ ] separate commit vs. exec responses?
   - [ ] membership discovery & view changes?
-- [ ] implementation of Raft
+- [x] implementation of Raft
+  - [x] state persistence & restart check
+  - [x] snapshotting & garbage collection
+  - [ ] membership discovery & view changes?
 - [x] client-side utilities
   - [x] REPL-style client
   - [x] random benchmarking client
   - [x] testing client
-  - [ ] YCSB-driven benchmarking
+  - [ ] YCSB-driven client
 - [ ] better README & documentation
 
 ---
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index 72234864..ae356fa0 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -44,11 +44,13 @@ def kill_all_matching(name, force=False):
     "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'",
     "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'",
     "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'",
+    "Raft": lambda r: f"backer_path='/tmp/summerset.raft.{r}.wal'",
     "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'",
 }
 
 PROTOCOL_SNAPSHOT_PATH = {
     "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'",
+    "Raft": lambda r: f"snapshot_path='/tmp/summerset.raft.{r}.snap'",
     "RSPaxos": lambda r: f"snapshot_path='/tmp/summerset.rs_paxos.{r}.snap'",
 }
 
@@ -70,19 +72,6 @@ def config_with_file_paths(protocol, config, replica):
     return result_config
 
 
-def config_with_backer_path(protocol, config, replica):
-    result_config = PROTOCOL_BACKER_PATH[protocol](replica)
-
-    if config is not None and len(config) > 0:
-        if "backer_path" in config:
-            result_config = config  # use user-supplied path
-        else:
-            result_config += "+"
-            result_config += config
-
-    return result_config
-
-
 def compose_manager_cmd(protocol, srv_port, cli_port, num_replicas, release):
     cmd = [f"./target/{'release' if release else 'debug'}/summerset_manager"]
     cmd += [
diff --git a/scripts/workflow_test.py b/scripts/workflow_test.py
index 33484aca..eb176a7f 100644
--- a/scripts/workflow_test.py
+++ b/scripts/workflow_test.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import argparse
 import subprocess
 
 
@@ -76,6 +77,12 @@ def run_tester_client(protocol, test_name):
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p", "--protocol", type=str, required=True, help="protocol name"
+    )
+    args = parser.parse_args()
+
     do_cargo_build()
 
     kill_all_matching("local_client.py", force=True)
@@ -85,6 +92,13 @@ def run_tester_client(protocol, test_name):
     kill_all_matching("summerset_manager", force=True)
 
     PROTOCOL = "MultiPaxos"
+    if args.protocol == "MultiPaxos":
+        pass
+    elif args.protocol == "Raft":
+        PROTOCOL = "Raft"
+    else:
+        raise ValueError(f"unrecognized protocol {args.protocol} to run workflow test")
+
     NUM_REPLICAS = 3
     TEST_NAME = "primitive_ops"
     TIMEOUT = 300
diff --git a/src/lib.rs b/src/lib.rs
index 9e044072..feb47c90 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,4 +35,5 @@ pub use crate::protocols::SmrProtocol;
 pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing};
 pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush};
 pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos};
+pub use crate::protocols::{ReplicaConfigRaft, ClientConfigRaft};
 pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos};
diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index a21ef9c7..daefadf0 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -186,19 +186,12 @@ impl ClusterManager {
                                     protocol);
         }
 
-        // tell it to connect to all existing known servers
+        // gather the list of all existing known servers
         let to_peers: HashMap<ReplicaId, SocketAddr> = self
             .server_info
             .iter()
             .map(|(&server, info)| (server, info.p2p_addr))
             .collect();
-        self.server_reigner.send_ctrl(
-            CtrlMsg::ConnectToPeers {
-                population: self.population,
-                to_peers,
-            },
-            server,
-        )?;
 
         // save new server's info
         self.server_info.insert(
@@ -211,6 +204,16 @@ impl ClusterManager {
                 start_slot: 0,
             },
         );
+
+        // tell it to connect to all other existing known servers
+        self.server_reigner.send_ctrl(
+            CtrlMsg::ConnectToPeers {
+                population: self.population,
+                to_peers,
+            },
+            server,
+        )?;
+
         Ok(())
     }
 
@@ -406,9 +409,13 @@ impl ClusterManager {
             self.server_info.get_mut(&s).unwrap().is_paused = true;
 
             // wait for dummy reply
-            let (_, reply) = self.server_reigner.recv_ctrl().await?;
-            if reply != CtrlMsg::PauseReply {
-                return logged_err!("m"; "unexpected reply type received");
+            loop {
+                let (server, reply) = self.server_reigner.recv_ctrl().await?;
+                if server != s || reply != CtrlMsg::PauseReply {
+                    self.handle_ctrl_msg(server, reply).await?;
+                } else {
+                    break;
+                }
             }
 
             pause_done.insert(s);
@@ -442,9 +449,13 @@ impl ClusterManager {
             self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?;
 
             // wait for dummy reply
-            let (_, reply) = self.server_reigner.recv_ctrl().await?;
-            if reply != CtrlMsg::ResumeReply {
-                return logged_err!("m"; "unexpected reply type received");
+            loop {
+                let (server, reply) = self.server_reigner.recv_ctrl().await?;
+                if server != s || reply != CtrlMsg::ResumeReply {
+                    self.handle_ctrl_msg(server, reply).await?;
+                } else {
+                    break;
+                }
             }
 
             // clear the is_paused flag
@@ -482,22 +493,27 @@ impl ClusterManager {
             self.server_reigner.send_ctrl(CtrlMsg::TakeSnapshot, s)?;
 
             // wait for reply
-            let (_, reply) = self.server_reigner.recv_ctrl().await?;
-            if let CtrlMsg::SnapshotUpTo { new_start } = reply {
-                // update the log start index info
-                assert!(self.server_info.contains_key(&s));
-                if new_start < self.server_info[&s].start_slot {
-                    return logged_err!("m"; "server {} snapshot up to {} < {}",
-                                            s, new_start,
-                                            self.server_info[&s].start_slot);
-                } else {
-                    self.server_info.get_mut(&s).unwrap().start_slot =
-                        new_start;
-                }
+            loop {
+                let (server, reply) = self.server_reigner.recv_ctrl().await?;
+                match reply {
+                    CtrlMsg::SnapshotUpTo { new_start } if server == s => {
+                        // update the log start index info
+                        assert!(self.server_info.contains_key(&s));
+                        if new_start < self.server_info[&s].start_slot {
+                            return logged_err!("m"; "server {} snapshot up to {} < {}",
+                                                    s, new_start,
+                                                    self.server_info[&s].start_slot);
+                        } else {
+                            self.server_info.get_mut(&s).unwrap().start_slot =
+                                new_start;
+                        }
+
+                        snapshot_up_to.insert(s, new_start);
+                        break;
+                    }
 
-                snapshot_up_to.insert(s, new_start);
-            } else {
-                return logged_err!("m"; "unexpected reply type received");
+                    _ => self.handle_ctrl_msg(server, reply).await?,
+                }
             }
         }
 
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index 41ae38ec..3be28cde 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -21,7 +21,7 @@ use tokio::task::JoinHandle;
 
 /// Control message from/to servers. Control traffic could be bidirectional:
 /// some initiated by the manager and some by servers.
-// TODO: later add leader change, membership change, etc.
+// TODO: later add membership/view change, link drop, etc.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub enum CtrlMsg {
     /// Server -> Manager: new server up, requesting a list of peers' addresses
diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs
index b7aaaf4f..ff7f88f1 100644
--- a/src/protocols/mod.rs
+++ b/src/protocols/mod.rs
@@ -22,6 +22,10 @@ mod multipaxos;
 use multipaxos::{MultiPaxosReplica, MultiPaxosClient};
 pub use multipaxos::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos};
 
+mod raft;
+use raft::{RaftReplica, RaftClient};
+pub use raft::{ReplicaConfigRaft, ClientConfigRaft};
+
 mod rs_paxos;
 use rs_paxos::{RSPaxosReplica, RSPaxosClient};
 pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos};
@@ -32,6 +36,7 @@ pub enum SmrProtocol {
     RepNothing,
     SimplePush,
     MultiPaxos,
+    Raft,
     RSPaxos,
 }
 
@@ -51,6 +56,7 @@ impl SmrProtocol {
             "RepNothing" => Some(Self::RepNothing),
             "SimplePush" => Some(Self::SimplePush),
             "MultiPaxos" => Some(Self::MultiPaxos),
+            "Raft" => Some(Self::Raft),
             "RSPaxos" => Some(Self::RSPaxos),
             _ => None,
         }
@@ -100,6 +106,14 @@ impl SmrProtocol {
                     .await
                 )
             }
+            Self::Raft => {
+                box_if_ok!(
+                    RaftReplica::new_and_setup(
+                        api_addr, p2p_addr, manager, config_str
+                    )
+                    .await
+                )
+            }
             Self::RSPaxos => {
                 box_if_ok!(
                     RSPaxosReplica::new_and_setup(
@@ -133,6 +147,9 @@ impl SmrProtocol {
                     MultiPaxosClient::new_and_setup(manager, config_str).await
                 )
             }
+            Self::Raft => {
+                box_if_ok!(RaftClient::new_and_setup(manager, config_str).await)
+            }
             Self::RSPaxos => {
                 box_if_ok!(
                     RSPaxosClient::new_and_setup(manager, config_str).await
@@ -166,6 +183,7 @@ mod protocols_name_tests {
         valid_name_test!(RepNothing);
         valid_name_test!(SimplePush);
         valid_name_test!(MultiPaxos);
+        valid_name_test!(Raft);
         valid_name_test!(RSPaxos);
     }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 00e5f964..f52ca472 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -7,6 +7,7 @@
 //!   - <https://github.com/josehu07/learn-tla/tree/main/Dr.-TLA%2B-selected/multipaxos_practical>
 //!   - <https://github.com/efficient/epaxos/blob/master/src/paxos/paxos.go>
 
+use std::cmp;
 use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
@@ -35,8 +36,8 @@ use tokio::sync::watch;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigMultiPaxos {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -49,7 +50,6 @@ pub struct ReplicaConfigMultiPaxos {
 
     /// Min timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_min: u64,
-
     /// Max timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_max: u64,
 
@@ -74,7 +74,7 @@ pub struct ReplicaConfigMultiPaxos {
 impl Default for ReplicaConfigMultiPaxos {
     fn default() -> Self {
         ReplicaConfigMultiPaxos {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.multipaxos.wal".into(),
             logger_sync: false,
@@ -154,12 +154,12 @@ struct Instance {
     external: bool,
 
     /// Offset of first durable WAL log entry related to this instance.
-    log_offset: usize,
+    wal_offset: usize,
 }
 
-/// Stable storage log entry type.
+/// Stable storage WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
 
@@ -175,11 +175,20 @@ enum LogEntry {
 }
 
 /// Snapshot file entry type.
+///
+/// NOTE: the current implementation simply appends a squashed log at the
+/// end of the snapshot file for simplicity. In production, the snapshot
+/// file should be a bounded-sized backend, e.g., an LSM-tree.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
-    /// First entry at the start of file: number of log instances covered by
-    /// this snapshot file == the start slot index of in-mem log.
-    StartSlot { slot: usize },
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log instances covered
+        /// by this snapshot file == the start slot index of in-mem log.
+        start_slot: usize,
+        /// Index of the first non-committed slot.
+        commit_bar: usize,
+    },
 
     /// Set of key-value pairs to apply to the state.
     KVPairSet { pairs: HashMap<String, String> },
@@ -189,7 +198,13 @@ enum SnapEntry {
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
-    Prepare { slot: usize, ballot: Ballot },
+    Prepare {
+        /// Slot index in Prepare message is the triggering slot of this
+        /// Prepare. Once prepared, it means that all slots in the range
+        /// [slot, +infinity) are prepared under this ballot number.
+        slot: usize,
+        ballot: Ballot,
+    },
 
     /// Prepare reply from replica to leader.
     PrepareReply {
@@ -213,8 +228,18 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Request by a lagging replica to leader asking to re-send Accepts for
+    /// missing holes
+    FillHoles { slots: Vec<usize> },
+
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot, exec_bar: usize },
+    Heartbeat {
+        ballot: Ballot,
+        /// For leader step-up as well as conservative snapshotting purpose.
+        exec_bar: usize,
+        /// For conservative snapshotting purpose.
+        snap_bar: usize,
+    },
 }
 
 /// MultiPaxos server replica module.
@@ -247,7 +272,7 @@ pub struct MultiPaxosReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// StorageHub module for the snapshot file.
     snapshot_hub: StorageHub<SnapEntry>,
@@ -255,14 +280,21 @@ pub struct MultiPaxosReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
     /// Timer for hearing heartbeat from leader.
     hb_hear_timer: Timer,
 
     /// Interval for sending heartbeat to followers.
     hb_send_interval: Interval,
 
-    /// Do I think I am the leader?
-    is_leader: bool,
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
@@ -289,15 +321,33 @@ pub struct MultiPaxosReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Map from peer ID -> its latest exec_bar I know; this is for conservative
+    /// snapshotting purpose.
+    peer_exec_bar: HashMap<ReplicaId, usize>,
+
+    /// Slot index before which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed (and executed) that entry.
+    snap_bar: usize,
+
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 
     /// Current durable snapshot file offset.
     snap_offset: usize,
 }
 
+// MultiPaxosReplica common helpers
 impl MultiPaxosReplica {
+    /// Do I think I am the current effective leader?
+    #[inline]
+    fn is_leader(&self) -> bool {
+        self.leader == Some(self.id)
+    }
+
     /// Create an empty null instance.
+    #[inline]
     fn null_instance(&self) -> Instance {
         Instance {
             bal: 0,
@@ -307,22 +357,36 @@ impl MultiPaxosReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
-            log_offset: 0,
+            wal_offset: 0,
+        }
+    }
+
+    /// Locate the first null slot or append a null instance if no holes exist.
+    fn first_null_slot(&mut self) -> usize {
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
+                return s;
+            }
         }
+        self.insts.push(self.null_instance());
+        self.start_slot + self.insts.len() - 1
     }
 
     /// Compose a unique ballot number from base.
+    #[inline]
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
     }
 
     /// Compose a unique ballot number greater than the given one.
+    #[inline]
     fn make_greater_ballot(&self, bal: Ballot) -> Ballot {
         self.make_unique_ballot((bal >> 8) + 1)
     }
 
     /// Compose LogActionId from slot index & entry type.
     /// Uses the `Status` enum type to represent differnet entry types.
+    #[inline]
     fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId {
         let type_num = match entry_type {
             Status::Preparing => 1,
@@ -334,6 +398,7 @@ impl MultiPaxosReplica {
     }
 
     /// Decompose LogActionId into slot index & entry type.
+    #[inline]
     fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) {
         let slot = (log_action_id >> 2) as usize;
         let type_num = log_action_id & ((1 << 2) - 1);
@@ -347,6 +412,7 @@ impl MultiPaxosReplica {
     }
 
     /// Compose CommandId from slot index & command index within.
+    #[inline]
     fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
         assert!(slot <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -354,12 +420,16 @@ impl MultiPaxosReplica {
     }
 
     /// Decompose CommandId into slot index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let slot = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (slot, cmd_idx)
     }
+}
 
+// MultiPaxosReplica client requests entrance
+impl MultiPaxosReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -370,52 +440,44 @@ impl MultiPaxosReplica {
         pf_debug!(self.id; "got request batch of size {}", batch_size);
 
         // if I'm not a leader, ignore client requests
-        if !self.is_leader {
+        if !self.is_leader() {
             for (client, req) in req_batch {
                 if let ApiRequest::Req { id: req_id, .. } = req {
-                    // tell the client to try on the next replica
-                    let next_replica = (self.id + 1) % self.population;
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
                     self.external_api.send_reply(
                         ApiReply::Reply {
                             id: req_id,
                             result: None,
-                            redirect: Some(next_replica),
+                            redirect: Some(target),
                         },
                         client,
                     )?;
                     pf_trace!(self.id; "redirected client {} to replica {}",
-                                       client, next_replica);
+                                       client, target);
                 }
             }
             return Ok(());
         }
 
         // create a new instance in the first null slot (or append a new one
-        // at the end if no holes exist)
-        let mut slot = self.start_slot + self.insts.len();
-        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
-            let old_inst = &mut self.insts[s - self.start_slot];
-            if old_inst.status == Status::Null {
-                old_inst.reqs = req_batch.clone();
-                old_inst.leader_bk = Some(LeaderBookkeeping {
-                    prepare_acks: Bitmap::new(self.population, false),
-                    prepare_max_bal: 0,
-                    accept_acks: Bitmap::new(self.population, false),
-                });
-                slot = s;
-                break;
-            }
-        }
-        if slot == self.start_slot + self.insts.len() {
-            let mut new_inst = self.null_instance();
-            new_inst.reqs = req_batch.clone();
-            new_inst.leader_bk = Some(LeaderBookkeeping {
+        // at the end if no holes exist); fill it up with incoming data
+        let slot = self.first_null_slot();
+        {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert_eq!(inst.status, Status::Null);
+            inst.reqs = req_batch.clone();
+            inst.leader_bk = Some(LeaderBookkeeping {
                 prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
                 accept_acks: Bitmap::new(self.population, false),
             });
-            new_inst.external = true;
-            self.insts.push(new_inst);
+            inst.external = true;
         }
 
         // decide whether we can enter fast path for this instance
@@ -439,7 +501,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal {
+                    entry: WalEntry::PrepareBal {
                         slot,
                         ballot: self.bal_prep_sent,
                     },
@@ -472,7 +534,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot: inst.bal,
                         reqs: req_batch.clone(),
@@ -498,7 +560,10 @@ impl MultiPaxosReplica {
 
         Ok(())
     }
+}
 
+// MultiPaxosReplica durable WAL logging
+impl MultiPaxosReplica {
     /// Handler of PrepareBal logging result chan recv.
     fn handle_logged_prepare_bal(
         &mut self,
@@ -516,7 +581,7 @@ impl MultiPaxosReplica {
             None
         };
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of a PrepareBal entry
             // is equivalent to receiving a Prepare reply from myself
             // (as an acceptor role)
@@ -553,7 +618,7 @@ impl MultiPaxosReplica {
                            slot, self.insts[slot - self.start_slot].bal);
         let inst = &self.insts[slot - self.start_slot];
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of an AcceptData entry
             // is equivalent to receiving an Accept reply from myself
             // (as an acceptor role)
@@ -619,6 +684,21 @@ impl MultiPaxosReplica {
             }
         }
 
+        // if there are hole(s) between current commit_bar and newly committed
+        // slot, ask the leader to re-send Accept messages for those slots
+        if slot > self.commit_bar && !self.is_leader() {
+            if let Some(leader) = self.leader {
+                let holes: Vec<usize> = (self.commit_bar..slot).collect();
+                self.transport_hub.send_msg(
+                    PeerMsg::FillHoles {
+                        slots: holes.clone(),
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes);
+            }
+        }
+
         Ok(())
     }
 
@@ -626,7 +706,7 @@ impl MultiPaxosReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
         if slot < self.start_slot {
@@ -635,15 +715,15 @@ impl MultiPaxosReplica {
         assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
-            assert!(now_size >= self.log_offset);
-            // update first log_offset of slot
+            assert!(now_size >= self.wal_offset);
+            // update first wal_offset of slot
             let inst = &mut self.insts[slot - self.start_slot];
-            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
-                inst.log_offset = self.log_offset;
+            if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset {
+                inst.wal_offset = self.wal_offset;
             }
-            assert!(inst.log_offset <= self.log_offset);
-            // then update self.log_offset
-            self.log_offset = now_size;
+            assert!(inst.wal_offset <= self.wal_offset);
+            // then update self.wal_offset
+            self.wal_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
         }
@@ -657,7 +737,10 @@ impl MultiPaxosReplica {
             }
         }
     }
+}
 
+// MultiPaxosReplica peer-peer messages handling
+impl MultiPaxosReplica {
     /// Handler of Prepare message from leader.
     fn handle_msg_prepare(
         &mut self,
@@ -691,7 +774,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal { slot, ballot },
+                    entry: WalEntry::PrepareBal { slot, ballot },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -719,10 +802,11 @@ impl MultiPaxosReplica {
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Preparing)
                 || (ballot < inst.bal)
             {
@@ -761,7 +845,7 @@ impl MultiPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
-                        entry: LogEntry::AcceptData {
+                        entry: WalEntry::AcceptData {
                             slot,
                             ballot,
                             reqs: inst.reqs.clone(),
@@ -825,7 +909,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData { slot, ballot, reqs },
+                    entry: WalEntry::AcceptData { slot, ballot, reqs },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -852,10 +936,11 @@ impl MultiPaxosReplica {
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Accepting)
                 || (ballot < inst.bal)
             {
@@ -882,7 +967,7 @@ impl MultiPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Committed),
                     LogAction::Append {
-                        entry: LogEntry::CommitSlot { slot },
+                        entry: WalEntry::CommitSlot { slot },
                         sync: self.config.logger_sync,
                     },
                 )?;
@@ -931,7 +1016,7 @@ impl MultiPaxosReplica {
         self.storage_hub.submit_action(
             Self::make_log_action_id(slot, Status::Committed),
             LogAction::Append {
-                entry: LogEntry::CommitSlot { slot },
+                entry: WalEntry::CommitSlot { slot },
                 sync: self.config.logger_sync,
             },
         )?;
@@ -941,6 +1026,43 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
+    /// Handler of FillHoles message from a lagging peer.
+    fn handle_msg_fill_holes(
+        &mut self,
+        peer: ReplicaId,
+        slots: Vec<usize>,
+    ) -> Result<(), SummersetError> {
+        if !self.is_leader() {
+            return Ok(());
+        }
+        pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots);
+
+        for slot in slots {
+            if slot < self.start_slot {
+                continue;
+            } else if slot >= self.start_slot + self.insts.len() {
+                break;
+            }
+            let inst = &self.insts[slot - self.start_slot];
+
+            if inst.status >= Status::Committed {
+                // re-send Accept message for this slot
+                self.transport_hub.send_msg(
+                    PeerMsg::Accept {
+                        slot,
+                        ballot: self.bal_prepared,
+                        reqs: inst.reqs.clone(),
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}",
+                                   peer, slot, self.bal_prepared);
+            }
+        }
+
+        Ok(())
+    }
+
     /// Synthesized handler of receiving message from peer.
     fn handle_msg_recv(
         &mut self,
@@ -963,12 +1085,20 @@ impl MultiPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Heartbeat { ballot, exec_bar } => {
-                self.heard_heartbeat(peer, ballot, exec_bar)
+            PeerMsg::FillHoles { slots } => {
+                self.handle_msg_fill_holes(peer, slots)
             }
+            PeerMsg::Heartbeat {
+                ballot,
+                exec_bar,
+                snap_bar,
+            } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar),
         }
     }
+}
 
+// MultiPaxosReplica state machine execution
+impl MultiPaxosReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -1026,22 +1156,39 @@ impl MultiPaxosReplica {
 
         Ok(())
     }
+}
 
+// MultiPaxosReplica leadership related logic
+impl MultiPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        if self.is_leader {
+        if self.is_leader() {
             return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
         }
 
-        self.is_leader = true; // this starts broadcasting heartbeats
+        self.leader = Some(self.id); // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_info!(self.id; "becoming a leader...");
 
-        // broadcast a heartbeat right now
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
         self.bcast_heartbeats()?;
 
+        // re-initialize peer_exec_bar information
+        for slot in self.peer_exec_bar.values_mut() {
+            *slot = 0;
+        }
+
         // make a greater ballot number and invalidate all in-progress instances
         self.bal_prepared = 0;
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
@@ -1069,7 +1216,7 @@ impl MultiPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Preparing),
                     LogAction::Append {
-                        entry: LogEntry::PrepareBal {
+                        entry: WalEntry::PrepareBal {
                             slot,
                             ballot: self.bal_prep_sent,
                         },
@@ -1091,7 +1238,6 @@ impl MultiPaxosReplica {
                                    slot, inst.bal);
             }
         }
-
         Ok(())
     }
 
@@ -1101,10 +1247,43 @@ impl MultiPaxosReplica {
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
                 exec_bar: self.exec_bar,
+                snap_bar: self.snap_bar,
             },
             None,
         )?;
-        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
+        self.heard_heartbeat(
+            self.id,
+            self.bal_prep_sent,
+            self.exec_bar,
+            self.snap_bar,
+        )?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1113,6 +1292,8 @@ impl MultiPaxosReplica {
     /// Chooses a random hb_hear_timeout from the min-max range and kicks off
     /// the hb_hear_timer.
     fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
         let timeout_ms = thread_rng().gen_range(
             self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
         );
@@ -1128,10 +1309,19 @@ impl MultiPaxosReplica {
     /// leader status if I currently think I'm a leader.
     fn heard_heartbeat(
         &mut self,
-        _peer: ReplicaId,
+        peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
+        snap_bar: usize,
     ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
         // ignore outdated heartbeats and those from peers with exec_bar < mine
         if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
@@ -1140,18 +1330,61 @@ impl MultiPaxosReplica {
         // reset hearing timer
         self.kickoff_hb_hear_timer()?;
 
-        // clear my leader status if it carries a higher ballot number
-        if self.is_leader && ballot > self.bal_max_seen {
-            self.is_leader = false;
-            self.control_hub
-                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
-            pf_info!(self.id; "no longer a leader...");
+        if peer != self.id {
+            // reply back with a Heartbeat message
+            self.transport_hub.send_msg(
+                PeerMsg::Heartbeat {
+                    ballot,
+                    exec_bar: self.exec_bar,
+                    snap_bar: self.snap_bar,
+                },
+                peer,
+            )?;
+
+            // update peer_exec_bar if larger then known; if all servers'
+            // exec_bar (including myself) have passed a slot, that slot
+            // is definitely safe to be snapshotted
+            if exec_bar > self.peer_exec_bar[&peer] {
+                *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar;
+                let passed_cnt = 1 + self
+                    .peer_exec_bar
+                    .values()
+                    .filter(|&&e| e >= exec_bar)
+                    .count() as u8;
+                if passed_cnt == self.population {
+                    // all servers have executed up to exec_bar
+                    self.snap_bar = exec_bar;
+                }
+            }
+
+            // if the peer has made a higher ballot number
+            if ballot > self.bal_max_seen {
+                self.bal_max_seen = ballot;
+
+                // clear my leader status if I was one
+                if self.is_leader() {
+                    self.control_hub
+                        .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+                    pf_info!(self.id; "no longer a leader...");
+                }
+
+                // set this peer to be the believed leader
+                self.leader = Some(peer);
+            }
+        }
+
+        // if snap_bar is larger than mine, update snap_bar
+        if snap_bar > self.snap_bar {
+            self.snap_bar = snap_bar;
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
+}
 
+// MultiPaxosReplica control messages handling
+impl MultiPaxosReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1263,14 +1496,20 @@ impl MultiPaxosReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// MultiPaxosReplica recovery from WAL log
+impl MultiPaxosReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
-        entry: LogEntry,
+        entry: WalEntry,
     ) -> Result<(), SummersetError> {
         match entry {
-            LogEntry::PrepareBal { slot, ballot } => {
+            WalEntry::PrepareBal { slot, ballot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance());
@@ -1289,7 +1528,10 @@ impl MultiPaxosReplica {
                 self.bal_prepared = 0;
             }
 
-            LogEntry::AcceptData { slot, ballot, reqs } => {
+            WalEntry::AcceptData { slot, ballot, reqs } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance());
@@ -1315,9 +1557,12 @@ impl MultiPaxosReplica {
                 assert!(self.bal_prepared <= self.bal_prep_sent);
             }
 
-            LogEntry::CommitSlot { slot } => {
+            WalEntry::CommitSlot { slot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 assert!(slot < self.start_slot + self.insts.len());
-                // update instance state
+                // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
@@ -1337,9 +1582,10 @@ impl MultiPaxosReplica {
                                 let _ = self.state_machine.get_result().await?;
                             }
                         }
-                        // update commit_bar and exec_bar
+                        // update instance status, commit_bar and exec_bar
                         self.commit_bar += 1;
                         self.exec_bar += 1;
+                        inst.status = Status::Executed;
                     }
                 }
             }
@@ -1348,15 +1594,15 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1368,7 +1614,7 @@ impl MultiPaxosReplica {
                 } => {
                     self.recover_apply_entry(entry).await?;
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -1384,7 +1630,7 @@ impl MultiPaxosReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1392,17 +1638,27 @@ impl MultiPaxosReplica {
             offset_ok: true, ..
         } = log_result
         {
+            if self.wal_offset > 0 {
+                pf_info!(self.id; "recovered from wal log: commit {} exec {}",
+                                  self.commit_bar, self.exec_bar);
+            }
             Ok(())
         } else {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+}
 
-    /// Dump a new key-value pair to snapshot file.
-    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+// MultiPaxosReplica snapshotting & GC logic
+impl MultiPaxosReplica {
+    /// Dump new key-value pairs to snapshot file.
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
-        for slot in self.start_slot..self.exec_bar {
+        for slot in self.start_slot..new_start_slot {
             let inst = &self.insts[slot - self.start_slot];
             for (_, req) in inst.reqs.clone() {
                 if let ApiRequest::Req {
@@ -1438,15 +1694,20 @@ impl MultiPaxosReplica {
     /// Discard everything older than start_slot in durable WAL log.
     async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
         let cut_offset = if !self.insts.is_empty() {
-            self.insts[0].log_offset
+            self.insts[0].wal_offset
         } else {
-            self.log_offset
+            self.wal_offset
         };
 
         // discard the log before cut_offset
         if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: 0,
+                },
+            )?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -1459,8 +1720,8 @@ impl MultiPaxosReplica {
                         now_size,
                     } = log_result
                     {
-                        assert_eq!(self.log_offset - cut_offset, now_size);
-                        self.log_offset = now_size;
+                        assert_eq!(self.wal_offset - cut_offset, now_size);
+                        self.wal_offset = now_size;
                     } else {
                         return logged_err!(
                             self.id;
@@ -1472,43 +1733,74 @@ impl MultiPaxosReplica {
             }
         }
 
-        // update inst.log_offset for all remaining in-mem instances
+        // update inst.wal_offset for all remaining in-mem instances
         for inst in &mut self.insts {
-            if inst.log_offset > 0 {
-                assert!(inst.log_offset >= cut_offset);
-                inst.log_offset -= cut_offset;
+            if inst.wal_offset > 0 {
+                assert!(inst.wal_offset >= cut_offset);
+                inst.wal_offset -= cut_offset;
             }
         }
 
         Ok(())
     }
 
-    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// Take a snapshot up to current exec_bar, then discard the in-mem log up
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
-    /// middle of taking a snapshot.
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we assume here that failed
+    /// Accept messages will be retried indefinitely until success before its
+    /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.exec_bar, self.snap_bar);
         assert!(self.exec_bar >= self.start_slot);
-        if self.exec_bar == self.start_slot {
+
+        let new_start_slot = cmp::min(self.snap_bar, self.exec_bar);
+        if new_start_slot == self.start_slot {
             return Ok(());
         }
 
         // collect and dump all Puts in executed instances
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
-        self.snapshot_dump_kv_pairs().await?;
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
+
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: new_start_slot,
+                    commit_bar: self.commit_bar,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed write");
+            }
+        }
 
         // update start_slot and discard all in-memory log instances up to exec_bar
-        self.insts.drain(0..(self.exec_bar - self.start_slot));
-        self.start_slot = self.exec_bar;
+        self.insts.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
 
         // discarding everything older than start_slot in WAL log
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -1533,11 +1825,20 @@ impl MultiPaxosReplica {
 
         match log_result {
             LogResult::Read {
-                entry: Some(SnapEntry::StartSlot { slot }),
+                entry:
+                    Some(SnapEntry::SlotInfo {
+                        start_slot,
+                        commit_bar,
+                    }),
                 end_offset,
             } => {
                 self.snap_offset = end_offset;
-                self.start_slot = slot; // get start slot index of in-mem log
+
+                // recover necessary slot indices info
+                self.start_slot = start_slot;
+                self.commit_bar = commit_bar;
+                self.exec_bar = start_slot;
+                self.snap_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -1580,6 +1881,11 @@ impl MultiPaxosReplica {
                 self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
                     new_start: self.start_slot,
                 })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}",
+                                      self.start_slot, self.commit_bar, self.exec_bar);
+                }
                 Ok(())
             }
 
@@ -1588,7 +1894,10 @@ impl MultiPaxosReplica {
                 self.snapshot_hub.submit_action(
                     0,
                     LogAction::Write {
-                        entry: SnapEntry::StartSlot { slot: 0 },
+                        entry: SnapEntry::SlotInfo {
+                            start_slot: 0,
+                            commit_bar: 0,
+                        },
                         offset: 0,
                         sync: self.config.logger_sync,
                     },
@@ -1602,7 +1911,7 @@ impl MultiPaxosReplica {
                     self.snap_offset = now_size;
                     Ok(())
                 } else {
-                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                    logged_err!(self.id; "unexpected log result type or failed write")
                 }
             }
 
@@ -1628,18 +1937,39 @@ impl GenericReplica for MultiPaxosReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigMultiPaxos;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
                                     snapshot_path, snapshot_interval_s,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
+            );
+        }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
             );
         }
         if config.hb_hear_timeout_min < 100 {
@@ -1728,7 +2058,7 @@ impl GenericReplica for MultiPaxosReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -1746,6 +2076,10 @@ impl GenericReplica for MultiPaxosReplica {
         ));
         snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
         Ok(MultiPaxosReplica {
             id,
             population,
@@ -1759,9 +2093,11 @@ impl GenericReplica for MultiPaxosReplica {
             storage_hub,
             snapshot_hub,
             transport_hub,
+            leader: None,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
-            is_leader: false,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
             insts: vec![],
             start_slot: 0,
             snapshot_interval,
@@ -1770,7 +2106,11 @@ impl GenericReplica for MultiPaxosReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
-            log_offset: 0,
+            peer_exec_bar: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            snap_bar: 0,
+            wal_offset: 0,
             snap_offset: 0,
         })
     }
@@ -1782,8 +2122,8 @@ impl GenericReplica for MultiPaxosReplica {
         // recover state from durable snapshot file
         self.recover_from_snapshot().await?;
 
-        // recover the tail-piece memory log & state from durable storage log
-        self.recover_from_log().await?;
+        // recover the tail-piece memory log & state from durable WAL log
+        self.recover_from_wal().await?;
 
         // kick off leader activity hearing timer
         self.kickoff_hb_hear_timer()?;
@@ -1849,7 +2189,7 @@ impl GenericReplica for MultiPaxosReplica {
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => {
                     if let Err(e) = self.bcast_heartbeats() {
                         pf_error!(self.id; "error broadcasting heartbeats: {}", e);
                     }
@@ -1857,7 +2197,7 @@ impl GenericReplica for MultiPaxosReplica {
 
                 // autonomous snapshot taking timeout
                 _ = self.snapshot_interval.tick(), if !paused
-                        && self.config.snapshot_interval_s > 0 => {
+                                                      && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs
new file mode 100644
index 00000000..4ffc04f5
--- /dev/null
+++ b/src/protocols/raft.rs
@@ -0,0 +1,2222 @@
+//! Replication protocol: Raft.
+//!
+//! ATC '14 version of Raft. References:
+//!   - <https://raft.github.io/raft.pdf>
+//!   - <https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf>
+//!   - <https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/>
+
+use std::cmp;
+use std::collections::{HashMap, HashSet};
+use std::path::Path;
+use std::net::SocketAddr;
+
+use crate::utils::{SummersetError, Bitmap, Timer};
+use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
+use crate::server::{
+    ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId,
+    ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult,
+    LogActionId, TransportHub, GenericReplica,
+};
+use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
+use crate::protocols::SmrProtocol;
+
+use rand::prelude::*;
+
+use async_trait::async_trait;
+
+use get_size::GetSize;
+
+use serde::{Serialize, Deserialize};
+
+use tokio::time::{self, Duration, Interval, MissedTickBehavior};
+use tokio::sync::watch;
+
+/// Configuration parameters struct.
+#[derive(Debug, Deserialize)]
+pub struct ReplicaConfigRaft {
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
+
+    /// Client request batching maximum batch size.
+    pub max_batch_size: usize,
+
+    /// Path to backing log file.
+    pub backer_path: String,
+
+    /// Whether to call `fsync()`/`fdatasync()` on logger.
+    pub logger_sync: bool,
+
+    /// Min timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_min: u64,
+    /// Max timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_max: u64,
+
+    /// Interval of leader sending AppendEntries heartbeats to followers.
+    pub hb_send_interval_ms: u64,
+
+    /// Path to snapshot file.
+    pub snapshot_path: String,
+
+    /// Snapshot self-triggering interval in secs. 0 means never trigger
+    /// snapshotting autonomously.
+    pub snapshot_interval_s: u64,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
+    pub perf_network_a: u64,
+    pub perf_network_b: u64,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for ReplicaConfigRaft {
+    fn default() -> Self {
+        ReplicaConfigRaft {
+            batch_interval_ms: 10,
+            max_batch_size: 5000,
+            backer_path: "/tmp/summerset.raft.wal".into(),
+            logger_sync: false,
+            hb_hear_timeout_min: 600,
+            hb_hear_timeout_max: 900,
+            hb_send_interval_ms: 50,
+            snapshot_path: "/tmp/summerset.raft.snap".into(),
+            snapshot_interval_s: 0,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
+            perf_network_a: 0,
+            perf_network_b: 0,
+        }
+    }
+}
+
+/// Term number type, defined for better code readability.
+type Term = u64;
+
+/// Request batch type (i.e., the "command" in an entry).
+///
+/// NOTE: the originally presented Raft algorithm does not explicitly mention
+/// batching, but instead hides it with the heartbeats: every AppendEntries RPC
+/// from the leader basically batches all commands it has received since the
+/// last sent heartbeat. Here, to make this implementation more comparable to
+/// MultiPaxos, we trigger batching also explicitly.
+type ReqBatch = Vec<(ClientId, ApiRequest)>;
+
+/// In-mem + persistent entry of log, containing a term and a commands batch.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+struct LogEntry {
+    /// Term number.
+    term: Term,
+
+    /// Batch of client requests.
+    reqs: ReqBatch,
+
+    /// True if from external client, else false.
+    external: bool,
+
+    /// Offset in durable log file of this entry. This field is not maintained
+    /// in durable storage itself, where it is typically 0. It is maintained
+    /// only in the in-memory log.
+    log_offset: usize,
+}
+
+/// Stable storage log entry type.
+///
+/// NOTE: Raft makes the persistent log exactly mirror the in-memory log, so
+/// the backer file is not a WAL log in runtime operation; it might get
+/// overwritten, etc.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum DurEntry {
+    /// Durable metadata.
+    Metadata {
+        curr_term: Term,
+        voted_for: Option<ReplicaId>,
+    },
+
+    /// Log entry mirroring in-mem log.
+    LogEntry { entry: LogEntry },
+}
+
+/// Snapshot file entry type.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum SnapEntry {
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log entries covered
+        /// by this snapshot file == the start slot index of remaining log.
+        start_slot: usize,
+    },
+
+    /// Set of key-value pairs to apply to the state.
+    KVPairSet { pairs: HashMap<String, String> },
+}
+
+/// Peer-peer message type.
+#[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
+enum PeerMsg {
+    /// AppendEntries from leader to followers.
+    AppendEntries {
+        term: Term,
+        prev_slot: usize,
+        prev_term: Term,
+        entries: Vec<LogEntry>,
+        leader_commit: usize,
+        /// For conservative snapshotting purpose.
+        last_snap: usize,
+    },
+
+    /// AppendEntries reply from follower to leader.
+    AppendEntriesReply {
+        term: Term,
+        /// For correct tracking of which AppendEntries this reply is for.
+        end_slot: usize,
+        success: bool,
+    },
+
+    /// RequestVote from leader to followers.
+    RequestVote {
+        term: Term,
+        last_slot: usize,
+        last_term: Term,
+    },
+
+    /// RequestVote reply from follower to leader.
+    RequestVoteReply { term: Term, granted: bool },
+}
+
+/// Replica role type.
+#[derive(
+    Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize,
+)]
+enum Role {
+    Follower,
+    Candidate,
+    Leader,
+}
+
+/// Raft server replica module.
+pub struct RaftReplica {
+    /// Replica ID in cluster.
+    id: ReplicaId,
+
+    /// Total number of replicas in cluster.
+    population: u8,
+
+    /// Majority quorum size.
+    quorum_cnt: u8,
+
+    /// Configuration parameters struct.
+    config: ReplicaConfigRaft,
+
+    /// Address string for client requests API.
+    _api_addr: SocketAddr,
+
+    /// Address string for internal peer-peer communication.
+    _p2p_addr: SocketAddr,
+
+    /// ControlHub module.
+    control_hub: ControlHub,
+
+    /// ExternalApi module.
+    external_api: ExternalApi,
+
+    /// StateMachine module.
+    state_machine: StateMachine,
+
+    /// StorageHub module.
+    storage_hub: StorageHub<DurEntry>,
+
+    /// StorageHub module for the snapshot file.
+    snapshot_hub: StorageHub<SnapEntry>,
+
+    /// TransportHub module.
+    transport_hub: TransportHub<PeerMsg>,
+
+    /// Which role am I in right now?
+    role: Role,
+
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
+    /// Timer for hearing heartbeat from leader.
+    hb_hear_timer: Timer,
+
+    /// Interval for sending heartbeat to followers.
+    hb_send_interval: Interval,
+
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
+
+    /// Latest term seen.
+    curr_term: Term,
+
+    /// Candidate ID that I voted for in current term.
+    voted_for: Option<ReplicaId>,
+
+    /// Replica IDs that voted for me in current election.
+    votes_granted: HashSet<ReplicaId>,
+
+    /// In-memory log of entries. Slot 0 is a dummy entry to make indexing happy.
+    log: Vec<LogEntry>,
+
+    /// Start slot index of in-mem log after latest snapshot.
+    start_slot: usize,
+
+    /// Timer for taking a new autonomous snapshot.
+    snapshot_interval: Interval,
+
+    /// Slot index of highest log entry known to be committed.
+    last_commit: usize,
+
+    /// Slot index of highest log entry applied to state machine.
+    last_exec: usize,
+
+    /// For each server, index of the next log entry to send.
+    next_slot: HashMap<ReplicaId, usize>,
+
+    /// For each server, index of the highest log entry known to be replicated.
+    match_slot: HashMap<ReplicaId, usize>,
+
+    /// Slot index up to which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed that entry.
+    last_snap: usize,
+
+    /// Current durable log file end offset.
+    log_offset: usize,
+
+    /// Current durable log end of offset of metadata.
+    log_meta_end: usize,
+
+    /// Current durable snapshot file offset.
+    snap_offset: usize,
+}
+
+// RaftReplica common helpers
+impl RaftReplica {
+    /// Compose LogActionId from (slot, end_slot) pair & entry type.
+    /// Uses the `Role` enum type to represent differnet entry types.
+    #[inline]
+    fn make_log_action_id(
+        slot: usize,
+        slot_e: usize,
+        entry_type: Role,
+    ) -> LogActionId {
+        let type_num = match entry_type {
+            Role::Follower => 1,
+            Role::Leader => 2,
+            _ => panic!("unknown log entry type {:?}", entry_type),
+        };
+        ((slot << 33) | (slot_e << 2) | type_num) as LogActionId
+    }
+
+    /// Decompose LogActionId into (slot, end_slot) pair & entry type.
+    #[inline]
+    fn split_log_action_id(log_action_id: LogActionId) -> (usize, usize, Role) {
+        let slot = (log_action_id >> 33) as usize;
+        let slot_e = ((log_action_id & ((1 << 33) - 1)) >> 2) as usize;
+        let type_num = log_action_id & ((1 << 2) - 1);
+        let entry_type = match type_num {
+            1 => Role::Follower,
+            2 => Role::Leader,
+            _ => panic!("unknown log entry type num {}", type_num),
+        };
+        (slot, slot_e, entry_type)
+    }
+
+    /// Compose CommandId from slot index & command index within.
+    #[inline]
+    fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
+        assert!(slot <= (u32::MAX as usize));
+        assert!(cmd_idx <= (u32::MAX as usize));
+        ((slot << 32) | cmd_idx) as CommandId
+    }
+
+    /// Decompose CommandId into slot index & command index within.
+    #[inline]
+    fn split_command_id(command_id: CommandId) -> (usize, usize) {
+        let slot = (command_id >> 32) as usize;
+        let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
+        (slot, cmd_idx)
+    }
+
+    /// Check if the given term is larger than mine. If so, convert my role
+    /// back to follower. Returns true if my role was not follower but now
+    /// converted to follower, and false otherwise.
+    #[inline]
+    fn check_term(
+        &mut self,
+        peer: ReplicaId,
+        term: Term,
+    ) -> Result<bool, SummersetError> {
+        if term > self.curr_term {
+            self.curr_term = term;
+            self.heard_heartbeat(peer, term)?; // refresh election timer
+            if self.role != Role::Follower {
+                self.role = Role::Follower;
+                pf_trace!(self.id; "converted back to follower");
+                Ok(true)
+            } else {
+                Ok(false)
+            }
+        } else {
+            Ok(false)
+        }
+    }
+}
+
+// RaftReplica client requests entrance
+impl RaftReplica {
+    /// Handler of client request batch chan recv.
+    fn handle_req_batch(
+        &mut self,
+        req_batch: ReqBatch,
+    ) -> Result<(), SummersetError> {
+        let batch_size = req_batch.len();
+        assert!(batch_size > 0);
+        pf_debug!(self.id; "got request batch of size {}", batch_size);
+
+        // if I'm not a leader, ignore client requests
+        if self.role != Role::Leader {
+            for (client, req) in req_batch {
+                if let ApiRequest::Req { id: req_id, .. } = req {
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
+                    self.external_api.send_reply(
+                        ApiReply::Reply {
+                            id: req_id,
+                            result: None,
+                            redirect: Some(target),
+                        },
+                        client,
+                    )?;
+                    pf_trace!(self.id; "redirected client {} to replica {}",
+                                       client, target);
+                }
+            }
+            return Ok(());
+        }
+
+        // append an entry to in-memory log
+        let entry = LogEntry {
+            term: self.curr_term,
+            reqs: req_batch,
+            external: true,
+            log_offset: 0,
+        };
+        let slot = self.start_slot + self.log.len();
+        self.log.push(entry.clone());
+
+        // submit logger action to make this log entry durable
+        self.storage_hub.submit_action(
+            Self::make_log_action_id(slot, slot, Role::Leader),
+            LogAction::Append {
+                entry: DurEntry::LogEntry { entry },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        pf_trace!(self.id; "submitted leader append log action for slot {}", slot);
+
+        Ok(())
+    }
+}
+
+// RaftReplica durable logging
+impl RaftReplica {
+    /// Handler of leader append logging result chan recv.
+    fn handle_logged_leader_append(
+        &mut self,
+        slot: usize,
+        slot_e: usize,
+    ) -> Result<(), SummersetError> {
+        if slot < self.start_slot || self.role != Role::Leader {
+            return Ok(()); // ignore if outdated
+        }
+        pf_trace!(self.id; "finished leader append logging for slot {} <= {}",
+                           slot, slot_e);
+        assert_eq!(slot, slot_e);
+
+        // broadcast AppendEntries messages to followers
+        for peer in 0..self.population {
+            if peer == self.id || self.next_slot[&peer] < 1 {
+                continue;
+            }
+
+            let prev_slot = self.next_slot[&peer] - 1;
+            if prev_slot < self.start_slot {
+                return logged_err!(self.id; "snapshotted slot {} queried", prev_slot);
+            }
+            let prev_term = self.log[prev_slot - self.start_slot].term;
+            let entries = self
+                .log
+                .iter()
+                .skip(self.next_slot[&peer] - self.start_slot)
+                .cloned()
+                .collect();
+
+            if slot >= self.next_slot[&peer] {
+                self.transport_hub.send_msg(
+                    PeerMsg::AppendEntries {
+                        term: self.curr_term,
+                        prev_slot,
+                        prev_term,
+                        entries,
+                        leader_commit: self.last_commit,
+                        last_snap: self.last_snap,
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}",
+                                   peer, self.next_slot[&peer],
+                                   self.start_slot + self.log.len() - 1);
+            }
+        }
+
+        // I also heard my own heartbeat
+        self.heard_heartbeat(self.id, self.curr_term)?;
+
+        Ok(())
+    }
+
+    /// Handler of follower append logging result chan recv.
+    fn handle_logged_follower_append(
+        &mut self,
+        slot: usize,
+        slot_e: usize,
+    ) -> Result<(), SummersetError> {
+        if slot < self.start_slot || self.role != Role::Follower {
+            return Ok(()); // ignore if outdated
+        }
+        pf_trace!(self.id; "finished follower append logging for slot {} <= {}",
+                           slot, slot_e);
+        assert!(slot <= slot_e);
+
+        // if all consecutive entries are made durable, reply AppendEntries
+        // success back to leader
+        if slot == slot_e {
+            if let Some(leader) = self.leader {
+                self.transport_hub.send_msg(
+                    PeerMsg::AppendEntriesReply {
+                        term: self.curr_term,
+                        end_slot: slot_e,
+                        success: true,
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent AppendEntriesReply -> {} up to slot {}",
+                                   leader, slot_e);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Synthesized handler of durable logging result chan recv.
+    fn handle_log_result(
+        &mut self,
+        action_id: LogActionId,
+        log_result: LogResult<DurEntry>,
+    ) -> Result<(), SummersetError> {
+        let (slot, slot_e, entry_type) = Self::split_log_action_id(action_id);
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot_e < self.start_slot + self.log.len());
+
+        if let LogResult::Append { now_size } = log_result {
+            let entry = &mut self.log[slot - self.start_slot];
+            if entry.log_offset != self.log_offset {
+                // entry has incorrect log_offset bookkept; update it
+                entry.log_offset = self.log_offset;
+            }
+            assert!(now_size > self.log_offset);
+            self.log_offset = now_size;
+        } else {
+            return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
+        }
+
+        match entry_type {
+            Role::Follower => self.handle_logged_follower_append(slot, slot_e),
+            Role::Leader => self.handle_logged_leader_append(slot, slot_e),
+            _ => {
+                logged_err!(self.id; "unexpected log entry type: {:?}", entry_type)
+            }
+        }
+    }
+}
+
+// RaftReplica peer-peer messages handling
+impl RaftReplica {
+    /// Handler of AppendEntries message from leader.
+    #[allow(clippy::too_many_arguments)]
+    async fn handle_msg_append_entries(
+        &mut self,
+        leader: ReplicaId,
+        term: Term,
+        prev_slot: usize,
+        prev_term: Term,
+        mut entries: Vec<LogEntry>,
+        leader_commit: usize,
+        last_snap: usize,
+    ) -> Result<(), SummersetError> {
+        if !entries.is_empty() {
+            pf_trace!(self.id; "received AcceptEntries <- {} for slots {} - {} term {}",
+                               leader, prev_slot + 1, prev_slot + entries.len(), term);
+        }
+        if self.check_term(leader, term)? || self.role != Role::Follower {
+            return Ok(());
+        }
+
+        // reply false if term smaller than mine, or if my log does not
+        // contain an entry at prev_slot matching prev_term
+        if term < self.curr_term
+            || prev_slot < self.start_slot
+            || prev_slot >= self.start_slot + self.log.len()
+            || self.log[prev_slot - self.start_slot].term != prev_term
+        {
+            self.transport_hub.send_msg(
+                PeerMsg::AppendEntriesReply {
+                    term: self.curr_term,
+                    end_slot: prev_slot,
+                    success: false,
+                },
+                leader,
+            )?;
+            pf_trace!(self.id; "sent AcceptEntriesReply -> {} term {} end_slot {} fail",
+                               leader, self.curr_term, prev_slot);
+
+            if term >= self.curr_term {
+                // also refresh heartbeat timer here since the "decrementing"
+                // procedure for a lagging follower might take long
+                self.heard_heartbeat(leader, term)?;
+            }
+            return Ok(());
+        }
+
+        // update my knowledge of who's the current leader, and reset election
+        // timeout timer
+        self.leader = Some(leader);
+        self.heard_heartbeat(leader, term)?;
+
+        // check if any existing entry conflicts with a new one in `entries`.
+        // If so, truncate everything at and after that entry
+        let mut first_new = prev_slot + 1;
+        for (slot, new_entry) in entries
+            .iter()
+            .enumerate()
+            .map(|(s, e)| (s + prev_slot + 1, e))
+        {
+            if slot >= self.start_slot + self.log.len() {
+                first_new = slot;
+                break;
+            } else if self.log[slot - self.start_slot].term != new_entry.term {
+                let cut_offset = self.log[slot - self.start_slot].log_offset;
+                // do this truncation in-place for simplicity
+                self.storage_hub.submit_action(
+                    0,
+                    LogAction::Truncate { offset: cut_offset },
+                )?;
+                loop {
+                    let (action_id, log_result) =
+                        self.storage_hub.get_result().await?;
+                    if action_id != 0 {
+                        // normal log action previously in queue; process it
+                        self.handle_log_result(action_id, log_result)?;
+                    } else {
+                        if let LogResult::Truncate {
+                            offset_ok: true,
+                            now_size,
+                        } = log_result
+                        {
+                            assert_eq!(now_size, cut_offset);
+                            self.log_offset = cut_offset;
+                        } else {
+                            return logged_err!(
+                                self.id;
+                                "unexpected log result type or failed truncate"
+                            );
+                        }
+                        break;
+                    }
+                }
+                // truncate in-mem log as well
+                self.log.truncate(slot - self.start_slot);
+                first_new = slot;
+                break;
+            }
+        }
+
+        // append new entries into my log, and submit logger actions to make
+        // new entries durable
+        let (num_entries, mut num_appended) = (entries.len(), 0);
+        for (slot, mut entry) in entries
+            .drain((first_new - prev_slot - 1)..entries.len())
+            .enumerate()
+            .map(|(s, e)| (s + first_new, e))
+        {
+            entry.log_offset = 0;
+
+            self.log.push(entry.clone());
+            self.storage_hub.submit_action(
+                Self::make_log_action_id(
+                    slot,
+                    prev_slot + num_entries,
+                    Role::Follower,
+                ),
+                LogAction::Append {
+                    entry: DurEntry::LogEntry { entry },
+                    sync: self.config.logger_sync,
+                },
+            )?;
+
+            num_appended += 1;
+        }
+
+        // even if no entries appended, also send back AppendEntriesReply
+        // as a follower-to-leader reverse heardbeat for peer health
+        // tracking purposes
+        if num_appended == 0 {
+            self.transport_hub.send_msg(
+                PeerMsg::AppendEntriesReply {
+                    term: self.curr_term,
+                    end_slot: first_new - 1,
+                    success: true,
+                },
+                leader,
+            )?;
+        }
+
+        // if leader_commit is larger than my last_commit, update last_commit
+        if leader_commit > self.last_commit {
+            let new_commit = cmp::min(leader_commit, prev_slot + entries.len());
+
+            // submit newly committed entries for state machine execution
+            for slot in (self.last_commit + 1)..=new_commit {
+                let entry = &self.log[slot - self.start_slot];
+                for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
+                    if let ApiRequest::Req { cmd, .. } = req {
+                        self.state_machine.submit_cmd(
+                            Self::make_command_id(slot, cmd_idx),
+                            cmd.clone(),
+                        )?;
+                    } else {
+                        continue; // ignore other types of requests
+                    }
+                }
+            }
+
+            self.last_commit = new_commit;
+        }
+
+        // if last_snap is larger than mine, update last_snap
+        if last_snap > self.last_snap {
+            self.last_snap = last_snap;
+        }
+
+        Ok(())
+    }
+
+    /// Handler of AppendEntries reply from follower.
+    fn handle_msg_append_entries_reply(
+        &mut self,
+        peer: ReplicaId,
+        term: Term,
+        end_slot: usize,
+        success: bool,
+    ) -> Result<(), SummersetError> {
+        if !success || self.match_slot[&peer] != end_slot {
+            pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}",
+                               peer, term, if success { "ok" } else { "fail" });
+        }
+        if self.check_term(peer, term)? || self.role != Role::Leader {
+            return Ok(());
+        }
+        self.heard_heartbeat(peer, term)?;
+
+        if success {
+            // success: update next_slot and match_slot for follower
+            *self.next_slot.get_mut(&peer).unwrap() = end_slot + 1;
+            *self.match_slot.get_mut(&peer).unwrap() = end_slot;
+
+            // since we updated some match_slot here, check if any additional
+            // entries are now considered committed
+            let mut new_commit = self.last_commit;
+            for slot in
+                (self.last_commit + 1)..(self.start_slot + self.log.len())
+            {
+                let entry = &self.log[slot - self.start_slot];
+                if entry.term != self.curr_term {
+                    continue; // cannot decide commit using non-latest term
+                }
+
+                let match_cnt = 1 + self
+                    .match_slot
+                    .values()
+                    .filter(|&&s| s >= slot)
+                    .count() as u8;
+                if match_cnt >= self.quorum_cnt {
+                    // quorum size reached, set new_commit to here
+                    new_commit = slot;
+                }
+            }
+
+            // submit newly committed commands, if any, for execution
+            for slot in (self.last_commit + 1)..=new_commit {
+                let entry = &self.log[slot - self.start_slot];
+                for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
+                    if let ApiRequest::Req { cmd, .. } = req {
+                        self.state_machine.submit_cmd(
+                            Self::make_command_id(slot, cmd_idx),
+                            cmd.clone(),
+                        )?;
+                    } else {
+                        continue; // ignore other types of requests
+                    }
+                }
+            }
+
+            self.last_commit = new_commit;
+
+            // also check if any additional entries are safe to snapshot
+            for slot in (self.last_snap + 1)..=end_slot {
+                let match_cnt = 1 + self
+                    .match_slot
+                    .values()
+                    .filter(|&&s| s >= slot)
+                    .count() as u8;
+                if match_cnt == self.population {
+                    // all servers have durably stored this entry
+                    self.last_snap = slot;
+                }
+            }
+        } else {
+            // failed: decrement next_slot for follower and retry
+            // NOTE: the optimization of fast-backward bypassing (instead of
+            //       always decrementing by 1) not implemented
+            if self.next_slot[&peer] == 1 {
+                return Ok(()); // cannot move backward any more
+            }
+            *self.next_slot.get_mut(&peer).unwrap() -= 1;
+
+            let prev_slot = self.next_slot[&peer] - 1;
+            if prev_slot < self.start_slot {
+                *self.next_slot.get_mut(&peer).unwrap() += 1;
+                return logged_err!(self.id; "snapshotted slot {} queried", prev_slot);
+            }
+            let prev_term = self.log[prev_slot - self.start_slot].term;
+            let entries = self
+                .log
+                .iter()
+                .skip(self.next_slot[&peer] - self.start_slot)
+                .cloned()
+                .collect();
+
+            self.transport_hub.send_msg(
+                PeerMsg::AppendEntries {
+                    term: self.curr_term,
+                    prev_slot,
+                    prev_term,
+                    entries,
+                    leader_commit: self.last_commit,
+                    last_snap: self.last_snap,
+                },
+                peer,
+            )?;
+            pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}",
+                               peer, self.next_slot[&peer],
+                               self.start_slot + self.log.len() - 1);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of RequestVote message from candidate.
+    fn handle_msg_request_vote(
+        &mut self,
+        candidate: ReplicaId,
+        term: Term,
+        last_slot: usize,
+        last_term: Term,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received RequestVote <- {} with term {} last {} term {}",
+                           candidate, term, last_slot, last_term);
+        self.check_term(candidate, term)?;
+
+        // if the given term is smaller than mine, reply false
+        if term < self.curr_term {
+            self.transport_hub.send_msg(
+                PeerMsg::RequestVoteReply {
+                    term: self.curr_term,
+                    granted: false,
+                },
+                candidate,
+            )?;
+            pf_trace!(self.id; "sent RequestVoteReply -> {} term {} false",
+                               candidate, self.curr_term);
+            return Ok(());
+        }
+
+        // if I did not vote for anyone else in my current term and that the
+        // candidate's log is as up-to-date as mine, grant vote
+        #[allow(clippy::collapsible_if)]
+        if self.voted_for.is_none() || (self.voted_for.unwrap() == candidate) {
+            if last_term >= self.log.last().unwrap().term
+                || (last_term == self.curr_term
+                    && last_slot + 1 >= self.start_slot + self.log.len())
+            {
+                self.transport_hub.send_msg(
+                    PeerMsg::RequestVoteReply {
+                        term: self.curr_term,
+                        granted: true,
+                    },
+                    candidate,
+                )?;
+                pf_trace!(self.id; "sent RequestVoteReply -> {} term {} granted",
+                               candidate, self.curr_term);
+
+                // hear a heartbeat here to prevent me from starting an
+                // election soon
+                self.heard_heartbeat(candidate, term)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Handler of RequestVote reply from peer.
+    fn handle_msg_request_vote_reply(
+        &mut self,
+        peer: ReplicaId,
+        term: Term,
+        granted: bool,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received RequestVoteReply <- {} with term {} {}",
+                           peer, term, if granted { "granted" } else { "false" });
+        if self.check_term(peer, term)? || self.role != Role::Candidate {
+            return Ok(());
+        }
+
+        // bookkeep this vote
+        self.votes_granted.insert(peer);
+
+        // if a majority of servers have voted for me, become the leader
+        if self.votes_granted.len() as u8 >= self.quorum_cnt {
+            self.become_the_leader()?;
+        }
+
+        Ok(())
+    }
+
+    /// Synthesized handler of receiving message from peer.
+    async fn handle_msg_recv(
+        &mut self,
+        peer: ReplicaId,
+        msg: PeerMsg,
+    ) -> Result<(), SummersetError> {
+        match msg {
+            PeerMsg::AppendEntries {
+                term,
+                prev_slot,
+                prev_term,
+                entries,
+                leader_commit,
+                last_snap,
+            } => {
+                self.handle_msg_append_entries(
+                    peer,
+                    term,
+                    prev_slot,
+                    prev_term,
+                    entries,
+                    leader_commit,
+                    last_snap,
+                )
+                .await
+            }
+            PeerMsg::AppendEntriesReply {
+                term,
+                end_slot,
+                success,
+            } => self
+                .handle_msg_append_entries_reply(peer, term, end_slot, success),
+            PeerMsg::RequestVote {
+                term,
+                last_slot,
+                last_term,
+            } => self.handle_msg_request_vote(peer, term, last_slot, last_term),
+            PeerMsg::RequestVoteReply { term, granted } => {
+                self.handle_msg_request_vote_reply(peer, term, granted)
+            }
+        }
+    }
+}
+
+// RaftReplica state machine execution
+impl RaftReplica {
+    /// Handler of state machine exec result chan recv.
+    fn handle_cmd_result(
+        &mut self,
+        cmd_id: CommandId,
+        cmd_result: CommandResult,
+    ) -> Result<(), SummersetError> {
+        let (slot, cmd_idx) = Self::split_command_id(cmd_id);
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.log.len());
+        pf_trace!(self.id; "executed cmd in entry at slot {} idx {}",
+                           slot, cmd_idx);
+
+        let entry = &mut self.log[slot - self.start_slot];
+        assert!(cmd_idx < entry.reqs.len());
+        let (client, ref req) = entry.reqs[cmd_idx];
+
+        // reply command result back to client
+        if let ApiRequest::Req { id: req_id, .. } = req {
+            if entry.external && self.external_api.has_client(client) {
+                self.external_api.send_reply(
+                    ApiReply::Reply {
+                        id: *req_id,
+                        result: Some(cmd_result),
+                        redirect: None,
+                    },
+                    client,
+                )?;
+                pf_trace!(self.id; "replied -> client {} for slot {} idx {}",
+                                   client, slot, cmd_idx);
+            }
+        } else {
+            return logged_err!(self.id; "unexpected API request type");
+        }
+
+        // if all commands in this entry have been executed, update last_exec
+        if cmd_idx == entry.reqs.len() - 1 {
+            pf_debug!(self.id; "executed all cmds in entry at slot {}", slot);
+            self.last_exec = slot;
+        }
+
+        Ok(())
+    }
+}
+
+// RaftReplica leader election timeout logic
+impl RaftReplica {
+    /// Becomes a candidate and starts the election procedure.
+    async fn become_a_candidate(&mut self) -> Result<(), SummersetError> {
+        if self.role != Role::Follower {
+            return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
+        self.role = Role::Candidate;
+
+        // increment current term and vote for myself
+        self.curr_term += 1;
+        self.voted_for = Some(self.id);
+        self.votes_granted = HashSet::from([self.id]);
+        pf_info!(self.id; "starting election with term {}...", self.curr_term);
+
+        // also make the two critical fields durable, synchronously
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: DurEntry::Metadata {
+                    curr_term: self.curr_term,
+                    voted_for: self.voted_for,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        loop {
+            let (action_id, log_result) = self.storage_hub.get_result().await?;
+            if action_id != 0 {
+                // normal log action previously in queue; process it
+                self.handle_log_result(action_id, log_result)?;
+            } else {
+                if let LogResult::Write {
+                    offset_ok: true, ..
+                } = log_result
+                {
+                } else {
+                    return logged_err!(self.id; "unexpected log result type or failed write");
+                }
+                break;
+            }
+        }
+
+        // reset election timeout timer
+        self.heard_heartbeat(self.id, self.curr_term)?;
+
+        // send RequestVote messages to all other peers
+        let last_slot = self.start_slot + self.log.len() - 1;
+        assert!(last_slot >= self.start_slot);
+        let last_term = self.log[last_slot - self.start_slot].term;
+        self.transport_hub.bcast_msg(
+            PeerMsg::RequestVote {
+                term: self.curr_term,
+                last_slot,
+                last_term,
+            },
+            None,
+        )?;
+        pf_trace!(self.id; "broadcast RequestVote with term {} last {} term {}",
+                           self.curr_term, last_slot, last_term);
+
+        Ok(())
+    }
+
+    /// Becomes the leader after enough votes granted for me.
+    fn become_the_leader(&mut self) -> Result<(), SummersetError> {
+        pf_info!(self.id; "elected to be leader with term {}", self.curr_term);
+        self.role = Role::Leader;
+
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
+        self.bcast_heartbeats()?;
+
+        // re-initialize next_slot and match_slot information
+        for slot in self.next_slot.values_mut() {
+            *slot = self.start_slot + self.log.len();
+        }
+        for slot in self.match_slot.values_mut() {
+            *slot = 0;
+        }
+
+        Ok(())
+    }
+
+    /// Broadcasts empty AppendEntries messages as heartbeats to all peers.
+    fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> {
+        let prev_slot = self.start_slot + self.log.len() - 1;
+        assert!(prev_slot >= self.start_slot);
+        let prev_term = self.log[prev_slot - self.start_slot].term;
+        self.transport_hub.bcast_msg(
+            PeerMsg::AppendEntries {
+                term: self.curr_term,
+                prev_slot,
+                prev_term,
+                entries: vec![],
+                leader_commit: self.last_commit,
+                last_snap: self.last_snap,
+            },
+            None,
+        )?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
+        self.heard_heartbeat(self.id, self.curr_term)?;
+
+        // pf_trace!(self.id; "broadcast heartbeats term {}", self.curr_term);
+        Ok(())
+    }
+
+    /// Chooses a random hb_hear_timeout from the min-max range and kicks off
+    /// the hb_hear_timer.
+    fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
+        let timeout_ms = thread_rng().gen_range(
+            self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
+        );
+
+        // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
+        self.hb_hear_timer
+            .kickoff(Duration::from_millis(timeout_ms))?;
+        Ok(())
+    }
+
+    /// Heard a heartbeat from some other replica. Resets election timer.
+    fn heard_heartbeat(
+        &mut self,
+        peer: ReplicaId,
+        _term: Term,
+    ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
+        // reset hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // pf_trace!(self.id; "heard heartbeat <- {} term {}", peer, term);
+        Ok(())
+    }
+}
+
+// RaftReplica control messages handling
+impl RaftReplica {
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got restart req");
+
+        // send leave notification to peers and wait for their replies
+        self.transport_hub.leave().await?;
+
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+
+        // reset leader heartbeat timer
+        self.kickoff_hb_hear_timer()?;
+
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+        Ok(())
+    }
+
+    /// Handler of TakeSnapshot control message.
+    async fn handle_ctrl_take_snapshot(
+        &mut self,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server told to take snapshot");
+        self.take_new_snapshot().await?;
+
+        self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+            new_start: self.start_slot,
+        })?;
+        Ok(())
+    }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+        paused: &mut bool,
+    ) -> Result<Option<bool>, SummersetError> {
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::TakeSnapshot => {
+                self.handle_ctrl_take_snapshot().await?;
+                Ok(None)
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
+}
+
+// RaftReplica recovery from durable log
+impl RaftReplica {
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+
+        // first, try to read the first several bytes, which should record
+        // necessary durable metadata
+        self.storage_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry:
+                    Some(DurEntry::Metadata {
+                        curr_term,
+                        voted_for,
+                    }),
+                end_offset,
+            } => {
+                self.log_offset = end_offset;
+                self.log_meta_end = end_offset;
+
+                // recover necessary metadata info
+                self.curr_term = curr_term;
+                self.voted_for = voted_for;
+
+                // read out and push all log entries into memory log
+                loop {
+                    // using 0 as a special log action ID
+                    self.storage_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.log_offset,
+                        },
+                    )?;
+                    let (_, log_result) = self.storage_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(DurEntry::LogEntry { mut entry }),
+                            end_offset,
+                        } => {
+                            entry.log_offset = self.log_offset;
+                            entry.external = false; // no re-replying to clients
+                            self.log.push(entry);
+                            self.log_offset = end_offset; // update log offset
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // log file is empty, write initial metadata
+                self.storage_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: DurEntry::Metadata {
+                            curr_term: 0,
+                            voted_for: None,
+                        },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.storage_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.log_offset = now_size;
+                    self.log_meta_end = now_size;
+                } else {
+                    return logged_err!(self.id; "unexpected log result type or failed write");
+                }
+                // ... and push a 0-th dummy entry into in-mem log
+                self.log.push(LogEntry {
+                    term: 0,
+                    reqs: vec![],
+                    external: false,
+                    log_offset: 0,
+                });
+                // ... and write the 0-th dummy entry durably
+                self.storage_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: DurEntry::LogEntry {
+                            entry: LogEntry {
+                                term: 0,
+                                reqs: vec![],
+                                external: false,
+                                log_offset: self.log_offset,
+                            },
+                        },
+                        offset: self.log_offset,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.storage_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.log[0].log_offset = self.log_offset;
+                    self.log_offset = now_size;
+                } else {
+                    return logged_err!(self.id; "unexpected log result type or failed write");
+                }
+            }
+
+            _ => return logged_err!(self.id; "unexpected log result type"),
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        assert!(self.log_offset >= self.log_meta_end);
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            if self.log_offset > self.log_meta_end {
+                pf_info!(self.id; "recovered from wal log: term {} voted {:?} |log| {}",
+                                  self.curr_term, self.voted_for, self.log.len());
+            }
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type or failed truncate")
+        }
+    }
+}
+
+// RaftReplica snapshotting & GC logic
+impl RaftReplica {
+    /// Dump new key-value pairs to snapshot file.
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
+        // collect all key-value pairs put up to exec_bar
+        let mut pairs = HashMap::new();
+        for slot in self.start_slot..new_start_slot {
+            let entry = &self.log[slot - self.start_slot];
+            for (_, req) in entry.reqs.clone() {
+                if let ApiRequest::Req {
+                    cmd: Command::Put { key, value },
+                    ..
+                } = req
+                {
+                    pairs.insert(key, value);
+                }
+            }
+        }
+
+        // write the collection to snapshot file
+        self.snapshot_hub.submit_action(
+            0, // using 0 as dummy log action ID
+            LogAction::Append {
+                entry: SnapEntry::KVPairSet { pairs },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        if let LogResult::Append { now_size } = log_result {
+            self.snap_offset = now_size;
+            Ok(())
+        } else {
+            logged_err!(
+                self.id;
+                "unexpected log result type"
+            )
+        }
+    }
+
+    /// Discard everything lower than start_slot in durable log.
+    async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
+        // drain things currently in storage_hub's recv chan if head of log's
+        // durable file offset has not been set yet
+        assert!(!self.log.is_empty());
+        while self.log[0].log_offset == 0 {
+            let (action_id, log_result) = self.storage_hub.get_result().await?;
+            self.handle_log_result(action_id, log_result)?;
+        }
+        let cut_offset = self.log[0].log_offset;
+
+        // discard the log after meta_end and before cut_offset
+        if cut_offset > 0 {
+            assert!(self.log_meta_end > 0);
+            assert!(self.log_meta_end <= cut_offset);
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: self.log_meta_end,
+                },
+            )?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id != 0 {
+                    // normal log action previously in queue; process it
+                    self.handle_log_result(action_id, log_result)?;
+                } else {
+                    if let LogResult::Discard {
+                        offset_ok: true,
+                        now_size,
+                    } = log_result
+                    {
+                        assert_eq!(
+                            self.log_offset - cut_offset + self.log_meta_end,
+                            now_size
+                        );
+                        self.log_offset = now_size;
+                    } else {
+                        return logged_err!(
+                            self.id;
+                            "unexpected log result type or failed discard"
+                        );
+                    }
+                    break;
+                }
+            }
+        }
+
+        // update entry.log_offset for all remaining in-mem entries
+        for entry in &mut self.log {
+            if entry.log_offset > 0 {
+                assert!(entry.log_offset >= cut_offset);
+                entry.log_offset -= cut_offset - self.log_meta_end;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Take a snapshot up to current last_exec, then discard the in-mem log up
+    /// to that index as well as their data in the durable log file.
+    ///
+    /// NOTE: the current implementation does not guard against crashes in the
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we take the conservative
+    /// approach that a snapshot is only taken when data has been durably
+    /// committed on all servers.
+    async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.last_exec, self.last_snap);
+        assert!(self.last_exec + 1 >= self.start_slot);
+
+        // always keep at least one entry in log to make indexing happy
+        let new_start_slot = cmp::min(self.last_snap, self.last_exec);
+        assert!(new_start_slot < self.start_slot + self.log.len());
+        if new_start_slot < self.start_slot + 1 {
+            return Ok(());
+        }
+
+        // collect and dump all Puts in executed entries
+        if self.role == Role::Leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
+
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: new_start_slot,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed write");
+            }
+        }
+
+        // update start_slot and discard all in-mem log entries up to
+        // new_start_slot
+        self.log.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
+
+        // discarding everything lower than start_slot in durable log
+        if self.role == Role::Leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_discard_log().await?;
+
+        // reset the leader heartbeat hear timer
+        self.kickoff_hb_hear_timer()?;
+
+        pf_info!(self.id; "took snapshot up to: start {}", self.start_slot);
+        Ok(())
+    }
+
+    /// Recover initial state from durable storage snapshot file.
+    async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.snap_offset, 0);
+
+        // first, try to read the first several bytes, which should record the
+        // start_slot index
+        self.snapshot_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry: Some(SnapEntry::SlotInfo { start_slot }),
+                end_offset,
+            } => {
+                self.snap_offset = end_offset;
+
+                // recover start_slot info
+                self.start_slot = start_slot;
+                if start_slot > 0 {
+                    self.last_commit = start_slot - 1;
+                    self.last_exec = start_slot - 1;
+                    self.last_snap = start_slot - 1;
+                }
+
+                // repeatedly apply key-value pairs
+                loop {
+                    self.snapshot_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.snap_offset,
+                        },
+                    )?;
+                    let (_, log_result) =
+                        self.snapshot_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(SnapEntry::KVPairSet { pairs }),
+                            end_offset,
+                        } => {
+                            // execute Put commands on state machine
+                            for (key, value) in pairs {
+                                self.state_machine.submit_cmd(
+                                    0,
+                                    Command::Put { key, value },
+                                )?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                            // update snapshot file offset
+                            self.snap_offset = end_offset;
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+
+                // tell manager about my start_slot index
+                self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+                    new_start: self.start_slot,
+                })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {}",
+                                      self.start_slot);
+                }
+                Ok(())
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // snapshot file is empty. Write a 0 as start_slot and return
+                self.snapshot_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: SnapEntry::SlotInfo { start_slot: 0 },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.snapshot_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.snap_offset = now_size;
+                    Ok(())
+                } else {
+                    logged_err!(self.id; "unexpected log result type or failed write")
+                }
+            }
+
+            _ => {
+                logged_err!(self.id; "unexpected log result type")
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl GenericReplica for RaftReplica {
+    async fn new_and_setup(
+        api_addr: SocketAddr,
+        p2p_addr: SocketAddr,
+        manager: SocketAddr,
+        config_str: Option<&str>,
+    ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a server ID
+        let mut control_hub = ControlHub::new_and_setup(manager).await?;
+        let id = control_hub.me;
+        let population = control_hub.population;
+
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigRaft;
+                                    batch_interval_ms, max_batch_size,
+                                    backer_path, logger_sync,
+                                    hb_hear_timeout_min, hb_hear_timeout_max,
+                                    hb_send_interval_ms,
+                                    snapshot_path, snapshot_interval_s,
+                                    perf_storage_a, perf_storage_b,
+                                    perf_network_a, perf_network_b)?;
+        if config.batch_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
+            );
+        }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
+            );
+        }
+
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
+
+        // setup transport hub module
+        let mut transport_hub = TransportHub::new_and_setup(
+            id,
+            population,
+            p2p_addr,
+            if config.perf_network_a == 0 && config.perf_network_b == 0 {
+                None
+            } else {
+                Some((config.perf_network_a, config.perf_network_b))
+            },
+        )
+        .await?;
+
+        // ask for the list of peers to proactively connect to. Do this after
+        // transport hub has been set up, so that I will be able to accept
+        // later peer connections
+        control_hub.send_ctrl(CtrlMsg::NewServerJoin {
+            id,
+            protocol: SmrProtocol::Raft,
+            api_addr,
+            p2p_addr,
+        })?;
+        let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } =
+            control_hub.recv_ctrl().await?
+        {
+            to_peers
+        } else {
+            return logged_err!(id; "unexpected ctrl msg type received");
+        };
+
+        // proactively connect to some peers, then wait for all population
+        // have been connected with me
+        for (peer, addr) in to_peers {
+            transport_hub.connect_to_peer(peer, addr).await?;
+        }
+        transport_hub.wait_for_group(population).await?;
+
+        // setup snapshot hub module
+        let snapshot_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.snapshot_path),
+            None,
+        )
+        .await?;
+
+        // setup external API module, ready to take in client requests
+        let external_api = ExternalApi::new_and_setup(
+            id,
+            api_addr,
+            Duration::from_millis(config.batch_interval_ms),
+            config.max_batch_size,
+        )
+        .await?;
+
+        let mut hb_send_interval =
+            time::interval(Duration::from_millis(config.hb_send_interval_ms));
+        hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        let mut snapshot_interval = time::interval(Duration::from_secs(
+            if config.snapshot_interval_s > 0 {
+                config.snapshot_interval_s
+            } else {
+                60 // dummy non-zero value to make `time::interval` happy
+            },
+        ));
+        snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
+        Ok(RaftReplica {
+            id,
+            population,
+            quorum_cnt: (population / 2) + 1,
+            config,
+            _api_addr: api_addr,
+            _p2p_addr: p2p_addr,
+            control_hub,
+            external_api,
+            state_machine,
+            storage_hub,
+            snapshot_hub,
+            transport_hub,
+            role: Role::Follower,
+            leader: None,
+            hb_hear_timer: Timer::new(),
+            hb_send_interval,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
+            curr_term: 0,
+            voted_for: None,
+            votes_granted: HashSet::new(),
+            log: vec![],
+            start_slot: 0,
+            snapshot_interval,
+            last_commit: 0,
+            last_exec: 0,
+            next_slot: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 1)) })
+                .collect(),
+            match_slot: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            last_snap: 0,
+            log_offset: 0,
+            log_meta_end: 0,
+            snap_offset: 0,
+        })
+    }
+
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
+        // recover state from durable snapshot file
+        self.recover_from_snapshot().await?;
+
+        // recover the tail-piece memory log & state from remaining durable log
+        self.recover_from_log().await?;
+
+        // kick off leader activity hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // main event loop
+        let mut paused = false;
+        loop {
+            tokio::select! {
+                // client request batch
+                req_batch = self.external_api.get_req_batch(), if !paused => {
+                    if let Err(e) = req_batch {
+                        pf_error!(self.id; "error getting req batch: {}", e);
+                        continue;
+                    }
+                    let req_batch = req_batch.unwrap();
+                    if let Err(e) = self.handle_req_batch(req_batch) {
+                        pf_error!(self.id; "error handling req batch: {}", e);
+                    }
+                },
+
+                // durable logging result
+                log_result = self.storage_hub.get_result(), if !paused => {
+                    if let Err(e) = log_result {
+                        pf_error!(self.id; "error getting log result: {}", e);
+                        continue;
+                    }
+                    let (action_id, log_result) = log_result.unwrap();
+                    if let Err(e) = self.handle_log_result(action_id, log_result) {
+                        pf_error!(self.id; "error handling log result {}: {}",
+                                           action_id, e);
+                    }
+                },
+
+                // message from peer
+                msg = self.transport_hub.recv_msg(), if !paused => {
+                    if let Err(e) = msg {
+                        pf_error!(self.id; "error receiving peer msg: {}", e);
+                        continue;
+                    }
+                    let (peer, msg) = msg.unwrap();
+                    if let Err(e) = self.handle_msg_recv(peer, msg).await {
+                        pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e);
+                    }
+                },
+
+                // state machine execution result
+                cmd_result = self.state_machine.get_result(), if !paused => {
+                    if let Err(e) = cmd_result {
+                        pf_error!(self.id; "error getting cmd result: {}", e);
+                        continue;
+                    }
+                    let (cmd_id, cmd_result) = cmd_result.unwrap();
+                    if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) {
+                        pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e);
+                    }
+                },
+
+                // leader inactivity timeout
+                _ = self.hb_hear_timer.timeout(), if !paused => {
+                    if let Err(e) = self.become_a_candidate().await {
+                        pf_error!(self.id; "error becoming a candidate: {}", e);
+                    }
+                },
+
+                // leader sending heartbeat
+                _ = self.hb_send_interval.tick(), if !paused
+                                                     && self.role == Role::Leader => {
+                    if let Err(e) = self.bcast_heartbeats() {
+                        pf_error!(self.id; "error broadcasting heartbeats: {}", e);
+                    }
+                },
+
+                // autonomous snapshot taking timeout
+                _ = self.snapshot_interval.tick(), if !paused
+                                                      && self.config.snapshot_interval_s > 0 => {
+                    if let Err(e) = self.take_new_snapshot().await {
+                        pf_error!(self.id; "error taking a new snapshot: {}", e);
+                    } else {
+                        self.control_hub.send_ctrl(
+                            CtrlMsg::SnapshotUpTo { new_start: self.start_slot }
+                        )?;
+                    }
+                },
+
+                // manager control message
+                ctrl_msg = self.control_hub.recv_ctrl() => {
+                    if let Err(e) = ctrl_msg {
+                        pf_error!(self.id; "error getting ctrl msg: {}", e);
+                        continue;
+                    }
+                    let ctrl_msg = ctrl_msg.unwrap();
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
+                    }
+                },
+
+                // receiving termination signal
+                _ = rx_term.changed() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
+                }
+            }
+        }
+    }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
+}
+
+/// Configuration parameters struct.
+#[derive(Debug, Deserialize)]
+pub struct ClientConfigRaft {
+    /// Which server to pick initially.
+    pub init_server_id: ReplicaId,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for ClientConfigRaft {
+    fn default() -> Self {
+        ClientConfigRaft { init_server_id: 0 }
+    }
+}
+
+/// Raft client-side module.
+pub struct RaftClient {
+    /// Client ID.
+    id: ClientId,
+
+    /// Configuration parameters struct.
+    _config: ClientConfigRaft,
+
+    /// List of active servers information.
+    servers: HashMap<ReplicaId, SocketAddr>,
+
+    /// Current server ID to talk to.
+    server_id: ReplicaId,
+
+    /// Control API stub to the cluster manager.
+    ctrl_stub: ClientCtrlStub,
+
+    /// API stubs for communicating with servers.
+    api_stubs: HashMap<ReplicaId, ClientApiStub>,
+}
+
+#[async_trait]
+impl GenericEndpoint for RaftClient {
+    async fn new_and_setup(
+        manager: SocketAddr,
+        config_str: Option<&str>,
+    ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ClientConfigRaft;
+                                    init_server_id)?;
+        let init_server_id = config.init_server_id;
+
+        Ok(RaftClient {
+            id,
+            _config: config,
+            servers: HashMap::new(),
+            server_id: init_server_id,
+            ctrl_stub,
+            api_stubs: HashMap::new(),
+        })
+    }
+
+    async fn connect(&mut self) -> Result<(), SummersetError> {
+        // disallow reconnection without leaving
+        if !self.api_stubs.is_empty() {
+            return logged_err!(self.id; "reconnecting without leaving");
+        }
+
+        // ask the manager about the list of active servers
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        while !sent {
+            sent = self.ctrl_stub.send_req(None)?;
+        }
+
+        let reply = self.ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // shift to a new server_id if current one not active
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.server_id) {
+                    self.server_id = (self.server_id + 1) % population;
+                }
+                // establish connection to all servers
+                self.servers = servers
+                    .into_iter()
+                    .map(|(id, info)| (id, info.0))
+                    .collect();
+                for (&id, &server) in &self.servers {
+                    pf_info!(self.id; "connecting to server {} '{}'...", id, server);
+                    let api_stub =
+                        ClientApiStub::new_by_connect(self.id, server).await?;
+                    self.api_stubs.insert(id, api_stub);
+                }
+                Ok(())
+            }
+            _ => logged_err!(self.id; "unexpected reply type received"),
+        }
+    }
+
+    async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
+        // send leave notification to all servers
+        for (id, mut api_stub) in self.api_stubs.drain() {
+            let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
+            while !sent {
+                sent = api_stub.send_req(None)?;
+            }
+
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left server connection {}", id);
+            api_stub.forget();
+        }
+
+        // if permanently leaving, send leave notification to the manager
+        if permanent {
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
+            }
+
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left manager connection");
+        }
+
+        Ok(())
+    }
+
+    fn send_req(
+        &mut self,
+        req: Option<&ApiRequest>,
+    ) -> Result<bool, SummersetError> {
+        if self.api_stubs.contains_key(&self.server_id) {
+            self.api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .send_req(req)
+        } else {
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
+        }
+    }
+
+    async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
+        if self.api_stubs.contains_key(&self.server_id) {
+            let reply = self
+                .api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .recv_reply()
+                .await?;
+
+            if let ApiReply::Reply {
+                ref result,
+                ref redirect,
+                ..
+            } = reply
+            {
+                // if the current server redirects me to a different server
+                if result.is_none() && redirect.is_some() {
+                    let redirect_id = redirect.unwrap();
+                    assert!(self.servers.contains_key(&redirect_id));
+                    self.server_id = redirect_id;
+                    pf_debug!(self.id; "redirected to replica {} '{}'",
+                                       redirect_id, self.servers[&redirect_id]);
+                }
+            }
+
+            Ok(reply)
+        } else {
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
+        }
+    }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
+}
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index e5c6b0dd..a14af95b 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -28,8 +28,8 @@ use tokio::sync::watch;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigRepNothing {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -49,7 +49,7 @@ pub struct ReplicaConfigRepNothing {
 impl Default for ReplicaConfigRepNothing {
     fn default() -> Self {
         ReplicaConfigRepNothing {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rep_nothing.wal".into(),
             logger_sync: false,
@@ -59,9 +59,9 @@ impl Default for ReplicaConfigRepNothing {
     }
 }
 
-/// Log entry type.
+/// WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-struct LogEntry {
+struct WalEntry {
     reqs: Vec<(ClientId, ApiRequest)>,
 }
 
@@ -97,17 +97,19 @@ pub struct RepNothingReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 }
 
+// RepNothingReplica common helpers
 impl RepNothingReplica {
     /// Compose CommandId from instance index & command index within.
+    #[inline]
     fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId {
         assert!(inst_idx <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -115,12 +117,16 @@ impl RepNothingReplica {
     }
 
     /// Decompose CommandId into instance index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let inst_idx = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (inst_idx, cmd_idx)
     }
+}
 
+// RepNothingReplica client requests entrance
+impl RepNothingReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -138,23 +144,26 @@ impl RepNothingReplica {
         self.insts.push(inst);
 
         // submit log action to make this instance durable
-        let log_entry = LogEntry { reqs: req_batch };
+        let wal_entry = WalEntry { reqs: req_batch };
         self.storage_hub.submit_action(
             inst_idx as LogActionId,
             LogAction::Append {
-                entry: log_entry,
+                entry: wal_entry,
                 sync: self.config.logger_sync,
             },
         )?;
 
         Ok(())
     }
+}
 
+// RepNothingReplica durable WAL logging
+impl RepNothingReplica {
     /// Handler of durable logging result chan recv.
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let inst_idx = action_id as usize;
         if inst_idx >= self.insts.len() {
@@ -163,8 +172,8 @@ impl RepNothingReplica {
 
         match log_result {
             LogResult::Append { now_size } => {
-                assert!(now_size >= self.log_offset);
-                self.log_offset = now_size;
+                assert!(now_size >= self.wal_offset);
+                self.wal_offset = now_size;
             }
             _ => {
                 return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result);
@@ -190,7 +199,10 @@ impl RepNothingReplica {
 
         Ok(())
     }
+}
 
+// RepNothingReplica state machine execution
+impl RepNothingReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -236,7 +248,10 @@ impl RepNothingReplica {
 
         Ok(())
     }
+}
 
+// RepNothingReplica control messages handling
+impl RepNothingReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -321,16 +336,19 @@ impl RepNothingReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+// RepNothingReplica recovery from WAL log
+impl RepNothingReplica {
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -356,7 +374,7 @@ impl RepNothingReplica {
                         execed: vec![true; num_reqs],
                     });
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -372,7 +390,7 @@ impl RepNothingReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -401,14 +419,14 @@ impl GenericReplica for RepNothingReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRepNothing;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     perf_storage_a, perf_storage_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
 
@@ -442,7 +460,7 @@ impl GenericReplica for RepNothingReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -457,7 +475,7 @@ impl GenericReplica for RepNothingReplica {
             state_machine,
             storage_hub,
             insts: vec![],
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
@@ -465,8 +483,8 @@ impl GenericReplica for RepNothingReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
-        self.recover_from_log().await?;
+        // recover state from durable storage WAL log
+        self.recover_from_wal().await?;
 
         // main event loop
         let mut paused = false;
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 82990870..1509fe0e 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -3,6 +3,7 @@
 //! MultiPaxos with Reed-Solomon erasure coding. References:
 //!   - <https://madsys.cs.tsinghua.edu.cn/publications/HPDC2014-mu.pdf>
 
+use std::cmp;
 use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
@@ -33,8 +34,8 @@ use reed_solomon_erasure::galois_8::ReedSolomon;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigRSPaxos {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -47,7 +48,6 @@ pub struct ReplicaConfigRSPaxos {
 
     /// Min timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_min: u64,
-
     /// Max timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_max: u64,
 
@@ -64,6 +64,9 @@ pub struct ReplicaConfigRSPaxos {
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
+    /// Maximum chunk size of a ReconstructRead message.
+    pub recon_chunk_size: usize,
+
     // Performance simulation params (all zeros means no perf simulation):
     pub perf_storage_a: u64,
     pub perf_storage_b: u64,
@@ -75,7 +78,7 @@ pub struct ReplicaConfigRSPaxos {
 impl Default for ReplicaConfigRSPaxos {
     fn default() -> Self {
         ReplicaConfigRSPaxos {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
@@ -85,6 +88,7 @@ impl Default for ReplicaConfigRSPaxos {
             snapshot_path: "/tmp/summerset.rs_paxos.snap".into(),
             snapshot_interval_s: 0,
             fault_tolerance: 0,
+            recon_chunk_size: 1000,
             perf_storage_a: 0,
             perf_storage_b: 0,
             perf_network_a: 0,
@@ -156,12 +160,12 @@ struct Instance {
     external: bool,
 
     /// Offset of first durable WAL log entry related to this instance.
-    log_offset: usize,
+    wal_offset: usize,
 }
 
-/// Stable storage log entry type.
+/// Stable storage WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
 
@@ -177,11 +181,20 @@ enum LogEntry {
 }
 
 /// Snapshot file entry type.
+///
+/// NOTE: the current implementation simply appends a squashed log at the
+/// end of the snapshot file for simplicity. In production, the snapshot
+/// file should be a bounded-sized backend, e.g., an LSM-tree.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
-    /// First entry at the start of file: number of log instances covered by
-    /// this snapshot file == the start slot index of in-mem log.
-    StartSlot { slot: usize },
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log instances covered
+        /// by this snapshot file == the start slot index of in-mem log.
+        start_slot: usize,
+        /// Index of the first non-committed slot.
+        commit_bar: usize,
+    },
 
     /// Set of key-value pairs to apply to the state.
     KVPairSet { pairs: HashMap<String, String> },
@@ -191,7 +204,13 @@ enum SnapEntry {
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
-    Prepare { slot: usize, ballot: Ballot },
+    Prepare {
+        /// Slot index in Prepare message is the triggering slot of this
+        /// Prepare. Once prepared, it means that all slots in the range
+        /// [slot, +infinity) are prepared under this ballot number.
+        slot: usize,
+        ballot: Ballot,
+    },
 
     /// Prepare reply from replica to leader.
     PrepareReply {
@@ -215,18 +234,27 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Request by a lagging replica to leader asking to re-send Accepts for
+    /// missing holes
+    FillHoles { slots: Vec<usize> },
+
     /// Reconstruction read from new leader to replicas.
-    Reconstruct { slot: usize },
+    Reconstruct { slots: Vec<usize> },
 
     /// Reconstruction read reply from replica to leader.
     ReconstructReply {
-        slot: usize,
-        ballot: Ballot,
-        reqs_cw: RSCodeword<ReqBatch>,
+        /// Map from slot -> (ballot, peer shards).
+        slots_data: HashMap<usize, (Ballot, RSCodeword<ReqBatch>)>,
     },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot, exec_bar: usize },
+    Heartbeat {
+        ballot: Ballot,
+        /// For leader step-up as well as conservative snapshotting purpose.
+        exec_bar: usize,
+        /// For conservative snapshotting purpose.
+        snap_bar: usize,
+    },
 }
 
 /// RSPaxos server replica module.
@@ -238,7 +266,7 @@ pub struct RSPaxosReplica {
     population: u8,
 
     /// Majority quorum size.
-    quorum_cnt: u8,
+    majority: u8,
 
     /// Configuration parameters struct.
     config: ReplicaConfigRSPaxos,
@@ -259,7 +287,7 @@ pub struct RSPaxosReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// StorageHub module for the snapshot file.
     snapshot_hub: StorageHub<SnapEntry>,
@@ -267,14 +295,21 @@ pub struct RSPaxosReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
     /// Timer for hearing heartbeat from leader.
     hb_hear_timer: Timer,
 
     /// Interval for sending heartbeat to followers.
     hb_send_interval: Interval,
 
-    /// Do I think I am the leader?
-    is_leader: bool,
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
@@ -301,8 +336,18 @@ pub struct RSPaxosReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Map from peer ID -> its latest exec_bar I know; this is for conservative
+    /// snapshotting purpose.
+    peer_exec_bar: HashMap<ReplicaId, usize>,
+
+    /// Slot index before which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed (and executed) that entry.
+    snap_bar: usize,
+
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 
     /// Current durable snapshot file offset.
     snap_offset: usize,
@@ -311,42 +356,64 @@ pub struct RSPaxosReplica {
     rs_coder: ReedSolomon,
 }
 
+// RSPaxosReplica common helpers
 impl RSPaxosReplica {
+    /// Do I think I am the current effective leader?
+    #[inline]
+    fn is_leader(&self) -> bool {
+        self.leader == Some(self.id)
+    }
+
     /// Create an empty null instance.
+    #[inline]
     fn null_instance(&self) -> Result<Instance, SummersetError> {
         Ok(Instance {
             bal: 0,
             status: Status::Null,
             reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                self.quorum_cnt,
-                self.population - self.quorum_cnt,
+                self.majority,
+                self.population - self.majority,
             )?,
             voted: (
                 0,
                 RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt,
-                    self.population - self.quorum_cnt,
+                    self.majority,
+                    self.population - self.majority,
                 )?,
             ),
             leader_bk: None,
             replica_bk: None,
             external: false,
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
+    /// Locate the first null slot or append a null instance if no holes exist.
+    fn first_null_slot(&mut self) -> Result<usize, SummersetError> {
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
+                return Ok(s);
+            }
+        }
+        self.insts.push(self.null_instance()?);
+        Ok(self.start_slot + self.insts.len() - 1)
+    }
+
     /// Compose a unique ballot number from base.
+    #[inline]
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
     }
 
     /// Compose a unique ballot number greater than the given one.
+    #[inline]
     fn make_greater_ballot(&self, bal: Ballot) -> Ballot {
         self.make_unique_ballot((bal >> 8) + 1)
     }
 
     /// Compose LogActionId from slot index & entry type.
     /// Uses the `Status` enum type to represent differnet entry types.
+    #[inline]
     fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId {
         let type_num = match entry_type {
             Status::Preparing => 1,
@@ -358,6 +425,7 @@ impl RSPaxosReplica {
     }
 
     /// Decompose LogActionId into slot index & entry type.
+    #[inline]
     fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) {
         let slot = (log_action_id >> 2) as usize;
         let type_num = log_action_id & ((1 << 2) - 1);
@@ -371,6 +439,7 @@ impl RSPaxosReplica {
     }
 
     /// Compose CommandId from slot index & command index within.
+    #[inline]
     fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
         assert!(slot <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -378,12 +447,16 @@ impl RSPaxosReplica {
     }
 
     /// Decompose CommandId into slot index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let slot = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (slot, cmd_idx)
     }
+}
 
+// RSPaxosReplica client requests entrance
+impl RSPaxosReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -394,21 +467,26 @@ impl RSPaxosReplica {
         pf_debug!(self.id; "got request batch of size {}", batch_size);
 
         // if I'm not a leader, ignore client requests
-        if !self.is_leader {
+        if !self.is_leader() {
             for (client, req) in req_batch {
                 if let ApiRequest::Req { id: req_id, .. } = req {
-                    // tell the client to try on the next replica
-                    let next_replica = (self.id + 1) % self.population;
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
                     self.external_api.send_reply(
                         ApiReply::Reply {
                             id: req_id,
                             result: None,
-                            redirect: Some(next_replica),
+                            redirect: Some(target),
                         },
                         client,
                     )?;
                     pf_trace!(self.id; "redirected client {} to replica {}",
-                                       client, next_replica);
+                                       client, target);
                 }
             }
             return Ok(());
@@ -417,39 +495,24 @@ impl RSPaxosReplica {
         // compute the complete Reed-Solomon codeword for the batch data
         let mut reqs_cw = RSCodeword::from_data(
             req_batch,
-            self.quorum_cnt,
-            self.population - self.quorum_cnt,
+            self.majority,
+            self.population - self.majority,
         )?;
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
         // create a new instance in the first null slot (or append a new one
-        // at the end if no holes exist)
-        let mut slot = self.start_slot + self.insts.len();
-        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
-            if self.insts[s - self.start_slot].status == Status::Null {
-                slot = s;
-                break;
-            }
-        }
-        if slot < self.start_slot + self.insts.len() {
-            let old_inst = &mut self.insts[slot - self.start_slot];
-            assert_eq!(old_inst.status, Status::Null);
-            old_inst.reqs_cw = reqs_cw;
-            old_inst.leader_bk = Some(LeaderBookkeeping {
-                prepare_acks: Bitmap::new(self.population, false),
-                prepare_max_bal: 0,
-                accept_acks: Bitmap::new(self.population, false),
-            });
-        } else {
-            let mut new_inst = self.null_instance()?;
-            new_inst.reqs_cw = reqs_cw;
-            new_inst.leader_bk = Some(LeaderBookkeeping {
+        // at the end if no holes exist); fill it up with incoming data
+        let slot = self.first_null_slot()?;
+        {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert_eq!(inst.status, Status::Null);
+            inst.reqs_cw = reqs_cw;
+            inst.leader_bk = Some(LeaderBookkeeping {
                 prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
                 accept_acks: Bitmap::new(self.population, false),
             });
-            new_inst.external = true;
-            self.insts.push(new_inst);
+            inst.external = true;
         }
 
         // decide whether we can enter fast path for this instance
@@ -473,7 +536,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal {
+                    entry: WalEntry::PrepareBal {
                         slot,
                         ballot: self.bal_prep_sent,
                     },
@@ -510,7 +573,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot: inst.bal,
                         // persist only one shard on myself
@@ -545,7 +608,10 @@ impl RSPaxosReplica {
 
         Ok(())
     }
+}
 
+// RSPaxosReplica durable WAL logging
+impl RSPaxosReplica {
     /// Handler of PrepareBal logging result chan recv.
     fn handle_logged_prepare_bal(
         &mut self,
@@ -563,7 +629,7 @@ impl RSPaxosReplica {
             None
         };
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of a PrepareBal entry
             // is equivalent to receiving a Prepare reply from myself
             // (as an acceptor role)
@@ -600,7 +666,7 @@ impl RSPaxosReplica {
                            slot, self.insts[slot - self.start_slot].bal);
         let inst = &self.insts[slot - self.start_slot];
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of an AcceptData entry
             // is equivalent to receiving an Accept reply from myself
             // (as an acceptor role)
@@ -642,13 +708,15 @@ impl RSPaxosReplica {
                 if inst.status < Status::Committed {
                     break;
                 }
+                let now_slot = self.commit_bar;
+                self.commit_bar += 1;
 
-                if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                if inst.reqs_cw.avail_shards() < self.majority {
                     // can't execute if I don't have the complete request batch
                     pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})",
-                                       slot, inst.reqs_cw.avail_shards(), self.quorum_cnt);
+                                       slot, inst.reqs_cw.avail_shards(), self.majority);
                     break;
-                } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                } else if inst.reqs_cw.avail_data_shards() < self.majority {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -662,7 +730,7 @@ impl RSPaxosReplica {
                     for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
                         if let ApiRequest::Req { cmd, .. } = req {
                             self.state_machine.submit_cmd(
-                                Self::make_command_id(self.commit_bar, cmd_idx),
+                                Self::make_command_id(now_slot, cmd_idx),
                                 cmd.clone(),
                             )?;
                         } else {
@@ -670,10 +738,23 @@ impl RSPaxosReplica {
                         }
                     }
                     pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                       reqs.len(), self.commit_bar);
+                                       reqs.len(), now_slot);
                 }
+            }
+        }
 
-                self.commit_bar += 1;
+        // if there are hole(s) between current commit_bar and newly committed
+        // slot, ask the leader to re-send Accept messages for those slots
+        if slot > self.commit_bar && !self.is_leader() {
+            if let Some(leader) = self.leader {
+                let holes: Vec<usize> = (self.commit_bar..slot).collect();
+                self.transport_hub.send_msg(
+                    PeerMsg::FillHoles {
+                        slots: holes.clone(),
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes);
             }
         }
 
@@ -684,7 +765,7 @@ impl RSPaxosReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
         if slot < self.start_slot {
@@ -693,15 +774,15 @@ impl RSPaxosReplica {
         assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
-            assert!(now_size >= self.log_offset);
-            // update first log_offset of slot
+            assert!(now_size >= self.wal_offset);
+            // update first wal_offset of slot
             let inst = &mut self.insts[slot - self.start_slot];
-            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
-                inst.log_offset = self.log_offset;
+            if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset {
+                inst.wal_offset = self.wal_offset;
             }
-            assert!(inst.log_offset <= self.log_offset);
-            // then update self.log_offset
-            self.log_offset = now_size;
+            assert!(inst.wal_offset <= self.wal_offset);
+            // then update self.wal_offset
+            self.wal_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
         }
@@ -715,7 +796,10 @@ impl RSPaxosReplica {
             }
         }
     }
+}
 
+// RSPaxosReplica peer-peer messages handling
+impl RSPaxosReplica {
     /// Handler of Prepare message from leader.
     fn handle_msg_prepare(
         &mut self,
@@ -749,7 +833,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal { slot, ballot },
+                    entry: WalEntry::PrepareBal { slot, ballot },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -778,10 +862,11 @@ impl RSPaxosReplica {
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Preparing)
                 || (ballot < inst.bal)
             {
@@ -815,10 +900,10 @@ impl RSPaxosReplica {
             // reconstruct the original data, enter Accept phase for this
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
-            if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_shards() >= self.quorum_cnt
+            if leader_bk.prepare_acks.count() >= self.majority
+                && inst.reqs_cw.avail_shards() >= self.majority
             {
-                if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                if inst.reqs_cw.avail_data_shards() < self.majority {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -845,7 +930,7 @@ impl RSPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
-                        entry: LogEntry::AcceptData {
+                        entry: WalEntry::AcceptData {
                             slot,
                             ballot,
                             reqs_cw: subset_copy,
@@ -917,7 +1002,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot,
                         reqs_cw: inst.reqs_cw.clone(),
@@ -948,10 +1033,11 @@ impl RSPaxosReplica {
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Accepting)
                 || (ballot < inst.bal)
             {
@@ -970,9 +1056,9 @@ impl RSPaxosReplica {
 
             // if quorum size reached AND enough number of shards are
             // remembered, mark this instance as committed; in RS-Paxos, this
-            // means accept_acks.count() >= self.quorum_cnt + fault_tolerance
+            // means accept_acks.count() >= self.majority + fault_tolerance
             if leader_bk.accept_acks.count()
-                >= self.quorum_cnt + self.config.fault_tolerance
+                >= self.majority + self.config.fault_tolerance
             {
                 inst.status = Status::Committed;
                 pf_debug!(self.id; "committed instance at slot {} bal {}",
@@ -982,7 +1068,7 @@ impl RSPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Committed),
                     LogAction::Append {
-                        entry: LogEntry::CommitSlot { slot },
+                        entry: WalEntry::CommitSlot { slot },
                         sync: self.config.logger_sync,
                     },
                 )?;
@@ -1031,7 +1117,7 @@ impl RSPaxosReplica {
         self.storage_hub.submit_action(
             Self::make_log_action_id(slot, Status::Committed),
             LogAction::Append {
-                entry: LogEntry::CommitSlot { slot },
+                entry: WalEntry::CommitSlot { slot },
                 sync: self.config.logger_sync,
             },
         )?;
@@ -1041,40 +1127,83 @@ impl RSPaxosReplica {
         Ok(())
     }
 
-    /// Handler of Reconstruct message from leader.
-    fn handle_msg_reconstruct(
+    /// Handler of FillHoles message from a lagging peer.
+    fn handle_msg_fill_holes(
         &mut self,
         peer: ReplicaId,
-        slot: usize,
+        slots: Vec<usize>,
     ) -> Result<(), SummersetError> {
-        if slot < self.start_slot {
-            return Ok(()); // ignore if slot index outdated
+        if !self.is_leader() {
+            return Ok(());
         }
-        pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot);
+        pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots);
 
-        // locate instance in memory, filling in null instances if needed
-        while self.start_slot + self.insts.len() <= slot {
-            self.insts.push(self.null_instance()?);
-        }
-        let inst = &mut self.insts[slot - self.start_slot];
+        for slot in slots {
+            if slot < self.start_slot {
+                continue;
+            } else if slot >= self.start_slot + self.insts.len() {
+                break;
+            }
+            let inst = &self.insts[slot - self.start_slot];
 
-        // ignore spurious duplications; also ignore if I have nothing to send back
-        if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
-            return Ok(());
+            if inst.status >= Status::Committed {
+                // re-send Accept message for this slot
+                self.transport_hub.send_msg(
+                    PeerMsg::Accept {
+                        slot,
+                        ballot: self.bal_prepared,
+                        reqs_cw: inst.reqs_cw.subset_copy(
+                            Bitmap::from(self.population, vec![peer]),
+                            false,
+                        )?,
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}",
+                                   peer, slot, self.bal_prepared);
+            }
         }
 
-        // send back my ballot for this slot and the available shards
-        self.transport_hub.send_msg(
-            PeerMsg::ReconstructReply {
-                slot,
-                ballot: inst.bal,
-                reqs_cw: inst.reqs_cw.clone(),
-            },
-            peer,
-        )?;
-        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}",
-                           slot, inst.bal);
+        Ok(())
+    }
+
+    /// Handler of Reconstruct message from leader.
+    fn handle_msg_reconstruct(
+        &mut self,
+        peer: ReplicaId,
+        slots: Vec<usize>,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received Reconstruct <- {} for slots {:?}", peer, slots);
+        let mut slots_data = HashMap::new();
+
+        for slot in slots {
+            if slot < self.start_slot {
+                continue; // ignore if slot index outdated
+            }
 
+            // locate instance in memory, filling in null instances if needed
+            while self.start_slot + self.insts.len() <= slot {
+                self.insts.push(self.null_instance()?);
+            }
+            let inst = &mut self.insts[slot - self.start_slot];
+
+            // ignore spurious duplications; also ignore if I have nothing to send back
+            if inst.status < Status::Accepting
+                || inst.reqs_cw.avail_shards() == 0
+            {
+                continue;
+            }
+
+            // send back my ballot for this slot and the available shards
+            slots_data.insert(slot, (inst.bal, inst.reqs_cw.clone()));
+        }
+
+        if !slots_data.is_empty() {
+            let num_slots = slots_data.len();
+            self.transport_hub
+                .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?;
+            pf_trace!(self.id; "sent ReconstructReply message for {} slots", num_slots);
+        }
         Ok(())
     }
 
@@ -1082,65 +1211,66 @@ impl RSPaxosReplica {
     fn handle_msg_reconstruct_reply(
         &mut self,
         peer: ReplicaId,
-        slot: usize,
-        ballot: Ballot,
-        reqs_cw: RSCodeword<ReqBatch>,
+        slots_data: HashMap<usize, (Ballot, RSCodeword<ReqBatch>)>,
     ) -> Result<(), SummersetError> {
-        if slot < self.start_slot {
-            return Ok(()); // ignore if slot index outdated
-        }
-        pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}",
-                           peer, slot, ballot, reqs_cw.avail_shards_map());
-        assert!(slot < self.start_slot + self.insts.len());
-        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
-        let num_insts = self.start_slot + self.insts.len();
-        let inst = &mut self.insts[slot - self.start_slot];
+        for (slot, (ballot, reqs_cw)) in slots_data {
+            if slot < self.start_slot {
+                continue; // ignore if slot index outdated
+            }
+            pf_trace!(self.id; "in ReconstructReply <- {} for slot {} bal {} shards {:?}",
+                               peer, slot, ballot, reqs_cw.avail_shards_map());
+            assert!(slot < self.start_slot + self.insts.len());
+            assert!(
+                self.insts[slot - self.start_slot].status >= Status::Committed
+            );
+            let inst = &mut self.insts[slot - self.start_slot];
 
-        // if reply not outdated and ballot is up-to-date
-        if inst.status < Status::Executed && ballot >= inst.bal {
-            // absorb the shards from this replica
-            inst.reqs_cw.absorb_other(reqs_cw)?;
-
-            // if enough shards have been gathered, can push execution forward
-            if slot == self.commit_bar {
-                while self.commit_bar < num_insts {
-                    let inst =
-                        &mut self.insts[self.commit_bar - self.start_slot];
-                    if inst.status < Status::Committed
-                        || inst.reqs_cw.avail_shards() < self.quorum_cnt
-                    {
-                        break;
-                    }
+            // if reply not outdated and ballot is up-to-date
+            if inst.status < Status::Executed && ballot >= inst.bal {
+                // absorb the shards from this replica
+                inst.reqs_cw.absorb_other(reqs_cw)?;
+
+                // if enough shards have been gathered, can push execution forward
+                if slot == self.exec_bar {
+                    let mut now_slot = self.exec_bar;
+                    while now_slot < self.start_slot + self.insts.len() {
+                        let inst = &mut self.insts[now_slot - self.start_slot];
+                        if inst.status < Status::Committed
+                            || inst.reqs_cw.avail_shards() < self.majority
+                        {
+                            break;
+                        }
 
-                    if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
-                        // have enough shards but need reconstruction
-                        inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
-                    }
-                    let reqs = inst.reqs_cw.get_data()?;
+                        if inst.reqs_cw.avail_data_shards() < self.majority {
+                            // have enough shards but need reconstruction
+                            inst.reqs_cw
+                                .reconstruct_data(Some(&self.rs_coder))?;
+                        }
+                        let reqs = inst.reqs_cw.get_data()?;
 
-                    // submit commands in committed instance to the state machine
-                    // for execution
-                    if reqs.is_empty() {
-                        inst.status = Status::Executed;
-                    } else {
-                        for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
-                            if let ApiRequest::Req { cmd, .. } = req {
-                                self.state_machine.submit_cmd(
-                                    Self::make_command_id(
-                                        self.commit_bar,
-                                        cmd_idx,
-                                    ),
-                                    cmd.clone(),
-                                )?;
-                            } else {
-                                continue; // ignore other types of requests
+                        // submit commands in committed instance to the state machine
+                        // for execution
+                        if reqs.is_empty() {
+                            inst.status = Status::Executed;
+                        } else {
+                            for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
+                                if let ApiRequest::Req { cmd, .. } = req {
+                                    self.state_machine.submit_cmd(
+                                        Self::make_command_id(
+                                            now_slot, cmd_idx,
+                                        ),
+                                        cmd.clone(),
+                                    )?;
+                                } else {
+                                    continue; // ignore other types of requests
+                                }
                             }
+                            pf_trace!(self.id; "submitted {} exec commands for slot {}",
+                                               reqs.len(), now_slot);
                         }
-                        pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                           reqs.len(), self.commit_bar);
-                    }
 
-                    self.commit_bar += 1;
+                        now_slot += 1;
+                    }
                 }
             }
         }
@@ -1172,20 +1302,26 @@ impl RSPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Reconstruct { slot } => {
-                self.handle_msg_reconstruct(peer, slot)
+            PeerMsg::FillHoles { slots } => {
+                self.handle_msg_fill_holes(peer, slots)
             }
-            PeerMsg::ReconstructReply {
-                slot,
-                ballot,
-                reqs_cw,
-            } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
-            PeerMsg::Heartbeat { ballot, exec_bar } => {
-                self.heard_heartbeat(peer, ballot, exec_bar)
+            PeerMsg::Reconstruct { slots } => {
+                self.handle_msg_reconstruct(peer, slots)
             }
+            PeerMsg::ReconstructReply { slots_data } => {
+                self.handle_msg_reconstruct_reply(peer, slots_data)
+            }
+            PeerMsg::Heartbeat {
+                ballot,
+                exec_bar,
+                snap_bar,
+            } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar),
         }
     }
+}
 
+// RSPaxosReplica state machine execution
+impl RSPaxosReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -1244,27 +1380,45 @@ impl RSPaxosReplica {
 
         Ok(())
     }
+}
 
+// RSPaxosReplica leadership related logic
+impl RSPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        if self.is_leader {
+        if self.is_leader() {
             return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
         }
 
-        self.is_leader = true; // this starts broadcasting heartbeats
+        self.leader = Some(self.id); // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_info!(self.id; "becoming a leader...");
 
-        // broadcast a heartbeat right now
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
         self.bcast_heartbeats()?;
 
+        // re-initialize peer_exec_bar information
+        for slot in self.peer_exec_bar.values_mut() {
+            *slot = 0;
+        }
+
         // make a greater ballot number and invalidate all in-progress instances
         self.bal_prepared = 0;
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
+        let mut recon_slots = Vec::new();
         for (slot, inst) in self
             .insts
             .iter_mut()
@@ -1287,7 +1441,7 @@ impl RSPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Preparing),
                     LogAction::Append {
-                        entry: LogEntry::PrepareBal {
+                        entry: WalEntry::PrepareBal {
                             slot,
                             ballot: self.bal_prep_sent,
                         },
@@ -1312,15 +1466,20 @@ impl RSPaxosReplica {
             // do reconstruction reads for all committed instances that do not
             // hold enough available shards for reconstruction
             if inst.status == Status::Committed
-                && inst.reqs_cw.avail_shards() < self.quorum_cnt
+                && inst.reqs_cw.avail_shards() < self.majority
             {
-                self.transport_hub
-                    .bcast_msg(PeerMsg::Reconstruct { slot }, None)?;
-                pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}",
-                                   slot, inst.bal, inst.reqs_cw.avail_shards_map());
+                recon_slots.push(slot);
             }
         }
 
+        // send reconstruction read messages in chunks
+        for chunk in recon_slots.chunks(self.config.recon_chunk_size) {
+            let slots = chunk.to_vec();
+            let num_slots = slots.len();
+            self.transport_hub
+                .bcast_msg(PeerMsg::Reconstruct { slots }, None)?;
+            pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", num_slots);
+        }
         Ok(())
     }
 
@@ -1330,10 +1489,43 @@ impl RSPaxosReplica {
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
                 exec_bar: self.exec_bar,
+                snap_bar: self.snap_bar,
             },
             None,
         )?;
-        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
+        self.heard_heartbeat(
+            self.id,
+            self.bal_prep_sent,
+            self.exec_bar,
+            self.snap_bar,
+        )?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1342,6 +1534,8 @@ impl RSPaxosReplica {
     /// Chooses a random hb_hear_timeout from the min-max range and kicks off
     /// the hb_hear_timer.
     fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
         let timeout_ms = thread_rng().gen_range(
             self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
         );
@@ -1357,10 +1551,19 @@ impl RSPaxosReplica {
     /// leader status if I currently think I'm a leader.
     fn heard_heartbeat(
         &mut self,
-        _peer: ReplicaId,
+        peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
+        snap_bar: usize,
     ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
         // ignore outdated heartbeats and those from peers with exec_bar < mine
         if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
@@ -1369,18 +1572,61 @@ impl RSPaxosReplica {
         // reset hearing timer
         self.kickoff_hb_hear_timer()?;
 
-        // clear my leader status if it carries a higher ballot number
-        if self.is_leader && ballot > self.bal_max_seen {
-            self.is_leader = false;
-            self.control_hub
-                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
-            pf_info!(self.id; "no longer a leader...");
+        if peer != self.id {
+            // reply back with a Heartbeat message
+            self.transport_hub.send_msg(
+                PeerMsg::Heartbeat {
+                    ballot,
+                    exec_bar: self.exec_bar,
+                    snap_bar: self.snap_bar,
+                },
+                peer,
+            )?;
+
+            // update peer_exec_bar if larger then known; if all servers'
+            // exec_bar (including myself) have passed a slot, that slot
+            // is definitely safe to be snapshotted
+            if exec_bar > self.peer_exec_bar[&peer] {
+                *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar;
+                let passed_cnt = 1 + self
+                    .peer_exec_bar
+                    .values()
+                    .filter(|&&e| e >= exec_bar)
+                    .count() as u8;
+                if passed_cnt == self.population {
+                    // all servers have executed up to exec_bar
+                    self.snap_bar = exec_bar;
+                }
+            }
+
+            // if the peer has made a higher ballot number
+            if ballot > self.bal_max_seen {
+                self.bal_max_seen = ballot;
+
+                // clear my leader status if I was one
+                if self.is_leader() {
+                    self.control_hub
+                        .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+                    pf_info!(self.id; "no longer a leader...");
+                }
+
+                // set this peer to be the believed leader
+                self.leader = Some(peer);
+            }
+        }
+
+        // if snap_bar is larger than mine, update snap_bar
+        if snap_bar > self.snap_bar {
+            self.snap_bar = snap_bar;
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
+}
 
+// RSPaxosReplica control messages handling
+impl RSPaxosReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1492,14 +1738,20 @@ impl RSPaxosReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// RSPaxosReplica recovery from WAL log
+impl RSPaxosReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
-        entry: LogEntry,
+        entry: WalEntry,
     ) -> Result<(), SummersetError> {
         match entry {
-            LogEntry::PrepareBal { slot, ballot } => {
+            WalEntry::PrepareBal { slot, ballot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
@@ -1518,11 +1770,14 @@ impl RSPaxosReplica {
                 self.bal_prepared = 0;
             }
 
-            LogEntry::AcceptData {
+            WalEntry::AcceptData {
                 slot,
                 ballot,
                 reqs_cw,
             } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
@@ -1548,9 +1803,12 @@ impl RSPaxosReplica {
                 assert!(self.bal_prepared <= self.bal_prep_sent);
             }
 
-            LogEntry::CommitSlot { slot } => {
+            WalEntry::CommitSlot { slot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 assert!(slot < self.start_slot + self.insts.len());
-                // update instance state
+                // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
@@ -1561,12 +1819,14 @@ impl RSPaxosReplica {
                         if inst.status < Status::Committed {
                             break;
                         }
+                        // update commit_bar
+                        self.commit_bar += 1;
                         // check number of available shards
-                        if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                        if inst.reqs_cw.avail_shards() < self.majority {
                             // can't execute if I don't have the complete request batch
                             break;
                         } else if inst.reqs_cw.avail_data_shards()
-                            < self.quorum_cnt
+                            < self.majority
                         {
                             // have enough shards but need reconstruction
                             inst.reqs_cw
@@ -1581,9 +1841,9 @@ impl RSPaxosReplica {
                                 let _ = self.state_machine.get_result().await?;
                             }
                         }
-                        // update commit_bar and exec_bar
-                        self.commit_bar += 1;
+                        // update instance status and exec_bar
                         self.exec_bar += 1;
+                        inst.status = Status::Executed;
                     }
                 }
             }
@@ -1592,15 +1852,15 @@ impl RSPaxosReplica {
         Ok(())
     }
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1612,7 +1872,7 @@ impl RSPaxosReplica {
                 } => {
                     self.recover_apply_entry(entry).await?;
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -1628,7 +1888,7 @@ impl RSPaxosReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1636,19 +1896,29 @@ impl RSPaxosReplica {
             offset_ok: true, ..
         } = log_result
         {
+            if self.wal_offset > 0 {
+                pf_info!(self.id; "recovered from wal log: commit {} exec {}",
+                                  self.commit_bar, self.exec_bar);
+            }
             Ok(())
         } else {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+}
 
-    /// Dump a new key-value pair to snapshot file.
-    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+// RSPaxosReplica snapshotting & GC logic
+impl RSPaxosReplica {
+    /// Dump new key-value pairs to snapshot file.
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
-        for slot in self.start_slot..self.exec_bar {
+        for slot in self.start_slot..new_start_slot {
             let inst = &mut self.insts[slot - self.start_slot];
-            assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt);
+            assert!(inst.reqs_cw.avail_data_shards() >= self.majority);
             for (_, req) in inst.reqs_cw.get_data()?.clone() {
                 if let ApiRequest::Req {
                     cmd: Command::Put { key, value },
@@ -1683,15 +1953,20 @@ impl RSPaxosReplica {
     /// Discard everything older than start_slot in durable WAL log.
     async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
         let cut_offset = if !self.insts.is_empty() {
-            self.insts[0].log_offset
+            self.insts[0].wal_offset
         } else {
-            self.log_offset
+            self.wal_offset
         };
 
         // discard the log before cut_offset
         if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: 0,
+                },
+            )?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -1704,8 +1979,8 @@ impl RSPaxosReplica {
                         now_size,
                     } = log_result
                     {
-                        assert_eq!(self.log_offset - cut_offset, now_size);
-                        self.log_offset = now_size;
+                        assert_eq!(self.wal_offset - cut_offset, now_size);
+                        self.wal_offset = now_size;
                     } else {
                         return logged_err!(
                             self.id;
@@ -1717,43 +1992,74 @@ impl RSPaxosReplica {
             }
         }
 
-        // update inst.log_offset for all remaining in-mem instances
+        // update inst.wal_offset for all remaining in-mem instances
         for inst in &mut self.insts {
-            if inst.log_offset > 0 {
-                assert!(inst.log_offset >= cut_offset);
-                inst.log_offset -= cut_offset;
+            if inst.wal_offset > 0 {
+                assert!(inst.wal_offset >= cut_offset);
+                inst.wal_offset -= cut_offset;
             }
         }
 
         Ok(())
     }
 
-    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// Take a snapshot up to current exec_bar, then discard the in-mem log up
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
-    /// middle of taking a snapshot.
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we assume here that failed
+    /// Accept messages will be retried indefinitely until success before its
+    /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.exec_bar, self.snap_bar);
         assert!(self.exec_bar >= self.start_slot);
-        if self.exec_bar == self.start_slot {
+
+        let new_start_slot = cmp::min(self.snap_bar, self.exec_bar);
+        if new_start_slot == self.start_slot {
             return Ok(());
         }
 
         // collect and dump all Puts in executed instances
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
-        self.snapshot_dump_kv_pairs().await?;
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
+
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: new_start_slot,
+                    commit_bar: self.commit_bar,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed write");
+            }
+        }
 
         // update start_slot and discard all in-memory log instances up to exec_bar
-        self.insts.drain(0..(self.exec_bar - self.start_slot));
-        self.start_slot = self.exec_bar;
+        self.insts.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
 
         // discarding everything older than start_slot in WAL log
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -1778,11 +2084,20 @@ impl RSPaxosReplica {
 
         match log_result {
             LogResult::Read {
-                entry: Some(SnapEntry::StartSlot { slot }),
+                entry:
+                    Some(SnapEntry::SlotInfo {
+                        start_slot,
+                        commit_bar,
+                    }),
                 end_offset,
             } => {
                 self.snap_offset = end_offset;
-                self.start_slot = slot; // get start slot index of in-mem log
+
+                // recover necessary slot indices info
+                self.start_slot = start_slot;
+                self.commit_bar = commit_bar;
+                self.exec_bar = start_slot;
+                self.snap_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -1825,6 +2140,11 @@ impl RSPaxosReplica {
                 self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
                     new_start: self.start_slot,
                 })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}",
+                                      self.start_slot, self.commit_bar, self.exec_bar);
+                }
                 Ok(())
             }
 
@@ -1833,7 +2153,10 @@ impl RSPaxosReplica {
                 self.snapshot_hub.submit_action(
                     0,
                     LogAction::Write {
-                        entry: SnapEntry::StartSlot { slot: 0 },
+                        entry: SnapEntry::SlotInfo {
+                            start_slot: 0,
+                            commit_bar: 0,
+                        },
                         offset: 0,
                         sync: self.config.logger_sync,
                     },
@@ -1847,7 +2170,7 @@ impl RSPaxosReplica {
                     self.snap_offset = now_size;
                     Ok(())
                 } else {
-                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                    logged_err!(self.id; "unexpected log result type or failed write")
                 }
             }
 
@@ -1873,19 +2196,47 @@ impl GenericReplica for RSPaxosReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRSPaxos;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
                                     snapshot_path, snapshot_interval_s,
-                                    fault_tolerance,
+                                    fault_tolerance, recon_chunk_size,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
+            );
+        }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
+            );
+        }
+        if config.recon_chunk_size == 0 {
+            return logged_err!(
+                id;
+                "invalid config.recon_chunk_size '{}'",
+                config.recon_chunk_size
             );
         }
         if config.hb_hear_timeout_min < 100 {
@@ -1957,14 +2308,14 @@ impl GenericReplica for RSPaxosReplica {
 
         // create a Reed-Solomon coder with num_data_shards == quorum size and
         // num_parity shards == population - quorum
-        let quorum_cnt = (population / 2) + 1;
-        if config.fault_tolerance > (population - quorum_cnt) {
+        let majority = (population / 2) + 1;
+        if config.fault_tolerance > (population - majority) {
             return logged_err!(id; "invalid config.fault_tolerance '{}'",
                                    config.fault_tolerance);
         }
         let rs_coder = ReedSolomon::new(
-            quorum_cnt as usize,
-            (population - quorum_cnt) as usize,
+            majority as usize,
+            (population - majority) as usize,
         )?;
 
         // proactively connect to some peers, then wait for all population
@@ -1986,7 +2337,7 @@ impl GenericReplica for RSPaxosReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -2004,10 +2355,14 @@ impl GenericReplica for RSPaxosReplica {
         ));
         snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
         Ok(RSPaxosReplica {
             id,
             population,
-            quorum_cnt,
+            majority,
             config,
             _api_addr: api_addr,
             _p2p_addr: p2p_addr,
@@ -2017,9 +2372,11 @@ impl GenericReplica for RSPaxosReplica {
             storage_hub,
             snapshot_hub,
             transport_hub,
+            leader: None,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
-            is_leader: false,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
             insts: vec![],
             start_slot: 0,
             snapshot_interval,
@@ -2028,7 +2385,11 @@ impl GenericReplica for RSPaxosReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
-            log_offset: 0,
+            peer_exec_bar: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            snap_bar: 0,
+            wal_offset: 0,
             snap_offset: 0,
             rs_coder,
         })
@@ -2041,8 +2402,8 @@ impl GenericReplica for RSPaxosReplica {
         // recover state from durable snapshot file
         self.recover_from_snapshot().await?;
 
-        // recover the tail-piece memory log & state from durable storage log
-        self.recover_from_log().await?;
+        // recover the tail-piece memory log & state from durable WAL log
+        self.recover_from_wal().await?;
 
         // kick off leader activity hearing timer
         self.kickoff_hb_hear_timer()?;
@@ -2108,7 +2469,7 @@ impl GenericReplica for RSPaxosReplica {
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => {
                     if let Err(e) = self.bcast_heartbeats() {
                         pf_error!(self.id; "error broadcasting heartbeats: {}", e);
                     }
@@ -2116,7 +2477,7 @@ impl GenericReplica for RSPaxosReplica {
 
                 // autonomous snapshot taking timeout
                 _ = self.snapshot_interval.tick(), if !paused
-                        && self.config.snapshot_interval_s > 0 => {
+                                                      && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index ce89c7d1..93baeb0c 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -29,8 +29,8 @@ use tokio::sync::watch;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigSimplePush {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -52,7 +52,7 @@ pub struct ReplicaConfigSimplePush {
 impl Default for ReplicaConfigSimplePush {
     fn default() -> Self {
         ReplicaConfigSimplePush {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.simple_push.wal".into(),
             rep_degree: 2,
@@ -64,9 +64,9 @@ impl Default for ReplicaConfigSimplePush {
     }
 }
 
-/// Log entry type.
+/// WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     FromClient {
         reqs: Vec<(ClientId, ApiRequest)>,
     },
@@ -126,7 +126,7 @@ pub struct SimplePushReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// TransportHub module.
     transport_hub: TransportHub<PushMsg>,
@@ -134,12 +134,14 @@ pub struct SimplePushReplica {
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 }
 
+// SimplePushReplica common helpers
 impl SimplePushReplica {
     /// Compose CommandId from instance index & command index within.
+    #[inline]
     fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId {
         assert!(inst_idx <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -147,12 +149,16 @@ impl SimplePushReplica {
     }
 
     /// Decompose CommandId into instance index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let inst_idx = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (inst_idx, cmd_idx)
     }
+}
 
+// SimplePushReplica client requests entrance
+impl SimplePushReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -186,13 +192,13 @@ impl SimplePushReplica {
         self.insts.push(inst);
 
         // submit log action to make this instance durable
-        let log_entry = LogEntry::FromClient {
+        let wal_entry = WalEntry::FromClient {
             reqs: req_batch.clone(),
         };
         self.storage_hub.submit_action(
             inst_idx as LogActionId,
             LogAction::Append {
-                entry: log_entry,
+                entry: wal_entry,
                 sync: true,
             },
         )?;
@@ -208,12 +214,15 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica durable WAL logging
+impl SimplePushReplica {
     /// Handler of durable logging result chan recv.
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let inst_idx = action_id as usize;
         if inst_idx >= self.insts.len() {
@@ -222,8 +231,8 @@ impl SimplePushReplica {
 
         match log_result {
             LogResult::Append { now_size } => {
-                assert!(now_size >= self.log_offset);
-                self.log_offset = now_size;
+                assert!(now_size >= self.wal_offset);
+                self.wal_offset = now_size;
             }
             _ => {
                 return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result);
@@ -265,7 +274,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica peer-peer messages handling
+impl SimplePushReplica {
     /// Handler of push message from peer.
     fn handle_push_msg(
         &mut self,
@@ -284,7 +296,7 @@ impl SimplePushReplica {
         self.insts.push(inst);
 
         // submit log action to make this instance durable
-        let log_entry = LogEntry::PeerPushed {
+        let wal_entry = WalEntry::PeerPushed {
             peer,
             src_inst_idx,
             reqs: req_batch.clone(),
@@ -292,7 +304,7 @@ impl SimplePushReplica {
         self.storage_hub.submit_action(
             inst_idx as LogActionId,
             LogAction::Append {
-                entry: log_entry,
+                entry: wal_entry,
                 sync: true,
             },
         )?;
@@ -346,7 +358,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica state machine execution
+impl SimplePushReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -398,7 +413,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica control messages handling
+impl SimplePushReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -486,16 +504,19 @@ impl SimplePushReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+// SimplePushReplica recovery from WAL log
+impl SimplePushReplica {
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -506,8 +527,8 @@ impl SimplePushReplica {
                     end_offset,
                 } => {
                     let (from_peer, reqs) = match entry {
-                        LogEntry::FromClient { reqs } => (None, reqs),
-                        LogEntry::PeerPushed {
+                        WalEntry::FromClient { reqs } => (None, reqs),
+                        WalEntry::PeerPushed {
                             peer,
                             src_inst_idx,
                             reqs,
@@ -531,7 +552,7 @@ impl SimplePushReplica {
                         from_peer,
                     });
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -547,7 +568,7 @@ impl SimplePushReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -577,15 +598,15 @@ impl GenericReplica for SimplePushReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigSimplePush;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, rep_degree,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
 
@@ -645,7 +666,7 @@ impl GenericReplica for SimplePushReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -662,7 +683,7 @@ impl GenericReplica for SimplePushReplica {
             storage_hub,
             transport_hub,
             insts: vec![],
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
@@ -670,8 +691,8 @@ impl GenericReplica for SimplePushReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
-        self.recover_from_log().await?;
+        // recover state from durable storage WAL log
+        self.recover_from_wal().await?;
 
         // main event loop
         let mut paused = false;
diff --git a/src/server/external.rs b/src/server/external.rs
index cc820c00..a0728861 100644
--- a/src/server/external.rs
+++ b/src/server/external.rs
@@ -21,7 +21,7 @@ use tokio::io::AsyncReadExt;
 use tokio::sync::{mpsc, Notify};
 use tokio::sync::mpsc::error::TryRecvError;
 use tokio::task::JoinHandle;
-use tokio::time::{self, Duration};
+use tokio::time::{self, Duration, MissedTickBehavior};
 
 /// External API request ID type.
 pub type RequestId = u64;
@@ -490,6 +490,7 @@ impl ExternalApi {
         batch_notify: Arc<Notify>,
     ) {
         let mut interval = time::interval(batch_interval);
+        interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
         loop {
             interval.tick().await;
diff --git a/src/server/storage.rs b/src/server/storage.rs
index a11d6ba6..06bc0430 100644
--- a/src/server/storage.rs
+++ b/src/server/storage.rs
@@ -44,8 +44,9 @@ pub enum LogAction<Ent> {
     /// Truncate the log at given offset, keeping the head part.
     Truncate { offset: usize },
 
-    /// Discard the log before given offset, keeping the tail part.
-    Discard { offset: usize },
+    /// Discard the log before given offset, keeping the tail part (and
+    /// optionally a head part).
+    Discard { offset: usize, keep: usize },
 }
 
 /// Action result returned by the logger.
@@ -337,12 +338,14 @@ where
         }
     }
 
-    /// Discard the file before given index, keeping the tail part.
+    /// Discard the file before given index, keeping the tail part (and
+    /// optionally a head part).
     async fn discard_log(
         me: ReplicaId,
         backer: &mut File,
         file_size: usize,
         offset: usize,
+        keep: usize,
     ) -> Result<(bool, usize), SummersetError> {
         if offset > file_size {
             pf_warn!(
@@ -352,25 +355,32 @@ where
                 file_size
             );
             Ok((false, file_size))
+        } else if keep >= offset {
+            pf_warn!(
+                me;
+                "discard keeping {} while offset is {}",
+                keep, offset
+            );
+            Ok((false, file_size))
         } else {
             let tail_size = file_size - offset;
             if tail_size > 0 {
                 // due to the limited interfaces provided by `tokio::fs`, we
-                // read out the tail part and write it back to offset 0 to
+                // read out the tail part and write it back to offset keep to
                 // achieve the effect of discarding
                 let mut tail_buf: Vec<u8> = vec![0; tail_size];
                 backer.seek(SeekFrom::Start(offset as u64)).await?;
                 backer.read_exact(&mut tail_buf[..]).await?;
 
-                backer.seek(SeekFrom::Start(0)).await?;
+                backer.seek(SeekFrom::Start(keep as u64)).await?;
                 backer.write_all(&tail_buf[..]).await?;
             }
 
-            backer.set_len(tail_size as u64).await?;
+            backer.set_len((keep + tail_size) as u64).await?;
             backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF
 
             backer.sync_all().await?;
-            Ok((true, tail_size))
+            Ok((true, keep + tail_size))
         }
     }
 
@@ -422,16 +432,16 @@ where
                         }
                     })
             }
-            LogAction::Discard { offset } => {
-                Self::discard_log(me, backer, *file_size, offset).await.map(
-                    |(offset_ok, now_size)| {
+            LogAction::Discard { offset, keep } => {
+                Self::discard_log(me, backer, *file_size, offset, keep)
+                    .await
+                    .map(|(offset_ok, now_size)| {
                         *file_size = now_size;
                         LogResult::Discard {
                             offset_ok,
                             now_size,
                         }
-                    },
-                )
+                    })
             }
         }
     }
@@ -658,24 +668,55 @@ mod storage_tests {
         let mut backer_file =
             prepare_test_file("/tmp/test-backer-4.log").await?;
         let entry = TestEntry("test-entry-dummy-string".into());
-        let mid_offset =
+        let mid1_offset =
             StorageHub::append_entry(0, &mut backer_file, 0, &entry, false)
                 .await?;
+        let mid2_offset = StorageHub::append_entry(
+            0,
+            &mut backer_file,
+            mid1_offset,
+            &entry,
+            false,
+        )
+        .await?;
         let end_offset = StorageHub::append_entry(
             0,
             &mut backer_file,
-            mid_offset,
+            mid2_offset,
             &entry,
             true,
         )
         .await?;
-        let tail_size = end_offset - mid_offset;
+        let tail_size = end_offset - mid2_offset;
         assert_eq!(
             StorageHub::<TestEntry>::discard_log(
                 0,
                 &mut backer_file,
                 end_offset,
-                mid_offset
+                mid2_offset,
+                mid1_offset,
+            )
+            .await?,
+            (true, 2 * tail_size)
+        );
+        assert_eq!(
+            StorageHub::<TestEntry>::discard_log(
+                0,
+                &mut backer_file,
+                2 * tail_size,
+                mid1_offset,
+                end_offset,
+            )
+            .await?,
+            (false, 2 * tail_size)
+        );
+        assert_eq!(
+            StorageHub::<TestEntry>::discard_log(
+                0,
+                &mut backer_file,
+                2 * tail_size,
+                mid1_offset,
+                0,
             )
             .await?,
             (true, tail_size)
@@ -685,7 +726,8 @@ mod storage_tests {
                 0,
                 &mut backer_file,
                 tail_size,
-                end_offset
+                end_offset,
+                0
             )
             .await?,
             (false, tail_size)
@@ -695,7 +737,8 @@ mod storage_tests {
                 0,
                 &mut backer_file,
                 tail_size,
-                tail_size
+                tail_size,
+                0
             )
             .await?,
             (true, 0)
diff --git a/src/server/transport.rs b/src/server/transport.rs
index ba0e1e8b..a91c44f4 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -1,9 +1,10 @@
 //! Summerset server internal TCP transport module implementation.
 //!
-//! In concept, all messages are sent through unstable communication channels,
-//! and are retried if the sender did not receive an ACK in a timely manner.
-//! Here, we use TCP as the communication protocol to get the same effect of
-//! "every message a sender wants to send will eventually be delivered".
+//! NOTE: In concept, all messages are sent through unstable communication
+//! channels, and are retried if the sender did not receive an ACK in a timely
+//! manner. Here, we use TCP as the communication protocol to get the same
+//! effect of "every message a sender wants to send will be retried until
+//! eventually delivered".
 
 use std::fmt;
 use std::net::SocketAddr;
@@ -227,11 +228,12 @@ where
                     .map_err(|e| SummersetError(e.to_string()))?;
             }
             None => {
-                pf_error!(
-                    self.me;
-                    "peer ID {} not found among connected ones",
-                    peer
-                );
+                // NOTE: commented out to avoid spurious error messages
+                // pf_error!(
+                //     self.me;
+                //     "peer ID {} not found among connected ones",
+                //     peer
+                // );
             }
         }
 
diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs
index d5fe9c8e..5211e9f9 100644
--- a/src/utils/bitmap.rs
+++ b/src/utils/bitmap.rs
@@ -71,11 +71,26 @@ impl Bitmap {
         self.0.count_ones(..) as u8
     }
 
+    /// Flips all flags in the bitmap.
+    #[inline]
+    pub fn flip(&mut self) {
+        self.0.toggle_range(..)
+    }
+
     /// Allows `for (id, bit) in map.iter()`.
     #[inline]
     pub fn iter(&self) -> BitmapIter {
         BitmapIter { map: self, idx: 0 }
     }
+
+    /// Convenience method for converting the bitmap to a vec of indexes where
+    /// the flag is true.
+    #[inline]
+    pub fn to_vec(&self) -> Vec<u8> {
+        self.iter()
+            .filter_map(|(idx, flag)| if flag { Some(idx) } else { None })
+            .collect()
+    }
 }
 
 /// Iterator over `Bitmap`, yielding `(id, bit)` pairs.
@@ -143,6 +158,14 @@ mod bitmap_tests {
         assert!(map.get(7).is_err());
     }
 
+    #[test]
+    fn bitmap_flip() {
+        let mut map = Bitmap::new(5, false);
+        assert!(map.set(1, true).is_ok());
+        map.flip();
+        assert_eq!(map, Bitmap::from(5, vec![0, 2, 3, 4]));
+    }
+
     #[test]
     fn bitmap_count() {
         let mut map = Bitmap::new(7, false);
@@ -161,5 +184,6 @@ mod bitmap_tests {
         for (id, flag) in map.iter() {
             assert_eq!(ref_map[id as usize], flag);
         }
+        assert_eq!(map.to_vec(), [0, 1, 3, 4]);
     }
 }
diff --git a/src/utils/error.rs b/src/utils/error.rs
index 0e73dccb..6c0907a0 100644
--- a/src/utils/error.rs
+++ b/src/utils/error.rs
@@ -3,6 +3,7 @@
 use std::fmt;
 use std::io;
 use std::net;
+use std::num;
 
 use crate::server::ReplicaId;
 
@@ -30,6 +31,7 @@ macro_rules! impl_from_error {
 }
 
 impl_from_error!(io::Error);
+impl_from_error!(num::ParseIntError);
 impl_from_error!(net::AddrParseError);
 impl_from_error!(rmp_serde::encode::Error);
 impl_from_error!(rmp_serde::decode::Error);
diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs
index c8461c26..35fdc97c 100644
--- a/src/utils/rscoding.rs
+++ b/src/utils/rscoding.rs
@@ -305,15 +305,20 @@ where
         self.shards.iter().filter(|s| s.is_some()).count() as u8
     }
 
-    /// Gets a bitmap of available shard indexes set true.
+    /// Gets a vec of available shard indexes.
     #[inline]
-    pub fn avail_shards_map(&self) -> Bitmap {
-        let ones: Vec<u8> = self
-            .shards
+    pub fn avail_shards_vec(&self) -> Vec<u8> {
+        self.shards
             .iter()
             .enumerate()
             .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None })
-            .collect();
+            .collect()
+    }
+
+    /// Gets a bitmap of available shard indexes set true.
+    #[inline]
+    pub fn avail_shards_map(&self) -> Bitmap {
+        let ones = self.avail_shards_vec();
         Bitmap::from(self.num_shards(), ones)
     }
 
diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs
index 09e4f330..88e0cfbb 100644
--- a/summerset_client/src/clients/repl.rs
+++ b/summerset_client/src/clients/repl.rs
@@ -1,6 +1,8 @@
 //! Interactive REPL-style command-line interface client.
 
+use std::collections::HashSet;
 use std::io::{self, Write};
+use std::str::SplitWhitespace;
 
 use crate::drivers::{DriverReply, DriverClosedLoop};
 
@@ -8,7 +10,9 @@ use color_print::{cprint, cprintln};
 
 use tokio::time::Duration;
 
-use summerset::{GenericEndpoint, Command, SummersetError};
+use summerset::{
+    ReplicaId, GenericEndpoint, Command, CtrlRequest, CtrlReply, SummersetError,
+};
 
 /// Prompt string at the start of line.
 const PROMPT: &str = ">>>>> ";
@@ -24,6 +28,9 @@ enum ReplCommand {
     /// Print help message.
     PrintHelp,
 
+    /// Control request to the manager.
+    Control(CtrlRequest),
+
     /// Client exit.
     Exit,
 
@@ -54,28 +61,61 @@ impl ClientRepl {
     }
 
     /// Prints the prompt string.
-    fn print_prompt(&mut self) {
+    #[inline]
+    fn print_prompt() {
         cprint!("<bright-yellow>{}</>", PROMPT);
         io::stdout().flush().unwrap();
     }
 
     /// Prints (optionally) an error message and the help message.
-    fn print_help(&mut self, err: Option<&SummersetError>) {
+    fn print_help(err: Option<&SummersetError>) {
         if let Some(e) = err {
             cprintln!("<bright-red>✗</> {}", e);
         }
-        println!("HELP: Supported commands are:");
-        println!("        get <key>");
-        println!("        put <key> <value>");
-        println!("        reconnect");
-        println!("        help");
-        println!("        exit");
+        println!("HELP: Supported normal commands are:");
+        println!("          get <key>");
+        println!("          put <key> <value>");
+        println!("          help");
+        println!("          exit");
+        println!("      Commands for control/testing:");
+        println!("          reconnect");
+        println!("          reset [servers]");
+        println!("          pause [servers]");
+        println!("          resume [servers]");
+        println!("          snapshot [servers]");
         println!(
             "      Keys and values currently cannot contain any whitespaces"
         );
         io::stdout().flush().unwrap();
     }
 
+    /// Expect to get the next segment string from parsed segs.
+    #[inline]
+    fn expect_next_seg<'s>(
+        segs: &mut SplitWhitespace<'s>,
+    ) -> Result<&'s str, SummersetError> {
+        if let Some(seg) = segs.next() {
+            Ok(seg)
+        } else {
+            let err = SummersetError("not enough args".into());
+            Self::print_help(Some(&err));
+            Err(err)
+        }
+    }
+
+    /// Drain all of the remaining segments into a hash set and interpret as
+    /// replica IDs.
+    #[inline]
+    fn drain_server_ids(
+        segs: &mut SplitWhitespace,
+    ) -> Result<HashSet<ReplicaId>, SummersetError> {
+        let mut servers = HashSet::new();
+        for seg in segs {
+            servers.insert(seg.parse::<ReplicaId>()?);
+        }
+        Ok(servers)
+    }
+
     /// Reads in user input and parses into a command.
     fn read_command(&mut self) -> Result<ReplCommand, SummersetError> {
         self.input_buf.clear();
@@ -98,36 +138,18 @@ impl ClientRepl {
 
         match &cmd_type.unwrap().to_lowercase()[..] {
             "get" => {
-                let key = segs.next();
-                if key.is_none() {
-                    let err = SummersetError("not enough args".into());
-                    self.print_help(Some(&err));
-                    return Err(err);
-                }
-
-                // keys and values are kept as-is, no case conversions
-                Ok(ReplCommand::Normal(Command::Get {
-                    key: key.unwrap().into(),
-                }))
+                // keys are kept as-is, no case conversions
+                let key = Self::expect_next_seg(&mut segs)?;
+                Ok(ReplCommand::Normal(Command::Get { key: key.into() }))
             }
 
             "put" => {
-                let key = segs.next();
-                if key.is_none() {
-                    let err = SummersetError("not enough args".into());
-                    self.print_help(Some(&err));
-                    return Err(err);
-                }
-                let value = segs.next();
-                if value.is_none() {
-                    let err = SummersetError("not enough args".into());
-                    self.print_help(Some(&err));
-                    return Err(err);
-                }
-
+                // keys and values are kept as-is, no case conversions
+                let key = Self::expect_next_seg(&mut segs)?;
+                let value = Self::expect_next_seg(&mut segs)?;
                 Ok(ReplCommand::Normal(Command::Put {
-                    key: key.unwrap().into(),
-                    value: value.unwrap().into(),
+                    key: key.into(),
+                    value: value.into(),
                 }))
             }
 
@@ -135,6 +157,29 @@ impl ClientRepl {
 
             "reconnect" => Ok(ReplCommand::Reconnect),
 
+            "reset" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::ResetServers {
+                    servers,
+                    durable: true,
+                }))
+            }
+
+            "pause" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::PauseServers { servers }))
+            }
+
+            "resume" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::ResumeServers { servers }))
+            }
+
+            "snapshot" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::TakeSnapshot { servers }))
+            }
+
             "exit" => Ok(ReplCommand::Exit),
 
             _ => {
@@ -142,7 +187,7 @@ impl ClientRepl {
                     "unrecognized command: {}",
                     cmd_type.unwrap()
                 ));
-                self.print_help(Some(&err));
+                Self::print_help(Some(&err));
                 Err(err)
             }
         }
@@ -200,9 +245,49 @@ impl ClientRepl {
         io::stdout().flush().unwrap();
     }
 
+    /// Makes a control request to the manager and wait for the reply.
+    async fn make_ctrl_req(
+        &mut self,
+        req: CtrlRequest,
+    ) -> Result<CtrlReply, SummersetError> {
+        let mut sent = self.driver.ctrl_stub().send_req(Some(&req))?;
+        while !sent {
+            sent = self.driver.ctrl_stub().send_req(None)?;
+        }
+        self.driver.ctrl_stub().recv_reply().await
+    }
+
+    /// Prints control request reply.
+    fn print_ctrl_reply(&mut self, reply: CtrlReply) {
+        match reply {
+            CtrlReply::ResetServers { servers } => {
+                cprintln!("<bright-blue>#</> reset servers {:?}", servers);
+            }
+
+            CtrlReply::PauseServers { servers } => {
+                cprintln!("<bright-blue>#</> paused servers {:?}", servers);
+            }
+
+            CtrlReply::ResumeServers { servers } => {
+                cprintln!("<bright-blue>#</> resumed servers {:?}", servers);
+            }
+
+            CtrlReply::TakeSnapshot { snapshot_up_to } => {
+                cprintln!(
+                    "<bright-blue>#</> servers snapshot up to {:?}",
+                    snapshot_up_to
+                );
+            }
+
+            _ => {
+                cprintln!("<bright-red>✗</> unexpected ctrl reply type");
+            }
+        }
+    }
+
     /// One iteration of the REPL loop.
     async fn iter(&mut self) -> Result<bool, SummersetError> {
-        self.print_prompt();
+        Self::print_prompt();
 
         let cmd = self.read_command()?;
         match cmd {
@@ -221,7 +306,7 @@ impl ClientRepl {
             }
 
             ReplCommand::PrintHelp => {
-                self.print_help(None);
+                Self::print_help(None);
                 Ok(true)
             }
 
@@ -230,6 +315,12 @@ impl ClientRepl {
                 self.print_result(result);
                 Ok(true)
             }
+
+            ReplCommand::Control(req) => {
+                let reply = self.make_ctrl_req(req).await?;
+                self.print_ctrl_reply(reply);
+                Ok(true)
+            }
         }
     }
 
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 378256b7..4fb021e0 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -467,6 +467,7 @@ impl ClientTester {
         self.checked_put("Jose", &v, Some(None), 0).await?;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
+                // picked a non-leader replica
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -484,6 +485,7 @@ impl ClientTester {
         self.checked_put("Jose", &v, Some(None), 0).await?;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
+                // picked a leader replica
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -515,6 +517,7 @@ impl ClientTester {
             }
         }
         if resets.len() == 2 {
+            // picked two replicas, one leader and one non-leader
             self.driver.leave(false).await?;
             self.reset_servers(resets, true).await?;
             time::sleep(Duration::from_secs(1)).await;
@@ -543,6 +546,7 @@ impl ClientTester {
         time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
+                // picked a non-leader replica
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -563,6 +567,7 @@ impl ClientTester {
         time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
+                // picked a leader replica
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -583,24 +588,28 @@ impl ClientTester {
         time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
+                // picked a leader replica
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 let v1 = Self::gen_rand_string(8);
                 self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+                // resuming old leader replica
                 self.driver.leave(false).await?;
                 self.resume_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 let v2 = Self::gen_rand_string(8);
                 self.checked_put("Jose", &v2, Some(Some(&v1)), 1).await?;
+                // pausing that replica again
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 let v3 = Self::gen_rand_string(8);
                 self.checked_put("Jose", &v3, Some(Some(&v2)), 0).await?;
+                // resuming that replica again
                 self.driver.leave(false).await?;
                 self.resume_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -619,9 +628,21 @@ impl ClientTester {
         self.checked_put("Jose", &v0, Some(None), 0).await?;
         let v1 = Self::gen_rand_string(8);
         self.checked_put("Shawn", &v1, Some(None), 0).await?;
+        // forcing all nodes to take snapshot
         time::sleep(Duration::from_millis(500)).await;
         self.force_snapshot(HashSet::new()).await?;
         self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+        // reseting all nodes and see if things are there
+        self.driver.leave(false).await?;
+        self.reset_servers(HashSet::new(), true).await?;
+        time::sleep(Duration::from_secs(1)).await;
+        self.driver.connect().await?;
+        self.checked_get("Shawn", Some(Some(&v1)), 0).await?;
+        self.checked_get("Jose", Some(Some(&v1)), 0).await?;
+        // forcing all nodes to take snapshot again
+        time::sleep(Duration::from_millis(500)).await;
+        self.force_snapshot(HashSet::new()).await?;
+        // reseting all nodes again and check again
         self.driver.leave(false).await?;
         self.reset_servers(HashSet::new(), true).await?;
         time::sleep(Duration::from_secs(1)).await;
diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs
index a0a96e87..06e218df 100644
--- a/summerset_client/src/drivers/closed_loop.rs
+++ b/summerset_client/src/drivers/closed_loop.rs
@@ -99,46 +99,55 @@ impl DriverClosedLoop {
         })?;
         let issue_ts = Instant::now();
 
-        let reply = self.recv_reply_with_timeout().await?;
-        match reply {
-            Some(ApiReply::Reply {
-                id: reply_id,
-                result: cmd_result,
-                redirect,
-            }) => {
-                if reply_id != req_id {
-                    logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
-                                         req_id, reply_id)
-                } else {
-                    match cmd_result {
-                        None => {
-                            if let Some(server) = redirect {
-                                Ok(DriverReply::Redirect { server })
-                            } else {
-                                Ok(DriverReply::Failure)
+        loop {
+            let reply = self.recv_reply_with_timeout().await?;
+            match reply {
+                Some(ApiReply::Reply {
+                    id: reply_id,
+                    result: cmd_result,
+                    redirect,
+                }) => {
+                    if reply_id != req_id {
+                        // logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
+                        //                      req_id, reply_id)
+                        continue;
+                    } else {
+                        match cmd_result {
+                            None => {
+                                if let Some(server) = redirect {
+                                    return Ok(DriverReply::Redirect {
+                                        server,
+                                    });
+                                } else {
+                                    return Ok(DriverReply::Failure);
+                                }
                             }
-                        }
 
-                        Some(CommandResult::Get { value }) => {
-                            let latency =
-                                Instant::now().duration_since(issue_ts);
-                            Ok(DriverReply::Success {
-                                req_id,
-                                cmd_result: CommandResult::Get { value },
-                                latency,
-                            })
-                        }
+                            Some(CommandResult::Get { value }) => {
+                                let latency =
+                                    Instant::now().duration_since(issue_ts);
+                                return Ok(DriverReply::Success {
+                                    req_id,
+                                    cmd_result: CommandResult::Get { value },
+                                    latency,
+                                });
+                            }
 
-                        _ => {
-                            logged_err!(self.id; "command type mismatch: expected Get")
+                            _ => {
+                                return logged_err!(self.id; "command type mismatch: expected Get");
+                            }
                         }
                     }
                 }
-            }
 
-            None => Ok(DriverReply::Timeout),
+                None => {
+                    return Ok(DriverReply::Timeout);
+                }
 
-            _ => logged_err!(self.id; "unexpected reply type received"),
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
         }
     }
 
@@ -160,46 +169,57 @@ impl DriverClosedLoop {
         })?;
         let issue_ts = Instant::now();
 
-        let reply = self.recv_reply_with_timeout().await?;
-        match reply {
-            Some(ApiReply::Reply {
-                id: reply_id,
-                result: cmd_result,
-                redirect,
-            }) => {
-                if reply_id != req_id {
-                    logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
-                                         req_id, reply_id)
-                } else {
-                    match cmd_result {
-                        None => {
-                            if let Some(server) = redirect {
-                                Ok(DriverReply::Redirect { server })
-                            } else {
-                                Ok(DriverReply::Failure)
+        loop {
+            let reply = self.recv_reply_with_timeout().await?;
+            match reply {
+                Some(ApiReply::Reply {
+                    id: reply_id,
+                    result: cmd_result,
+                    redirect,
+                }) => {
+                    if reply_id != req_id {
+                        // logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
+                        //                      req_id, reply_id)
+                        continue;
+                    } else {
+                        match cmd_result {
+                            None => {
+                                if let Some(server) = redirect {
+                                    return Ok(DriverReply::Redirect {
+                                        server,
+                                    });
+                                } else {
+                                    return Ok(DriverReply::Failure);
+                                }
                             }
-                        }
 
-                        Some(CommandResult::Put { old_value }) => {
-                            let latency =
-                                Instant::now().duration_since(issue_ts);
-                            Ok(DriverReply::Success {
-                                req_id,
-                                cmd_result: CommandResult::Put { old_value },
-                                latency,
-                            })
-                        }
+                            Some(CommandResult::Put { old_value }) => {
+                                let latency =
+                                    Instant::now().duration_since(issue_ts);
+                                return Ok(DriverReply::Success {
+                                    req_id,
+                                    cmd_result: CommandResult::Put {
+                                        old_value,
+                                    },
+                                    latency,
+                                });
+                            }
 
-                        _ => {
-                            logged_err!(self.id; "command type mismatch: expected Put")
+                            _ => {
+                                return logged_err!(self.id; "command type mismatch: expected Put");
+                            }
                         }
                     }
                 }
-            }
 
-            None => Ok(DriverReply::Timeout),
+                None => {
+                    return Ok(DriverReply::Timeout);
+                }
 
-            _ => logged_err!(self.id; "unexpected reply type received"),
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
         }
     }
 
diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs
index 8e49c107..37a902d5 100644
--- a/summerset_client/src/drivers/open_loop.rs
+++ b/summerset_client/src/drivers/open_loop.rs
@@ -168,37 +168,45 @@ impl DriverOpenLoop {
 
     /// Waits for the next reply.
     pub async fn wait_reply(&mut self) -> Result<DriverReply, SummersetError> {
-        let reply = self.recv_reply_with_timeout().await?;
-        match reply {
-            Some(ApiReply::Reply {
-                id: reply_id,
-                result: cmd_result,
-                redirect,
-            }) => {
-                if !self.pending_reqs.contains_key(&reply_id) {
-                    logged_err!(self.id; "request ID {} not in pending set",
-                                         reply_id)
-                } else {
-                    let issue_ts = self.pending_reqs.remove(&reply_id).unwrap();
-                    let latency = Instant::now().duration_since(issue_ts);
-
-                    if let Some(res) = cmd_result {
-                        Ok(DriverReply::Success {
-                            req_id: reply_id,
-                            cmd_result: res,
-                            latency,
-                        })
-                    } else if let Some(server) = redirect {
-                        Ok(DriverReply::Redirect { server })
+        loop {
+            let reply = self.recv_reply_with_timeout().await?;
+            match reply {
+                Some(ApiReply::Reply {
+                    id: reply_id,
+                    result: cmd_result,
+                    redirect,
+                }) => {
+                    if !self.pending_reqs.contains_key(&reply_id) {
+                        // logged_err!(self.id; "request ID {} not in pending set",
+                        //                      reply_id)
+                        continue;
                     } else {
-                        Ok(DriverReply::Failure)
+                        let issue_ts =
+                            self.pending_reqs.remove(&reply_id).unwrap();
+                        let latency = Instant::now().duration_since(issue_ts);
+
+                        if let Some(res) = cmd_result {
+                            return Ok(DriverReply::Success {
+                                req_id: reply_id,
+                                cmd_result: res,
+                                latency,
+                            });
+                        } else if let Some(server) = redirect {
+                            return Ok(DriverReply::Redirect { server });
+                        } else {
+                            return Ok(DriverReply::Failure);
+                        }
                     }
                 }
-            }
 
-            None => Ok(DriverReply::Timeout),
+                None => {
+                    return Ok(DriverReply::Timeout);
+                }
 
-            _ => logged_err!(self.id; "unexpected reply type received"),
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
         }
     }