Skip to content

Commit

Permalink
Merge pull request #943 from graydon/bug-937-catchup-fail-and-reset
Browse files Browse the repository at this point in the history
Bug 937 catchup fail and reset

Reviewed-by: jedmccaleb
  • Loading branch information
latobarita committed Dec 4, 2015
2 parents dee36d3 + 45ecb47 commit 62d2481
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 17 deletions.
7 changes: 4 additions & 3 deletions src/history/CatchupStateMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,8 +653,9 @@ CatchupStateMachine::enterRetryingState(uint64_t nseconds)
if (self->mRetryCount++ > kRetryLimit)
{
CLOG(WARNING, "History") << "Retry count " << kRetryLimit
<< " exceeded, restarting catchup";
self->enterBeginState();
<< " exceeded, failing catchup";
self->mError = std::make_error_code(std::errc::timed_out);
self->enterEndState();
}
else if (!anchored)
{
Expand Down Expand Up @@ -1235,7 +1236,7 @@ CatchupStateMachine::applyHistoryOfSingleCheckpoint(uint32_t checkpoint)
void
CatchupStateMachine::enterEndState()
{
assert(mState == CATCHUP_APPLYING);
assert(mState == CATCHUP_APPLYING || (mState == CATCHUP_RETRYING && (!!mError)));
mApplyState.reset();
mState = CATCHUP_END;
CLOG(DEBUG, "History") << "Completed catchup from '" << mArchive->getName()
Expand Down
72 changes: 63 additions & 9 deletions src/history/HistoryTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ class HistoryTests
bool catchupApplication(uint32_t initLedger,
HistoryManager::CatchupMode resumeMode,
Application::pointer app2, bool doStart = true,
uint32_t maxCranks = 0xffffffff);
uint32_t maxCranks = 0xffffffff,
uint32_t gap=0);

bool
flip()
Expand Down Expand Up @@ -415,7 +416,8 @@ bool
HistoryTests::catchupApplication(uint32_t initLedger,
HistoryManager::CatchupMode resumeMode,
Application::pointer app2, bool doStart,
uint32_t maxCranks)
uint32_t maxCranks,
uint32_t gap)
{

auto& lm = app2->getLedgerManager();
Expand Down Expand Up @@ -460,16 +462,25 @@ HistoryTests::catchupApplication(uint32_t initLedger,
app.getHistoryManager().nextCheckpointLedger(initLedger);
for (uint32_t n = initLedger; n <= nextBlockStart; ++n)
{
// Remember the vectors count from 2, not 0.
if (n - 2 >= mLedgerCloseDatas.size())
{
break;
}
// Remember the vectors count from 2, not 0.
auto const& lcd = mLedgerCloseDatas.at(n - 2);
CLOG(INFO, "History")
<< "force-externalizing LedgerCloseData for " << n
<< " has txhash:" << hexAbbrev(lcd.mTxSet->getContentsHash());
lm.externalizeValue(lcd);
if (n == gap)
{
CLOG(INFO, "History")
<< "simulating LedgerClose transmit gap at ledger " << n;
}
else
{
// Remember the vectors count from 2, not 0.
auto const& lcd = mLedgerCloseDatas.at(n - 2);
CLOG(INFO, "History")
<< "force-externalizing LedgerCloseData for " << n
<< " has txhash:" << hexAbbrev(lcd.mTxSet->getContentsHash());
lm.externalizeValue(lcd);
}
}

uint32_t lastLedger = lm.getLastClosedLedgerNum();
Expand All @@ -480,7 +491,7 @@ HistoryTests::catchupApplication(uint32_t initLedger,
LedgerManager::LM_SYNCED_STATE) &&
!app2->getClock().getIOService().stopped() && (--maxCranks != 0))
{
app2->getClock().crank(true);
app2->getClock().crank(false);
}

if (maxCranks == 0)
Expand Down Expand Up @@ -937,3 +948,46 @@ TEST_CASE("persist publish queue", "[history]")
LOG(INFO) << app1->isStopping();
}
}


// The idea with this test is that we join a network and somehow get a gap
// in the SCP voting sequence while we're trying to catchup. This should
// cause catchup to fail, but that failure should itself just flush the
// ledgermanager's buffer and get kicked back into catchup mode when the
// network moves further ahead.
//
// (Both the hard-failure and the clear/reset weren't working when this
// test was written)

TEST_CASE_METHOD(HistoryTests, "too far behind / catchup restart",
"[history][catchupstall]")
{
generateAndPublishInitialHistory(1);

// Catch up successfully the first time
auto app2 = catchupNewApplication(
app.getLedgerManager().getCurrentLedgerHeader().ledgerSeq,
Config::TESTDB_IN_MEMORY_SQLITE, HistoryManager::CATCHUP_COMPLETE,
"app2");

// Now generate a little more history
generateAndPublishHistory(1);

bool caughtup = false;
auto init = app2->getLedgerManager().getLastClosedLedgerNum() + 2;

// Now start a catchup on that _fails_ due to a gap
LOG(INFO) << "Starting BROKEN catchup (with gap) from " << init;
caughtup = catchupApplication(init, HistoryManager::CATCHUP_COMPLETE,
app2, true, 10000, init + 10);

assert(!caughtup);

// Now generate a little more history
generateAndPublishHistory(1);

// And catchup successfully
init = app.getLedgerManager().getLastClosedLedgerNum();
caughtup = catchupApplication(init, HistoryManager::CATCHUP_COMPLETE, app2);
assert(caughtup);
}
10 changes: 5 additions & 5 deletions src/ledger/LedgerManagerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ LedgerManagerImpl::historyCaughtup(asio::error_code const& ec,
{
CLOG(ERROR, "Ledger") << "Error catching up: " << ec.message();
CLOG(ERROR, "Ledger") << "Catchup will restart at next close.";
setState(LM_BOOTING_STATE);
}
else
{
Expand Down Expand Up @@ -589,16 +590,15 @@ LedgerManagerImpl::historyCaughtup(asio::error_code const& ec,
}
}

// we're done processing the ledgers backlog
mSyncingLedgers.clear();

CLOG(INFO, "Ledger")
<< "Caught up to LCL including recent network activity: "
<< ledgerAbbrev(mLastClosedLedger);

mSyncingLedgersSize.set_count(mSyncingLedgers.size());
setState(LM_SYNCED_STATE);
}

// Either way, we're done processing the ledgers backlog
mSyncingLedgers.clear();
mSyncingLedgersSize.set_count(mSyncingLedgers.size());
}

uint64_t
Expand Down

0 comments on commit 62d2481

Please sign in to comment.