Skip to content

Commit

Permalink
Add to recover SSD cache space on server restart (facebookincubator#1…
Browse files Browse the repository at this point in the history
…0967)

Summary:
Pull Request resolved: facebookincubator#10967

Current SSD checkpoint recover doesn't restore the cache size in each region. This PR adds to
recover this with unit test.
This PR also fixes a stats update issue.

Reviewed By: zacw7, oerling

Differential Revision: D62482872
  • Loading branch information
xiaoxmeng authored and facebook-github-bot committed Sep 12, 2024
1 parent d1ac079 commit 200f3bd
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 38 deletions.
46 changes: 36 additions & 10 deletions velox/common/caching/SsdFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ std::optional<std::pair<uint64_t, int32_t>> SsdFile::getSpace(
return std::nullopt;
}
}
assert(!writableRegions_.empty());
VELOX_CHECK(!writableRegions_.empty());
const auto region = writableRegions_[0];
const auto offset = regionSizes_[region];
auto available = kRegionSize - offset;
Expand Down Expand Up @@ -554,6 +554,7 @@ void SsdFile::updateStats(SsdCacheStats& stats) const {
stats.readSsdErrors += stats_.readSsdErrors;
stats.readCheckpointErrors += stats_.readCheckpointErrors;
stats.readSsdCorruptions += stats_.readSsdCorruptions;
stats.readWithoutChecksumChecks += stats_.readWithoutChecksumChecks;
}

void SsdFile::clear() {
Expand Down Expand Up @@ -651,8 +652,8 @@ bool SsdFile::removeFileEntries(
VELOX_SSD_CACHE_LOG(INFO)
<< "Removed " << entriesAgedOut << " entries from " << fileName_
<< ". And erased " << toFree.size() << " regions with "
<< kMaxErasedSizePct << "% entries removed.";

<< kMaxErasedSizePct << "% entries removed, and " << entries_.size()
<< " left.";
return true;
}

Expand Down Expand Up @@ -1009,6 +1010,8 @@ void SsdFile::readCheckpoint(std::ifstream& state) {
for (auto region : evicted) {
evictedMap.insert(region);
}

std::vector<uint32_t> regionCacheSizes(numRegions_, 0);
for (;;) {
const auto fileNum = readNumber<uint64_t>(state);
if (fileNum == kCheckpointEndMarker) {
Expand All @@ -1021,15 +1024,32 @@ void SsdFile::readCheckpoint(std::ifstream& state) {
checksum = readNumber<uint32_t>(state);
}
const auto run = SsdRun(fileBits, checksum);
const auto region = regionIndex(run.offset());
// Check that the recovered entry does not fall in an evicted region.
if (evictedMap.find(regionIndex(run.offset())) == evictedMap.end()) {
// The file may have a different id on restore.
auto it = idMap.find(fileNum);
VELOX_CHECK(it != idMap.end());
FileCacheKey key{it->second, offset};
entries_[std::move(key)] = run;
if (evictedMap.find(region) != evictedMap.end()) {
continue;
}
// The file may have a different id on restore.
const auto it = idMap.find(fileNum);
VELOX_CHECK(it != idMap.end());
FileCacheKey key{it->second, offset};
entries_[std::move(key)] = run;
regionCacheSizes[region] += run.size();
regionSizes_[region] = std::max<uint32_t>(
regionSizes_[region], regionOffset(run.offset()) + run.size());
}

// NOTE: we might erase entries from a region for TTL eviction, so we need to
// set the region size to the max offset of the recovered cache entry from the
// region. Correspondingly, we substract the cached size from the region size
// to get the erased size.
for (auto region = 0; region < numRegions_; ++region) {
VELOX_CHECK_LE(regionSizes_[region], kRegionSize);
VELOX_CHECK_LE(regionCacheSizes[region], regionSizes_[region]);
erasedRegionSizes_[region] =
regionSizes_[region] - regionCacheSizes[region];
}

++stats_.checkpointsRead;
stats_.entriesRecovered += entries_.size();

Expand All @@ -1042,10 +1062,16 @@ void SsdFile::readCheckpoint(std::ifstream& state) {
writableRegions_.push_back(region);
}
tracker_.setRegionScores(scores);

uint64_t cachedBytes{0};
for (const auto regionSize : regionSizes_) {
cachedBytes += regionSize;
}
VELOX_SSD_CACHE_LOG(INFO) << fmt::format(
"Starting shard {} from checkpoint with {} entries, {} regions with {} free, with checksum write {}, read verification {}, checkpoint file {}",
"Starting shard {} from checkpoint with {} entries, {} cached data, {} regions with {} free, with checksum write {}, read verification {}, checkpoint file {}",
shardId_,
entries_.size(),
succinctBytes(cachedBytes),
numRegions_,
writableRegions_.size(),
checksumEnabled_ ? "enabled" : "disabled",
Expand Down
34 changes: 19 additions & 15 deletions velox/common/caching/SsdFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ struct SsdCacheStats {
return result;
}

void clear() {
*this = SsdCacheStats();
}

/// Snapshot stats
tsan_atomic<uint64_t> entriesCached{0};
tsan_atomic<uint64_t> regionsCached{0};
Expand Down Expand Up @@ -301,7 +305,6 @@ class SsdFile {

/// Erases 'key'
bool erase(RawFileCacheKey key);

/// Copies the data in 'ssdPins' into 'pins'. Coalesces IO for nearby
/// entries if they are in ascending order and near enough.
CoalesceIoStats load(
Expand All @@ -322,16 +325,6 @@ class SsdFile {
VELOX_CHECK_GT(regionPins_[regionIndex(offset)], 0);
}

/// Returns the region number corresponding to offset.
static int32_t regionIndex(uint64_t offset) {
return offset / kRegionSize;
}

/// Updates the read count of a region.
void regionRead(int32_t region, int32_t size) {
tracker_.regionRead(region, size);
}

int32_t maxRegions() const {
return maxRegions_;
}
Expand Down Expand Up @@ -399,10 +392,6 @@ class SsdFile {
return entries_;
}

SsdCacheStats testingStats() const {
return stats_;
}

bool testingChecksumReadVerificationEnabled() const {
return checksumReadVerificationEnabled_;
}
Expand All @@ -416,6 +405,21 @@ class SsdFile {

static constexpr int kMaxErasedSizePct = 50;

// Updates the read count of a region.
void regionRead(int32_t region, int32_t size) {
tracker_.regionRead(region, size);
}

// Returns the region number corresponding to 'offset'.
static int32_t regionIndex(uint64_t offset) {
return offset / kRegionSize;
}

// Returns the offset within a region corresponding to 'offset'.
static int32_t regionOffset(uint64_t offset) {
return offset % kRegionSize;
}

// The first 4 bytes of a checkpoint file contains version string to indicate
// if checksum write is enabled or not.
std::string checkpointVersion() const {
Expand Down
Loading

0 comments on commit 200f3bd

Please sign in to comment.