From b86a69804529977e94831da654f455a80a8ab648 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Wed, 26 Jul 2023 16:57:55 -0400 Subject: [PATCH] internal/cache: increase shard count to 4x CPUs Increase the block cache shard count to reduce mutex contention. --- internal/cache/clockpro.go | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/internal/cache/clockpro.go b/internal/cache/clockpro.go index d15600f6025..8f6831a8c5b 100644 --- a/internal/cache/clockpro.go +++ b/internal/cache/clockpro.go @@ -625,7 +625,7 @@ type Metrics struct { // Cache implements Pebble's sharded block cache. The Clock-PRO algorithm is // used for page replacement // (http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html). In -// order to provide better concurrency, 2 x NumCPUs shards are created, with +// order to provide better concurrency, 4 x NumCPUs shards are created, with // each shard being given 1/n of the target cache size. The Clock-PRO algorithm // is run independently on each shard. // @@ -681,7 +681,34 @@ type Cache struct { // defer c.Unref() // d, err := pebble.Open(pebble.Options{Cache: c}) func New(size int64) *Cache { - return newShards(size, 2*runtime.GOMAXPROCS(0)) + // How many cache shards should we create? + // + // Note that the probability two processors will try to access the same + // shard at the same time increases superlinearly with the number of + // processors (Eg, consider the brithday problem where each CPU is a person, + // and each shard is a possible birthday). + // + // We could consider growing the number of shards superlinearly, but + // increasing the shard count may reduce the effectiveness of the caching + // algorithm if frequently-accessed blocks are insufficiently distributed + // across shards. If a shard's size is smaller than a single frequently + // scanned sstable, then the shard will be unable to hold the entire + // frequently-scanned table in memory despite other shards still holding + // infrequently accessed blocks. + // + // Experimentally, we've observed contention contributing to tail latencies + // at 2 shards per processor. For now we use 4 shards per processor, + // recognizing this may not be final word. + m := 4 * runtime.GOMAXPROCS(0) + + // In tests we can use large CPU machines with small cache sizes and have + // many caches in existence at a time. If sharding into m shards would + // produce too small of shards, constrain the number of shards to 4. + const minimumShardSize = 4 << 20 // 4 MiB + if m > 4 && int(size)/m < minimumShardSize { + m = 4 + } + return newShards(size, m) } func newShards(size int64, shards int) *Cache {