facebook · pdillinger · Dec 11, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
@@ -48,6 +48,10 @@ DEFINE_uint64(cache_size, 1 * GiB,
               "Number of bytes to use as a cache of uncompressed data.");
 DEFINE_int32(num_shard_bits, -1,
              "ShardedCacheOptions::shard_bits. Default = auto");
+DEFINE_int32(
+    eviction_effort_cap,
+    ROCKSDB_NAMESPACE::HyperClockCacheOptions(1, 1).eviction_effort_cap,
+    "HyperClockCacheOptions::eviction_effort_cap");
 
 DEFINE_double(resident_ratio, 0.25,
               "Ratio of keys fitting in cache to keyspace.");
@@ -391,6 +395,7 @@ class CacheBench {
           FLAGS_cache_size, /*estimated_entry_charge=*/0, FLAGS_num_shard_bits);
       opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
       opts.memory_allocator = allocator;
+      opts.eviction_effort_cap = FLAGS_eviction_effort_cap;
       if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
           FLAGS_cache_type == "hyper_clock_cache") {
         opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
@@ -11,6 +11,7 @@
 
 #include <array>
 #include <atomic>
+#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -374,6 +375,14 @@ struct ClockHandle : public ClockHandleBasicData {
 
 class BaseClockTable {
  public:
+  struct BaseOpts {
+    explicit BaseOpts(int _eviction_effort_cap)
+        : eviction_effort_cap(_eviction_effort_cap) {}
+    explicit BaseOpts(const HyperClockCacheOptions& opts)
+        : BaseOpts(opts.eviction_effort_cap) {}
+    int eviction_effort_cap;
+  };
+
   BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
                  MemoryAllocator* allocator,
                  const Cache::EvictionCallback* eviction_callback,
@@ -386,13 +395,13 @@ class BaseClockTable {
   template <class Table>
   typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
                                                size_t capacity,
-                                               bool strict_capacity_limit,
+                                               uint32_t eec_and_scl,
                                                bool allow_uncharged);
 
   template <class Table>
   Status Insert(const ClockHandleBasicData& proto,
                 typename Table::HandleImpl** handle, Cache::Priority priority,
-                size_t capacity, bool strict_capacity_limit);
+                size_t capacity, uint32_t eec_and_scl);
 
   void Ref(ClockHandle& handle);
 
@@ -406,12 +415,17 @@ class BaseClockTable {
 
   uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); }
 
+  uint64_t GetEvictionEffortExceededCount() const {
+    return eviction_effort_exceeded_count_.LoadRelaxed();
+  }
+
   struct EvictionData {
     size_t freed_charge = 0;
     size_t freed_count = 0;
+    size_t seen_pinned_count = 0;
   };
 
-  void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data);
+  void TrackAndReleaseEvictedEntry(ClockHandle* h);
 
 #ifndef NDEBUG
   // Acquire N references
@@ -436,6 +450,7 @@ class BaseClockTable {
   template <class Table>
   Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
                                      bool need_evict_for_occupancy,
+                                     uint32_t eviction_effort_cap,
                                      typename Table::InsertState& state);
 
   // Helper for updating `usage_` for new entry with given `total_charge`
@@ -449,6 +464,7 @@ class BaseClockTable {
   template <class Table>
   bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
                                       bool need_evict_for_occupancy,
+                                      uint32_t eviction_effort_cap,
                                       typename Table::InsertState& state);
 
  protected:  // data
@@ -461,9 +477,15 @@ class BaseClockTable {
   RelaxedAtomic<uint64_t> clock_pointer_{};
 
   // Counter for number of times we yield to wait on another thread.
+  // It is normal for this to occur rarely in normal operation.
   // (Relaxed: a simple stat counter.)
   RelaxedAtomic<uint64_t> yield_count_{};
 
+  // Counter for number of times eviction effort cap is exceeded.
+  // It is normal for this to occur rarely in normal operation.
+  // (Relaxed: a simple stat counter.)
+  RelaxedAtomic<uint64_t> eviction_effort_exceeded_count_{};
+
   // TODO: is this separation needed if we don't do background evictions?
   ALIGN_AS(CACHE_LINE_SIZE)
   // Number of elements in the table.
@@ -517,17 +539,19 @@ class FixedHyperClockTable : public BaseClockTable {
     inline void SetStandalone() { standalone = true; }
   };  // struct HandleImpl
 
-  struct Opts {
-    explicit Opts(size_t _estimated_value_size)
-        : estimated_value_size(_estimated_value_size) {}
-    explicit Opts(const HyperClockCacheOptions& opts) {
+  struct Opts : public BaseOpts {
+    explicit Opts(size_t _estimated_value_size, int _eviction_effort_cap)
+        : BaseOpts(_eviction_effort_cap),
+          estimated_value_size(_estimated_value_size) {}
+    explicit Opts(const HyperClockCacheOptions& opts)
+        : BaseOpts(opts.eviction_effort_cap) {
       assert(opts.estimated_entry_charge > 0);
       estimated_value_size = opts.estimated_entry_charge;
     }
     size_t estimated_value_size;
   };
 
-  FixedHyperClockTable(size_t capacity, bool strict_capacity_limit,
+  FixedHyperClockTable(size_t capacity,
                        CacheMetadataChargePolicy metadata_charge_policy,
                        MemoryAllocator* allocator,
                        const Cache::EvictionCallback* eviction_callback,
@@ -549,7 +573,8 @@ class FixedHyperClockTable : public BaseClockTable {
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
-  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data,
+             uint32_t eviction_effort_cap);
 
   HandleImpl* Lookup(const UniqueId64x2& hashed_key);
 
@@ -803,18 +828,20 @@ class AutoHyperClockTable : public BaseClockTable {
     }
   };  // struct HandleImpl
 
-  struct Opts {
-    explicit Opts(size_t _min_avg_value_size)
-        : min_avg_value_size(_min_avg_value_size) {}
+  struct Opts : public BaseOpts {
+    explicit Opts(size_t _min_avg_value_size, int _eviction_effort_cap)
+        : BaseOpts(_eviction_effort_cap),
+          min_avg_value_size(_min_avg_value_size) {}
 
-    explicit Opts(const HyperClockCacheOptions& opts) {
+    explicit Opts(const HyperClockCacheOptions& opts)
+        : BaseOpts(opts.eviction_effort_cap) {
       assert(opts.estimated_entry_charge == 0);
       min_avg_value_size = opts.min_avg_entry_charge;
     }
     size_t min_avg_value_size;
   };
 
-  AutoHyperClockTable(size_t capacity, bool strict_capacity_limit,
+  AutoHyperClockTable(size_t capacity,
                       CacheMetadataChargePolicy metadata_charge_policy,
                       MemoryAllocator* allocator,
                       const Cache::EvictionCallback* eviction_callback,
@@ -841,7 +868,8 @@ class AutoHyperClockTable : public BaseClockTable {
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
-  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data,
+             uint32_t eviction_effort_cap);
 
   HandleImpl* Lookup(const UniqueId64x2& hashed_key);
 
@@ -906,7 +934,8 @@ class AutoHyperClockTable : public BaseClockTable {
   // with proper handling to ensure all existing data is seen even in the
   // presence of concurrent insertions, etc. (See implementation.)
   template <class OpData>
-  void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX);
+  void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX,
+                 EvictionData* data = nullptr);
 
   // An RAII wrapper for locking a chain of entries for removals. See
   // implementation.
@@ -916,7 +945,7 @@ class AutoHyperClockTable : public BaseClockTable {
   // implementation.
   template <class OpData>
   void PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock,
-                       size_t home);
+                       size_t home, EvictionData* data);
 
   // Update length_info_ as much as possible without waiting, given a known
   // usable (ready for inserts and lookups) grow_home. (Previous grow_homes
@@ -1078,9 +1107,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
   // (Relaxed: eventual consistency/update is OK)
   RelaxedAtomic<size_t> capacity_;
 
-  // Whether to reject insertion if cache reaches its full capacity.
+  // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit
+  // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc.
   // (Relaxed: eventual consistency/update is OK)
-  RelaxedAtomic<bool> strict_capacity_limit_;
+  RelaxedAtomic<uint32_t> eec_and_scl_;
 };  // class ClockCacheShard
 
 template <class Table>

diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc
@@ -992,6 +992,8 @@ class CompressedSecCacheTestWithTiered
         /*_capacity=*/0,
         /*_estimated_entry_charge=*/256 << 10,
         /*_num_shard_bits=*/0);
+    // eviction_effort_cap setting simply to avoid churn in existing test
+    hcc_opts.eviction_effort_cap = 100;
     TieredCacheOptions opts;
     lru_opts.capacity = 0;
     lru_opts.num_shard_bits = 0;

diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
@@ -389,12 +389,13 @@ class ClockCacheTest : public testing::Test {
     }
   }
 
-  void NewShard(size_t capacity, bool strict_capacity_limit = true) {
+  void NewShard(size_t capacity, bool strict_capacity_limit = true,
+                int eviction_effort_cap = 30) {
     DeleteShard();
     shard_ =
         reinterpret_cast<Shard*>(port::cacheline_aligned_alloc(sizeof(Shard)));
 
-    TableOpts opts{1 /*value_size*/};
+    TableOpts opts{1 /*value_size*/, eviction_effort_cap};
     new (shard_)
         Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata,
               /*allocator*/ nullptr, &eviction_callback_, &hash_seed_, opts);
@@ -445,12 +446,20 @@ class ClockCacheTest : public testing::Test {
     return Slice(reinterpret_cast<const char*>(&hashed_key), 16U);
   }
 
+  // A bad hash function for testing / stressing collision handling
   static inline UniqueId64x2 TestHashedKey(char key) {
     // For testing hash near-collision behavior, put the variance in
     // hashed_key in bits that are unlikely to be used as hash bits.
     return {(static_cast<uint64_t>(key) << 56) + 1234U, 5678U};
   }
 
+  // A reasonable hash function, for testing "typical behavior" etc.
+  template <typename T>
+  static inline UniqueId64x2 CheapHash(T i) {
+    return {static_cast<uint64_t>(i) * uint64_t{0x85EBCA77C2B2AE63},
+            static_cast<uint64_t>(i) * uint64_t{0xC2B2AE3D27D4EB4F}};
+  }
+
   Shard* shard_ = nullptr;
 
  private:
@@ -683,6 +692,53 @@ TYPED_TEST(ClockCacheTest, ClockEvictionTest) {
   }
 }
 
+TYPED_TEST(ClockCacheTest, ClockEvictionEffortCapTest) {
+  using HandleImpl = typename ClockCacheTest<TypeParam>::Shard::HandleImpl;
+  for (bool strict_capacity_limit : {true, false}) {
+    SCOPED_TRACE("strict_capacity_limit = " +
+                 std::to_string(strict_capacity_limit));
+    for (int eec : {-42, 0, 1, 10, 100, 1000}) {
+      SCOPED_TRACE("eviction_effort_cap = " + std::to_string(eec));
+      constexpr size_t kCapacity = 1000;
+      // Start with much larger capacity to ensure that we can go way over
+      // capacity without reaching table occupancy limit.
+      this->NewShard(3 * kCapacity, strict_capacity_limit, eec);
+      auto& shard = *this->shard_;
+      shard.SetCapacity(kCapacity);
+
+      // Nearly fill the cache with pinned entries, then add a bunch of
+      // non-pinned entries. eviction_effort_cap should affect how many
+      // evictable entries are present beyond the cache capacity, despite
+      // being evictable.
+      constexpr size_t kCount = kCapacity - 1;
+      std::unique_ptr<HandleImpl* []> ha { new HandleImpl* [kCount] {} };
+      for (size_t i = 0; i < 2 * kCount; ++i) {
+        UniqueId64x2 hkey = this->CheapHash(i);
+        ASSERT_OK(shard.Insert(
+            this->TestKey(hkey), hkey, nullptr /*value*/, &kNoopCacheItemHelper,
+            1 /*charge*/, i < kCount ? &ha[i] : nullptr, Cache::Priority::LOW));
+      }
+
+      if (strict_capacity_limit) {
+        // If strict_capacity_limit is enabled, the cache will never exceed its
+        // capacity
+        EXPECT_EQ(shard.GetOccupancyCount(), kCapacity);
+      } else {
+        // Rough inverse relationship between cap and possible memory
+        // explosion, which shows up as increased table occupancy count.
+        int effective_eec = std::max(int{1}, eec) + 1;
+        EXPECT_NEAR(shard.GetOccupancyCount() * 1.0,
+                    kCount * (1 + 1.4 / effective_eec),
+                    kCount * (0.6 / effective_eec) + 1.0);
+      }
+
+      for (size_t i = 0; i < kCount; ++i) {
+        shard.Release(ha[i]);
+      }
+    }
+  }
+}
+
 namespace {
 struct DeleteCounter {
   int deleted = 0;

diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
@@ -380,9 +380,6 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
 // to find the appropriate balance automatically.
 // * Cache priorities are less aggressively enforced, which could cause
 // cache dilution from long range scans (unless they use fill_cache=false).
-// * Can be worse for small caches, because if almost all of a cache shard is
-// pinned (more likely with non-partitioned filters), then CLOCK eviction
-// becomes very CPU intensive.
 //
 // See internal cache/clock_cache.h for full description.
 struct HyperClockCacheOptions : public ShardedCacheOptions {
@@ -441,6 +438,43 @@ struct HyperClockCacheOptions : public ShardedCacheOptions {
   // load factor for efficient Lookup, Insert, etc.
   size_t min_avg_entry_charge = 450;
 
+  // A tuning parameter to cap eviction CPU usage in a "thrashing" situation
+  // by allowing the memory capacity to be exceeded slightly as needed. The
+  // default setting should offer balanced protection against excessive CPU
+  // and memory usage under extreme stress conditions, with no effect on
+  // normal operation. Such stress conditions are proportionally more likely
+  // with small caches (10s of MB or less) vs. large caches (GB-scale).
+  // (NOTE: With the unusual setting of strict_capacity_limit=true, this
+  // parameter is ignored.)
+  //
+  // BACKGROUND: Without some kind of limiter, inserting into a CLOCK-based
+  // cache with no evictable entries (all "pinned") requires scanning the
+  // entire cache to determine that nothing can be evicted. (By contrast,
+  // LRU caches can determine no entries are evictable in O(1) time, but
+  // require more synchronization/coordination on that eviction metadata.)
+  // This aspect of a CLOCK cache can make a stressed situation worse by
+  // bogging down the CPU with repeated scans of the cache. And with
+  // strict_capacity_limit=false (normal setting), finding something evictable
+  // doesn't change the outcome of insertion: the entry is inserted anyway
+  // and the cache is allowed to exceed its target capacity if necessary.
+  //
+  // SOLUTION: Eviction is aborted upon seeing some number of pinned
+  // entries before evicting anything, or if the ratio of pinned to evicted
+  // is too high. This setting `eviction_effort_cap` essentially controls both
+  // that allowed initial number of pinned entries and the maximum allowed
+  // ratio. As the pinned size approaches the target cache capacity, roughly
+  // 1/eviction_effort_cap additional portion of the capacity might be kept
+  // in memory and evictable in order to keep CLOCK eviction reasonably
+  // performant. Under the default setting and high stress conditions, this
+  // memory overhead is around 3-5%. Under normal or even moderate stress
+  // conditions, the memory overhead is negligible to zero.
+  //
+  // A large value like 1000 offers some protection with essentially no
+  // memory overhead, while the minimum value of 1 could be useful for a
+  // small cache where roughly doubling in size under stress could be OK to
+  // keep operations very fast.
+  int eviction_effort_cap = 30;
+
   HyperClockCacheOptions(
       size_t _capacity, size_t _estimated_entry_charge,
       int _num_shard_bits = -1, bool _strict_capacity_limit = false,

diff --git a/unreleased_history/behavior_changes/eviction_effort_cap.md b/unreleased_history/behavior_changes/eviction_effort_cap.md
@@ -0,0 +1 @@
+HyperClockCache now has built-in protection against excessive CPU consumption under the extreme stress condition of no (or very few) evictable cache entries, which can slightly increase memory usage such conditions. New option `HyperClockCacheOptions::eviction_effort_cap` controls the space-time trade-off of the response. The default should be generally well-balanced, with no measurable affect on normal operation.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		HyperClockCache now has built-in protection against excessive CPU consumption under the extreme stress condition of no (or very few) evictable cache entries, which can slightly increase memory usage such conditions. New option `HyperClockCacheOptions::eviction_effort_cap` controls the space-time trade-off of the response. The default should be generally well-balanced, with no measurable affect on normal operation.